summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/x86/mc16_avx2.asm
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:44:51 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:44:51 +0000
commit9e3c08db40b8916968b9f30096c7be3f00ce9647 (patch)
treea68f146d7fa01f0134297619fbe7e33db084e0aa /third_party/dav1d/src/x86/mc16_avx2.asm
parentInitial commit. (diff)
downloadthunderbird-upstream.tar.xz
thunderbird-upstream.zip
Adding upstream version 1:115.7.0.upstream/1%115.7.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/dav1d/src/x86/mc16_avx2.asm')
-rw-r--r--third_party/dav1d/src/x86/mc16_avx2.asm5879
1 files changed, 5879 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/mc16_avx2.asm b/third_party/dav1d/src/x86/mc16_avx2.asm
new file mode 100644
index 0000000000..61eeaa1007
--- /dev/null
+++ b/third_party/dav1d/src/x86/mc16_avx2.asm
@@ -0,0 +1,5879 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+; dav1d_obmc_masks[] * -512
+const obmc_masks_avx2
+ dw 0, 0, -9728, 0, -12800, -7168, -2560, 0
+ dw -14336, -11264, -8192, -5632, -3584, -1536, 0, 0
+ dw -15360, -13824, -12288, -10752, -9216, -7680, -6144, -5120
+ dw -4096, -3072, -2048, -1536, 0, 0, 0, 0
+ dw -15872, -14848, -14336, -13312, -12288, -11776, -10752, -10240
+ dw -9728, -8704, -8192, -7168, -6656, -6144, -5632, -4608
+ dw -4096, -3584, -3072, -2560, -2048, -2048, -1536, -1024
+ dw 0, 0, 0, 0, 0, 0, 0, 0
+
+deint_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
+subpel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
+subpel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
+subpel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
+subpel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
+subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7
+rescale_mul2: dd 0, 1, 4, 5, 2, 3, 6, 7
+resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7
+ db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+bdct_lb_q: times 8 db 0
+ times 8 db 4
+ times 8 db 8
+ times 8 db 12
+
+prep_mul: dw 16, 16, 4, 4
+put_bilin_h_rnd: dw 8, 8, 10, 10
+put_8tap_h_rnd: dd 34, 40
+s_8tap_h_rnd: dd 2, 8
+s_8tap_h_sh: dd 2, 4
+put_s_8tap_v_rnd: dd 512, 128
+put_s_8tap_v_sh: dd 10, 8
+prep_8tap_1d_rnd: dd 8 - (8192 << 4)
+prep_8tap_2d_rnd: dd 32 - (8192 << 5)
+warp8x8t_rnd: dd 16384 - (8192 << 15)
+warp8x8_shift: dd 5, 3
+warp8x8_rnd: dw 4096, 4096, 16384, 16384
+bidir_rnd: dw -16400, -16400, -16388, -16388
+bidir_mul: dw 2048, 2048, 8192, 8192
+
+%define pw_16 prep_mul
+%define pd_512 put_s_8tap_v_rnd
+
+pw_2: times 2 dw 2
+pw_64: times 2 dw 64
+pw_2048: times 2 dw 2048
+pw_8192: times 2 dw 8192
+pw_27615: times 2 dw 27615
+pw_32766: times 2 dw 32766
+pw_m512: times 2 dw -512
+pd_32: dd 32
+pd_63: dd 63
+pd_64: dd 64
+pd_32768: dd 32768
+pd_65538: dd 65538
+pd_m524256: dd -524256 ; -8192 << 6 + 32
+pd_0x3ff: dd 0x3ff
+pq_0x40000000: dq 0x40000000
+ dd 0
+
+%macro BIDIR_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - 2*%3)
+ %xdefine %%base %1_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%prefix %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE blend, avx2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v, avx2, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h, avx2, 2, 4, 8, 16, 32, 64, 128
+
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_16bpc_avx2.put)
+%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_16bpc_avx2.prep)
+
+BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128
+
+%macro HV_JMP_TABLE 5-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3)
+ %xdefine %%base %1_%3
+ %assign %%types %4
+ %if %%types & 1
+ %xdefine %1_%2_h_%3_table (%%h - %5)
+ %%h:
+ %rep %0 - 4
+ dw %%prefix %+ .h_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 2
+ %xdefine %1_%2_v_%3_table (%%v - %5)
+ %%v:
+ %rep %0 - 4
+ dw %%prefix %+ .v_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 4
+ %xdefine %1_%2_hv_%3_table (%%hv - %5)
+ %%hv:
+ %rep %0 - 4
+ dw %%prefix %+ .hv_w%5 - %%base
+ %rotate 1
+ %endrep
+ %endif
+%endmacro
+
+HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128
+
+%macro SCALED_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2)
+%%table:
+ %rep %0 - 2
+ dw %%base %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+ %%dy_1024:
+ %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy1_w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+ %%dy_2048:
+ %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy2_w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+cextern mc_subpel_filters
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+
+cextern mc_warp_filter
+cextern resize_filter
+
+SECTION .text
+
+INIT_XMM avx2
+cglobal put_bilin_16bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
+ mov mxyd, r6m ; mx
+ lea r7, [put_avx2]
+%if UNIX64
+ DECLARE_REG_TMP 8
+ %define org_w r8d
+ mov r8d, wd
+%else
+ DECLARE_REG_TMP 7
+ %define org_w wm
+%endif
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .v
+.put:
+ movzx wd, word [r7+wq*2+table_offset(put,)]
+ add wq, r7
+ jmp wq
+.put_w2:
+ mov r6d, [srcq+ssq*0]
+ mov r7d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6d
+ mov [dstq+dsq*1], r7d
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w2
+ RET
+.put_w4:
+ mov r6, [srcq+ssq*0]
+ mov r7, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6
+ mov [dstq+dsq*1], r7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w4
+ RET
+.put_w8:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w8
+ RET
+INIT_YMM avx2
+.put_w16:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w16
+ RET
+.put_w32:
+ movu m0, [srcq+ssq*0+32*0]
+ movu m1, [srcq+ssq*0+32*1]
+ movu m2, [srcq+ssq*1+32*0]
+ movu m3, [srcq+ssq*1+32*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0+32*0], m0
+ mova [dstq+dsq*0+32*1], m1
+ mova [dstq+dsq*1+32*0], m2
+ mova [dstq+dsq*1+32*1], m3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w32
+ RET
+.put_w64:
+ movu m0, [srcq+32*0]
+ movu m1, [srcq+32*1]
+ movu m2, [srcq+32*2]
+ movu m3, [srcq+32*3]
+ add srcq, ssq
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w64
+ RET
+.put_w128:
+ movu m0, [srcq+32*0]
+ movu m1, [srcq+32*1]
+ movu m2, [srcq+32*2]
+ movu m3, [srcq+32*3]
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ movu m0, [srcq+32*4]
+ movu m1, [srcq+32*5]
+ movu m2, [srcq+32*6]
+ movu m3, [srcq+32*7]
+ add srcq, ssq
+ mova [dstq+32*4], m0
+ mova [dstq+32*5], m1
+ mova [dstq+32*6], m2
+ mova [dstq+32*7], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w128
+ RET
+.h:
+ movd xm5, mxyd
+ mov mxyd, r7m ; my
+ vpbroadcastd m4, [pw_16]
+ vpbroadcastw m5, xm5
+ psubw m4, m5
+ test mxyd, mxyd
+ jnz .hv
+ ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)]
+ mov r6d, r8m ; bitdepth_max
+ add wq, r7
+ shr r6d, 11
+ vpbroadcastd m3, [r7-put_avx2+put_bilin_h_rnd+r6*4]
+ jmp wq
+.h_w2:
+ movq xm1, [srcq+ssq*0]
+ movhps xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmullw xm0, xm4, xm1
+ psrlq xm1, 16
+ pmullw xm1, xm5
+ paddw xm0, xm3
+ paddw xm0, xm1
+ psrlw xm0, 4
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2
+ RET
+.h_w4:
+ movq xm0, [srcq+ssq*0]
+ movhps xm0, [srcq+ssq*1]
+ movq xm1, [srcq+ssq*0+2]
+ movhps xm1, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ pmullw xm0, xm4
+ pmullw xm1, xm5
+ paddw xm0, xm3
+ paddw xm0, xm1
+ psrlw xm0, 4
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4
+ RET
+.h_w8:
+ movu xm0, [srcq+ssq*0]
+ vinserti128 m0, [srcq+ssq*1], 1
+ movu xm1, [srcq+ssq*0+2]
+ vinserti128 m1, [srcq+ssq*1+2], 1
+ lea srcq, [srcq+ssq*2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 4
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ pmullw m0, m4, [srcq+ssq*0]
+ pmullw m1, m5, [srcq+ssq*0+2]
+ paddw m0, m3
+ paddw m0, m1
+ pmullw m1, m4, [srcq+ssq*1]
+ pmullw m2, m5, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ pmullw m0, m4, [srcq+32*0]
+ pmullw m1, m5, [srcq+32*0+2]
+ paddw m0, m3
+ paddw m0, m1
+ pmullw m1, m4, [srcq+32*1]
+ pmullw m2, m5, [srcq+32*1+2]
+ add srcq, ssq
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ add dstq, dsq
+ dec hd
+ jg .h_w32
+ RET
+.h_w64:
+.h_w128:
+ movifnidn t0d, org_w
+.h_w64_loop0:
+ mov r6d, t0d
+.h_w64_loop:
+ pmullw m0, m4, [srcq+r6*2-32*1]
+ pmullw m1, m5, [srcq+r6*2-32*1+2]
+ paddw m0, m3
+ paddw m0, m1
+ pmullw m1, m4, [srcq+r6*2-32*2]
+ pmullw m2, m5, [srcq+r6*2-32*2+2]
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+r6*2-32*1], m0
+ mova [dstq+r6*2-32*2], m1
+ sub r6d, 32
+ jg .h_w64_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w64_loop0
+ RET
+.v:
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)]
+ shl mxyd, 11
+ movd xm5, mxyd
+ add wq, r7
+ vpbroadcastw m5, xm5
+ jmp wq
+.v_w2:
+ movd xm0, [srcq+ssq*0]
+.v_w2_loop:
+ movd xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpckldq xm2, xm0, xm1
+ movd xm0, [srcq+ssq*0]
+ punpckldq xm1, xm0
+ psubw xm1, xm2
+ pmulhrsw xm1, xm5
+ paddw xm1, xm2
+ movd [dstq+dsq*0], xm1
+ pextrd [dstq+dsq*1], xm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movq xm0, [srcq+ssq*0]
+.v_w4_loop:
+ movq xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklqdq xm2, xm0, xm1
+ movq xm0, [srcq+ssq*0]
+ punpcklqdq xm1, xm0
+ psubw xm1, xm2
+ pmulhrsw xm1, xm5
+ paddw xm1, xm2
+ movq [dstq+dsq*0], xm1
+ movhps [dstq+dsq*1], xm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movu xm0, [srcq+ssq*0]
+.v_w8_loop:
+ vbroadcasti128 m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd m2, m0, m1, 0xf0
+ vbroadcasti128 m0, [srcq+ssq*0]
+ vpblendd m1, m0, 0xf0
+ psubw m1, m2
+ pmulhrsw m1, m5
+ paddw m1, m2
+ mova [dstq+dsq*0], xm1
+ vextracti128 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w32:
+ movu m0, [srcq+ssq*0+32*0]
+ movu m1, [srcq+ssq*0+32*1]
+.v_w32_loop:
+ movu m2, [srcq+ssq*1+32*0]
+ movu m3, [srcq+ssq*1+32*1]
+ lea srcq, [srcq+ssq*2]
+ psubw m4, m2, m0
+ pmulhrsw m4, m5
+ paddw m4, m0
+ movu m0, [srcq+ssq*0+32*0]
+ mova [dstq+dsq*0+32*0], m4
+ psubw m4, m3, m1
+ pmulhrsw m4, m5
+ paddw m4, m1
+ movu m1, [srcq+ssq*0+32*1]
+ mova [dstq+dsq*0+32*1], m4
+ psubw m4, m0, m2
+ pmulhrsw m4, m5
+ paddw m4, m2
+ mova [dstq+dsq*1+32*0], m4
+ psubw m4, m1, m3
+ pmulhrsw m4, m5
+ paddw m4, m3
+ mova [dstq+dsq*1+32*1], m4
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w32_loop
+ RET
+.v_w16:
+.v_w64:
+.v_w128:
+ movifnidn t0d, org_w
+ add t0d, t0d
+ mov r4, srcq
+ lea r6d, [hq+t0*8-256]
+ mov r7, dstq
+.v_w16_loop0:
+ movu m0, [srcq+ssq*0]
+.v_w16_loop:
+ movu m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ psubw m1, m3, m0
+ pmulhrsw m1, m5
+ paddw m1, m0
+ movu m0, [srcq+ssq*0]
+ psubw m2, m0, m3
+ pmulhrsw m2, m5
+ paddw m2, m3
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ add r4, 32
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .v_w16_loop0
+ RET
+.hv:
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
+ WIN64_SPILL_XMM 8
+ shl mxyd, 11
+ vpbroadcastd m3, [pw_2]
+ movd xm6, mxyd
+ vpbroadcastd m7, [pw_8192]
+ add wq, r7
+ vpbroadcastw m6, xm6
+ test dword r8m, 0x800
+ jnz .hv_12bpc
+ psllw m4, 2
+ psllw m5, 2
+ vpbroadcastd m7, [pw_2048]
+.hv_12bpc:
+ jmp wq
+.hv_w2:
+ vpbroadcastq xm1, [srcq+ssq*0]
+ pmullw xm0, xm4, xm1
+ psrlq xm1, 16
+ pmullw xm1, xm5
+ paddw xm0, xm3
+ paddw xm0, xm1
+ psrlw xm0, 2
+.hv_w2_loop:
+ movq xm2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xm2, [srcq+ssq*0]
+ pmullw xm1, xm4, xm2
+ psrlq xm2, 16
+ pmullw xm2, xm5
+ paddw xm1, xm3
+ paddw xm1, xm2
+ psrlw xm1, 2 ; 1 _ 2 _
+ shufpd xm2, xm0, xm1, 0x01 ; 0 _ 1 _
+ mova xm0, xm1
+ psubw xm1, xm2
+ paddw xm1, xm1
+ pmulhw xm1, xm6
+ paddw xm1, xm2
+ pmulhrsw xm1, xm7
+ movd [dstq+dsq*0], xm1
+ pextrd [dstq+dsq*1], xm1, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ pmullw xm0, xm4, [srcq+ssq*0-8]
+ pmullw xm1, xm5, [srcq+ssq*0-6]
+ paddw xm0, xm3
+ paddw xm0, xm1
+ psrlw xm0, 2
+.hv_w4_loop:
+ movq xm1, [srcq+ssq*1]
+ movq xm2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ movhps xm1, [srcq+ssq*0]
+ movhps xm2, [srcq+ssq*0+2]
+ pmullw xm1, xm4
+ pmullw xm2, xm5
+ paddw xm1, xm3
+ paddw xm1, xm2
+ psrlw xm1, 2 ; 1 2
+ shufpd xm2, xm0, xm1, 0x01 ; 0 1
+ mova xm0, xm1
+ psubw xm1, xm2
+ paddw xm1, xm1
+ pmulhw xm1, xm6
+ paddw xm1, xm2
+ pmulhrsw xm1, xm7
+ movq [dstq+dsq*0], xm1
+ movhps [dstq+dsq*1], xm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ pmullw xm0, xm4, [srcq+ssq*0]
+ pmullw xm1, xm5, [srcq+ssq*0+2]
+ paddw xm0, xm3
+ paddw xm0, xm1
+ psrlw xm0, 2
+ vinserti128 m0, xm0, 1
+.hv_w8_loop:
+ movu xm1, [srcq+ssq*1]
+ movu xm2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ vinserti128 m1, [srcq+ssq*0], 1
+ vinserti128 m2, [srcq+ssq*0+2], 1
+ pmullw m1, m4
+ pmullw m2, m5
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m1, 2 ; 1 2
+ vperm2i128 m2, m0, m1, 0x21 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m2
+ pmulhrsw m1, m7
+ mova [dstq+dsq*0], xm1
+ vextracti128 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+.hv_w32:
+.hv_w64:
+.hv_w128:
+%if UNIX64
+ lea r6d, [r8*2-32]
+%else
+ mov r6d, wm
+ lea r6d, [r6*2-32]
+%endif
+ mov r4, srcq
+ lea r6d, [hq+r6*8]
+ mov r7, dstq
+.hv_w16_loop0:
+ pmullw m0, m4, [srcq+ssq*0]
+ pmullw m1, m5, [srcq+ssq*0+2]
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 2
+.hv_w16_loop:
+ pmullw m1, m4, [srcq+ssq*1]
+ pmullw m2, m5, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m1, 2
+ psubw m2, m1, m0
+ paddw m2, m2
+ pmulhw m2, m6
+ paddw m2, m0
+ pmulhrsw m2, m7
+ mova [dstq+dsq*0], m2
+ pmullw m0, m4, [srcq+ssq*0]
+ pmullw m2, m5, [srcq+ssq*0+2]
+ paddw m0, m3
+ paddw m0, m2
+ psrlw m0, 2
+ psubw m2, m0, m1
+ paddw m2, m2
+ pmulhw m2, m6
+ paddw m2, m1
+ pmulhrsw m2, m7
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ add r4, 32
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .hv_w16_loop0
+ RET
+
+cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ movifnidn mxyd, r5m ; mx
+ lea r6, [prep_avx2]
+%if UNIX64
+ DECLARE_REG_TMP 7
+ %define org_w r7d
+%else
+ DECLARE_REG_TMP 6
+ %define org_w r5m
+%endif
+ mov org_w, wd
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .v
+.prep:
+ movzx wd, word [r6+wq*2+table_offset(prep,)]
+ mov r5d, r7m ; bitdepth_max
+ vpbroadcastd m5, [r6-prep_avx2+pw_8192]
+ add wq, r6
+ shr r5d, 11
+ vpbroadcastd m4, [r6-prep_avx2+prep_mul+r5*4]
+ lea stride3q, [strideq*3]
+ jmp wq
+.prep_w4:
+ movq xm0, [srcq+strideq*0]
+ movhps xm0, [srcq+strideq*1]
+ vpbroadcastq m1, [srcq+strideq*2]
+ vpbroadcastq m2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd m0, m1, 0x30
+ vpblendd m0, m2, 0xc0
+ pmullw m0, m4
+ psubw m0, m5
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .prep_w4
+ RET
+.prep_w8:
+ movu xm0, [srcq+strideq*0]
+ vinserti128 m0, [srcq+strideq*1], 1
+ movu xm1, [srcq+strideq*2]
+ vinserti128 m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ pmullw m0, m4
+ pmullw m1, m4
+ psubw m0, m5
+ psubw m1, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ add tmpq, 32*2
+ sub hd, 4
+ jg .prep_w8
+ RET
+.prep_w16:
+ pmullw m0, m4, [srcq+strideq*0]
+ pmullw m1, m4, [srcq+strideq*1]
+ pmullw m2, m4, [srcq+strideq*2]
+ pmullw m3, m4, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ psubw m0, m5
+ psubw m1, m5
+ psubw m2, m5
+ psubw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 4
+ jg .prep_w16
+ RET
+.prep_w32:
+ pmullw m0, m4, [srcq+strideq*0+32*0]
+ pmullw m1, m4, [srcq+strideq*0+32*1]
+ pmullw m2, m4, [srcq+strideq*1+32*0]
+ pmullw m3, m4, [srcq+strideq*1+32*1]
+ lea srcq, [srcq+strideq*2]
+ psubw m0, m5
+ psubw m1, m5
+ psubw m2, m5
+ psubw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 2
+ jg .prep_w32
+ RET
+.prep_w64:
+ pmullw m0, m4, [srcq+32*0]
+ pmullw m1, m4, [srcq+32*1]
+ pmullw m2, m4, [srcq+32*2]
+ pmullw m3, m4, [srcq+32*3]
+ add srcq, strideq
+ psubw m0, m5
+ psubw m1, m5
+ psubw m2, m5
+ psubw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ dec hd
+ jg .prep_w64
+ RET
+.prep_w128:
+ pmullw m0, m4, [srcq+32*0]
+ pmullw m1, m4, [srcq+32*1]
+ pmullw m2, m4, [srcq+32*2]
+ pmullw m3, m4, [srcq+32*3]
+ psubw m0, m5
+ psubw m1, m5
+ psubw m2, m5
+ psubw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ pmullw m0, m4, [srcq+32*4]
+ pmullw m1, m4, [srcq+32*5]
+ pmullw m2, m4, [srcq+32*6]
+ pmullw m3, m4, [srcq+32*7]
+ add tmpq, 32*8
+ add srcq, strideq
+ psubw m0, m5
+ psubw m1, m5
+ psubw m2, m5
+ psubw m3, m5
+ mova [tmpq-32*4], m0
+ mova [tmpq-32*3], m1
+ mova [tmpq-32*2], m2
+ mova [tmpq-32*1], m3
+ dec hd
+ jg .prep_w128
+ RET
+.h:
+ movd xm5, mxyd
+ mov mxyd, r6m ; my
+ vpbroadcastd m4, [pw_16]
+ vpbroadcastw m5, xm5
+ vpbroadcastd m3, [pw_32766]
+ psubw m4, m5
+ test dword r7m, 0x800
+ jnz .h_12bpc
+ psllw m4, 2
+ psllw m5, 2
+.h_12bpc:
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.h_w4:
+ movu xm1, [srcq+strideq*0]
+ vinserti128 m1, [srcq+strideq*2], 1
+ movu xm2, [srcq+strideq*1]
+ vinserti128 m2, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ punpcklqdq m0, m1, m2
+ psrldq m1, 2
+ pslldq m2, 6
+ pmullw m0, m4
+ vpblendd m1, m2, 0xcc
+ pmullw m1, m5
+ psubw m0, m3
+ paddw m0, m1
+ psraw m0, 2
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4
+ RET
+.h_w8:
+ movu xm0, [srcq+strideq*0]
+ vinserti128 m0, [srcq+strideq*1], 1
+ movu xm1, [srcq+strideq*0+2]
+ vinserti128 m1, [srcq+strideq*1+2], 1
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m4
+ pmullw m1, m5
+ psubw m0, m3
+ paddw m0, m1
+ psraw m0, 2
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ pmullw m0, m4, [srcq+strideq*0]
+ pmullw m1, m5, [srcq+strideq*0+2]
+ psubw m0, m3
+ paddw m0, m1
+ pmullw m1, m4, [srcq+strideq*1]
+ pmullw m2, m5, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ psubw m1, m3
+ paddw m1, m2
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ add tmpq, 32*2
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+.h_w64:
+.h_w128:
+ movifnidn t0d, org_w
+.h_w32_loop0:
+ mov r3d, t0d
+.h_w32_loop:
+ pmullw m0, m4, [srcq+r3*2-32*1]
+ pmullw m1, m5, [srcq+r3*2-32*1+2]
+ psubw m0, m3
+ paddw m0, m1
+ pmullw m1, m4, [srcq+r3*2-32*2]
+ pmullw m2, m5, [srcq+r3*2-32*2+2]
+ psubw m1, m3
+ paddw m1, m2
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+r3*2-32*1], m0
+ mova [tmpq+r3*2-32*2], m1
+ sub r3d, 32
+ jg .h_w32_loop
+ add srcq, strideq
+ lea tmpq, [tmpq+t0*2]
+ dec hd
+ jg .h_w32_loop0
+ RET
+.v:
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
+ movd xm5, mxyd
+ vpbroadcastd m4, [pw_16]
+ vpbroadcastw m5, xm5
+ vpbroadcastd m3, [pw_32766]
+ add wq, r6
+ lea stride3q, [strideq*3]
+ psubw m4, m5
+ test dword r7m, 0x800
+ jnz .v_12bpc
+ psllw m4, 2
+ psllw m5, 2
+.v_12bpc:
+ jmp wq
+.v_w4:
+ movq xm0, [srcq+strideq*0]
+.v_w4_loop:
+ vpbroadcastq m2, [srcq+strideq*2]
+ vpbroadcastq xm1, [srcq+strideq*1]
+ vpblendd m2, m0, 0x03 ; 0 2 2 2
+ vpbroadcastq m0, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd m1, m0, 0xf0 ; 1 1 3 3
+ vpbroadcastq m0, [srcq+strideq*0]
+ vpblendd m1, m2, 0x33 ; 0 1 2 3
+ vpblendd m0, m2, 0x0c ; 4 2 4 4
+ punpckhqdq m2, m1, m0 ; 1 2 3 4
+ pmullw m1, m4
+ pmullw m2, m5
+ psubw m1, m3
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq], m1
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movu xm0, [srcq+strideq*0]
+.v_w8_loop:
+ vbroadcasti128 m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpblendd m1, m0, m2, 0xf0 ; 0 1
+ vbroadcasti128 m0, [srcq+strideq*0]
+ vpblendd m2, m0, 0xf0 ; 1 2
+ pmullw m1, m4
+ pmullw m2, m5
+ psubw m1, m3
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq], m1
+ add tmpq, 32
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+ movu m0, [srcq+strideq*0]
+.v_w16_loop:
+ movu m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m4
+ pmullw m1, m5, m2
+ psubw m0, m3
+ paddw m1, m0
+ movu m0, [srcq+strideq*0]
+ psraw m1, 2
+ pmullw m2, m4
+ mova [tmpq+32*0], m1
+ pmullw m1, m5, m0
+ psubw m2, m3
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq+32*1], m1
+ add tmpq, 32*2
+ sub hd, 2
+ jg .v_w16_loop
+ RET
+.v_w32:
+.v_w64:
+.v_w128:
+%if WIN64
+ PUSH r7
+%endif
+ movifnidn r7d, org_w
+ add r7d, r7d
+ mov r3, srcq
+ lea r6d, [hq+r7*8-256]
+ mov r5, tmpq
+.v_w32_loop0:
+ movu m0, [srcq+strideq*0]
+.v_w32_loop:
+ movu m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m4
+ pmullw m1, m5, m2
+ psubw m0, m3
+ paddw m1, m0
+ movu m0, [srcq+strideq*0]
+ psraw m1, 2
+ pmullw m2, m4
+ mova [tmpq+r7*0], m1
+ pmullw m1, m5, m0
+ psubw m2, m3
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq+r7*1], m1
+ lea tmpq, [tmpq+r7*2]
+ sub hd, 2
+ jg .v_w32_loop
+ add r3, 32
+ add r5, 32
+ movzx hd, r6b
+ mov srcq, r3
+ mov tmpq, r5
+ sub r6d, 1<<8
+ jg .v_w32_loop0
+%if WIN64
+ POP r7
+%endif
+ RET
+.hv:
+ WIN64_SPILL_XMM 7
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
+ shl mxyd, 11
+ movd xm6, mxyd
+ add wq, r6
+ lea stride3q, [strideq*3]
+ vpbroadcastw m6, xm6
+ jmp wq
+.hv_w4:
+ movu xm1, [srcq+strideq*0]
+%if WIN64
+ movaps [rsp+24], xmm7
+%endif
+ pmullw xm0, xm4, xm1
+ psrldq xm1, 2
+ pmullw xm1, xm5
+ psubw xm0, xm3
+ paddw xm0, xm1
+ psraw xm0, 2
+ vpbroadcastq m0, xm0
+.hv_w4_loop:
+ movu xm1, [srcq+strideq*1]
+ vinserti128 m1, [srcq+stride3q ], 1
+ movu xm2, [srcq+strideq*2]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m2, [srcq+strideq*0], 1
+ punpcklqdq m7, m1, m2
+ psrldq m1, 2
+ pslldq m2, 6
+ pmullw m7, m4
+ vpblendd m1, m2, 0xcc
+ pmullw m1, m5
+ psubw m7, m3
+ paddw m1, m7
+ psraw m1, 2 ; 1 2 3 4
+ vpblendd m0, m1, 0x3f
+ vpermq m2, m0, q2103 ; 0 1 2 3
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m6
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+%if WIN64
+ movaps xmm7, [rsp+24]
+%endif
+ RET
+.hv_w8:
+ pmullw xm0, xm4, [srcq+strideq*0]
+ pmullw xm1, xm5, [srcq+strideq*0+2]
+ psubw xm0, xm3
+ paddw xm0, xm1
+ psraw xm0, 2
+ vinserti128 m0, xm0, 1
+.hv_w8_loop:
+ movu xm1, [srcq+strideq*1]
+ movu xm2, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ vinserti128 m1, [srcq+strideq*0], 1
+ vinserti128 m2, [srcq+strideq*0+2], 1
+ pmullw m1, m4
+ pmullw m2, m5
+ psubw m1, m3
+ paddw m1, m2
+ psraw m1, 2 ; 1 2
+ vperm2i128 m2, m0, m1, 0x21 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m6
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 32
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+.hv_w32:
+.hv_w64:
+.hv_w128:
+%if WIN64
+ PUSH r7
+%endif
+ movifnidn r7d, org_w
+ add r7d, r7d
+ mov r3, srcq
+ lea r6d, [hq+r7*8-256]
+ mov r5, tmpq
+.hv_w16_loop0:
+ pmullw m0, m4, [srcq]
+ pmullw m1, m5, [srcq+2]
+ psubw m0, m3
+ paddw m0, m1
+ psraw m0, 2
+.hv_w16_loop:
+ pmullw m1, m4, [srcq+strideq*1]
+ pmullw m2, m5, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ psubw m1, m3
+ paddw m1, m2
+ psraw m1, 2
+ psubw m2, m1, m0
+ pmulhrsw m2, m6
+ paddw m2, m0
+ mova [tmpq+r7*0], m2
+ pmullw m0, m4, [srcq+strideq*0]
+ pmullw m2, m5, [srcq+strideq*0+2]
+ psubw m0, m3
+ paddw m0, m2
+ psraw m0, 2
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+r7*1], m2
+ lea tmpq, [tmpq+r7*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ add r3, 32
+ add r5, 32
+ movzx hd, r6b
+ mov srcq, r3
+ mov tmpq, r5
+ sub r6d, 1<<8
+ jg .hv_w16_loop0
+%if WIN64
+ POP r7
+%endif
+ RET
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+%macro FN 4 ; prefix, type, type_h, type_v
+cglobal %1_%2_16bpc
+ mov t0d, FILTER_%3
+%ifidn %3, %4
+ mov t1d, t0d
+%else
+ mov t1d, FILTER_%4
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX)
+%endif
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 4, 5
+%else
+DECLARE_REG_TMP 7, 8
+%endif
+
+%define PUT_8TAP_FN FN put_8tap,
+PUT_8TAP_FN sharp, SHARP, SHARP
+PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
+%define base r8-put_avx2
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r8, [put_avx2]
+ movifnidn wd, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r8+wq*2+table_offset(put,)]
+ add wq, r8
+%if WIN64
+ pop r8
+%endif
+ jmp wq
+.h_w2:
+ movzx mxd, mxb
+ sub srcq, 2
+ mova xm2, [subpel_h_shuf2]
+ vpbroadcastd xm3, [base+subpel_filters+mxq*8+2]
+ pmovsxbw xm3, xm3
+.h_w2_loop:
+ movu xm0, [srcq+ssq*0]
+ movu xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm2
+ pshufb xm1, xm2
+ pmaddwd xm0, xm3
+ pmaddwd xm1, xm3
+ phaddd xm0, xm1
+ paddd xm0, xm4
+ psrad xm0, 6
+ packusdw xm0, xm0
+ pminsw xm0, xm5
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ movzx mxd, mxb
+ sub srcq, 2
+ pmovsxbw xm3, [base+subpel_filters+mxq*8]
+ WIN64_SPILL_XMM 8
+ vbroadcasti128 m6, [subpel_h_shufA]
+ vbroadcasti128 m7, [subpel_h_shufB]
+ pshufd xm3, xm3, q2211
+ vpbroadcastq m2, xm3
+ vpermq m3, m3, q1111
+.h_w4_loop:
+ movu xm1, [srcq+ssq*0]
+ vinserti128 m1, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4
+ pshufb m1, m7 ; 2 3 3 4 4 5 5 6
+ pmaddwd m0, m2
+ pmaddwd m1, m3
+ paddd m0, m4
+ paddd m0, m1
+ psrad m0, 6
+ vextracti128 xm1, m0, 1
+ packusdw xm0, xm1
+ pminsw xm0, xm5
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h:
+ test myd, 0xf00
+ jnz .hv
+ mov r7d, r8m
+ vpbroadcastw m5, r8m
+ shr r7d, 11
+ vpbroadcastd m4, [base+put_8tap_h_rnd+r7*4]
+ cmp wd, 4
+ je .h_w4
+ jl .h_w2
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 13
+ shr mxd, 16
+ sub srcq, 6
+ vpbroadcastq m0, [base+subpel_filters+mxq*8]
+ vbroadcasti128 m6, [subpel_h_shufA]
+ vbroadcasti128 m7, [subpel_h_shufB]
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
+ cmp wd, 8
+ jg .h_w16
+.h_w8:
+%macro PUT_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
+ pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6
+ pshufb m%1, m6 ; 0 1 1 2 2 3 3 4
+ pmaddwd m%5, m9, m%4 ; abcd1
+ pmaddwd m%1, m8 ; abcd0
+ pshufb m%2, m7 ; 6 7 7 8 8 9 9 a
+ shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8
+ paddd m%5, m4
+ paddd m%1, m%5
+ pmaddwd m%5, m11, m%2 ; abcd3
+ paddd m%1, m%5
+ pmaddwd m%5, m10, m%4 ; abcd2
+ pshufb m%3, m7 ; a b b c c d d e
+ pmaddwd m%4, m8 ; efgh0
+ paddd m%1, m%5
+ pmaddwd m%5, m9, m%2 ; efgh1
+ shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c
+ pmaddwd m%3, m11 ; efgh3
+ pmaddwd m%2, m10 ; efgh2
+ paddd m%4, m4
+ paddd m%4, m%5
+ paddd m%3, m%4
+ paddd m%2, m%3
+ psrad m%1, 6
+ psrad m%2, 6
+ packusdw m%1, m%2
+ pminsw m%1, m5
+%endmacro
+ movu xm0, [srcq+ssq*0+ 0]
+ vinserti128 m0, [srcq+ssq*1+ 0], 1
+ movu xm2, [srcq+ssq*0+16]
+ vinserti128 m2, [srcq+ssq*1+16], 1
+ lea srcq, [srcq+ssq*2]
+ shufpd m1, m0, m2, 0x05
+ PUT_8TAP_H 0, 1, 2, 3, 12
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ mov r6d, wd
+.h_w16_loop:
+ movu m0, [srcq+r6*2-32]
+ movu m1, [srcq+r6*2-24]
+ movu m2, [srcq+r6*2-16]
+ PUT_8TAP_H 0, 1, 2, 3, 12
+ mova [dstq+r6*2-32], m0
+ sub r6d, 16
+ jg .h_w16_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w16
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ vpbroadcastq m0, [base+subpel_filters+myq*8]
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 15
+ vpbroadcastd m6, [pd_32]
+ vpbroadcastw m7, r8m
+ lea r6, [ssq*3]
+ sub srcq, r6
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
+ cmp wd, 4
+ jg .v_w8
+ je .v_w4
+.v_w2:
+ movd xm2, [srcq+ssq*0]
+ pinsrd xm2, [srcq+ssq*1], 1
+ pinsrd xm2, [srcq+ssq*2], 2
+ pinsrd xm2, [srcq+r6 ], 3 ; 0 1 2 3
+ lea srcq, [srcq+ssq*4]
+ movd xm3, [srcq+ssq*0]
+ vpbroadcastd xm1, [srcq+ssq*1]
+ vpbroadcastd xm0, [srcq+ssq*2]
+ add srcq, r6
+ vpblendd xm3, xm1, 0x02 ; 4 5
+ vpblendd xm1, xm0, 0x02 ; 5 6
+ palignr xm4, xm3, xm2, 4 ; 1 2 3 4
+ punpcklwd xm3, xm1 ; 45 56
+ punpcklwd xm1, xm2, xm4 ; 01 12
+ punpckhwd xm2, xm4 ; 23 34
+.v_w2_loop:
+ vpbroadcastd xm4, [srcq+ssq*0]
+ pmaddwd xm5, xm8, xm1 ; a0 b0
+ mova xm1, xm2
+ pmaddwd xm2, xm9 ; a1 b1
+ paddd xm5, xm6
+ paddd xm5, xm2
+ mova xm2, xm3
+ pmaddwd xm3, xm10 ; a2 b2
+ paddd xm5, xm3
+ vpblendd xm3, xm0, xm4, 0x02 ; 6 7
+ vpbroadcastd xm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xm4, xm0, 0x02 ; 7 8
+ punpcklwd xm3, xm4 ; 67 78
+ pmaddwd xm4, xm11, xm3 ; a3 b3
+ paddd xm5, xm4
+ psrad xm5, 6
+ packusdw xm5, xm5
+ pminsw xm5, xm7
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movq xm1, [srcq+ssq*0]
+ vpbroadcastq m0, [srcq+ssq*1]
+ vpbroadcastq m2, [srcq+ssq*2]
+ vpbroadcastq m4, [srcq+r6 ]
+ lea srcq, [srcq+ssq*4]
+ vpbroadcastq m3, [srcq+ssq*0]
+ vpbroadcastq m5, [srcq+ssq*1]
+ vpblendd m1, m0, 0x30
+ vpblendd m0, m2, 0x30
+ punpcklwd m1, m0 ; 01 12
+ vpbroadcastq m0, [srcq+ssq*2]
+ add srcq, r6
+ vpblendd m2, m4, 0x30
+ vpblendd m4, m3, 0x30
+ punpcklwd m2, m4 ; 23 34
+ vpblendd m3, m5, 0x30
+ vpblendd m5, m0, 0x30
+ punpcklwd m3, m5 ; 45 56
+.v_w4_loop:
+ vpbroadcastq m4, [srcq+ssq*0]
+ pmaddwd m5, m8, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m9 ; a1 b1
+ paddd m5, m6
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m10 ; a2 b2
+ paddd m5, m3
+ vpblendd m3, m0, m4, 0x30
+ vpbroadcastq m0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd m4, m0, 0x30
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m11, m3 ; a3 b3
+ paddd m5, m4
+ psrad m5, 6
+ vextracti128 xm4, m5, 1
+ packusdw xm5, xm4
+ pminsw xm5, xm7
+ movq [dstq+dsq*0], xm5
+ movhps [dstq+dsq*1], xm5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ shl wd, 5
+ mov r7, srcq
+ mov r8, dstq
+ lea wd, [hq+wq-256]
+.v_w8_loop0:
+ vbroadcasti128 m4, [srcq+ssq*0]
+ vbroadcasti128 m5, [srcq+ssq*1]
+ vbroadcasti128 m0, [srcq+r6 ]
+ vbroadcasti128 m6, [srcq+ssq*2]
+ lea srcq, [srcq+ssq*4]
+ vbroadcasti128 m1, [srcq+ssq*0]
+ vbroadcasti128 m2, [srcq+ssq*1]
+ vbroadcasti128 m3, [srcq+ssq*2]
+ add srcq, r6
+ shufpd m4, m0, 0x0c
+ shufpd m5, m1, 0x0c
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ shufpd m6, m2, 0x0c
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ shufpd m0, m3, 0x0c
+ punpcklwd m3, m6, m0 ; 23
+ punpckhwd m6, m0 ; 56
+.v_w8_loop:
+ vbroadcasti128 m14, [srcq+ssq*0]
+ pmaddwd m12, m8, m1 ; a0
+ pmaddwd m13, m8, m2 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m9 ; a1
+ pmaddwd m4, m9 ; b1
+ paddd m12, m3
+ paddd m13, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m12, m5
+ vbroadcasti128 m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ paddd m13, m6
+ shufpd m6, m0, m14, 0x0d
+ shufpd m0, m14, m5, 0x0c
+ punpcklwd m5, m6, m0 ; 67
+ punpckhwd m6, m0 ; 78
+ pmaddwd m14, m11, m5 ; a3
+ paddd m12, m14
+ pmaddwd m14, m11, m6 ; b3
+ paddd m13, m14
+ psrad m12, 5
+ psrad m13, 5
+ packusdw m12, m13
+ pxor m13, m13
+ pavgw m12, m13
+ pminsw m12, m7
+ vpermq m12, m12, q3120
+ mova [dstq+dsq*0], xm12
+ vextracti128 [dstq+dsq*1], m12, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ add r7, 16
+ add r8, 16
+ movzx hd, wb
+ mov srcq, r7
+ mov dstq, r8
+ sub wd, 1<<8
+ jg .v_w8_loop0
+ RET
+.hv:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 16
+ vpbroadcastw m15, r8m
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ vpbroadcastd m0, [base+subpel_filters+mxq*8+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ vpbroadcastq m1, [base+subpel_filters+myq*8]
+ vpbroadcastd m6, [pd_512]
+ lea r6, [ssq*3]
+ sub srcq, 2
+ sub srcq, r6
+ pxor m7, m7
+ punpcklbw m7, m0
+ punpcklbw m1, m1
+ psraw m1, 8 ; sign-extend
+ test dword r8m, 0x800
+ jz .hv_10bit
+ psraw m7, 2
+ psllw m1, 2
+.hv_10bit:
+ pshufd m11, m1, q0000
+ pshufd m12, m1, q1111
+ pshufd m13, m1, q2222
+ pshufd m14, m1, q3333
+ cmp wd, 4
+ je .hv_w4
+ vbroadcasti128 m9, [subpel_h_shuf2]
+ vbroadcasti128 m1, [srcq+r6 ] ; 3 3
+ movu xm3, [srcq+ssq*2]
+ movu xm0, [srcq+ssq*0]
+ movu xm2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*4]
+ vinserti128 m3, [srcq+ssq*0], 1 ; 2 4
+ vinserti128 m0, [srcq+ssq*1], 1 ; 0 5
+ vinserti128 m2, [srcq+ssq*2], 1 ; 1 6
+ add srcq, r6
+ pshufb m1, m9
+ pshufb m3, m9
+ pshufb m0, m9
+ pshufb m2, m9
+ pmaddwd m1, m7
+ pmaddwd m3, m7
+ pmaddwd m0, m7
+ pmaddwd m2, m7
+ phaddd m1, m3
+ phaddd m0, m2
+ paddd m1, m6
+ paddd m0, m6
+ psrad m1, 10
+ psrad m0, 10
+ packssdw m1, m0 ; 3 2 0 1
+ vextracti128 xm0, m1, 1 ; 3 4 5 6
+ pshufd xm2, xm1, q1301 ; 2 3 1 2
+ pshufd xm3, xm0, q2121 ; 4 5 4 5
+ punpckhwd xm1, xm2 ; 01 12
+ punpcklwd xm2, xm0 ; 23 34
+ punpckhwd xm3, xm0 ; 45 56
+.hv_w2_loop:
+ movu xm4, [srcq+ssq*0]
+ movu xm5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm4, xm9
+ pshufb xm5, xm9
+ pmaddwd xm4, xm7
+ pmaddwd xm5, xm7
+ phaddd xm4, xm5
+ pmaddwd xm5, xm11, xm1 ; a0 b0
+ mova xm1, xm2
+ pmaddwd xm2, xm12 ; a1 b1
+ paddd xm5, xm2
+ mova xm2, xm3
+ pmaddwd xm3, xm13 ; a2 b2
+ paddd xm5, xm3
+ paddd xm4, xm6
+ psrad xm4, 10
+ packssdw xm4, xm4
+ palignr xm3, xm4, xm0, 12
+ mova xm0, xm4
+ punpcklwd xm3, xm0 ; 67 78
+ pmaddwd xm4, xm14, xm3 ; a3 b3
+ paddd xm5, xm6
+ paddd xm5, xm4
+ psrad xm5, 10
+ packusdw xm5, xm5
+ pminsw xm5, xm15
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ vbroadcasti128 m9, [subpel_h_shufA]
+ vbroadcasti128 m10, [subpel_h_shufB]
+ pshufd m8, m7, q1111
+ pshufd m7, m7, q0000
+ movu xm1, [srcq+ssq*0]
+ vinserti128 m1, [srcq+ssq*1], 1 ; 0 1
+ vbroadcasti128 m0, [srcq+r6 ]
+ vinserti128 m2, m0, [srcq+ssq*2], 0 ; 2 3
+ lea srcq, [srcq+ssq*4]
+ vinserti128 m0, [srcq+ssq*0], 1 ; 3 4
+ movu xm3, [srcq+ssq*1]
+ vinserti128 m3, [srcq+ssq*2], 1 ; 5 6
+ add srcq, r6
+ pshufb m4, m1, m9
+ pshufb m1, m10
+ pmaddwd m4, m7
+ pmaddwd m1, m8
+ pshufb m5, m2, m9
+ pshufb m2, m10
+ pmaddwd m5, m7
+ pmaddwd m2, m8
+ paddd m4, m6
+ paddd m1, m4
+ pshufb m4, m0, m9
+ pshufb m0, m10
+ pmaddwd m4, m7
+ pmaddwd m0, m8
+ paddd m5, m6
+ paddd m2, m5
+ pshufb m5, m3, m9
+ pshufb m3, m10
+ pmaddwd m5, m7
+ pmaddwd m3, m8
+ paddd m4, m6
+ paddd m4, m0
+ paddd m5, m6
+ paddd m5, m3
+ vperm2i128 m0, m1, m2, 0x21
+ psrld m1, 10
+ psrld m2, 10
+ vperm2i128 m3, m4, m5, 0x21
+ pslld m4, 6
+ pslld m5, 6
+ pblendw m2, m4, 0xaa ; 23 34
+ pslld m0, 6
+ pblendw m1, m0, 0xaa ; 01 12
+ psrld m3, 10
+ pblendw m3, m5, 0xaa ; 45 56
+ psrad m0, m5, 16
+.hv_w4_loop:
+ movu xm4, [srcq+ssq*0]
+ vinserti128 m4, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m5, m11, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m12 ; a1 b1
+ paddd m5, m6
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m13 ; a2 b2
+ paddd m5, m3
+ pshufb m3, m4, m9
+ pshufb m4, m10
+ pmaddwd m3, m7
+ pmaddwd m4, m8
+ paddd m3, m6
+ paddd m4, m3
+ psrad m4, 10
+ packssdw m0, m4 ; _ 7 6 8
+ vpermq m3, m0, q1122 ; _ 6 _ 7
+ punpckhwd m3, m0 ; 67 78
+ mova m0, m4
+ pmaddwd m4, m14, m3 ; a3 b3
+ paddd m4, m5
+ psrad m4, 10
+ vextracti128 xm5, m4, 1
+ packusdw xm4, xm5
+ pminsw xm4, xm15
+ movq [dstq+dsq*0], xm4
+ movhps [dstq+dsq*1], xm4
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+ vpbroadcastq m2, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ pmovsxbw xm1, [base+subpel_filters+myq*8]
+ shl wd, 5
+ lea r6, [ssq*3]
+ sub srcq, 6
+ sub srcq, r6
+ pxor m0, m0
+ punpcklbw m0, m2
+ mov r7, srcq
+ mov r8, dstq
+ lea wd, [hq+wq-256]
+ test dword r8m, 0x800
+ jz .hv_w8_10bit
+ psraw m0, 2
+ psllw xm1, 2
+.hv_w8_10bit:
+ pshufd m11, m0, q0000
+ pshufd m12, m0, q1111
+ pshufd m13, m0, q2222
+ pshufd m14, m0, q3333
+%if WIN64
+ %define v_mul (rsp+stack_offset+40) ; r4m
+%else
+ %define v_mul (rsp-24) ; red zone
+%endif
+ mova [v_mul], xm1
+.hv_w8_loop0:
+%macro PUT_8TAP_HV_H 3 ; dst/src+0, src+8, src+16
+ pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6
+ pshufb m%1, m8 ; 0 1 1 2 2 3 3 4
+ pmaddwd m3, m12, m2
+ pmaddwd m%1, m11
+ pshufb m%2, m9 ; 6 7 7 8 8 9 9 a
+ shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8
+ paddd m3, m10
+ paddd m%1, m3
+ pmaddwd m3, m14, m%2
+ paddd m%1, m3
+ pmaddwd m3, m13, m2
+ pshufb m%3, m9 ; a b b c c d d e
+ pmaddwd m2, m11
+ paddd m%1, m3
+ pmaddwd m3, m12, m%2
+ shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c
+ pmaddwd m%3, m14
+ pmaddwd m%2, m13
+ paddd m2, m10
+ paddd m2, m3
+ paddd m%3, m2
+ paddd m%2, m%3
+ psrad m%1, 10
+ psrad m%2, 10
+ packssdw m%1, m%2
+%endmacro
+ movu xm4, [srcq+r6 *1+ 0]
+ vbroadcasti128 m8, [subpel_h_shufA]
+ movu xm6, [srcq+r6 *1+ 8]
+ vbroadcasti128 m9, [subpel_h_shufB]
+ movu xm0, [srcq+r6 *1+16]
+ vpbroadcastd m10, [pd_512]
+ movu xm5, [srcq+ssq*0+ 0]
+ vinserti128 m5, [srcq+ssq*4+ 0], 1
+ movu xm1, [srcq+ssq*0+16]
+ vinserti128 m1, [srcq+ssq*4+16], 1
+ shufpd m7, m5, m1, 0x05
+ INIT_XMM avx2
+ PUT_8TAP_HV_H 4, 6, 0 ; 3
+ INIT_YMM avx2
+ PUT_8TAP_HV_H 5, 7, 1 ; 0 4
+ movu xm0, [srcq+ssq*2+ 0]
+ vinserti128 m0, [srcq+r6 *2+ 0], 1
+ movu xm1, [srcq+ssq*2+16]
+ vinserti128 m1, [srcq+r6 *2+16], 1
+ shufpd m7, m0, m1, 0x05
+ PUT_8TAP_HV_H 0, 7, 1 ; 2 6
+ movu xm6, [srcq+ssq*1+ 0]
+ movu xm1, [srcq+ssq*1+16]
+ lea srcq, [srcq+ssq*4]
+ vinserti128 m6, [srcq+ssq*1+ 0], 1
+ vinserti128 m1, [srcq+ssq*1+16], 1
+ add srcq, r6
+ shufpd m7, m6, m1, 0x05
+ PUT_8TAP_HV_H 6, 7, 1 ; 1 5
+ vpermq m4, m4, q1100
+ vpermq m5, m5, q3120
+ vpermq m6, m6, q3120
+ vpermq m7, m0, q3120
+ punpcklwd m3, m7, m4 ; 23
+ punpckhwd m4, m5 ; 34
+ punpcklwd m1, m5, m6 ; 01
+ punpckhwd m5, m6 ; 45
+ punpcklwd m2, m6, m7 ; 12
+ punpckhwd m6, m7 ; 56
+.hv_w8_loop:
+ vpbroadcastd m9, [v_mul+4*0]
+ vpbroadcastd m7, [v_mul+4*1]
+ vpbroadcastd m10, [v_mul+4*2]
+ pmaddwd m8, m9, m1 ; a0
+ pmaddwd m9, m2 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m7 ; a1
+ pmaddwd m4, m7 ; b1
+ paddd m8, m3
+ paddd m9, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m8, m5
+ paddd m9, m6
+ movu xm5, [srcq+ssq*0]
+ vinserti128 m5, [srcq+ssq*1], 1
+ vbroadcasti128 m7, [subpel_h_shufA]
+ vbroadcasti128 m10, [subpel_h_shufB]
+ movu xm6, [srcq+ssq*0+16]
+ vinserti128 m6, [srcq+ssq*1+16], 1
+ vextracti128 [dstq], m0, 1
+ pshufb m0, m5, m7 ; 01
+ pshufb m5, m10 ; 23
+ pmaddwd m0, m11
+ pmaddwd m5, m12
+ paddd m0, m5
+ pshufb m5, m6, m7 ; 89
+ pshufb m6, m10 ; ab
+ pmaddwd m5, m13
+ pmaddwd m6, m14
+ paddd m6, m5
+ movu xm5, [srcq+ssq*0+8]
+ vinserti128 m5, [srcq+ssq*1+8], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb m7, m5, m7
+ pshufb m5, m10
+ pmaddwd m10, m13, m7
+ pmaddwd m7, m11
+ paddd m0, m10
+ vpbroadcastd m10, [pd_512]
+ paddd m6, m7
+ pmaddwd m7, m14, m5
+ pmaddwd m5, m12
+ paddd m0, m7
+ paddd m5, m6
+ vbroadcasti128 m6, [dstq]
+ paddd m8, m10
+ paddd m9, m10
+ paddd m0, m10
+ paddd m5, m10
+ vpbroadcastd m10, [v_mul+4*3]
+ psrad m0, 10
+ psrad m5, 10
+ packssdw m0, m5
+ vpermq m7, m0, q3120 ; 7 8
+ shufpd m6, m7, 0x04 ; 6 7
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m10, m5 ; a3
+ pmaddwd m10, m6 ; b3
+ paddd m7, m8
+ paddd m9, m10
+ psrad m7, 10
+ psrad m9, 10
+ packusdw m7, m9
+ pminsw m7, m15
+ vpermq m7, m7, q3120
+ mova [dstq+dsq*0], xm7
+ vextracti128 [dstq+dsq*1], m7, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ add r7, 16
+ add r8, 16
+ movzx hd, wb
+ mov srcq, r7
+ mov dstq, r8
+ sub wd, 1<<8
+ jg .hv_w8_loop0
+ RET
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%define PREP_8TAP_FN FN prep_8tap,
+PREP_8TAP_FN sharp, SHARP, SHARP
+PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
+%define base r7-prep_avx2
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r7, [prep_avx2]
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ mov r6d, r7m ; bitdepth_max
+ movzx wd, word [r7+wq*2+table_offset(prep,)]
+ vpbroadcastd m5, [r7-prep_avx2+pw_8192]
+ shr r6d, 11
+ add wq, r7
+ vpbroadcastd m4, [base+prep_mul+r6*4]
+ lea r6, [strideq*3]
+%if WIN64
+ pop r7
+%endif
+ jmp wq
+.h_w4:
+ movzx mxd, mxb
+ sub srcq, 2
+ pmovsxbw xm0, [base+subpel_filters+mxq*8]
+ vbroadcasti128 m3, [subpel_h_shufA]
+ vbroadcasti128 m4, [subpel_h_shufB]
+ WIN64_SPILL_XMM 8
+ pshufd xm0, xm0, q2211
+ test dword r7m, 0x800
+ jnz .h_w4_12bpc
+ psllw xm0, 2
+.h_w4_12bpc:
+ vpbroadcastq m6, xm0
+ vpermq m7, m0, q1111
+.h_w4_loop:
+ movu xm1, [srcq+strideq*0]
+ vinserti128 m1, [srcq+strideq*2], 1
+ movu xm2, [srcq+strideq*1]
+ vinserti128 m2, [srcq+r6 ], 1
+ lea srcq, [srcq+strideq*4]
+ pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4
+ pshufb m1, m4 ; 2 3 3 4 4 5 5 6
+ pmaddwd m0, m6
+ pmaddwd m1, m7
+ paddd m0, m5
+ paddd m0, m1
+ pshufb m1, m2, m3
+ pshufb m2, m4
+ pmaddwd m1, m6
+ pmaddwd m2, m7
+ paddd m1, m5
+ paddd m1, m2
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4)
+ lea r6, [strideq*3]
+ cmp wd, 4
+ je .h_w4
+ shr mxd, 16
+ sub srcq, 6
+ vpbroadcastq m0, [base+subpel_filters+mxq*8]
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 12
+ vbroadcasti128 m6, [subpel_h_shufA]
+ vbroadcasti128 m7, [subpel_h_shufB]
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ test dword r7m, 0x800
+ jnz .h_12bpc
+ psllw m0, 2
+.h_12bpc:
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
+ cmp wd, 8
+ jg .h_w16
+.h_w8:
+%macro PREP_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
+ pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6
+ pshufb m%1, m6 ; 0 1 1 2 2 3 3 4
+ pmaddwd m%5, m9, m%4 ; abcd1
+ pmaddwd m%1, m8 ; abcd0
+ pshufb m%2, m7 ; 6 7 7 8 8 9 9 a
+ shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8
+ paddd m%5, m5
+ paddd m%1, m%5
+ pmaddwd m%5, m11, m%2 ; abcd3
+ paddd m%1, m%5
+ pmaddwd m%5, m10, m%4 ; abcd2
+ pshufb m%3, m7 ; a b b c c d d e
+ pmaddwd m%4, m8 ; efgh0
+ paddd m%1, m%5
+ pmaddwd m%5, m9, m%2 ; efgh1
+ shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c
+ pmaddwd m%3, m11 ; efgh3
+ pmaddwd m%2, m10 ; efgh2
+ paddd m%4, m5
+ paddd m%4, m%5
+ paddd m%3, m%4
+ paddd m%2, m%3
+ psrad m%1, 4
+ psrad m%2, 4
+ packssdw m%1, m%2
+%endmacro
+ movu xm0, [srcq+strideq*0+ 0]
+ vinserti128 m0, [srcq+strideq*1+ 0], 1
+ movu xm2, [srcq+strideq*0+16]
+ vinserti128 m2, [srcq+strideq*1+16], 1
+ lea srcq, [srcq+strideq*2]
+ shufpd m1, m0, m2, 0x05
+ PREP_8TAP_H 0, 1, 2, 3, 4
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ add wd, wd
+.h_w16_loop0:
+ mov r6d, wd
+.h_w16_loop:
+ movu m0, [srcq+r6-32]
+ movu m1, [srcq+r6-24]
+ movu m2, [srcq+r6-16]
+ PREP_8TAP_H 0, 1, 2, 3, 4
+ mova [tmpq+r6-32], m0
+ sub r6d, 32
+ jg .h_w16_loop
+ add srcq, strideq
+ add tmpq, wq
+ dec hd
+ jg .h_w16_loop0
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ vpbroadcastq m0, [base+subpel_filters+myq*8]
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 15
+ vpbroadcastd m7, [prep_8tap_1d_rnd]
+ lea r6, [strideq*3]
+ sub srcq, r6
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ test dword r7m, 0x800
+ jnz .v_12bpc
+ psllw m0, 2
+.v_12bpc:
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
+ cmp wd, 4
+ jg .v_w8
+.v_w4:
+ movq xm1, [srcq+strideq*0]
+ vpbroadcastq m0, [srcq+strideq*1]
+ vpbroadcastq m2, [srcq+strideq*2]
+ vpbroadcastq m4, [srcq+r6 ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m3, [srcq+strideq*0]
+ vpbroadcastq m5, [srcq+strideq*1]
+ vpblendd m1, m0, 0x30
+ vpblendd m0, m2, 0x30
+ punpcklwd m1, m0 ; 01 12
+ vpbroadcastq m0, [srcq+strideq*2]
+ add srcq, r6
+ vpblendd m2, m4, 0x30
+ vpblendd m4, m3, 0x30
+ punpcklwd m2, m4 ; 23 34
+ vpblendd m3, m5, 0x30
+ vpblendd m5, m0, 0x30
+ punpcklwd m3, m5 ; 45 56
+.v_w4_loop:
+ vpbroadcastq m4, [srcq+strideq*0]
+ pmaddwd m5, m8, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m9 ; a1 b1
+ paddd m5, m7
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m10 ; a2 b2
+ paddd m5, m3
+ vpblendd m3, m0, m4, 0x30
+ vpbroadcastq m0, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpblendd m4, m0, 0x30
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m11, m3 ; a3 b3
+ paddd m5, m4
+ psrad m5, 4
+ vextracti128 xm4, m5, 1
+ packssdw xm5, xm4
+ mova [tmpq], xm5
+ add tmpq, 16
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+%if WIN64
+ push r8
+%endif
+ mov r8d, wd
+ shl wd, 5
+ mov r5, srcq
+ mov r7, tmpq
+ lea wd, [hq+wq-256]
+.v_w8_loop0:
+ vbroadcasti128 m4, [srcq+strideq*0]
+ vbroadcasti128 m5, [srcq+strideq*1]
+ vbroadcasti128 m0, [srcq+r6 ]
+ vbroadcasti128 m6, [srcq+strideq*2]
+ lea srcq, [srcq+strideq*4]
+ vbroadcasti128 m1, [srcq+strideq*0]
+ vbroadcasti128 m2, [srcq+strideq*1]
+ vbroadcasti128 m3, [srcq+strideq*2]
+ add srcq, r6
+ shufpd m4, m0, 0x0c
+ shufpd m5, m1, 0x0c
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ shufpd m6, m2, 0x0c
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ shufpd m0, m3, 0x0c
+ punpcklwd m3, m6, m0 ; 23
+ punpckhwd m6, m0 ; 56
+.v_w8_loop:
+ vbroadcasti128 m14, [srcq+strideq*0]
+ pmaddwd m12, m8, m1 ; a0
+ pmaddwd m13, m8, m2 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m9 ; a1
+ pmaddwd m4, m9 ; b1
+ paddd m12, m7
+ paddd m13, m7
+ paddd m12, m3
+ paddd m13, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m12, m5
+ vbroadcasti128 m5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ paddd m13, m6
+ shufpd m6, m0, m14, 0x0d
+ shufpd m0, m14, m5, 0x0c
+ punpcklwd m5, m6, m0 ; 67
+ punpckhwd m6, m0 ; 78
+ pmaddwd m14, m11, m5 ; a3
+ paddd m12, m14
+ pmaddwd m14, m11, m6 ; b3
+ paddd m13, m14
+ psrad m12, 4
+ psrad m13, 4
+ packssdw m12, m13
+ vpermq m12, m12, q3120
+ mova [tmpq+r8*0], xm12
+ vextracti128 [tmpq+r8*2], m12, 1
+ lea tmpq, [tmpq+r8*4]
+ sub hd, 2
+ jg .v_w8_loop
+ add r5, 16
+ add r7, 16
+ movzx hd, wb
+ mov srcq, r5
+ mov tmpq, r7
+ sub wd, 1<<8
+ jg .v_w8_loop0
+%if WIN64
+ pop r8
+%endif
+ RET
+.hv:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 16
+ vpbroadcastd m15, [prep_8tap_2d_rnd]
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ vpbroadcastd m0, [base+subpel_filters+mxq*8+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ vpbroadcastq m1, [base+subpel_filters+myq*8]
+ lea r6, [strideq*3]
+ sub srcq, 2
+ sub srcq, r6
+ pxor m7, m7
+ punpcklbw m7, m0
+ punpcklbw m1, m1
+ psraw m7, 4
+ psraw m1, 8
+ test dword r7m, 0x800
+ jz .hv_w4_10bit
+ psraw m7, 2
+.hv_w4_10bit:
+ pshufd m11, m1, q0000
+ pshufd m12, m1, q1111
+ pshufd m13, m1, q2222
+ pshufd m14, m1, q3333
+.hv_w4:
+ vbroadcasti128 m9, [subpel_h_shufA]
+ vbroadcasti128 m10, [subpel_h_shufB]
+ pshufd m8, m7, q1111
+ pshufd m7, m7, q0000
+ movu xm1, [srcq+strideq*0]
+ vinserti128 m1, [srcq+strideq*1], 1 ; 0 1
+ vbroadcasti128 m0, [srcq+r6 ]
+ vinserti128 m2, m0, [srcq+strideq*2], 0 ; 2 3
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m0, [srcq+strideq*0], 1 ; 3 4
+ movu xm3, [srcq+strideq*1]
+ vinserti128 m3, [srcq+strideq*2], 1 ; 5 6
+ add srcq, r6
+ pshufb m4, m1, m9
+ pshufb m1, m10
+ pmaddwd m4, m7
+ pmaddwd m1, m8
+ pshufb m5, m2, m9
+ pshufb m2, m10
+ pmaddwd m5, m7
+ pmaddwd m2, m8
+ paddd m4, m15
+ paddd m1, m4
+ pshufb m4, m0, m9
+ pshufb m0, m10
+ pmaddwd m4, m7
+ pmaddwd m0, m8
+ paddd m5, m15
+ paddd m2, m5
+ pshufb m5, m3, m9
+ pshufb m3, m10
+ pmaddwd m5, m7
+ pmaddwd m3, m8
+ paddd m4, m15
+ paddd m4, m0
+ paddd m5, m15
+ paddd m5, m3
+ vperm2i128 m0, m1, m2, 0x21
+ psrld m1, 6
+ psrld m2, 6
+ vperm2i128 m3, m4, m5, 0x21
+ pslld m4, 10
+ pslld m5, 10
+ pblendw m2, m4, 0xaa ; 23 34
+ pslld m0, 10
+ pblendw m1, m0, 0xaa ; 01 12
+ psrld m3, 6
+ pblendw m3, m5, 0xaa ; 45 56
+ psrad m0, m5, 16
+.hv_w4_loop:
+ movu xm4, [srcq+strideq*0]
+ vinserti128 m4, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+ pmaddwd m5, m11, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m12 ; a1 b1
+ paddd m5, m15
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m13 ; a2 b2
+ paddd m5, m3
+ pshufb m3, m4, m9
+ pshufb m4, m10
+ pmaddwd m3, m7
+ pmaddwd m4, m8
+ paddd m3, m15
+ paddd m4, m3
+ psrad m4, 6
+ packssdw m0, m4 ; _ 7 6 8
+ vpermq m3, m0, q1122 ; _ 6 _ 7
+ punpckhwd m3, m0 ; 67 78
+ mova m0, m4
+ pmaddwd m4, m14, m3 ; a3 b3
+ paddd m4, m5
+ psrad m4, 6
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+ mova [tmpq], xm4
+ add tmpq, 16
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+ vpbroadcastq m2, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ pmovsxbw xm1, [base+subpel_filters+myq*8]
+%if WIN64
+ PUSH r8
+%endif
+ mov r8d, wd
+ shl wd, 5
+ lea r6, [strideq*3]
+ sub srcq, 6
+ sub srcq, r6
+ mov r5, srcq
+ mov r7, tmpq
+ lea wd, [hq+wq-256]
+ pxor m0, m0
+ punpcklbw m0, m2
+ mova [v_mul], xm1
+ psraw m0, 4
+ test dword r7m, 0x800
+ jz .hv_w8_10bit
+ psraw m0, 2
+.hv_w8_10bit:
+ pshufd m11, m0, q0000
+ pshufd m12, m0, q1111
+ pshufd m13, m0, q2222
+ pshufd m14, m0, q3333
+.hv_w8_loop0:
+%macro PREP_8TAP_HV_H 3 ; dst/src+0, src+8, src+16
+ pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6
+ pshufb m%1, m8 ; 0 1 1 2 2 3 3 4
+ pmaddwd m3, m12, m2
+ pmaddwd m%1, m11
+ pshufb m%2, m9 ; 6 7 7 8 8 9 9 a
+ shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8
+ paddd m3, m15
+ paddd m%1, m3
+ pmaddwd m3, m14, m%2
+ paddd m%1, m3
+ pmaddwd m3, m13, m2
+ pshufb m%3, m9 ; a b b c c d d e
+ pmaddwd m2, m11
+ paddd m%1, m3
+ pmaddwd m3, m12, m%2
+ shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c
+ pmaddwd m%3, m14
+ pmaddwd m%2, m13
+ paddd m2, m15
+ paddd m2, m3
+ paddd m2, m%3
+ paddd m2, m%2
+ psrad m%1, 6
+ psrad m2, 6
+ packssdw m%1, m2
+%endmacro
+ movu xm4, [srcq+r6 + 0]
+ vbroadcasti128 m8, [subpel_h_shufA]
+ movu xm6, [srcq+r6 + 8]
+ vbroadcasti128 m9, [subpel_h_shufB]
+ movu xm0, [srcq+r6 +16]
+ movu xm5, [srcq+strideq*0+ 0]
+ vinserti128 m5, [srcq+strideq*4+ 0], 1
+ movu xm1, [srcq+strideq*0+16]
+ vinserti128 m1, [srcq+strideq*4+16], 1
+ shufpd m7, m5, m1, 0x05
+ INIT_XMM avx2
+ PREP_8TAP_HV_H 4, 6, 0 ; 3
+ INIT_YMM avx2
+ PREP_8TAP_HV_H 5, 7, 1 ; 0 4
+ movu xm0, [srcq+strideq*2+ 0]
+ vinserti128 m0, [srcq+r6 *2+ 0], 1
+ movu xm1, [srcq+strideq*2+16]
+ vinserti128 m1, [srcq+r6 *2+16], 1
+ shufpd m7, m0, m1, 0x05
+ PREP_8TAP_HV_H 0, 7, 1 ; 2 6
+ movu xm6, [srcq+strideq*1+ 0]
+ movu xm1, [srcq+strideq*1+16]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m6, [srcq+strideq*1+ 0], 1
+ vinserti128 m1, [srcq+strideq*1+16], 1
+ add srcq, r6
+ shufpd m7, m6, m1, 0x05
+ PREP_8TAP_HV_H 6, 7, 1 ; 1 5
+ vpermq m4, m4, q1100
+ vpermq m5, m5, q3120
+ vpermq m6, m6, q3120
+ vpermq m7, m0, q3120
+ punpcklwd m3, m7, m4 ; 23
+ punpckhwd m4, m5 ; 34
+ punpcklwd m1, m5, m6 ; 01
+ punpckhwd m5, m6 ; 45
+ punpcklwd m2, m6, m7 ; 12
+ punpckhwd m6, m7 ; 56
+.hv_w8_loop:
+ vpbroadcastd m9, [v_mul+4*0]
+ vpbroadcastd m7, [v_mul+4*1]
+ vpbroadcastd m10, [v_mul+4*2]
+ pmaddwd m8, m9, m1 ; a0
+ pmaddwd m9, m2 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m7 ; a1
+ pmaddwd m4, m7 ; b1
+ paddd m8, m15
+ paddd m9, m15
+ paddd m8, m3
+ paddd m9, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m8, m5
+ paddd m9, m6
+ movu xm5, [srcq+strideq*0]
+ vinserti128 m5, [srcq+strideq*1], 1
+ vbroadcasti128 m7, [subpel_h_shufA]
+ vbroadcasti128 m10, [subpel_h_shufB]
+ movu xm6, [srcq+strideq*0+16]
+ vinserti128 m6, [srcq+strideq*1+16], 1
+ vextracti128 [tmpq], m0, 1
+ pshufb m0, m5, m7 ; 01
+ pshufb m5, m10 ; 23
+ pmaddwd m0, m11
+ pmaddwd m5, m12
+ paddd m0, m15
+ paddd m0, m5
+ pshufb m5, m6, m7 ; 89
+ pshufb m6, m10 ; ab
+ pmaddwd m5, m13
+ pmaddwd m6, m14
+ paddd m5, m15
+ paddd m6, m5
+ movu xm5, [srcq+strideq*0+8]
+ vinserti128 m5, [srcq+strideq*1+8], 1
+ lea srcq, [srcq+strideq*2]
+ pshufb m7, m5, m7
+ pshufb m5, m10
+ pmaddwd m10, m13, m7
+ pmaddwd m7, m11
+ paddd m0, m10
+ paddd m6, m7
+ pmaddwd m7, m14, m5
+ pmaddwd m5, m12
+ paddd m0, m7
+ paddd m5, m6
+ vbroadcasti128 m6, [tmpq]
+ vpbroadcastd m10, [v_mul+4*3]
+ psrad m0, 6
+ psrad m5, 6
+ packssdw m0, m5
+ vpermq m7, m0, q3120 ; 7 8
+ shufpd m6, m7, 0x04 ; 6 7
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m10, m5 ; a3
+ pmaddwd m10, m6 ; b3
+ paddd m7, m8
+ paddd m9, m10
+ psrad m7, 6
+ psrad m9, 6
+ packssdw m7, m9
+ vpermq m7, m7, q3120
+ mova [tmpq+r8*0], xm7
+ vextracti128 [tmpq+r8*2], m7, 1
+ lea tmpq, [tmpq+r8*4]
+ sub hd, 2
+ jg .hv_w8_loop
+ add r5, 16
+ add r7, 16
+ movzx hd, wb
+ mov srcq, r5
+ mov tmpq, r7
+ sub wd, 1<<8
+ jg .hv_w8_loop0
+%if WIN64
+ POP r8
+%endif
+ RET
+
+%macro movifprep 2
+ %if isprep
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro REMAP_REG 2
+ %xdefine r%1 r%2
+ %xdefine r%1q r%2q
+ %xdefine r%1d r%2d
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
+ %if isprep
+ %xdefine r14_save r14
+ %assign %%i 14
+ %rep 14
+ %assign %%j %%i-1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i-1
+ %endrep
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
+ %if isprep
+ %assign %%i 1
+ %rep 13
+ %assign %%j %%i+1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i+1
+ %endrep
+ %xdefine r14 r14_save
+ %undef r14_save
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
+ MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+ RET
+ %if %1
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_H 8-9 0 ; dst, tmp[0-6], load_hrnd
+ movu xm%1, [srcq+ r4*2]
+ movu xm%2, [srcq+ r6*2]
+ movu xm%3, [srcq+ r7*2]
+ movu xm%4, [srcq+ r9*2]
+ vinserti128 m%1, [srcq+r10*2], 1
+ vinserti128 m%2, [srcq+r11*2], 1
+ vinserti128 m%3, [srcq+r13*2], 1
+ vinserti128 m%4, [srcq+ rX*2], 1
+ add srcq, ssq
+ movu xm%5, [srcq+ r4*2]
+ movu xm%6, [srcq+ r6*2]
+ movu xm%7, [srcq+ r7*2]
+ movu xm%8, [srcq+ r9*2]
+ vinserti128 m%5, [srcq+r10*2], 1
+ vinserti128 m%6, [srcq+r11*2], 1
+ vinserti128 m%7, [srcq+r13*2], 1
+ vinserti128 m%8, [srcq+ rX*2], 1
+ add srcq, ssq
+ pmaddwd m%1, m12
+ pmaddwd m%2, m13
+ pmaddwd m%3, m14
+ pmaddwd m%4, m15
+ pmaddwd m%5, m12
+ pmaddwd m%6, m13
+ pmaddwd m%7, m14
+ pmaddwd m%8, m15
+ phaddd m%1, m%2
+ %if %9
+ mova m10, [rsp+0x00]
+ %endif
+ phaddd m%3, m%4
+ phaddd m%5, m%6
+ phaddd m%7, m%8
+ phaddd m%1, m%3
+ phaddd m%5, m%7
+ paddd m%1, m10
+ paddd m%5, m10
+ psrad m%1, xm11
+ psrad m%5, xm11
+ packssdw m%1, m%5
+%endmacro
+
+%macro MC_8TAP_SCALED 1
+%ifidn %1, put
+ %assign isput 1
+ %assign isprep 0
+cglobal put_8tap_scaled_16bpc, 4, 14, 16, 0xe0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
+ %xdefine base_reg r12
+ mov r7d, pxmaxm
+%else
+ %assign isput 0
+ %assign isprep 1
+cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
+ %define tmp_stridem qword [rsp+0xd0]
+ %xdefine base_reg r11
+%endif
+ lea base_reg, [%1_8tap_scaled_16bpc_avx2]
+%define base base_reg-%1_8tap_scaled_16bpc_avx2
+ tzcnt wd, wm
+ vpbroadcastd m8, dxm
+%if isprep && UNIX64
+ movd xm10, mxd
+ vpbroadcastd m10, xm10
+ mov r5d, t0d
+ DECLARE_REG_TMP 5, 7
+ mov r6d, pxmaxm
+%else
+ vpbroadcastd m10, mxm
+ %if isput
+ vpbroadcastw m11, pxmaxm
+ %else
+ mov r6d, pxmaxm
+ %endif
+%endif
+ mov dyd, dym
+%if isput
+ %if WIN64
+ mov r8d, hm
+ DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r5m
+ %define dxm r8m
+ %else
+ DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm r6m
+ %endif
+ %define dsm [rsp+0x98]
+ %define rX r1
+ %define rXd r1d
+%else ; prep
+ %if WIN64
+ mov r7d, hm
+ DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r4m
+ %define dxm r7m
+ %else
+ DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm [rsp+0x98]
+ %endif
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %define rX r14
+ %define rXd r14d
+%endif
+ shr r7d, 11
+ vpbroadcastd m6, [base+pd_0x3ff]
+ vpbroadcastd m12, [base+s_8tap_h_rnd+r7*4]
+ movd xm7, [base+s_8tap_h_sh+r7*4]
+%if isput
+ vpbroadcastd m13, [base+put_s_8tap_v_rnd+r7*4]
+ pinsrd xm7, [base+put_s_8tap_v_sh+r7*4], 2
+%else
+ vpbroadcastd m13, [base+pd_m524256]
+%endif
+ pxor m9, m9
+ lea ss3q, [ssq*3]
+ movzx r7d, t1b
+ shr t1d, 16
+ cmp hd, 6
+ cmovs t1d, r7d
+ sub srcq, ss3q
+ cmp dyd, 1024
+ je .dy1
+ cmp dyd, 2048
+ je .dy2
+ movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%if isput
+.w2:
+ mov myd, mym
+ movzx t0d, t0b
+ sub srcq, 2
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m10, m8 ; mx+dx*[0,1]
+ vpbroadcastd xm14, [base+pq_0x40000000+2]
+ vpbroadcastd xm15, xm15
+ pand xm8, xm10, xm6
+ psrld xm8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_q]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd xm15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd xm4, [base+subpel_filters+r6*8+2]
+ pcmpeqd xm8, xm9
+ psrld m10, 10
+ paddd m10, m10
+ movu xm0, [srcq+ssq*0]
+ movu xm1, [srcq+ssq*1]
+ movu xm2, [srcq+ssq*2]
+ movu xm3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m10, m5
+ paddb m10, m6
+ vpblendd xm15, xm4, 0xa
+ pblendvb xm15, xm14, xm8
+ pmovsxbw m15, xm15
+ vinserti128 m0, [srcq+ssq*0], 1 ; 0 4
+ vinserti128 m1, [srcq+ssq*1], 1 ; 1 5
+ vinserti128 m2, [srcq+ssq*2], 1 ; 2 6
+ vinserti128 m3, [srcq+ss3q ], 1 ; 3 7
+ lea srcq, [srcq+ssq*4]
+ REPX {pshufb x, m10}, m0, m1, m2, m3
+ REPX {pmaddwd x, m15}, m0, m1, m2, m3
+ phaddd m0, m1
+ phaddd m2, m3
+ paddd m0, m12
+ paddd m2, m12
+ psrad m0, xm7
+ psrad m2, xm7
+ packssdw m0, m2 ; 0 1 2 3 4 5 6 7
+ vextracti128 xm1, m0, 1
+ palignr xm2, xm1, xm0, 4 ; 1 2 3 4
+ punpcklwd xm3, xm0, xm2 ; 01 12
+ punpckhwd xm0, xm2 ; 23 34
+ pshufd xm4, xm1, q0321 ; 5 6 7 _
+ punpcklwd xm2, xm1, xm4 ; 45 56
+ punpckhwd xm4, xm1, xm4 ; 67 __
+.w2_loop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq xm14, r6q
+ pmovsxbw xm14, xm14
+ pshufd xm8, xm14, q0000
+ pshufd xm9, xm14, q1111
+ pmaddwd xm5, xm3, xm8
+ pmaddwd xm6, xm0, xm9
+ pshufd xm8, xm14, q2222
+ pshufd xm14, xm14, q3333
+ paddd xm5, xm6
+ pmaddwd xm6, xm2, xm8
+ pmaddwd xm8, xm4, xm14
+ psrldq xm9, xm7, 8
+ paddd xm5, xm6
+ paddd xm5, xm13
+ paddd xm5, xm8
+ psrad xm5, xm9
+ packusdw xm5, xm5
+ pminsw xm5, xm11
+ movd [dstq], xm5
+ add dstq, dsq
+ dec hd
+ jz .ret
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w2_loop
+ movu xm5, [srcq]
+ test myd, 0x400
+ jz .w2_skip_line
+ add srcq, ssq
+ shufps xm3, xm0, q1032 ; 01 12
+ shufps xm0, xm2, q1032 ; 23 34
+ shufps xm2, xm4, q1032 ; 45 56
+ pshufb xm5, xm10
+ pmaddwd xm5, xm15
+ phaddd xm5, xm5
+ paddd xm5, xm12
+ psrad xm5, xm7
+ packssdw xm5, xm5
+ palignr xm1, xm5, xm1, 12
+ punpcklqdq xm1, xm1 ; 6 7 6 7
+ punpcklwd xm4, xm1, xm5 ; 67 __
+ jmp .w2_loop
+.w2_skip_line:
+ movu xm6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova xm3, xm0 ; 01 12
+ mova xm0, xm2 ; 23 34
+ pshufb xm5, xm10
+ pshufb xm6, xm10
+ pmaddwd xm5, xm15
+ pmaddwd xm6, xm15
+ phaddd xm5, xm6
+ paddd xm5, xm12
+ psrad xm5, xm7
+ packssdw xm5, xm5 ; 6 7 6 7
+ palignr xm1, xm5, xm1, 8 ; 4 5 6 7
+ pshufd xm5, xm1, q0321 ; 5 6 7 _
+ punpcklwd xm2, xm1, xm5 ; 45 56
+ punpckhwd xm4, xm1, xm5 ; 67 __
+ jmp .w2_loop
+%endif
+.w4:
+ mov myd, mym
+ mova [rsp+0x00], m12
+%if isput
+ mova [rsp+0x20], xm13
+%else
+ SWAP m11, m13
+%endif
+ mova [rsp+0x30], xm7
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ sub srcq, 2
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastq m2, [base+pq_0x40000000+1]
+ vpbroadcastd xm15, xm15
+ SWAP m13, m10
+ paddd m13, m8 ; mx+dx*[0-3]
+ pand m6, m13
+ psrld m6, 6
+ paddd xm15, xm6
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ pextrd r11d, xm15, 2
+ pextrd r13d, xm15, 3
+ vbroadcasti128 m5, [base+bdct_lb_q+ 0]
+ vbroadcasti128 m1, [base+bdct_lb_q+16]
+ vbroadcasti128 m0, [base+subpel_s_shuf2]
+ vpbroadcastd xm14, [base+subpel_filters+r4*8+2]
+ vpbroadcastd xm7, [base+subpel_filters+r6*8+2]
+ vpbroadcastd xm15, [base+subpel_filters+r11*8+2]
+ vpbroadcastd xm8, [base+subpel_filters+r13*8+2]
+ pcmpeqd m6, m9
+ punpckldq m10, m6, m6
+ punpckhdq m6, m6
+ psrld m13, 10
+ paddd m13, m13
+ vpblendd xm14, xm7, 0xa
+ vpblendd xm15, xm8, 0xa
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ pblendvb m14, m2, m10
+ pblendvb m15, m2, m6
+ pextrd r4, xm13, 2
+ pshufb m12, m13, m5
+ pshufb m13, m1
+ lea r6, [r4+ssq*1]
+ lea r11, [r4+ssq*2]
+ lea r13, [r4+ss3q ]
+ movu xm7, [srcq+ssq*0]
+ movu xm9, [srcq+ssq*1]
+ movu xm8, [srcq+ssq*2]
+ movu xm10, [srcq+ss3q ]
+ movu xm1, [srcq+r4 ]
+ movu xm3, [srcq+r6 ]
+ movu xm2, [srcq+r11 ]
+ movu xm4, [srcq+r13 ]
+ lea srcq, [srcq+ssq*4]
+ vinserti128 m7, [srcq+ssq*0], 1
+ vinserti128 m9, [srcq+ssq*1], 1
+ vinserti128 m8, [srcq+ssq*2], 1
+ vinserti128 m10, [srcq+ss3q ], 1
+ vinserti128 m1, [srcq+r4 ], 1
+ vinserti128 m3, [srcq+r6 ], 1
+ vinserti128 m2, [srcq+r11 ], 1
+ vinserti128 m4, [srcq+r13 ], 1
+ lea srcq, [srcq+ssq*4]
+ vpbroadcastb m5, xm13
+ psubb m13, m5
+ paddb m12, m0
+ paddb m13, m0
+ REPX {pshufb x, m12}, m7, m9, m8, m10
+ REPX {pmaddwd x, m14}, m7, m9, m8, m10
+ REPX {pshufb x, m13}, m1, m2, m3, m4
+ REPX {pmaddwd x, m15}, m1, m2, m3, m4
+ mova m5, [rsp+0x00]
+ movd xm6, [rsp+0x30]
+ phaddd m7, m1
+ phaddd m9, m3
+ phaddd m8, m2
+ phaddd m10, m4
+ REPX {paddd x, m5}, m7, m9, m8, m10
+ REPX {psrad x, xm6}, m7, m9, m8, m10
+ packssdw m7, m9 ; 0 1 4 5
+ packssdw m8, m10 ; 2 3 6 7
+ vextracti128 xm9, m7, 1 ; 4 5
+ vextracti128 xm3, m8, 1 ; 6 7
+ shufps xm4, xm7, xm8, q1032 ; 1 2
+ shufps xm5, xm8, xm9, q1032 ; 3 4
+ shufps xm6, xm9, xm3, q1032 ; 5 6
+ psrldq xm10, xm3, 8 ; 7 _
+ punpcklwd xm0, xm7, xm4 ; 01
+ punpckhwd xm7, xm4 ; 12
+ punpcklwd xm1, xm8, xm5 ; 23
+ punpckhwd xm8, xm5 ; 34
+ punpcklwd xm2, xm9, xm6 ; 45
+ punpckhwd xm9, xm6 ; 56
+ punpcklwd xm3, xm10 ; 67
+ mova [rsp+0x40], xm7
+ mova [rsp+0x50], xm8
+ mova [rsp+0x60], xm9
+.w4_loop:
+ and myd, 0x3ff
+ mov r11d, 64 << 24
+ mov r13d, myd
+ shr r13d, 6
+ lea r13d, [t1+r13]
+ cmovnz r11q, [base+subpel_filters+r13*8]
+ movq xm9, r11q
+ pmovsxbw xm9, xm9
+ pshufd xm7, xm9, q0000
+ pshufd xm8, xm9, q1111
+ pmaddwd xm4, xm0, xm7
+ pmaddwd xm5, xm1, xm8
+ pshufd xm7, xm9, q2222
+ pshufd xm9, xm9, q3333
+ pmaddwd xm6, xm2, xm7
+ pmaddwd xm8, xm3, xm9
+%if isput
+ mova xm7, [rsp+0x20]
+ movd xm9, [rsp+0x38]
+%else
+ SWAP m7, m11
+%endif
+ paddd xm4, xm5
+ paddd xm6, xm8
+ paddd xm4, xm6
+ paddd xm4, xm7
+%if isput
+ psrad xm4, xm9
+ packusdw xm4, xm4
+ pminuw xm4, xm11
+ movq [dstq], xm4
+ add dstq, dsq
+%else
+ SWAP m11, m7
+ psrad xm4, 6
+ packssdw xm4, xm4
+ movq [tmpq], xm4
+ add tmpq, 8
+%endif
+ dec hd
+ jz .ret
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w4_loop
+ mova xm8, [rsp+0x00]
+ movd xm9, [rsp+0x30]
+ movu xm4, [srcq]
+ movu xm5, [srcq+r4]
+ test myd, 0x400
+ jz .w4_skip_line
+ mova xm0, [rsp+0x40]
+ mova [rsp+0x40], xm1
+ mova xm1, [rsp+0x50]
+ mova [rsp+0x50], xm2
+ mova xm2, [rsp+0x60]
+ mova [rsp+0x60], xm3
+ pshufb xm4, xm12
+ pshufb xm5, xm13
+ pmaddwd xm4, xm14
+ pmaddwd xm5, xm15
+ phaddd xm4, xm5
+ paddd xm4, xm8
+ psrad xm4, xm9
+ packssdw xm4, xm4
+ punpcklwd xm3, xm10, xm4
+ mova xm10, xm4
+ add srcq, ssq
+ jmp .w4_loop
+.w4_skip_line:
+ movu xm6, [srcq+ssq*1]
+ movu xm7, [srcq+r6]
+ movu m0, [rsp+0x50]
+ pshufb xm4, xm12
+ pshufb xm6, xm12
+ pshufb xm5, xm13
+ pshufb xm7, xm13
+ pmaddwd xm4, xm14
+ pmaddwd xm6, xm14
+ pmaddwd xm5, xm15
+ pmaddwd xm7, xm15
+ mova [rsp+0x40], m0
+ phaddd xm4, xm5
+ phaddd xm6, xm7
+ paddd xm4, xm8
+ paddd xm6, xm8
+ psrad xm4, xm9
+ psrad xm6, xm9
+ packssdw xm4, xm6
+ punpcklwd xm9, xm10, xm4
+ mova [rsp+0x60], xm9
+ psrldq xm10, xm4, 8
+ mova xm0, xm1
+ mova xm1, xm2
+ mova xm2, xm3
+ punpcklwd xm3, xm4, xm10
+ lea srcq, [srcq+ssq*2]
+ jmp .w4_loop
+ SWAP m10, m13
+%if isprep
+ SWAP m13, m11
+%endif
+.w8:
+ mov dword [rsp+0x80], 1
+ movifprep tmp_stridem, 16
+ jmp .w_start
+.w16:
+ mov dword [rsp+0x80], 2
+ movifprep tmp_stridem, 32
+ jmp .w_start
+.w32:
+ mov dword [rsp+0x80], 4
+ movifprep tmp_stridem, 64
+ jmp .w_start
+.w64:
+ mov dword [rsp+0x80], 8
+ movifprep tmp_stridem, 128
+ jmp .w_start
+.w128:
+ mov dword [rsp+0x80], 16
+ movifprep tmp_stridem, 256
+.w_start:
+ SWAP m10, m12, m1
+ SWAP m11, m7
+ ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free
+%if isput
+ movifnidn dsm, dsq
+ mova [rsp+0xb0], xm7
+%endif
+ mova [rsp+0x00], m10
+ mova [rsp+0x20], m13
+ shr t0d, 16
+ sub srcq, 6
+ pmaddwd m8, [base+rescale_mul2]
+ movd xm15, t0d
+ mov [rsp+0x84], t0d
+ mov [rsp+0x88], srcq
+ mov [rsp+0x90], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m1, m8 ; mx+dx*[0-7]
+ jmp .hloop
+.hloop_prep:
+ dec dword [rsp+0x80]
+ jz .ret
+ add qword [rsp+0x90], 16
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m6, [base+pd_0x3ff]
+ paddd m1, m8, [rsp+0x40]
+ vpbroadcastd m15, [rsp+0x84]
+ pxor m9, m9
+ mov srcq, [rsp+0x88]
+ mov r0q, [rsp+0x90] ; dstq / tmpq
+.hloop:
+ vpbroadcastq xm2, [base+pq_0x40000000]
+ pand m5, m1, m6
+ psrld m5, 6
+ paddd m15, m5
+ pcmpeqd m5, m9
+ vextracti128 xm7, m15, 1
+ movq r6, xm15
+ pextrq r9, xm15, 1
+ movq r11, xm7
+ pextrq rX, xm7, 1
+ mov r4d, r6d
+ shr r6, 32
+ mov r7d, r9d
+ shr r9, 32
+ mov r10d, r11d
+ shr r11, 32
+ mov r13d, rXd
+ shr rX, 32
+ mova [rsp+0x40], m1
+ movq xm12, [base+subpel_filters+ r4*8]
+ movq xm13, [base+subpel_filters+ r6*8]
+ movhps xm12, [base+subpel_filters+ r7*8]
+ movhps xm13, [base+subpel_filters+ r9*8]
+ movq xm14, [base+subpel_filters+r10*8]
+ movq xm15, [base+subpel_filters+r11*8]
+ movhps xm14, [base+subpel_filters+r13*8]
+ movhps xm15, [base+subpel_filters+ rX*8]
+ psrld m1, 10
+ vextracti128 xm7, m1, 1
+ vextracti128 xm6, m5, 1
+ movq [rsp+0xa0], xm1
+ movq [rsp+0xa8], xm7
+ movq r6, xm1
+ pextrq r11, xm1, 1
+ movq r9, xm7
+ pextrq rX, xm7, 1
+ mov r4d, r6d
+ shr r6, 32
+ mov r10d, r11d
+ shr r11, 32
+ mov r7d, r9d
+ shr r9, 32
+ mov r13d, rXd
+ shr rX, 32
+ pshufd xm4, xm5, q2200
+ pshufd xm5, xm5, q3311
+ pshufd xm7, xm6, q2200
+ pshufd xm6, xm6, q3311
+ pblendvb xm12, xm2, xm4
+ pblendvb xm13, xm2, xm5
+ pblendvb xm14, xm2, xm7
+ pblendvb xm15, xm2, xm6
+ pmovsxbw m12, xm12
+ pmovsxbw m13, xm13
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ mova [rsp+0x60], m0
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b
+ mova m0, [rsp+0x60]
+ vbroadcasti128 m9, [base+subpel_s_shuf8]
+ mov myd, mym
+ mov dyd, dym
+ pshufb m0, m9 ; 01a 01b
+ pshufb m1, m9 ; 23a 23b
+ pshufb m2, m9 ; 45a 45b
+ pshufb m3, m9 ; 67a 67b
+.vloop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq xm9, r6q
+ punpcklqdq xm9, xm9
+ pmovsxbw m9, xm9
+ pshufd m8, m9, q0000
+ pshufd m7, m9, q1111
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m7
+ pshufd m8, m9, q2222
+ pshufd m9, m9, q3333
+ pmaddwd m6, m2, m8
+ pmaddwd m7, m3, m9
+%if isput
+ psrldq xm8, xm11, 8
+%endif
+ paddd m4, [rsp+0x20]
+ paddd m6, m7
+ paddd m4, m5
+ paddd m4, m6
+%if isput
+ psrad m4, xm8
+ vextracti128 xm5, m4, 1
+ packusdw xm4, xm5
+ pminsw xm4, [rsp+0xb0]
+ mova [dstq], xm4
+ add dstq, dsm
+%else
+ psrad m4, 6
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .hloop_prep
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .vloop
+ test myd, 0x400
+ mov [rsp+0x60], myd
+ mov r4d, [rsp+0xa0]
+ mov r6d, [rsp+0xa4]
+ mov r7d, [rsp+0xa8]
+ mov r9d, [rsp+0xac]
+ jz .skip_line
+ vbroadcasti128 m9, [base+wswap]
+ movu xm4, [srcq+ r4*2]
+ movu xm5, [srcq+ r6*2]
+ movu xm6, [srcq+ r7*2]
+ movu xm7, [srcq+ r9*2]
+ vinserti128 m4, [srcq+r10*2], 1
+ vinserti128 m5, [srcq+r11*2], 1
+ vinserti128 m6, [srcq+r13*2], 1
+ vinserti128 m7, [srcq+ rX*2], 1
+ add srcq, ssq
+ mov myd, [rsp+0x60]
+ mov dyd, dym
+ pshufb m0, m9
+ pshufb m1, m9
+ pshufb m2, m9
+ pshufb m3, m9
+ pmaddwd m4, m12
+ pmaddwd m5, m13
+ pmaddwd m6, m14
+ pmaddwd m7, m15
+ phaddd m4, m5
+ phaddd m6, m7
+ phaddd m4, m6
+ paddd m4, m10
+ psrad m4, xm11
+ pslld m4, 16
+ pblendw m0, m1, 0xaa
+ pblendw m1, m2, 0xaa
+ pblendw m2, m3, 0xaa
+ pblendw m3, m4, 0xaa
+ jmp .vloop
+.skip_line:
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ MC_8TAP_SCALED_H 3, 10, 4, 5, 6, 7, 8, 9, 1
+ vbroadcasti128 m9, [base+subpel_s_shuf8]
+ mov myd, [rsp+0x60]
+ mov dyd, dym
+ pshufb m3, m9
+ jmp .vloop
+ SWAP m1, m12, m10
+ SWAP m7, m11
+.dy1:
+ movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%if isput
+.dy1_w2:
+ mov myd, mym
+ movzx t0d, t0b
+ sub srcq, 2
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m10, m8 ; mx+dx*[0-1]
+ vpbroadcastd xm14, [base+pq_0x40000000+2]
+ vpbroadcastd xm15, xm15
+ pand xm8, xm10, xm6
+ psrld xm8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_q]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd m15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd m4, [base+subpel_filters+r6*8+2]
+ pcmpeqd xm8, xm9
+ psrld m10, 10
+ paddd m10, m10
+ movu xm0, [srcq+ssq*0]
+ movu xm1, [srcq+ssq*1]
+ movu xm2, [srcq+ssq*2]
+ movu xm3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m10, m5
+ paddb m10, m6
+ vpblendd xm15, xm4, 0xa
+ pblendvb xm15, xm14, xm8
+ pmovsxbw m15, xm15
+ vinserti128 m0, [srcq+ssq*0], 1
+ vinserti128 m1, [srcq+ssq*1], 1
+ vinserti128 m2, [srcq+ssq*2], 1
+ add srcq, ss3q
+ movq xm6, r4q
+ pmovsxbw xm6, xm6
+ pshufd xm8, xm6, q0000
+ pshufd xm9, xm6, q1111
+ pshufd xm14, xm6, q2222
+ pshufd xm6, xm6, q3333
+ REPX {pshufb x, m10}, m0, m1, m2
+ pshufb xm3, xm10
+ REPX {pmaddwd x, m15}, m0, m1, m2
+ pmaddwd xm3, xm15
+ phaddd m0, m1
+ phaddd m2, m3
+ paddd m0, m12
+ paddd m2, m12
+ psrad m0, xm7
+ psrad m2, xm7
+ packssdw m0, m2
+ vextracti128 xm1, m0, 1
+ palignr xm2, xm1, xm0, 4
+ pshufd xm4, xm1, q2121
+ punpcklwd xm3, xm0, xm2 ; 01 12
+ punpckhwd xm0, xm2 ; 23 34
+ punpcklwd xm2, xm1, xm4 ; 45 56
+.dy1_w2_loop:
+ movu xm1, [srcq+ssq*0]
+ movu xm5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm1, xm10
+ pshufb xm5, xm10
+ pmaddwd xm1, xm15
+ pmaddwd xm5, xm15
+ phaddd xm1, xm5
+ pmaddwd xm5, xm3, xm8
+ mova xm3, xm0
+ pmaddwd xm0, xm9
+ paddd xm1, xm12
+ psrad xm1, xm7
+ packssdw xm1, xm1
+ paddd xm5, xm0
+ mova xm0, xm2
+ pmaddwd xm2, xm14
+ paddd xm5, xm2
+ palignr xm2, xm1, xm4, 12
+ punpcklwd xm2, xm1 ; 67 78
+ pmaddwd xm4, xm2, xm6
+ paddd xm5, xm13
+ paddd xm5, xm4
+ mova xm4, xm1
+ psrldq xm1, xm7, 8
+ psrad xm5, xm1
+ packusdw xm5, xm5
+ pminsw xm5, xm11
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy1_w2_loop
+ RET
+%endif
+.dy1_w4:
+ mov myd, mym
+%if isput
+ mova [rsp+0x50], xm11
+%endif
+ mova [rsp+0x00], m12
+ mova [rsp+0x20], m13
+ mova [rsp+0x40], xm7
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ sub srcq, 2
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastq m2, [base+pq_0x40000000+1]
+ vpbroadcastd xm15, xm15
+ SWAP m13, m10
+ paddd m13, m8 ; mx+dx*[0-3]
+ pand m6, m13
+ psrld m6, 6
+ paddd xm15, xm6
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ pextrd r11d, xm15, 2
+ pextrd r13d, xm15, 3
+ vbroadcasti128 m5, [base+bdct_lb_q+ 0]
+ vbroadcasti128 m1, [base+bdct_lb_q+16]
+ vbroadcasti128 m4, [base+subpel_s_shuf2]
+ vpbroadcastd xm14, [base+subpel_filters+r4*8+2]
+ vpbroadcastd xm7, [base+subpel_filters+r6*8+2]
+ vpbroadcastd xm15, [base+subpel_filters+r11*8+2]
+ vpbroadcastd xm8, [base+subpel_filters+r13*8+2]
+ pcmpeqd m6, m9
+ punpckldq m10, m6, m6
+ punpckhdq m6, m6
+ psrld m13, 10
+ paddd m13, m13
+ vpblendd xm14, xm7, 0xa
+ vpblendd xm15, xm8, 0xa
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ pblendvb m14, m2, m10
+ pblendvb m15, m2, m6
+ pextrd r4, xm13, 2
+ pshufb m12, m13, m5
+ pshufb m13, m1
+ lea r6, [r4+ssq*2]
+ lea r11, [r4+ssq*1]
+ lea r13, [r4+ss3q ]
+ movu xm0, [srcq+ssq*0]
+ movu xm7, [srcq+r4 ]
+ movu xm1, [srcq+ssq*2]
+ movu xm8, [srcq+r6 ]
+ vinserti128 m0, [srcq+ssq*1], 1 ; 0 1
+ vinserti128 m7, [srcq+r11 ], 1
+ vinserti128 m1, [srcq+ss3q ], 1 ; 2 3
+ vinserti128 m8, [srcq+r13 ], 1
+ lea srcq, [srcq+ssq*4]
+ movu xm2, [srcq+ssq*0]
+ movu xm9, [srcq+r4 ]
+ movu xm3, [srcq+ssq*2] ; 6 _
+ movu xm10, [srcq+r6 ]
+ vinserti128 m2, [srcq+ssq*1], 1 ; 4 5
+ vinserti128 m9, [srcq+r11 ], 1
+ lea srcq, [srcq+ss3q ]
+ vpbroadcastb m5, xm13
+ psubb m13, m5
+ paddb m12, m4
+ paddb m13, m4
+ mova m5, [rsp+0x00]
+ movd xm6, [rsp+0x40]
+ pshufb m0, m12
+ pshufb m1, m12
+ pmaddwd m0, m14
+ pmaddwd m1, m14
+ pshufb m7, m13
+ pshufb m8, m13
+ pmaddwd m7, m15
+ pmaddwd m8, m15
+ pshufb m2, m12
+ pshufb xm3, xm12
+ pmaddwd m2, m14
+ pmaddwd xm3, xm14
+ pshufb m9, m13
+ pshufb xm10, xm13
+ pmaddwd m9, m15
+ pmaddwd xm10, xm15
+ phaddd m0, m7
+ phaddd m1, m8
+ phaddd m2, m9
+ phaddd xm3, xm10
+ paddd m0, m5
+ paddd m1, m5
+ paddd m2, m5
+ paddd xm3, xm5
+ psrad m0, xm6
+ psrad m1, xm6
+ psrad m2, xm6
+ psrad xm3, xm6
+ vperm2i128 m4, m0, m1, 0x21 ; 1 2
+ vperm2i128 m5, m1, m2, 0x21 ; 3 4
+ vperm2i128 m6, m2, m3, 0x21 ; 5 6
+ shr myd, 6
+ mov r13d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r13q, [base+subpel_filters+myq*8]
+ pslld m4, 16
+ pslld m5, 16
+ pslld m6, 16
+ pblendw m0, m4, 0xaa ; 01 12
+ pblendw m1, m5, 0xaa ; 23 34
+ pblendw m2, m6, 0xaa ; 45 56
+ movq xm10, r13q
+ punpcklqdq xm10, xm10
+ pmovsxbw m10, xm10
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+.dy1_w4_loop:
+ movu xm11, [srcq+ssq*0]
+ movu xm6, [srcq+r4 ]
+ vinserti128 m11, [srcq+ssq*1], 1
+ vinserti128 m6, [srcq+r11 ], 1
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m4, m0, m7
+ pmaddwd m5, m1, m8
+ pshufb m11, m12
+ pshufb m6, m13
+ pmaddwd m11, m14
+ pmaddwd m6, m15
+ paddd m4, [rsp+0x20]
+ phaddd m11, m6
+ pmaddwd m6, m2, m9
+ paddd m11, [rsp+0x00]
+ psrad m11, [rsp+0x40]
+ mova m0, m1
+ mova m1, m2
+ paddd m5, m6
+ paddd m4, m5
+ vinserti128 m2, m3, xm11, 1
+ pslld m3, m11, 16
+ pblendw m2, m3, 0xaa ; 67 78
+ pmaddwd m5, m2, m10
+ vextracti128 xm3, m11, 1
+ paddd m4, m5
+%if isput
+ psrad m4, [rsp+0x48]
+ vextracti128 xm5, m4, 1
+ packusdw xm4, xm5
+ pminsw xm4, [rsp+0x50]
+ movq [dstq+dsq*0], xm4
+ movhps [dstq+dsq*1], xm4
+ lea dstq, [dstq+dsq*2]
+%else
+ psrad m4, 6
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jg .dy1_w4_loop
+ MC_8TAP_SCALED_RET
+ SWAP m10, m13
+.dy1_w8:
+ mov dword [rsp+0xa0], 1
+ movifprep tmp_stridem, 16
+ jmp .dy1_w_start
+.dy1_w16:
+ mov dword [rsp+0xa0], 2
+ movifprep tmp_stridem, 32
+ jmp .dy1_w_start
+.dy1_w32:
+ mov dword [rsp+0xa0], 4
+ movifprep tmp_stridem, 64
+ jmp .dy1_w_start
+.dy1_w64:
+ mov dword [rsp+0xa0], 8
+ movifprep tmp_stridem, 128
+ jmp .dy1_w_start
+.dy1_w128:
+ mov dword [rsp+0xa0], 16
+ movifprep tmp_stridem, 256
+.dy1_w_start:
+ SWAP m10, m12, m1
+ SWAP m11, m7
+ ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free
+ mov myd, mym
+%if isput
+ %define dsm [rsp+0xb8]
+ movifnidn dsm, dsq
+ mova [rsp+0xc0], xm7
+%else
+ %if UNIX64
+ %define hm [rsp+0xb8]
+ %endif
+%endif
+ mova [rsp+0x00], m10
+ mova [rsp+0x20], m13
+ mova [rsp+0x40], xm11
+ shr t0d, 16
+ sub srcq, 6
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pmaddwd m8, [base+rescale_mul2]
+ movd xm15, t0d
+ mov [rsp+0xa4], t0d
+ mov [rsp+0xa8], srcq
+ mov [rsp+0xb0], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m1, m8 ; mx+dx*[0-7]
+ movq xm0, r4q
+ pmovsxbw xm0, xm0
+ mova [rsp+0x50], xm0
+ jmp .dy1_hloop
+.dy1_hloop_prep:
+ dec dword [rsp+0xa0]
+ jz .ret
+ add qword [rsp+0xb0], 16
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m6, [base+pd_0x3ff]
+ paddd m1, m8, [rsp+0x60]
+ vpbroadcastd m15, [rsp+0xa4]
+ pxor m9, m9
+ mov srcq, [rsp+0xa8]
+ mov r0q, [rsp+0xb0] ; dstq / tmpq
+ mova m10, [rsp+0x00]
+ mova xm11, [rsp+0x40]
+.dy1_hloop:
+ vpbroadcastq xm2, [base+pq_0x40000000]
+ pand m5, m1, m6
+ psrld m5, 6
+ paddd m15, m5
+ pcmpeqd m5, m9
+ vextracti128 xm7, m15, 1
+ movq r6, xm15
+ pextrq r9, xm15, 1
+ movq r11, xm7
+ pextrq rX, xm7, 1
+ mov r4d, r6d
+ shr r6, 32
+ mov r7d, r9d
+ shr r9, 32
+ mov r10d, r11d
+ shr r11, 32
+ mov r13d, rXd
+ shr rX, 32
+ mova [rsp+0x60], m1
+ movq xm12, [base+subpel_filters+ r4*8]
+ movq xm13, [base+subpel_filters+ r6*8]
+ movhps xm12, [base+subpel_filters+ r7*8]
+ movhps xm13, [base+subpel_filters+ r9*8]
+ movq xm14, [base+subpel_filters+r10*8]
+ movq xm15, [base+subpel_filters+r11*8]
+ movhps xm14, [base+subpel_filters+r13*8]
+ movhps xm15, [base+subpel_filters+ rX*8]
+ psrld m1, 10
+ vextracti128 xm7, m1, 1
+ vextracti128 xm6, m5, 1
+ movq r6, xm1
+ pextrq r11, xm1, 1
+ movq r9, xm7
+ pextrq rX, xm7, 1
+ mov r4d, r6d
+ shr r6, 32
+ mov r10d, r11d
+ shr r11, 32
+ mov r7d, r9d
+ shr r9, 32
+ mov r13d, rXd
+ shr rX, 32
+ pshufd xm4, xm5, q2200
+ pshufd xm5, xm5, q3311
+ pshufd xm7, xm6, q2200
+ pshufd xm6, xm6, q3311
+ pblendvb xm12, xm2, xm4
+ pblendvb xm13, xm2, xm5
+ pblendvb xm14, xm2, xm7
+ pblendvb xm15, xm2, xm6
+ pmovsxbw m12, xm12
+ pmovsxbw m13, xm13
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ mova [rsp+0x80], m0
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b
+ mova m0, [rsp+0x80]
+ vbroadcasti128 m7, [base+subpel_s_shuf8]
+ vpbroadcastd m8, [rsp+0x50]
+ vpbroadcastd m9, [rsp+0x54]
+ vpbroadcastd m10, [rsp+0x58]
+ vpbroadcastd m11, [rsp+0x5c]
+ pshufb m0, m7 ; 01a 01b
+ pshufb m1, m7 ; 23a 23b
+ pshufb m2, m7 ; 45a 45b
+ pshufb m3, m7 ; 67a 67b
+.dy1_vloop:
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pmaddwd m6, m2, m10
+ pmaddwd m7, m3, m11
+ paddd m4, [rsp+0x20]
+ paddd m6, m7
+ paddd m4, m5
+ paddd m4, m6
+%if isput
+ psrad m4, [rsp+0x48]
+ vextracti128 xm5, m4, 1
+ packusdw xm4, xm5
+ pminsw xm4, [rsp+0xc0]
+ mova [dstq], xm4
+ add dstq, dsm
+%else
+ psrad m4, 6
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy1_hloop_prep
+ vbroadcasti128 m7, [base+wswap]
+ pshufb m0, m7
+ pshufb m1, m7
+ pshufb m2, m7
+ pshufb m3, m7
+ movu xm4, [srcq+ r4*2]
+ movu xm5, [srcq+ r6*2]
+ movu xm6, [srcq+ r7*2]
+ movu xm7, [srcq+ r9*2]
+ vinserti128 m4, [srcq+r10*2], 1
+ vinserti128 m5, [srcq+r11*2], 1
+ vinserti128 m6, [srcq+r13*2], 1
+ vinserti128 m7, [srcq+ rX*2], 1
+ add srcq, ssq
+ pmaddwd m4, m12
+ pmaddwd m5, m13
+ pmaddwd m6, m14
+ pmaddwd m7, m15
+ phaddd m4, m5
+ phaddd m6, m7
+ phaddd m4, m6
+ paddd m4, [rsp+0x00]
+ psrad m4, [rsp+0x40]
+ pslld m4, 16
+ pblendw m0, m1, 0xaa
+ pblendw m1, m2, 0xaa
+ pblendw m2, m3, 0xaa
+ pblendw m3, m4, 0xaa
+ jmp .dy1_vloop
+ SWAP m1, m12, m10
+ SWAP m7, m11
+.dy2:
+ movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%if isput
+.dy2_w2:
+ mov myd, mym
+ movzx t0d, t0b
+ sub srcq, 2
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m10, m8 ; mx+dx*[0-1]
+ vpbroadcastd xm14, [base+pq_0x40000000+2]
+ vpbroadcastd xm15, xm15
+ pand xm8, xm10, xm6
+ psrld xm8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_q]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd xm15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd xm4, [base+subpel_filters+r6*8+2]
+ pcmpeqd xm8, xm9
+ psrld m10, 10
+ paddd m10, m10
+ movu xm0, [srcq+ssq*0]
+ movu xm1, [srcq+ssq*2]
+ movu xm2, [srcq+ssq*4]
+ pshufb m10, m5
+ paddb m10, m6
+ vpblendd xm15, xm4, 0xa
+ pblendvb xm15, xm14, xm8
+ pmovsxbw m15, xm15
+ vinserti128 m0, [srcq+ssq*1], 1 ; 0 1
+ vinserti128 m1, [srcq+ss3q ], 1 ; 2 3
+ lea srcq, [srcq+ssq*4]
+ vinserti128 m2, [srcq+ssq*1], 1 ; 4 5
+ lea srcq, [srcq+ssq*2]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m0, m10
+ pshufb m1, m10
+ pshufb m2, m10
+ pmaddwd m0, m15
+ pmaddwd m1, m15
+ pmaddwd m2, m15
+ movq xm6, r4q
+ pmovsxbw xm6, xm6
+ phaddd m0, m1
+ phaddd m1, m2
+ paddd m0, m12
+ paddd m1, m12
+ psrad m0, xm7
+ psrad m1, xm7
+ packssdw m0, m1 ; 0 2 2 4 1 3 3 5
+ vextracti128 xm1, m0, 1
+ pshufd xm8, xm6, q0000
+ pshufd xm9, xm6, q1111
+ pshufd xm14, xm6, q2222
+ pshufd xm6, xm6, q3333
+ punpcklwd xm2, xm0, xm1 ; 01 23
+ punpckhwd xm1, xm0, xm1 ; 23 45
+.dy2_w2_loop:
+ movu xm3, [srcq+ssq*0]
+ movu xm5, [srcq+ssq*2]
+ vinserti128 m3, [srcq+ssq*1], 1 ; 6 7
+ vinserti128 m5, [srcq+ss3q ], 1 ; 8 9
+ lea srcq, [srcq+ssq*4]
+ pmaddwd xm4, xm2, xm8
+ pmaddwd xm1, xm9
+ pshufb m3, m10
+ pshufb m5, m10
+ pmaddwd m3, m15
+ pmaddwd m5, m15
+ phaddd m3, m5
+ paddd xm4, xm1
+ paddd m3, m12
+ psrad m3, xm7
+ packssdw m3, m3
+ pshufd m3, m3, q2100
+ palignr m0, m3, m0, 12 ; 4 6 6 8 5 7 7 9
+ vextracti128 xm1, m0, 1
+ punpcklwd xm2, xm0, xm1 ; 45 67
+ punpckhwd xm1, xm0, xm1 ; 67 89
+ pmaddwd xm3, xm2, xm14
+ pmaddwd xm5, xm1, xm6
+ paddd xm4, xm13
+ paddd xm4, xm3
+ psrldq xm3, xm7, 8
+ paddd xm4, xm5
+ psrad xm4, xm3
+ packusdw xm4, xm4
+ pminsw xm4, xm11
+ movd [dstq+dsq*0], xm4
+ pextrd [dstq+dsq*1], xm4, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy2_w2_loop
+ RET
+%endif
+.dy2_w4:
+ mov myd, mym
+%if isput
+ mova [rsp+0x50], xm11
+%endif
+ mova [rsp+0x00], m12
+ mova [rsp+0x20], m13
+ mova [rsp+0x40], xm7
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ sub srcq, 2
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastq m2, [base+pq_0x40000000+1]
+ vpbroadcastd xm15, xm15
+ SWAP m13, m10
+ paddd m13, m8 ; mx+dx*[0-3]
+ pand m6, m13
+ psrld m6, 6
+ paddd xm15, xm6
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ pextrd r11d, xm15, 2
+ pextrd r13d, xm15, 3
+ vbroadcasti128 m5, [base+bdct_lb_q+ 0]
+ vbroadcasti128 m1, [base+bdct_lb_q+16]
+ vbroadcasti128 m4, [base+subpel_s_shuf2]
+ vpbroadcastd xm14, [base+subpel_filters+r4*8+2]
+ vpbroadcastd xm7, [base+subpel_filters+r6*8+2]
+ vpbroadcastd xm15, [base+subpel_filters+r11*8+2]
+ vpbroadcastd xm8, [base+subpel_filters+r13*8+2]
+ shr myd, 6
+ mov r13d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r13q, [base+subpel_filters+myq*8]
+ pcmpeqd m6, m9
+ punpckldq m11, m6, m6
+ punpckhdq m6, m6
+ psrld m13, 10
+ paddd m13, m13
+ vpblendd xm14, xm7, 0xa
+ vpblendd xm15, xm8, 0xa
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ movq xm10, r13q
+ pblendvb m14, m2, m11
+ pblendvb m15, m2, m6
+ pextrd r4, xm13, 2
+ pshufb m12, m13, m5
+ pshufb m13, m1
+ lea r6, [r4+ssq*1]
+ lea r11, [r4+ssq*2]
+ lea r13, [r4+ss3q ]
+ movu xm0, [srcq+ssq*0]
+ movu xm7, [srcq+r4 ]
+ movu xm1, [srcq+ssq*1]
+ movu xm8, [srcq+r6 ]
+ vinserti128 m0, [srcq+ssq*2], 1 ; 0 2
+ vinserti128 m7, [srcq+r11 ], 1
+ vinserti128 m1, [srcq+ss3q ], 1 ; 1 3
+ vinserti128 m8, [srcq+r13 ], 1
+ lea srcq, [srcq+ssq*4]
+ movu xm2, [srcq+ssq*0]
+ movu xm9, [srcq+r4 ]
+ vinserti128 m2, [srcq+ssq*1], 1 ; 4 5
+ vinserti128 m9, [srcq+r6 ], 1
+ lea srcq, [srcq+ssq*2]
+ vpbroadcastb m5, xm13
+ psubb m13, m5
+ paddb m12, m4
+ paddb m13, m4
+ mova m5, [rsp+0x00]
+ movd xm6, [rsp+0x40]
+ pshufb m0, m12
+ pshufb m1, m12
+ pshufb m2, m12
+ pmaddwd m0, m14
+ pmaddwd m1, m14
+ pmaddwd m2, m14
+ pshufb m7, m13
+ pshufb m8, m13
+ pshufb m9, m13
+ pmaddwd m7, m15
+ pmaddwd m8, m15
+ pmaddwd m9, m15
+ punpcklqdq xm10, xm10
+ pmovsxbw m10, xm10
+ phaddd m0, m7
+ phaddd m1, m8
+ phaddd m2, m9
+ paddd m0, m5
+ paddd m1, m5
+ paddd m2, m5
+ psrad m0, xm6
+ psrad m1, xm6
+ psrad m2, xm6
+ vperm2i128 m3, m0, m2, 0x21 ; 2 4
+ vperm2i128 m2, m1, 0x13 ; 3 5
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+ packssdw m0, m3 ; 0 2 2 4
+ packssdw m1, m2 ; 1 3 3 5
+ punpckhwd m2, m0, m1 ; 23 45
+ punpcklwd m0, m1 ; 01 23
+.dy2_w4_loop:
+ movu xm1, [srcq+ssq*0]
+ movu xm6, [srcq+r4 ]
+ movu xm3, [srcq+ssq*1]
+ movu xm11, [srcq+r6 ]
+ vinserti128 m1, [srcq+ssq*2], 1 ; 6 8
+ vinserti128 m6, [srcq+r11 ], 1
+ vinserti128 m3, [srcq+ss3q ], 1 ; 7 9
+ vinserti128 m11, [srcq+r13 ], 1
+ lea srcq, [srcq+ssq*4]
+ pmaddwd m4, m0, m7
+ pmaddwd m5, m2, m8
+ pshufb m1, m12
+ pshufb m3, m12
+ pmaddwd m1, m14
+ pmaddwd m3, m14
+ mova m0, [rsp+0x00]
+ pshufb m6, m13
+ pshufb m11, m13
+ pmaddwd m6, m15
+ pmaddwd m11, m15
+ paddd m4, m5
+ movd xm5, [rsp+0x40]
+ phaddd m1, m6
+ phaddd m3, m11
+ paddd m1, m0
+ paddd m3, m0
+ psrad m1, xm5
+ psrad m3, xm5
+ pslld m3, 16
+ pblendw m1, m3, 0xaa ; 67 89
+ vperm2i128 m0, m2, m1, 0x21 ; 45 67
+ paddd m4, [rsp+0x20]
+ mova m2, m1
+ pmaddwd m5, m0, m9
+ pmaddwd m6, m2, m10
+ paddd m4, m5
+ paddd m4, m6
+%if isput
+ psrad m4, [rsp+0x48]
+ vextracti128 xm5, m4, 1
+ packusdw xm4, xm5
+ pminsw xm4, [rsp+0x50]
+ movq [dstq+dsq*0], xm4
+ movhps [dstq+dsq*1], xm4
+ lea dstq, [dstq+dsq*2]
+%else
+ psrad m4, 6
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jg .dy2_w4_loop
+ MC_8TAP_SCALED_RET
+ SWAP m10, m13
+.dy2_w8:
+ mov dword [rsp+0xa0], 1
+ movifprep tmp_stridem, 16
+ jmp .dy2_w_start
+.dy2_w16:
+ mov dword [rsp+0xa0], 2
+ movifprep tmp_stridem, 32
+ jmp .dy2_w_start
+.dy2_w32:
+ mov dword [rsp+0xa0], 4
+ movifprep tmp_stridem, 64
+ jmp .dy2_w_start
+.dy2_w64:
+ mov dword [rsp+0xa0], 8
+ movifprep tmp_stridem, 128
+ jmp .dy2_w_start
+.dy2_w128:
+ mov dword [rsp+0xa0], 16
+ movifprep tmp_stridem, 256
+.dy2_w_start:
+ SWAP m10, m12, m1
+ SWAP m11, m7
+ ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free
+ mov myd, mym
+%if isput
+ movifnidn dsm, dsq
+ mova [rsp+0xc0], xm7
+%endif
+ mova [rsp+0x00], m10
+ mova [rsp+0x20], m13
+ mova [rsp+0x40], xm11
+ shr t0d, 16
+ sub srcq, 6
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pmaddwd m8, [base+rescale_mul2]
+ movd xm15, t0d
+ mov [rsp+0xa4], t0d
+ mov [rsp+0xa8], srcq
+ mov [rsp+0xb0], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m1, m8 ; mx+dx*[0-7]
+ movq xm0, r4q
+ pmovsxbw xm0, xm0
+ mova [rsp+0x50], xm0
+ jmp .dy2_hloop
+.dy2_hloop_prep:
+ dec dword [rsp+0xa0]
+ jz .ret
+ add qword [rsp+0xb0], 16
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m6, [base+pd_0x3ff]
+ paddd m1, m8, [rsp+0x60]
+ vpbroadcastd m15, [rsp+0xa4]
+ pxor m9, m9
+ mov srcq, [rsp+0xa8]
+ mov r0q, [rsp+0xb0] ; dstq / tmpq
+ mova m10, [rsp+0x00]
+ mova xm11, [rsp+0x40]
+.dy2_hloop:
+ vpbroadcastq xm2, [base+pq_0x40000000]
+ pand m5, m1, m6
+ psrld m5, 6
+ paddd m15, m5
+ pcmpeqd m5, m9
+ vextracti128 xm7, m15, 1
+ movq r6, xm15
+ pextrq r9, xm15, 1
+ movq r11, xm7
+ pextrq rX, xm7, 1
+ mov r4d, r6d
+ shr r6, 32
+ mov r7d, r9d
+ shr r9, 32
+ mov r10d, r11d
+ shr r11, 32
+ mov r13d, rXd
+ shr rX, 32
+ mova [rsp+0x60], m1
+ movq xm12, [base+subpel_filters+ r4*8]
+ movq xm13, [base+subpel_filters+ r6*8]
+ movhps xm12, [base+subpel_filters+ r7*8]
+ movhps xm13, [base+subpel_filters+ r9*8]
+ movq xm14, [base+subpel_filters+r10*8]
+ movq xm15, [base+subpel_filters+r11*8]
+ movhps xm14, [base+subpel_filters+r13*8]
+ movhps xm15, [base+subpel_filters+ rX*8]
+ psrld m1, 10
+ vextracti128 xm7, m1, 1
+ vextracti128 xm6, m5, 1
+ movq r6, xm1
+ pextrq r11, xm1, 1
+ movq r9, xm7
+ pextrq rX, xm7, 1
+ mov r4d, r6d
+ shr r6, 32
+ mov r10d, r11d
+ shr r11, 32
+ mov r7d, r9d
+ shr r9, 32
+ mov r13d, rXd
+ shr rX, 32
+ pshufd xm4, xm5, q2200
+ pshufd xm5, xm5, q3311
+ pshufd xm7, xm6, q2200
+ pshufd xm6, xm6, q3311
+ pblendvb xm12, xm2, xm4
+ pblendvb xm13, xm2, xm5
+ pblendvb xm14, xm2, xm7
+ pblendvb xm15, xm2, xm6
+ pmovsxbw m12, xm12
+ pmovsxbw m13, xm13
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ mova [rsp+0x80], m0
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b
+ mova m0, [rsp+0x80]
+ vbroadcasti128 m7, [base+subpel_s_shuf8]
+ vpbroadcastd m8, [rsp+0x50]
+ vpbroadcastd m9, [rsp+0x54]
+ vpbroadcastd m10, [rsp+0x58]
+ vpbroadcastd m11, [rsp+0x5c]
+ pshufb m0, m7 ; 01a 01b
+ pshufb m1, m7 ; 23a 23b
+ pshufb m2, m7 ; 45a 45b
+ pshufb m3, m7 ; 67a 67b
+.dy2_vloop:
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pmaddwd m6, m2, m10
+ pmaddwd m7, m3, m11
+ paddd m4, [rsp+0x20]
+ paddd m6, m7
+ paddd m4, m5
+ paddd m4, m6
+%if isput
+ psrad m4, [rsp+0x48]
+ vextracti128 xm5, m4, 1
+ packusdw xm4, xm5
+ pminsw xm4, [rsp+0xc0]
+ mova [dstq], xm4
+ add dstq, dsm
+%else
+ psrad m4, 6
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy2_hloop_prep
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ movu xm3, [srcq+ r4*2]
+ movu xm4, [srcq+ r6*2]
+ movu xm5, [srcq+ r7*2]
+ movu xm6, [srcq+ r9*2]
+ vinserti128 m3, [srcq+r10*2], 1
+ vinserti128 m4, [srcq+r11*2], 1
+ vinserti128 m5, [srcq+r13*2], 1
+ vinserti128 m6, [srcq+ rX*2], 1
+ add srcq, ssq
+ pmaddwd m3, m12
+ pmaddwd m4, m13
+ pmaddwd m5, m14
+ pmaddwd m6, m15
+ phaddd m3, m4
+ phaddd m5, m6
+ phaddd m3, m5
+ movu xm4, [srcq+ r4*2]
+ movu xm5, [srcq+ r6*2]
+ movu xm6, [srcq+ r7*2]
+ movu xm7, [srcq+ r9*2]
+ vinserti128 m4, [srcq+r10*2], 1
+ vinserti128 m5, [srcq+r11*2], 1
+ vinserti128 m6, [srcq+r13*2], 1
+ vinserti128 m7, [srcq+ rX*2], 1
+ add srcq, ssq
+ pmaddwd m4, m12
+ pmaddwd m5, m13
+ pmaddwd m6, m14
+ pmaddwd m7, m15
+ phaddd m4, m5
+ phaddd m6, m7
+ mova m5, [rsp+0x00]
+ movd xm7, [rsp+0x40]
+ phaddd m4, m6
+ paddd m3, m5
+ paddd m4, m5
+ psrad m3, xm7
+ psrad m4, xm7
+ pslld m4, 16
+ pblendw m3, m4, 0xaa
+ jmp .dy2_vloop
+.ret:
+ MC_8TAP_SCALED_RET 0
+%undef isput
+%undef isprep
+%endmacro
+
+%macro BILIN_SCALED_FN 1
+cglobal %1_bilin_scaled_16bpc
+ mov t0d, (5*15 << 16) | 5*15
+ mov t1d, t0d
+ jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX)
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 5
+%else
+DECLARE_REG_TMP 6, 8
+%endif
+
+%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
+BILIN_SCALED_FN put
+PUT_8TAP_SCALED_FN sharp, SHARP, SHARP
+PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR
+MC_8TAP_SCALED put
+
+%if WIN64
+DECLARE_REG_TMP 5, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
+BILIN_SCALED_FN prep
+PREP_8TAP_SCALED_FN sharp, SHARP, SHARP
+PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR
+MC_8TAP_SCALED prep
+
+%macro WARP_V 5 ; dst, 01, 23, 45, 67
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq xm8, [filterq+myq *8]
+ vinserti128 m8, [filterq+tmp1q*8], 1 ; a e
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+deltaq*1]
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1 ; b f
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq xm9, [filterq+myq *8]
+ vinserti128 m9, [filterq+tmp1q*8], 1 ; c g
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+gammaq] ; my += gamma
+ punpcklwd m8, m0
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1 ; d h
+ punpcklwd m0, m9, m0
+ punpckldq m9, m8, m0
+ punpckhdq m0, m8, m0
+ punpcklbw m8, m11, m9 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8
+ punpckhbw m9, m11, m9 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8
+ pmaddwd m%2, m8
+ pmaddwd m9, m%3
+ punpcklbw m8, m11, m0 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8
+ punpckhbw m0, m11, m0 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8
+ pmaddwd m8, m%4
+ pmaddwd m0, m%5
+ paddd m9, m%2
+ mova m%2, m%3
+ paddd m0, m8
+ mova m%3, m%4
+ mova m%4, m%5
+ paddd m%1, m0, m9
+%endmacro
+
+cglobal warp_affine_8x8t_16bpc, 4, 14, 16, tmp, ts
+ mov r6d, r7m
+ lea r9, [$$]
+ shr r6d, 11
+ vpbroadcastd m13, [r9-$$+warp8x8_shift+r6*4]
+ vpbroadcastd m14, [warp8x8t_rnd]
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main
+ jmp .start
+.loop:
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main2
+ lea tmpq, [tmpq+tsq*4]
+.start:
+ paddd m7, m14
+ paddd m0, m14
+ psrad m7, 15
+ psrad m0, 15
+ packssdw m7, m0
+ vpermq m7, m7, q3120
+ mova [tmpq+tsq*0], xm7
+ vextracti128 [tmpq+tsq*2], m7, 1
+ dec r4d
+ jg .loop
+.end:
+ RET
+
+cglobal warp_affine_8x8_16bpc, 4, 14, 16, dst, ds, src, ss, abcd, mx, tmp2, \
+ alpha, beta, filter, tmp1, delta, \
+ my, gamma
+ mov r6d, r7m
+ lea filterq, [$$]
+ shr r6d, 11
+ vpbroadcastd m13, [filterq-$$+warp8x8_shift+r6*4]
+ vpbroadcastd m14, [filterq-$$+warp8x8_rnd +r6*4]
+ vpbroadcastw m15, r7m ; pixel_max
+ call .main
+ jmp .start
+.loop:
+ call .main2
+ lea dstq, [dstq+dsq*2]
+.start:
+ psrad m7, 16
+ psrad m0, 16
+ packusdw m7, m0
+ pmulhrsw m7, m14
+ pminsw m7, m15
+ vpermq m7, m7, q3120
+ mova [dstq+dsq*0], xm7
+ vextracti128 [dstq+dsq*1], m7, 1
+ dec r4d
+ jg .loop
+.end:
+ RET
+ALIGN function_align
+.main:
+ ; Stack args offset by one (r4m -> r5m etc.) due to call
+%if WIN64
+ mov abcdq, r5m
+ mov mxd, r6m
+%endif
+ movsx alphad, word [abcdq+2*0]
+ movsx betad, word [abcdq+2*1]
+ vpbroadcastd m12, [pd_32768]
+ pxor m11, m11
+ add filterq, mc_warp_filter-$$
+ lea tmp1q, [ssq*3]
+ add mxd, 512+(64<<10)
+ lea tmp2d, [alphaq*3]
+ sub srcq, tmp1q ; src -= src_stride*3
+ sub betad, tmp2d ; beta -= alpha*3
+ mov myd, r7m
+ call .h
+ psrld m1, m0, 16
+ call .h
+ pblendw m1, m0, 0xaa ; 01
+ psrld m2, m0, 16
+ call .h
+ pblendw m2, m0, 0xaa ; 12
+ psrld m3, m0, 16
+ call .h
+ pblendw m3, m0, 0xaa ; 23
+ psrld m4, m0, 16
+ call .h
+ pblendw m4, m0, 0xaa ; 34
+ psrld m5, m0, 16
+ call .h
+ pblendw m5, m0, 0xaa ; 45
+ psrld m6, m0, 16
+ call .h
+ pblendw m6, m0, 0xaa ; 56
+ movsx deltad, word [abcdq+2*2]
+ movsx gammad, word [abcdq+2*3]
+ add myd, 512+(64<<10)
+ mov r4d, 4
+ lea tmp1d, [deltaq*3]
+ sub gammad, tmp1d ; gamma -= delta*3
+.main2:
+ call .h
+ psrld m7, m6, 16
+ pblendw m7, m0, 0xaa ; 67
+ WARP_V 7, 1, 3, 5, 7
+ call .h
+ psrld m10, m5, 16
+ pblendw m10, m0, 0xaa ; 78
+ WARP_V 0, 2, 4, 6, 10
+ ret
+ALIGN function_align
+.h:
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+ movu xm10, [srcq-6]
+ vinserti128 m10, [srcq+2], 1
+ shr mxd, 10 ; 0
+ shr tmp1d, 10 ; 4
+ movq xm0, [filterq+mxq *8]
+ vinserti128 m0, [filterq+tmp1q*8], 1
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+alphaq*1]
+ movu xm8, [srcq-4]
+ vinserti128 m8, [srcq+4], 1
+ shr tmp2d, 10 ; 1
+ shr tmp1d, 10 ; 5
+ movq xm9, [filterq+tmp2q*8]
+ vinserti128 m9, [filterq+tmp1q*8], 1
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+ shr mxd, 10 ; 2
+ shr tmp1d, 10 ; 6
+ punpcklbw m0, m11, m0
+ pmaddwd m0, m10
+ movu xm10, [srcq-2]
+ vinserti128 m10, [srcq+6], 1
+ punpcklbw m9, m11, m9
+ pmaddwd m9, m8
+ movq xm8, [filterq+mxq *8]
+ vinserti128 m8, [filterq+tmp1q*8], 1
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+betaq] ; mx += beta
+ phaddd m0, m9 ; 0 1 4 5
+ movu xm9, [srcq+0]
+ vinserti128 m9, [srcq+8], 1
+ shr tmp2d, 10 ; 3
+ shr tmp1d, 10 ; 7
+ punpcklbw m8, m11, m8
+ pmaddwd m8, m10
+ movq xm10, [filterq+tmp2q*8]
+ vinserti128 m10, [filterq+tmp1q*8], 1
+ punpcklbw m10, m11, m10
+ pmaddwd m9, m10
+ add srcq, ssq
+ phaddd m8, m9 ; 2 3 6 7
+ phaddd m0, m8 ; 0 1 2 3 4 5 6 7
+ vpsllvd m0, m13
+ paddd m0, m12 ; rounded 14-bit result in upper 16 bits of dword
+ ret
+
+%macro BIDIR_FN 0
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movq [dstq ], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti128 xm0, m0, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ cmp hd, 4
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti128 xm1, m1, 1
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+ cmp hd, 8
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti128 xm2, m2, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], xm3
+ movhps [dstq+strideq*1], xm3
+ vextracti128 xm3, m3, 1
+ movq [dstq+strideq*2], xm3
+ movhps [dstq+stride3q ], xm3
+.ret:
+ RET
+.w8:
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+stride3q ], m1, 1
+ cmp hd, 4
+ jne .w8_loop_start
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+stride3q ], m1, 1
+.w8_loop_start:
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm2
+ vextracti128 [dstq+strideq*1], m2, 1
+ mova [dstq+strideq*2], xm3
+ vextracti128 [dstq+stride3q ], m3, 1
+ sub hd, 8
+ jg .w8_loop
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w16:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w32:
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m2
+ mova [dstq+strideq*1+32*1], m3
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ call .main
+ mova [dstq+32*4], m0
+ mova [dstq+32*5], m1
+ mova [dstq+32*6], m2
+ mova [dstq+32*7], m3
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 5
+%else
+DECLARE_REG_TMP 7
+%endif
+
+cglobal avg_16bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-avg_avx2_table
+ lea r6, [avg_avx2_table]
+ tzcnt wd, wm
+ mov t0d, r6m ; pixel_max
+ movsxd wq, [r6+wq*4]
+ shr t0d, 11
+ vpbroadcastd m4, [base+bidir_rnd+t0*4]
+ vpbroadcastd m5, [base+bidir_mul+t0*4]
+ movifnidn hd, hm
+ add wq, r6
+ BIDIR_FN
+ALIGN function_align
+.main:
+ mova m0, [tmp1q+32*0]
+ paddsw m0, [tmp2q+32*0]
+ mova m1, [tmp1q+32*1]
+ paddsw m1, [tmp2q+32*1]
+ mova m2, [tmp1q+32*2]
+ paddsw m2, [tmp2q+32*2]
+ mova m3, [tmp1q+32*3]
+ paddsw m3, [tmp2q+32*3]
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pmaxsw m2, m4
+ pmaxsw m3, m4
+ psubsw m0, m4
+ psubsw m1, m4
+ psubsw m2, m4
+ psubsw m3, m4
+ pmulhw m0, m5
+ pmulhw m1, m5
+ pmulhw m2, m5
+ pmulhw m3, m5
+ ret
+
+cglobal w_avg_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, stride3
+ lea r6, [w_avg_avx2_table]
+ tzcnt wd, wm
+ mov t0d, r6m ; weight
+ vpbroadcastw m8, r7m ; pixel_max
+ vpbroadcastd m7, [r6-w_avg_avx2_table+pd_65538]
+ movsxd wq, [r6+wq*4]
+ paddw m7, m8
+ add wq, r6
+ lea r6d, [t0-16]
+ shl t0d, 16
+ sub t0d, r6d ; 16-weight, weight
+ pslld m7, 7
+ rorx r6d, t0d, 30 ; << 2
+ test dword r7m, 0x800
+ cmovz r6d, t0d
+ movifnidn hd, hm
+ movd xm6, r6d
+ vpbroadcastd m6, xm6
+ BIDIR_FN
+ALIGN function_align
+.main:
+ mova m4, [tmp1q+32*0]
+ mova m0, [tmp2q+32*0]
+ punpckhwd m5, m0, m4
+ punpcklwd m0, m4
+ mova m4, [tmp1q+32*1]
+ mova m1, [tmp2q+32*1]
+ pmaddwd m5, m6
+ pmaddwd m0, m6
+ paddd m5, m7
+ paddd m0, m7
+ psrad m5, 8
+ psrad m0, 8
+ packusdw m0, m5
+ punpckhwd m5, m1, m4
+ punpcklwd m1, m4
+ mova m4, [tmp1q+32*2]
+ mova m2, [tmp2q+32*2]
+ pmaddwd m5, m6
+ pmaddwd m1, m6
+ paddd m5, m7
+ paddd m1, m7
+ psrad m5, 8
+ psrad m1, 8
+ packusdw m1, m5
+ punpckhwd m5, m2, m4
+ punpcklwd m2, m4
+ mova m4, [tmp1q+32*3]
+ mova m3, [tmp2q+32*3]
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ pmaddwd m5, m6
+ pmaddwd m2, m6
+ paddd m5, m7
+ paddd m2, m7
+ psrad m5, 8
+ psrad m2, 8
+ packusdw m2, m5
+ punpckhwd m5, m3, m4
+ punpcklwd m3, m4
+ pmaddwd m5, m6
+ pmaddwd m3, m6
+ paddd m5, m7
+ paddd m3, m7
+ psrad m5, 8
+ psrad m3, 8
+ packusdw m3, m5
+ pminsw m0, m8
+ pminsw m1, m8
+ pminsw m2, m8
+ pminsw m3, m8
+ ret
+
+cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-mask_avx2_table
+ lea r7, [mask_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; pixel_max
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m8, [base+pw_64]
+ vpbroadcastd m9, [base+bidir_rnd+r6*4]
+ vpbroadcastd m10, [base+bidir_mul+r6*4]
+ mov maskq, maskmp
+ add wq, r7
+ BIDIR_FN
+ALIGN function_align
+.main:
+%macro MASK 1
+ pmovzxbw m5, [maskq+16*%1]
+ mova m%1, [tmp1q+32*%1]
+ mova m6, [tmp2q+32*%1]
+ punpckhwd m4, m%1, m6
+ punpcklwd m%1, m6
+ psubw m7, m8, m5
+ punpckhwd m6, m5, m7 ; m, 64-m
+ punpcklwd m5, m7
+ pmaddwd m4, m6 ; tmp1 * m + tmp2 * (64-m)
+ pmaddwd m%1, m5
+ psrad m4, 5
+ psrad m%1, 5
+ packssdw m%1, m4
+ pmaxsw m%1, m9
+ psubsw m%1, m9
+ pmulhw m%1, m10
+%endmacro
+ MASK 0
+ MASK 1
+ MASK 2
+ MASK 3
+ add maskq, 16*4
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ ret
+
+cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_420_avx2_table
+ lea r7, [w_mask_420_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movd xm0, r7m ; sign
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
+ vpbroadcastd m11, [base+pw_64]
+ vpbroadcastd m12, [base+bidir_rnd+r6*4]
+ vpbroadcastd m13, [base+bidir_mul+r6*4]
+ movd xm14, [base+pw_2]
+ mov maskq, maskmp
+ psubw xm14, xm0
+ vpbroadcastw m14, xm14
+ add wq, r7
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ phaddd m4, m5
+ paddw m4, m14
+ psrlw m4, 2
+ packuswb m4, m4
+ vextracti128 xm5, m4, 1
+ punpcklwd xm4, xm5
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti128 xm0, m0, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ mova [maskq], xm4
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti128 xm1, m1, 1
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+ je .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti128 xm2, m2, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm3
+ movhps [dstq+strideq*1], xm3
+ vextracti128 xm3, m3, 1
+ movq [dstq+strideq*2], xm3
+ movhps [dstq+stride3q ], xm3
+.w4_end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+ add maskq, 16
+.w8:
+ vperm2i128 m6, m4, m5, 0x21
+ vpblendd m4, m5, 0xf0
+ paddw m4, m14
+ paddw m4, m6
+ psrlw m4, 2
+ vextracti128 xm5, m4, 1
+ packuswb xm4, xm5
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+stride3q ], m1, 1
+ mova [maskq], xm4
+ sub hd, 8
+ jl .w8_end
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm2
+ vextracti128 [dstq+strideq*1], m2, 1
+ mova [dstq+strideq*2], xm3
+ vextracti128 [dstq+stride3q ], m3, 1
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+ add maskq, 16
+.w16:
+ punpcklqdq m6, m4, m5
+ punpckhqdq m4, m5
+ paddw m6, m14
+ paddw m4, m6
+ psrlw m4, 2
+ vextracti128 xm5, m4, 1
+ packuswb xm4, xm5
+ pshufd xm4, xm4, q3120
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ mova [maskq], xm4
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+ add maskq, 32
+.w32:
+ paddw m4, m14
+ paddw m4, m5
+ psrlw m15, m4, 2
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m2
+ mova [dstq+strideq*1+32*1], m3
+ call .main
+ mova m6, [deint_shuf]
+ paddw m4, m14
+ paddw m4, m5
+ psrlw m4, 2
+ packuswb m15, m4
+ vpermd m4, m6, m15
+ mova [dstq+strideq*2+32*0], m0
+ mova [dstq+strideq*2+32*1], m1
+ mova [dstq+stride3q +32*0], m2
+ mova [dstq+stride3q +32*1], m3
+ mova [maskq], m4
+ sub hd, 4
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 32
+.w64:
+ paddw m4, m14
+ paddw m15, m14, m5
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*0+32*2], m2
+ mova [dstq+strideq*0+32*3], m3
+ mova [maskq], m4 ; no available registers
+ call .main
+ paddw m4, [maskq]
+ mova m6, [deint_shuf]
+ paddw m5, m15
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5 ; 0 2 4 6 1 3 5 7
+ vpermd m4, m6, m4
+ mova [dstq+strideq*1+32*0], m0
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*1+32*2], m2
+ mova [dstq+strideq*1+32*3], m3
+ mova [maskq], m4
+ sub hd, 2
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 64
+.w128:
+ paddw m4, m14
+ paddw m5, m14
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*0+32*2], m2
+ mova [dstq+strideq*0+32*3], m3
+ mova [maskq+32*0], m4
+ mova [dstq+strideq], m5
+ call .main
+ paddw m4, m14
+ paddw m15, m14, m5
+ mova [dstq+strideq*0+32*4], m0
+ mova [dstq+strideq*0+32*5], m1
+ mova [dstq+strideq*0+32*6], m2
+ mova [dstq+strideq*0+32*7], m3
+ mova [maskq+32*1], m4
+ call .main
+ paddw m4, [maskq+32*0]
+ paddw m5, [dstq+strideq]
+ mova m6, [deint_shuf]
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m6, m4
+ mova [dstq+strideq*1+32*0], m0
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*1+32*2], m2
+ mova [dstq+strideq*1+32*3], m3
+ mova [maskq+32*0], m4
+ call .main
+ paddw m4, [maskq+32*1]
+ mova m6, [deint_shuf]
+ paddw m5, m15
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m6, m4
+ mova [dstq+strideq*1+32*4], m0
+ mova [dstq+strideq*1+32*5], m1
+ mova [dstq+strideq*1+32*6], m2
+ mova [dstq+strideq*1+32*7], m3
+ mova [maskq+32*1], m4
+ sub hd, 2
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+%macro W_MASK 2-6 11, 12, 13 ; dst/src1, mask/src2, pw_64, rnd, mul
+ mova m%1, [tmp1q+32*%1]
+ mova m%2, [tmp2q+32*%1]
+ punpcklwd m8, m%2, m%1
+ punpckhwd m9, m%2, m%1
+ psubsw m%1, m%2
+ pabsw m%1, m%1
+ psubusw m7, m10, m%1
+ psrlw m7, 10 ; 64-m
+ psubw m%2, m%3, m7 ; m
+ punpcklwd m%1, m7, m%2
+ punpckhwd m7, m%2
+ pmaddwd m%1, m8
+ pmaddwd m7, m9
+ psrad m%1, 5
+ psrad m7, 5
+ packssdw m%1, m7
+ pmaxsw m%1, m%4
+ psubsw m%1, m%4
+ pmulhw m%1, m%5
+%endmacro
+ W_MASK 0, 4
+ W_MASK 1, 5
+ phaddw m4, m5
+ W_MASK 2, 5
+ W_MASK 3, 6
+ phaddw m5, m6
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ ret
+
+cglobal w_mask_422_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_422_avx2_table
+ lea r7, [w_mask_422_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ vpbroadcastb m14, r7m ; sign
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m10, [base+pw_27615]
+ vpbroadcastd m11, [base+pw_64]
+ vpbroadcastd m12, [base+bidir_rnd+r6*4]
+ vpbroadcastd m13, [base+bidir_mul+r6*4]
+ mova m15, [base+deint_shuf]
+ mov maskq, maskmp
+ add wq, r7
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti128 xm0, m0, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti128 xm1, m1, 1
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+ je .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti128 xm2, m2, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm3
+ movhps [dstq+strideq*1], xm3
+ vextracti128 xm3, m3, 1
+ movq [dstq+strideq*2], xm3
+ movhps [dstq+stride3q ], xm3
+.w4_end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w8:
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+stride3q ], m1, 1
+ sub hd, 8
+ jl .w8_end
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm2
+ vextracti128 [dstq+strideq*1], m2, 1
+ mova [dstq+strideq*2], xm3
+ vextracti128 [dstq+stride3q ], m3, 1
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w16:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w32:
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m2
+ mova [dstq+strideq*1+32*1], m3
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ call .main
+ mova [dstq+32*4], m0
+ mova [dstq+32*5], m1
+ mova [dstq+32*6], m2
+ mova [dstq+32*7], m3
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ W_MASK 0, 4
+ W_MASK 1, 5
+ phaddw m4, m5
+ W_MASK 2, 5
+ W_MASK 3, 6
+ phaddw m5, m6
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ packuswb m4, m5
+ pxor m5, m5
+ psubb m4, m14
+ pavgb m4, m5
+ vpermd m4, m15, m4
+ mova [maskq], m4
+ add maskq, 32
+ ret
+
+cglobal w_mask_444_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_444_avx2_table
+ lea r7, [w_mask_444_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m10, [base+pw_27615]
+ vpbroadcastd m4, [base+pw_64]
+ vpbroadcastd m5, [base+bidir_rnd+r6*4]
+ vpbroadcastd m6, [base+bidir_mul+r6*4]
+ mov maskq, maskmp
+ add wq, r7
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti128 xm0, m0, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti128 xm1, m1, 1
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+ je .w4_end
+ call .main
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti128 xm0, m0, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti128 xm1, m1, 1
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+.w4_end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w8:
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+stride3q ], m1, 1
+ sub hd, 4
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w16:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ add dstq, strideq
+.w32:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ call .main
+ mova [dstq+32*2], m0
+ mova [dstq+32*3], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ call .main
+ mova [dstq+32*2], m0
+ mova [dstq+32*3], m1
+ call .main
+ mova [dstq+32*4], m0
+ mova [dstq+32*5], m1
+ call .main
+ mova [dstq+32*6], m0
+ mova [dstq+32*7], m1
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ W_MASK 0, 2, 4, 5, 6
+ W_MASK 1, 3, 4, 5, 6
+ packuswb m2, m3
+ vpermq m2, m2, q3120
+ add tmp1q, 32*2
+ add tmp2q, 32*2
+ mova [maskq], m2
+ add maskq, 32
+ ret
+
+; (a * (64 - m) + b * m + 32) >> 6
+; = (((b - a) * m + 32) >> 6) + a
+; = (((b - a) * (m << 9) + 16384) >> 15) + a
+; except m << 9 overflows int16_t when m == 64 (which is possible),
+; but if we negate m it works out (-64 << 9 == -32768).
+; = (((a - b) * (m * -512) + 16384) >> 15) + a
+cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
+%define base r6-blend_avx2_table
+ lea r6, [blend_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r6+wq*4]
+ movifnidn maskq, maskmp
+ vpbroadcastd m6, [base+pw_m512]
+ add wq, r6
+ lea r6, [dsq*3]
+ jmp wq
+.w4:
+ pmovzxbw m3, [maskq]
+ movq xm0, [dstq+dsq*0]
+ movhps xm0, [dstq+dsq*1]
+ vpbroadcastq m1, [dstq+dsq*2]
+ vpbroadcastq m2, [dstq+r6 ]
+ vpblendd m0, m1, 0x30
+ vpblendd m0, m2, 0xc0
+ psubw m1, m0, [tmpq]
+ add maskq, 16
+ add tmpq, 32
+ pmullw m3, m6
+ pmulhrsw m1, m3
+ paddw m0, m1
+ vextracti128 xm1, m0, 1
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ movq [dstq+dsq*2], xm1
+ movhps [dstq+r6 ], xm1
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ pmovzxbw m4, [maskq+16*0]
+ pmovzxbw m5, [maskq+16*1]
+ mova xm0, [dstq+dsq*0]
+ vinserti128 m0, [dstq+dsq*1], 1
+ mova xm1, [dstq+dsq*2]
+ vinserti128 m1, [dstq+r6 ], 1
+ psubw m2, m0, [tmpq+32*0]
+ psubw m3, m1, [tmpq+32*1]
+ add maskq, 16*2
+ add tmpq, 32*2
+ pmullw m4, m6
+ pmullw m5, m6
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ mova [dstq+dsq*2], xm1
+ vextracti128 [dstq+r6 ], m1, 1
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ pmovzxbw m4, [maskq+16*0]
+ pmovzxbw m5, [maskq+16*1]
+ mova m0, [dstq+dsq*0]
+ psubw m2, m0, [tmpq+ 32*0]
+ mova m1, [dstq+dsq*1]
+ psubw m3, m1, [tmpq+ 32*1]
+ add maskq, 16*2
+ add tmpq, 32*2
+ pmullw m4, m6
+ pmullw m5, m6
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16
+ RET
+.w32:
+ pmovzxbw m4, [maskq+16*0]
+ pmovzxbw m5, [maskq+16*1]
+ mova m0, [dstq+32*0]
+ psubw m2, m0, [tmpq+32*0]
+ mova m1, [dstq+32*1]
+ psubw m3, m1, [tmpq+32*1]
+ add maskq, 16*2
+ add tmpq, 32*2
+ pmullw m4, m6
+ pmullw m5, m6
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ add dstq, dsq
+ dec hd
+ jg .w32
+ RET
+
+INIT_XMM avx2
+cglobal blend_v_16bpc, 3, 6, 6, dst, ds, tmp, w, h
+%define base r5-blend_v_avx2_table
+ lea r5, [blend_v_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp wq
+.w2:
+ vpbroadcastd m2, [base+obmc_masks_avx2+2*2]
+.w2_loop:
+ movd m0, [dstq+dsq*0]
+ pinsrd m0, [dstq+dsq*1], 1
+ movq m1, [tmpq]
+ add tmpq, 4*2
+ psubw m1, m0, m1
+ pmulhrsw m1, m2
+ paddw m0, m1
+ movd [dstq+dsq*0], m0
+ pextrd [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w2_loop
+ RET
+.w4:
+ vpbroadcastq m2, [base+obmc_masks_avx2+4*2]
+.w4_loop:
+ movq m0, [dstq+dsq*0]
+ movhps m0, [dstq+dsq*1]
+ psubw m1, m0, [tmpq]
+ add tmpq, 8*2
+ pmulhrsw m1, m2
+ paddw m0, m1
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+INIT_YMM avx2
+.w8:
+ vbroadcasti128 m2, [base+obmc_masks_avx2+8*2]
+.w8_loop:
+ mova xm0, [dstq+dsq*0]
+ vinserti128 m0, [dstq+dsq*1], 1
+ psubw m1, m0, [tmpq]
+ add tmpq, 16*2
+ pmulhrsw m1, m2
+ paddw m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16:
+ mova m4, [base+obmc_masks_avx2+16*2]
+.w16_loop:
+ mova m0, [dstq+dsq*0]
+ psubw m2, m0, [tmpq+ 32*0]
+ mova m1, [dstq+dsq*1]
+ psubw m3, m1, [tmpq+ 32*1]
+ add tmpq, 32*2
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32:
+%if WIN64
+ movaps [rsp+ 8], xmm6
+ movaps [rsp+24], xmm7
+%endif
+ mova m6, [base+obmc_masks_avx2+32*2]
+ vbroadcasti128 m7, [base+obmc_masks_avx2+32*3]
+.w32_loop:
+ mova m0, [dstq+dsq*0+32*0]
+ psubw m3, m0, [tmpq +32*0]
+ mova xm2, [dstq+dsq*0+32*1]
+ mova xm5, [tmpq +32*1]
+ mova m1, [dstq+dsq*1+32*0]
+ psubw m4, m1, [tmpq +32*2]
+ vinserti128 m2, [dstq+dsq*1+32*1], 1
+ vinserti128 m5, [tmpq +32*3], 1
+ add tmpq, 32*4
+ psubw m5, m2, m5
+ pmulhrsw m3, m6
+ pmulhrsw m4, m6
+ pmulhrsw m5, m7
+ paddw m0, m3
+ paddw m1, m4
+ paddw m2, m5
+ mova [dstq+dsq*0+32*0], m0
+ mova [dstq+dsq*1+32*0], m1
+ mova [dstq+dsq*0+32*1], xm2
+ vextracti128 [dstq+dsq*1+32*1], m2, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w32_loop
+%if WIN64
+ movaps xmm6, [rsp+ 8]
+ movaps xmm7, [rsp+24]
+%endif
+ RET
+
+%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp
+ mova m0, [dstq+32*(%1+0)]
+ psubw m2, m0, [tmpq+32*(%2+0)]
+ mova m1, [dstq+32*(%1+1)]
+ psubw m3, m1, [tmpq+32*(%2+1)]
+%if %3
+ add tmpq, 32*%3
+%endif
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+32*(%1+0)], m0
+ mova [dstq+32*(%1+1)], m1
+%endmacro
+
+INIT_XMM avx2
+cglobal blend_h_16bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
+%define base r5-blend_h_avx2_table
+ lea r5, [blend_h_avx2_table]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea maskq, [base+obmc_masks_avx2+hq*2]
+ lea hd, [hq*3]
+ shr hd, 2 ; h * 3/4
+ lea maskq, [maskq+hq*2]
+ neg hq
+ jmp wq
+.w2:
+ movd m0, [dstq+dsq*0]
+ pinsrd m0, [dstq+dsq*1], 1
+ movd m2, [maskq+hq*2]
+ movq m1, [tmpq]
+ add tmpq, 4*2
+ punpcklwd m2, m2
+ psubw m1, m0, m1
+ pmulhrsw m1, m2
+ paddw m0, m1
+ movd [dstq+dsq*0], m0
+ pextrd [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w2
+ RET
+.w4:
+ mova m3, [blend_shuf]
+.w4_loop:
+ movq m0, [dstq+dsq*0]
+ movhps m0, [dstq+dsq*1]
+ movd m2, [maskq+hq*2]
+ psubw m1, m0, [tmpq]
+ add tmpq, 8*2
+ pshufb m2, m3
+ pmulhrsw m1, m2
+ paddw m0, m1
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w4_loop
+ RET
+INIT_YMM avx2
+.w8:
+ vbroadcasti128 m3, [blend_shuf]
+ shufpd m3, m3, 0x0c
+.w8_loop:
+ mova xm0, [dstq+dsq*0]
+ vinserti128 m0, [dstq+dsq*1], 1
+ vpbroadcastd m2, [maskq+hq*2]
+ psubw m1, m0, [tmpq]
+ add tmpq, 16*2
+ pshufb m2, m3
+ pmulhrsw m1, m2
+ paddw m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w8_loop
+ RET
+.w16:
+ vpbroadcastw m4, [maskq+hq*2]
+ vpbroadcastw m5, [maskq+hq*2+2]
+ mova m0, [dstq+dsq*0]
+ psubw m2, m0, [tmpq+ 32*0]
+ mova m1, [dstq+dsq*1]
+ psubw m3, m1, [tmpq+ 32*1]
+ add tmpq, 32*2
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w16
+ RET
+.w32:
+ vpbroadcastw m4, [maskq+hq*2]
+ BLEND_H_ROW 0, 0, 2
+ add dstq, dsq
+ inc hq
+ jl .w32
+ RET
+.w64:
+ vpbroadcastw m4, [maskq+hq*2]
+ BLEND_H_ROW 0, 0
+ BLEND_H_ROW 2, 2, 4
+ add dstq, dsq
+ inc hq
+ jl .w64
+ RET
+.w128:
+ vpbroadcastw m4, [maskq+hq*2]
+ BLEND_H_ROW 0, 0
+ BLEND_H_ROW 2, 2, 8
+ BLEND_H_ROW 4, -4
+ BLEND_H_ROW 6, -2
+ add dstq, dsq
+ inc hq
+ jl .w128
+ RET
+
+cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
+ bottomext, rightext
+ ; we assume that the buffer (stride) is larger than width, so we can
+ ; safely overwrite by a few bytes
+
+ ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ xor r12d, r12d
+ lea r10, [ihq-1]
+ cmp yq, ihq
+ cmovs r10, yq
+ test yq, yq
+ cmovs r10, r12
+ imul r10, sstrideq
+ add srcq, r10
+
+ ; ref += iclip(x, 0, iw - 1)
+ lea r10, [iwq-1]
+ cmp xq, iwq
+ cmovs r10, xq
+ test xq, xq
+ cmovs r10, r12
+ lea srcq, [srcq+r10*2]
+
+ ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+ lea bottomextq, [yq+bhq]
+ sub bottomextq, ihq
+ lea r3, [bhq-1]
+ cmovs bottomextq, r12
+
+ DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \
+ bottomext, rightext
+
+ ; top_ext = iclip(-y, 0, bh - 1)
+ neg topextq
+ cmovs topextq, r12
+ cmp bottomextq, bhq
+ cmovns bottomextq, r3
+ cmp topextq, bhq
+ cmovg topextq, r3
+
+ ; right_ext = iclip(x + bw - iw, 0, bw - 1)
+ lea rightextq, [xq+bwq]
+ sub rightextq, iwq
+ lea r2, [bwq-1]
+ cmovs rightextq, r12
+
+ DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \
+ bottomext, rightext
+
+ ; left_ext = iclip(-x, 0, bw - 1)
+ neg leftextq
+ cmovs leftextq, r12
+ cmp rightextq, bwq
+ cmovns rightextq, r2
+ cmp leftextq, bwq
+ cmovns leftextq, r2
+
+ DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \
+ dst, dstride, src, sstride, bottomext, rightext
+
+ ; center_h = bh - top_ext - bottom_ext
+ lea r3, [bottomextq+topextq]
+ sub centerhq, r3
+
+ ; blk += top_ext * PXSTRIDE(dst_stride)
+ mov r2, topextq
+ imul r2, dstrideq
+ add dstq, r2
+ mov r9m, dstq
+
+ ; center_w = bw - left_ext - right_ext
+ mov centerwq, bwq
+ lea r3, [rightextq+leftextq]
+ sub centerwq, r3
+
+%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
+.v_loop_%3:
+%if %1
+ ; left extension
+ xor r3, r3
+ vpbroadcastw m0, [srcq]
+.left_loop_%3:
+ mova [dstq+r3*2], m0
+ add r3, 16
+ cmp r3, leftextq
+ jl .left_loop_%3
+
+ ; body
+ lea r12, [dstq+leftextq*2]
+%endif
+ xor r3, r3
+.body_loop_%3:
+ movu m0, [srcq+r3*2]
+%if %1
+ movu [r12+r3*2], m0
+%else
+ movu [dstq+r3*2], m0
+%endif
+ add r3, 16
+ cmp r3, centerwq
+ jl .body_loop_%3
+
+%if %2
+ ; right extension
+%if %1
+ lea r12, [r12+centerwq*2]
+%else
+ lea r12, [dstq+centerwq*2]
+%endif
+ xor r3, r3
+ vpbroadcastw m0, [srcq+centerwq*2-2]
+.right_loop_%3:
+ movu [r12+r3*2], m0
+ add r3, 16
+ cmp r3, rightextq
+ jl .right_loop_%3
+
+%endif
+ add dstq, dstrideq
+ add srcq, sstrideq
+ dec centerhq
+ jg .v_loop_%3
+%endmacro
+
+ test leftextq, leftextq
+ jnz .need_left_ext
+ test rightextq, rightextq
+ jnz .need_right_ext
+ v_loop 0, 0, 0
+ jmp .body_done
+
+.need_left_ext:
+ test rightextq, rightextq
+ jnz .need_left_right_ext
+ v_loop 1, 0, 1
+ jmp .body_done
+
+.need_left_right_ext:
+ v_loop 1, 1, 2
+ jmp .body_done
+
+.need_right_ext:
+ v_loop 0, 1, 3
+
+.body_done:
+ ; bottom edge extension
+ test bottomextq, bottomextq
+ jz .top
+ mov srcq, dstq
+ sub srcq, dstrideq
+ xor r1, r1
+.bottom_x_loop:
+ mova m0, [srcq+r1*2]
+ lea r3, [dstq+r1*2]
+ mov r4, bottomextq
+.bottom_y_loop:
+ mova [r3], m0
+ add r3, dstrideq
+ dec r4
+ jg .bottom_y_loop
+ add r1, 16
+ cmp r1, bwq
+ jl .bottom_x_loop
+
+.top:
+ ; top edge extension
+ test topextq, topextq
+ jz .end
+ mov srcq, r9m
+ mov dstq, dstm
+ xor r1, r1
+.top_x_loop:
+ mova m0, [srcq+r1*2]
+ lea r3, [dstq+r1*2]
+ mov r4, topextq
+.top_y_loop:
+ mova [r3], m0
+ add r3, dstrideq
+ dec r4
+ jg .top_y_loop
+ add r1, 16
+ cmp r1, bwq
+ jl .top_x_loop
+
+.end:
+ RET
+
+cglobal resize_16bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0, pxmax
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ vpbroadcastd m5, dxm
+ vpbroadcastd m8, mx0m
+ vpbroadcastd m6, src_wm
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax
+ LEA r7, $$
+%define base r7-$$
+ vpbroadcastd m3, [base+pd_64]
+ vpbroadcastw xm7, pxmaxm
+ pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7]
+ pslld m5, 3 ; dx*8
+ pslld m6, 14
+ paddd m8, m2 ; mx+[0..7]*dx
+.loop_y:
+ xor xd, xd
+ mova m4, m8 ; per-line working version of mx
+.loop_x:
+ vpbroadcastd m10, [base+pd_63]
+ pxor m2, m2
+ pmaxsd m0, m4, m2
+ psrad m9, m4, 8 ; filter offset (unmasked)
+ pminsd m0, m6 ; iclip(mx, 0, src_w-8)
+ psubd m1, m4, m0 ; pshufb offset
+ psrad m0, 14 ; clipped src_x offset
+ psrad m1, 14 ; pshufb edge_emu offset
+ pand m9, m10 ; filter offset (masked)
+ ; load source pixels
+ movd r8d, xm0
+ pextrd r9d, xm0, 1
+ pextrd r10d, xm0, 2
+ pextrd r11d, xm0, 3
+ vextracti128 xm0, m0, 1
+ movu xm10, [srcq+r8*2]
+ movu xm11, [srcq+r9*2]
+ movu xm12, [srcq+r10*2]
+ movu xm13, [srcq+r11*2]
+ movd r8d, xm0
+ pextrd r9d, xm0, 1
+ pextrd r10d, xm0, 2
+ pextrd r11d, xm0, 3
+ vinserti128 m10, [srcq+r8*2], 1
+ vinserti128 m11, [srcq+r9*2], 1
+ vinserti128 m12, [srcq+r10*2], 1
+ vinserti128 m13, [srcq+r11*2], 1
+ ptest m1, m1
+ jz .filter
+ movq r9, xm1
+ pextrq r11, xm1, 1
+ movsxd r8, r9d
+ sar r9, 32
+ movsxd r10, r11d
+ sar r11, 32
+ vextracti128 xm1, m1, 1
+ movu xm14, [base+resize_shuf+8+r8*2]
+ movu xm15, [base+resize_shuf+8+r9*2]
+ movu xm0, [base+resize_shuf+8+r10*2]
+ movu xm2, [base+resize_shuf+8+r11*2]
+ movq r9, xm1
+ pextrq r11, xm1, 1
+ movsxd r8, r9d
+ sar r9, 32
+ movsxd r10, r11d
+ sar r11, 32
+ vinserti128 m14, [base+resize_shuf+8+r8*2], 1
+ vinserti128 m15, [base+resize_shuf+8+r9*2], 1
+ vinserti128 m0, [base+resize_shuf+8+r10*2], 1
+ vinserti128 m2, [base+resize_shuf+8+r11*2], 1
+ pshufb m10, m14
+ pshufb m11, m15
+ pshufb m12, m0
+ pshufb m13, m2
+.filter:
+ movd r8d, xm9
+ pextrd r9d, xm9, 1
+ pextrd r10d, xm9, 2
+ pextrd r11d, xm9, 3
+ vextracti128 xm9, m9, 1
+ movq xm14, [base+resize_filter+r8*8]
+ movq xm15, [base+resize_filter+r9*8]
+ movq xm0, [base+resize_filter+r10*8]
+ movq xm2, [base+resize_filter+r11*8]
+ movd r8d, xm9
+ pextrd r9d, xm9, 1
+ pextrd r10d, xm9, 2
+ pextrd r11d, xm9, 3
+ movhps xm14, [base+resize_filter+r8*8]
+ movhps xm15, [base+resize_filter+r9*8]
+ movhps xm0, [base+resize_filter+r10*8]
+ movhps xm2, [base+resize_filter+r11*8]
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ pmovsxbw m0, xm0
+ pmovsxbw m2, xm2
+ pmaddwd m10, m14
+ pmaddwd m11, m15
+ pmaddwd m12, m0
+ pmaddwd m13, m2
+ phaddd m10, m11
+ phaddd m12, m13
+ phaddd m10, m12
+ psubd m10, m3, m10
+ psrad m10, 7
+ vextracti128 xm0, m10, 1
+ packusdw xm10, xm0
+ pminsw xm10, xm7
+ mova [dstq+xq*2], xm10
+ paddd m4, m5
+ add xd, 8
+ cmp xd, dst_wd
+ jl .loop_x
+ add dstq, dst_strideq
+ add srcq, src_strideq
+ dec hd
+ jg .loop_y
+ RET
+
+%endif ; ARCH_X86_64