summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/x86/mc16_sse.asm
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/dav1d/src/x86/mc16_sse.asm')
-rw-r--r--third_party/dav1d/src/x86/mc16_sse.asm8731
1 files changed, 8731 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/mc16_sse.asm b/third_party/dav1d/src/x86/mc16_sse.asm
new file mode 100644
index 0000000000..fde8e372a3
--- /dev/null
+++ b/third_party/dav1d/src/x86/mc16_sse.asm
@@ -0,0 +1,8731 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA
+
+; dav1d_obmc_masks[] << 9
+obmc_masks: dw 0, 0, 9728, 0, 12800, 7168, 2560, 0
+ dw 14336, 11264, 8192, 5632, 3584, 1536, 0, 0
+ dw 15360, 13824, 12288, 10752, 9216, 7680, 6144, 5120
+ dw 4096, 3072, 2048, 1536, 0, 0, 0, 0
+ dw 15872, 14848, 14336, 13312, 12288, 11776, 10752, 10240
+ dw 9728, 8704, 8192, 7168, 6656, 6144, 5632, 4608
+ dw 4096, 3584, 3072, 2560, 2048, 2048, 1536, 1024
+
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
+spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
+spel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
+spel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
+spel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+rescale_mul: dd 0, 1, 2, 3
+resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7
+ db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
+bdct_lb_q: times 8 db 0
+ times 8 db 4
+ times 8 db 8
+ times 8 db 12
+
+pw_2: times 8 dw 2
+pw_16: times 4 dw 16
+prep_mul: times 4 dw 16
+ times 8 dw 4
+pw_64: times 8 dw 64
+pw_256: times 8 dw 256
+pw_2048: times 4 dw 2048
+bidir_mul: times 4 dw 2048
+pw_8192: times 8 dw 8192
+pw_27615: times 8 dw 27615
+pw_32766: times 8 dw 32766
+pw_m512: times 8 dw -512
+pd_63: times 4 dd 63
+pd_64: times 4 dd 64
+pd_512: times 4 dd 512
+pd_m524256: times 4 dd -524256 ; -8192 << 6 + 32
+pd_0x3ff: times 4 dd 0x3ff
+pd_0x4000: times 4 dd 0x4000
+pq_0x400000: times 2 dq 0x400000
+pq_0x40000000: times 2 dq 0x40000000
+pd_65538: times 2 dd 65538
+
+put_bilin_h_rnd: times 4 dw 8
+ times 4 dw 10
+s_8tap_h_rnd: times 2 dd 2
+ times 2 dd 8
+put_s_8tap_v_rnd: times 2 dd 512
+ times 2 dd 128
+s_8tap_h_sh: dd 2, 4
+put_s_8tap_v_sh: dd 10, 8
+bidir_rnd: times 4 dw -16400
+ times 4 dw -16388
+put_8tap_h_rnd: dd 34, 34, 40, 40
+prep_8tap_1d_rnd: times 2 dd 8 - (8192 << 4)
+prep_8tap_2d_rnd: times 4 dd 32 - (8192 << 5)
+
+warp8x8_shift: dd 11, 13
+warp8x8_rnd1: dd 1024, 1024, 4096, 4096
+warp8x8_rnd2: times 4 dw 4096
+ times 4 dw 16384
+warp8x8t_rnd: times 2 dd 16384 - (8192 << 15)
+
+%macro BIDIR_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - 2*%3)
+ %xdefine %%base %1_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%prefix %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 32, 64, 128
+
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_16bpc_ssse3.put)
+%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_16bpc_ssse3.prep)
+
+BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128
+
+%macro SCALED_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2)
+%%table:
+ %rep %0 - 2
+ dw %%base %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+%%dy_1024:
+ %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy1_w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+%%dy_2048:
+ %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy2_w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE prep_8tap_scaled, ssse3, 4, 8, 16, 32, 64, 128
+
+cextern mc_subpel_filters
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+
+cextern mc_warp_filter
+cextern resize_filter
+
+SECTION .text
+
+%if UNIX64
+DECLARE_REG_TMP 7
+%else
+DECLARE_REG_TMP 5
+%endif
+
+INIT_XMM ssse3
+cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w, h, mxy
+%define base t0-put_ssse3
+ mov mxyd, r6m ; mx
+ LEA t0, put_ssse3
+ movifnidn wd, wm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .v
+.put:
+ tzcnt wd, wd
+ movzx wd, word [base+put_ssse3_table+wq*2]
+ add wq, t0
+ movifnidn hd, hm
+ jmp wq
+.put_w2:
+ mov r4d, [srcq+ssq*0]
+ mov r6d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r4d
+ mov [dstq+dsq*1], r6d
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w2
+ RET
+.put_w4:
+ movq m0, [srcq+ssq*0]
+ movq m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movq [dstq+dsq*0], m0
+ movq [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w4
+ RET
+.put_w8:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w8
+ RET
+.put_w16:
+ movu m0, [srcq+ssq*0+16*0]
+ movu m1, [srcq+ssq*0+16*1]
+ movu m2, [srcq+ssq*1+16*0]
+ movu m3, [srcq+ssq*1+16*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0+16*0], m0
+ mova [dstq+dsq*0+16*1], m1
+ mova [dstq+dsq*1+16*0], m2
+ mova [dstq+dsq*1+16*1], m3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w16
+ RET
+.put_w32:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ add srcq, ssq
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w32
+ RET
+.put_w64:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ movu m0, [srcq+16*4]
+ movu m1, [srcq+16*5]
+ movu m2, [srcq+16*6]
+ movu m3, [srcq+16*7]
+ add srcq, ssq
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ mova [dstq+16*6], m2
+ mova [dstq+16*7], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w64
+ RET
+.put_w128:
+ add srcq, 16*8
+ add dstq, 16*8
+.put_w128_loop:
+ movu m0, [srcq-16*8]
+ movu m1, [srcq-16*7]
+ movu m2, [srcq-16*6]
+ movu m3, [srcq-16*5]
+ mova [dstq-16*8], m0
+ mova [dstq-16*7], m1
+ mova [dstq-16*6], m2
+ mova [dstq-16*5], m3
+ movu m0, [srcq-16*4]
+ movu m1, [srcq-16*3]
+ movu m2, [srcq-16*2]
+ movu m3, [srcq-16*1]
+ mova [dstq-16*4], m0
+ mova [dstq-16*3], m1
+ mova [dstq-16*2], m2
+ mova [dstq-16*1], m3
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ movu m0, [srcq+16*4]
+ movu m1, [srcq+16*5]
+ movu m2, [srcq+16*6]
+ movu m3, [srcq+16*7]
+ add srcq, ssq
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ mova [dstq+16*6], m2
+ mova [dstq+16*7], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w128_loop
+ RET
+.h:
+ movd m5, mxyd
+ mov mxyd, r7m ; my
+ mova m4, [base+pw_16]
+ pshufb m5, [base+pw_256]
+ psubw m4, m5
+ test mxyd, mxyd
+ jnz .hv
+ ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
+ mov r6d, r8m ; bitdepth_max
+ shr r6d, 11
+ movddup m3, [base+put_bilin_h_rnd+r6*8]
+ movifnidn hd, hm
+ sub wd, 8
+ jg .h_w16
+ je .h_w8
+ cmp wd, -4
+ je .h_w4
+.h_w2:
+ movq m1, [srcq+ssq*0]
+ movhps m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmullw m0, m4, m1
+ psrlq m1, 16
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 4
+ movd [dstq+dsq*0], m0
+ punpckhqdq m0, m0
+ movd [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2
+ RET
+.h_w4:
+ movq m0, [srcq+ssq*0]
+ movhps m0, [srcq+ssq*1]
+ movq m1, [srcq+ssq*0+2]
+ movhps m1, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 4
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4
+ RET
+.h_w8:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*0+2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ movu m1, [srcq+ssq*1]
+ movu m2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ pmullw m1, m4
+ pmullw m2, m5
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ neg wq
+.h_w16_loop0:
+ mov r6, wq
+.h_w16_loop:
+ movu m0, [srcq+r6*2+ 0]
+ movu m1, [srcq+r6*2+ 2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ movu m1, [srcq+r6*2+16]
+ movu m2, [srcq+r6*2+18]
+ pmullw m1, m4
+ pmullw m2, m5
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+r6*2+16*0], m0
+ mova [dstq+r6*2+16*1], m1
+ add r6, 16
+ jl .h_w16_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w16_loop0
+ RET
+.v:
+ shl mxyd, 11
+ movd m5, mxyd
+ pshufb m5, [base+pw_256]
+ movifnidn hd, hm
+ cmp wd, 4
+ jg .v_w8
+ je .v_w4
+.v_w2:
+ movd m0, [srcq+ssq*0]
+.v_w2_loop:
+ movd m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklqdq m2, m0, m1
+ movd m0, [srcq+ssq*0]
+ punpcklqdq m1, m0
+ psubw m1, m2
+ pmulhrsw m1, m5
+ paddw m1, m2
+ movd [dstq+dsq*0], m1
+ punpckhqdq m1, m1
+ movd [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movq m0, [srcq+ssq*0]
+.v_w4_loop:
+ movq m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklqdq m2, m0, m1
+ movq m0, [srcq+ssq*0]
+ punpcklqdq m1, m0
+ psubw m1, m2
+ pmulhrsw m1, m5
+ paddw m1, m2
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+%if ARCH_X86_64
+%if WIN64
+ push r7
+%endif
+ shl wd, 5
+ mov r7, srcq
+ lea r6d, [wq+hq-256]
+ mov r4, dstq
+%else
+ mov r6, srcq
+%endif
+.v_w8_loop0:
+ movu m0, [srcq+ssq*0]
+.v_w8_loop:
+ movu m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ psubw m1, m3, m0
+ pmulhrsw m1, m5
+ paddw m1, m0
+ movu m0, [srcq+ssq*0]
+ psubw m2, m0, m3
+ pmulhrsw m2, m5
+ paddw m2, m3
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+%if ARCH_X86_64
+ add r7, 16
+ add r4, 16
+ movzx hd, r6b
+ mov srcq, r7
+ mov dstq, r4
+ sub r6d, 1<<8
+%else
+ mov dstq, dstmp
+ add r6, 16
+ mov hd, hm
+ add dstq, 16
+ mov srcq, r6
+ mov dstmp, dstq
+ sub wd, 8
+%endif
+ jg .v_w8_loop0
+%if WIN64
+ pop r7
+%endif
+ RET
+.hv:
+ WIN64_SPILL_XMM 8
+ shl mxyd, 11
+ mova m3, [base+pw_2]
+ movd m6, mxyd
+ mova m7, [base+pw_8192]
+ pshufb m6, [base+pw_256]
+ test dword r8m, 0x800
+ jnz .hv_12bpc
+ psllw m4, 2
+ psllw m5, 2
+ mova m7, [base+pw_2048]
+.hv_12bpc:
+ movifnidn hd, hm
+ cmp wd, 4
+ jg .hv_w8
+ je .hv_w4
+.hv_w2:
+ movddup m0, [srcq+ssq*0]
+ pshufhw m1, m0, q0321
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 2
+.hv_w2_loop:
+ movq m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps m2, [srcq+ssq*0]
+ pmullw m1, m4, m2
+ psrlq m2, 16
+ pmullw m2, m5
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m1, 2 ; 1 _ 2 _
+ shufpd m2, m0, m1, 0x01 ; 0 _ 1 _
+ mova m0, m1
+ psubw m1, m2
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m2
+ pmulhrsw m1, m7
+ movd [dstq+dsq*0], m1
+ punpckhqdq m1, m1
+ movd [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ movddup m0, [srcq+ssq*0]
+ movddup m1, [srcq+ssq*0+2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 2
+.hv_w4_loop:
+ movq m1, [srcq+ssq*1]
+ movq m2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ movhps m1, [srcq+ssq*0]
+ movhps m2, [srcq+ssq*0+2]
+ pmullw m1, m4
+ pmullw m2, m5
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m1, 2 ; 1 2
+ shufpd m2, m0, m1, 0x01 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m2
+ pmulhrsw m1, m7
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+%if ARCH_X86_64
+%if WIN64
+ push r7
+%endif
+ shl wd, 5
+ lea r6d, [wq+hq-256]
+ mov r4, srcq
+ mov r7, dstq
+%else
+ mov r6, srcq
+%endif
+.hv_w8_loop0:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*0+2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 2
+.hv_w8_loop:
+ movu m1, [srcq+ssq*1]
+ movu m2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ pmullw m1, m4
+ pmullw m2, m5
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m1, 2
+ psubw m2, m1, m0
+ paddw m2, m2
+ pmulhw m2, m6
+ paddw m2, m0
+ pmulhrsw m2, m7
+ mova [dstq+dsq*0], m2
+ movu m0, [srcq+ssq*0]
+ movu m2, [srcq+ssq*0+2]
+ pmullw m0, m4
+ pmullw m2, m5
+ paddw m0, m3
+ paddw m0, m2
+ psrlw m0, 2
+ psubw m2, m0, m1
+ paddw m2, m2
+ pmulhw m2, m6
+ paddw m2, m1
+ pmulhrsw m2, m7
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+%if ARCH_X86_64
+ add r4, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+%else
+ mov dstq, dstmp
+ add r6, 16
+ mov hd, hm
+ add dstq, 16
+ mov srcq, r6
+ mov dstmp, dstq
+ sub wd, 8
+%endif
+ jg .hv_w8_loop0
+%if WIN64
+ pop r7
+%endif
+ RET
+
+cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w, h, mxy, stride3
+%define base r6-prep_ssse3
+ movifnidn mxyd, r5m ; mx
+ LEA r6, prep_ssse3
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .v
+.prep:
+ tzcnt wd, wd
+ movzx wd, word [base+prep_ssse3_table+wq*2]
+ mov r5d, r7m ; bitdepth_max
+ mova m5, [base+pw_8192]
+ add wq, r6
+ shr r5d, 11
+ movddup m4, [base+prep_mul+r5*8]
+ lea stride3q, [strideq*3]
+ jmp wq
+.prep_w4:
+ movq m0, [srcq+strideq*0]
+ movhps m0, [srcq+strideq*1]
+ movq m1, [srcq+strideq*2]
+ movhps m1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pmullw m0, m4
+ pmullw m1, m4
+ psubw m0, m5
+ psubw m1, m5
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ add tmpq, 16*2
+ sub hd, 4
+ jg .prep_w4
+ RET
+.prep_w8:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*2]
+ movu m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 4
+ jg .prep_w8
+ RET
+.prep_w16:
+ movu m0, [srcq+strideq*0+16*0]
+ movu m1, [srcq+strideq*0+16*1]
+ movu m2, [srcq+strideq*1+16*0]
+ movu m3, [srcq+strideq*1+16*1]
+ lea srcq, [srcq+strideq*2]
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 2
+ jg .prep_w16
+ RET
+.prep_w32:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ add srcq, strideq
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ dec hd
+ jg .prep_w32
+ RET
+.prep_w64:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ movu m0, [srcq+16*4]
+ movu m1, [srcq+16*5]
+ movu m2, [srcq+16*6]
+ movu m3, [srcq+16*7]
+ add srcq, strideq
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*4], m0
+ mova [tmpq+16*5], m1
+ mova [tmpq+16*6], m2
+ mova [tmpq+16*7], m3
+ add tmpq, 16*8
+ dec hd
+ jg .prep_w64
+ RET
+.prep_w128:
+ movu m0, [srcq+16* 0]
+ movu m1, [srcq+16* 1]
+ movu m2, [srcq+16* 2]
+ movu m3, [srcq+16* 3]
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ movu m0, [srcq+16* 4]
+ movu m1, [srcq+16* 5]
+ movu m2, [srcq+16* 6]
+ movu m3, [srcq+16* 7]
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*4], m0
+ mova [tmpq+16*5], m1
+ mova [tmpq+16*6], m2
+ mova [tmpq+16*7], m3
+ movu m0, [srcq+16* 8]
+ movu m1, [srcq+16* 9]
+ movu m2, [srcq+16*10]
+ movu m3, [srcq+16*11]
+ add tmpq, 16*16
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq-16*8], m0
+ mova [tmpq-16*7], m1
+ mova [tmpq-16*6], m2
+ mova [tmpq-16*5], m3
+ movu m0, [srcq+16*12]
+ movu m1, [srcq+16*13]
+ movu m2, [srcq+16*14]
+ movu m3, [srcq+16*15]
+ add srcq, strideq
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq-16*4], m0
+ mova [tmpq-16*3], m1
+ mova [tmpq-16*2], m2
+ mova [tmpq-16*1], m3
+ dec hd
+ jg .prep_w128
+ RET
+.h:
+ movd m4, mxyd
+ mov mxyd, r6m ; my
+ mova m3, [base+pw_16]
+ pshufb m4, [base+pw_256]
+ mova m5, [base+pw_32766]
+ psubw m3, m4
+ test dword r7m, 0x800
+ jnz .h_12bpc
+ psllw m3, 2
+ psllw m4, 2
+.h_12bpc:
+ test mxyd, mxyd
+ jnz .hv
+ sub wd, 8
+ je .h_w8
+ jg .h_w16
+.h_w4:
+ movq m0, [srcq+strideq*0]
+ movhps m0, [srcq+strideq*1]
+ movq m1, [srcq+strideq*0+2]
+ movhps m1, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m3
+ pmullw m1, m4
+ psubw m0, m5
+ paddw m0, m1
+ psraw m0, 2
+ mova [tmpq], m0
+ add tmpq, 16
+ sub hd, 2
+ jg .h_w4
+ RET
+.h_w8:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*0+2]
+ pmullw m0, m3
+ pmullw m1, m4
+ psubw m0, m5
+ paddw m0, m1
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ pmullw m1, m3
+ pmullw m2, m4
+ psubw m1, m5
+ paddw m1, m2
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ add tmpq, 16*2
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ lea srcq, [srcq+wq*2]
+ neg wq
+.h_w16_loop0:
+ mov r6, wq
+.h_w16_loop:
+ movu m0, [srcq+r6*2+ 0]
+ movu m1, [srcq+r6*2+ 2]
+ pmullw m0, m3
+ pmullw m1, m4
+ psubw m0, m5
+ paddw m0, m1
+ movu m1, [srcq+r6*2+16]
+ movu m2, [srcq+r6*2+18]
+ pmullw m1, m3
+ pmullw m2, m4
+ psubw m1, m5
+ paddw m1, m2
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ add tmpq, 16*2
+ add r6, 16
+ jl .h_w16_loop
+ add srcq, strideq
+ dec hd
+ jg .h_w16_loop0
+ RET
+.v:
+ movd m4, mxyd
+ mova m3, [base+pw_16]
+ pshufb m4, [base+pw_256]
+ mova m5, [base+pw_32766]
+ psubw m3, m4
+ test dword r7m, 0x800
+ jnz .v_12bpc
+ psllw m3, 2
+ psllw m4, 2
+.v_12bpc:
+ cmp wd, 8
+ je .v_w8
+ jg .v_w16
+.v_w4:
+ movq m0, [srcq+strideq*0]
+.v_w4_loop:
+ movq m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ punpcklqdq m1, m0, m2 ; 0 1
+ movq m0, [srcq+strideq*0]
+ punpcklqdq m2, m0 ; 1 2
+ pmullw m1, m3
+ pmullw m2, m4
+ psubw m1, m5
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq], m1
+ add tmpq, 16
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movu m0, [srcq+strideq*0]
+.v_w8_loop:
+ movu m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m3
+ pmullw m1, m4, m2
+ psubw m0, m5
+ paddw m1, m0
+ movu m0, [srcq+strideq*0]
+ psraw m1, 2
+ pmullw m2, m3
+ mova [tmpq+16*0], m1
+ pmullw m1, m4, m0
+ psubw m2, m5
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq+16*1], m1
+ add tmpq, 16*2
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+%if WIN64
+ push r7
+%endif
+ mov r5, srcq
+%if ARCH_X86_64
+ lea r6d, [wq*4-32]
+ mov wd, wd
+ lea r6d, [hq+r6*8]
+ mov r7, tmpq
+%else
+ mov r6d, wd
+%endif
+.v_w16_loop0:
+ movu m0, [srcq+strideq*0]
+.v_w16_loop:
+ movu m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m3
+ pmullw m1, m4, m2
+ psubw m0, m5
+ paddw m1, m0
+ movu m0, [srcq+strideq*0]
+ psraw m1, 2
+ pmullw m2, m3
+ mova [tmpq+wq*0], m1
+ pmullw m1, m4, m0
+ psubw m2, m5
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq+wq*2], m1
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .v_w16_loop
+%if ARCH_X86_64
+ add r5, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r6d, 1<<8
+%else
+ mov tmpq, tmpmp
+ add r5, 16
+ mov hd, hm
+ add tmpq, 16
+ mov srcq, r5
+ mov tmpmp, tmpq
+ sub r6d, 8
+%endif
+ jg .v_w16_loop0
+%if WIN64
+ pop r7
+%endif
+ RET
+.hv:
+ WIN64_SPILL_XMM 7
+ shl mxyd, 11
+ movd m6, mxyd
+ pshufb m6, [base+pw_256]
+ cmp wd, 8
+ je .hv_w8
+ jg .hv_w16
+.hv_w4:
+ movddup m0, [srcq+strideq*0]
+ movddup m1, [srcq+strideq*0+2]
+ pmullw m0, m3
+ pmullw m1, m4
+ psubw m0, m5
+ paddw m0, m1
+ psraw m0, 2
+.hv_w4_loop:
+ movq m1, [srcq+strideq*1]
+ movq m2, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ movhps m1, [srcq+strideq*0]
+ movhps m2, [srcq+strideq*0+2]
+ pmullw m1, m3
+ pmullw m2, m4
+ psubw m1, m5
+ paddw m1, m2
+ psraw m1, 2 ; 1 2
+ shufpd m2, m0, m1, 0x01 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m6
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 16
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*0+2]
+ pmullw m0, m3
+ pmullw m1, m4
+ psubw m0, m5
+ paddw m0, m1
+ psraw m0, 2
+.hv_w8_loop:
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ pmullw m1, m3
+ pmullw m2, m4
+ psubw m1, m5
+ paddw m1, m2
+ psraw m1, 2
+ psubw m2, m1, m0
+ pmulhrsw m2, m6
+ paddw m2, m0
+ mova [tmpq+16*0], m2
+ movu m0, [srcq+strideq*0]
+ movu m2, [srcq+strideq*0+2]
+ pmullw m0, m3
+ pmullw m2, m4
+ psubw m0, m5
+ paddw m0, m2
+ psraw m0, 2
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+16*1], m2
+ add tmpq, 16*2
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+%if WIN64
+ push r7
+%endif
+ mov r5, srcq
+%if ARCH_X86_64
+ lea r6d, [wq*4-32]
+ mov wd, wd
+ lea r6d, [hq+r6*8]
+ mov r7, tmpq
+%else
+ mov r6d, wd
+%endif
+.hv_w16_loop0:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*0+2]
+ pmullw m0, m3
+ pmullw m1, m4
+ psubw m0, m5
+ paddw m0, m1
+ psraw m0, 2
+.hv_w16_loop:
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ pmullw m1, m3
+ pmullw m2, m4
+ psubw m1, m5
+ paddw m1, m2
+ psraw m1, 2
+ psubw m2, m1, m0
+ pmulhrsw m2, m6
+ paddw m2, m0
+ mova [tmpq+wq*0], m2
+ movu m0, [srcq+strideq*0]
+ movu m2, [srcq+strideq*0+2]
+ pmullw m0, m3
+ pmullw m2, m4
+ psubw m0, m5
+ paddw m0, m2
+ psraw m0, 2
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+wq*2], m2
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .hv_w16_loop
+%if ARCH_X86_64
+ add r5, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r6d, 1<<8
+%else
+ mov tmpq, tmpmp
+ add r5, 16
+ mov hd, hm
+ add tmpq, 16
+ mov srcq, r5
+ mov tmpmp, tmpq
+ sub r6d, 8
+%endif
+ jg .hv_w16_loop0
+%if WIN64
+ pop r7
+%endif
+ RET
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+%macro FN 4 ; prefix, type, type_h, type_v
+cglobal %1_%2_16bpc
+ mov t0d, FILTER_%3
+%ifidn %3, %4
+ mov t1d, t0d
+%else
+ mov t1d, FILTER_%4
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX)
+%endif
+%endmacro
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 1, 2, 6
+%elif WIN64
+DECLARE_REG_TMP 4, 5, 8
+%else
+DECLARE_REG_TMP 7, 8, 8
+%endif
+
+%define PUT_8TAP_FN FN put_8tap,
+PUT_8TAP_FN sharp, SHARP, SHARP
+PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN regular, REGULAR, REGULAR
+
+%if ARCH_X86_32
+cglobal put_8tap_16bpc, 0, 7, 8, dst, ds, src, ss, w, h, mx, my
+%define mxb r0b
+%define mxd r0
+%define mxq r0
+%define myb r1b
+%define myd r1
+%define myq r1
+%define m8 [esp+16*0]
+%define m9 [esp+16*1]
+%define m10 [esp+16*2]
+%define m11 [esp+16*3]
+%define m12 [esp+16*4]
+%define m13 [esp+16*5]
+%define m14 [esp+16*6]
+%define m15 [esp+16*7]
+%else
+cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
+%endif
+%define base t2-put_ssse3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ LEA t2, put_ssse3
+ movifnidn wd, wm
+ movifnidn srcq, srcmp
+ movifnidn ssq, ssmp
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [base+put_ssse3_table+wq*2]
+ movifnidn dstq, dstmp
+ movifnidn dsq, dsmp
+ add wq, t2
+%if WIN64
+ pop r8
+ pop r7
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ mov myd, r8m
+ movd m5, r8m
+ shr myd, 11
+ movddup m4, [base+put_8tap_h_rnd+myq*8]
+ movifnidn dsq, dsmp
+ pshufb m5, [base+pw_256]
+ cmp wd, 4
+ jg .h_w8
+ movzx mxd, mxb
+ lea srcq, [srcq-2]
+ movq m3, [base+subpel_filters+mxq*8]
+ movifnidn dstq, dstmp
+ punpcklbw m3, m3
+ psraw m3, 8 ; sign-extend
+ je .h_w4
+.h_w2:
+ mova m2, [base+spel_h_shuf2]
+ pshufd m3, m3, q2121
+.h_w2_loop:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m2
+ pshufb m1, m2
+ pmaddwd m0, m3
+ pmaddwd m1, m3
+ phaddd m0, m1
+ paddd m0, m4
+ psrad m0, 6
+ packssdw m0, m0
+ pxor m1, m1
+ pminsw m0, m5
+ pmaxsw m0, m1
+ movd [dstq+dsq*0], m0
+ pshuflw m0, m0, q3232
+ movd [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ WIN64_SPILL_XMM 8
+ mova m6, [base+spel_h_shufA]
+ mova m7, [base+spel_h_shufB]
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q2222
+.h_w4_loop:
+ movu m1, [srcq]
+ add srcq, ssq
+ pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4
+ pshufb m1, m7 ; 2 3 3 4 4 5 5 6
+ pmaddwd m0, m2
+ pmaddwd m1, m3
+ paddd m0, m4
+ paddd m0, m1
+ psrad m0, 6
+ packssdw m0, m0
+ pxor m1, m1
+ pminsw m0, m5
+ pmaxsw m0, m1
+ movq [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .h_w4_loop
+ RET
+.h_w8:
+%if WIN64
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 12
+%endif
+ shr mxd, 16
+ movq m3, [base+subpel_filters+mxq*8]
+ movifnidn dstq, dstmp
+ mova m6, [base+spel_h_shufA]
+ mova m7, [base+spel_h_shufB]
+%if UNIX64
+ mov wd, wd
+%endif
+ lea srcq, [srcq+wq*2]
+ punpcklbw m3, m3
+ lea dstq, [dstq+wq*2]
+ psraw m3, 8
+ neg wq
+%if ARCH_X86_32
+ ALLOC_STACK -16*4
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+ mova m11, m3
+%else
+ pshufd m8, m3, q0000
+ pshufd m9, m3, q1111
+ pshufd m10, m3, q2222
+ pshufd m11, m3, q3333
+%endif
+.h_w8_loop0:
+ mov r6, wq
+.h_w8_loop:
+ movu m0, [srcq+r6*2- 6]
+ movu m1, [srcq+r6*2+ 2]
+ pshufb m2, m0, m6 ; 0 1 1 2 2 3 3 4
+ pshufb m0, m7 ; 2 3 3 4 4 5 5 6
+ pmaddwd m2, m8 ; abcd0
+ pmaddwd m0, m9 ; abcd1
+ pshufb m3, m1, m6 ; 4 5 5 6 6 7 7 8
+ pshufb m1, m7 ; 6 7 7 8 8 9 9 a
+ paddd m2, m4
+ paddd m0, m2
+ pmaddwd m2, m10, m3 ; abcd2
+ pmaddwd m3, m8 ; efgh0
+ paddd m0, m2
+ pmaddwd m2, m11, m1 ; abcd3
+ pmaddwd m1, m9 ; efgh1
+ paddd m0, m2
+ movu m2, [srcq+r6*2+10]
+ paddd m3, m4
+ paddd m1, m3
+ pshufb m3, m2, m6 ; 8 9 9 a a b b c
+ pshufb m2, m7 ; a b b c c d d e
+ pmaddwd m3, m10 ; efgh2
+ pmaddwd m2, m11 ; efgh3
+ paddd m1, m3
+ paddd m1, m2
+ psrad m0, 6
+ psrad m1, 6
+ packssdw m0, m1
+ pxor m1, m1
+ pminsw m0, m5
+ pmaxsw m0, m1
+ mova [dstq+r6*2], m0
+ add r6, 8
+ jl .h_w8_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w8_loop0
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovb myd, mxd
+ movq m3, [base+subpel_filters+myq*8]
+%if STACK_ALIGNMENT < 16
+ %xdefine rstk rsp
+%else
+ %assign stack_offset stack_offset - stack_size_padded
+%endif
+%if WIN64
+ WIN64_SPILL_XMM 15
+%endif
+ movd m7, r8m
+ movifnidn dstq, dstmp
+ movifnidn dsq, dsmp
+ punpcklbw m3, m3
+ pshufb m7, [base+pw_256]
+ psraw m3, 8 ; sign-extend
+%if ARCH_X86_32
+ ALLOC_STACK -16*7
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+ mova m11, m3
+%else
+ pshufd m8, m3, q0000
+ pshufd m9, m3, q1111
+ pshufd m10, m3, q2222
+ pshufd m11, m3, q3333
+%endif
+ lea r6, [ssq*3]
+ sub srcq, r6
+ cmp wd, 2
+ jne .v_w4
+.v_w2:
+ movd m1, [srcq+ssq*0]
+ movd m4, [srcq+ssq*1]
+ movd m2, [srcq+ssq*2]
+ add srcq, r6
+ movd m5, [srcq+ssq*0]
+ movd m3, [srcq+ssq*1]
+ movd m6, [srcq+ssq*2]
+ add srcq, r6
+ movd m0, [srcq+ssq*0]
+ punpckldq m1, m4 ; 0 1
+ punpckldq m4, m2 ; 1 2
+ punpckldq m2, m5 ; 2 3
+ punpckldq m5, m3 ; 3 4
+ punpckldq m3, m6 ; 4 5
+ punpckldq m6, m0 ; 5 6
+ punpcklwd m1, m4 ; 01 12
+ punpcklwd m2, m5 ; 23 34
+ punpcklwd m3, m6 ; 45 56
+ pxor m6, m6
+.v_w2_loop:
+ movd m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m5, m8, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m9 ; a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m10 ; a2 b2
+ paddd m5, m3
+ punpckldq m3, m0, m4 ; 6 7
+ movd m0, [srcq+ssq*0]
+ punpckldq m4, m0 ; 7 8
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m11, m3 ; a3 b3
+ paddd m5, m4
+ psrad m5, 5
+ packssdw m5, m5
+ pmaxsw m5, m6
+ pavgw m5, m6
+ pminsw m5, m7
+ movd [dstq+dsq*0], m5
+ pshuflw m5, m5, q3232
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+%if ARCH_X86_32
+ shl wd, 14
+%if STACK_ALIGNMENT < 16
+ mov [esp+4*29], srcq
+ mov [esp+4*30], dstq
+%else
+ mov srcmp, srcq
+%endif
+ lea wd, [wq+hq-(1<<16)]
+%else
+ shl wd, 6
+ mov r7, srcq
+ mov r8, dstq
+ lea wd, [wq+hq-(1<<8)]
+%endif
+.v_w4_loop0:
+ movq m1, [srcq+ssq*0]
+ movq m2, [srcq+ssq*1]
+ movq m3, [srcq+ssq*2]
+ add srcq, r6
+ movq m4, [srcq+ssq*0]
+ movq m5, [srcq+ssq*1]
+ movq m6, [srcq+ssq*2]
+ add srcq, r6
+ movq m0, [srcq+ssq*0]
+ punpcklwd m1, m2 ; 01
+ punpcklwd m2, m3 ; 12
+ punpcklwd m3, m4 ; 23
+ punpcklwd m4, m5 ; 34
+ punpcklwd m5, m6 ; 45
+ punpcklwd m6, m0 ; 56
+%if ARCH_X86_32
+ jmp .v_w4_loop_start
+.v_w4_loop:
+ mova m1, m12
+ mova m2, m13
+ mova m3, m14
+.v_w4_loop_start:
+ pmaddwd m1, m8 ; a0
+ pmaddwd m2, m8 ; b0
+ mova m12, m3
+ mova m13, m4
+ pmaddwd m3, m9 ; a1
+ pmaddwd m4, m9 ; b1
+ paddd m1, m3
+ paddd m2, m4
+ mova m14, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m1, m5
+ paddd m2, m6
+ movq m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklwd m5, m0, m6 ; 67
+ movq m0, [srcq+ssq*0]
+ pmaddwd m3, m11, m5 ; a3
+ punpcklwd m6, m0 ; 78
+ paddd m1, m3
+ pmaddwd m3, m11, m6 ; b3
+ paddd m2, m3
+ psrad m1, 5
+ psrad m2, 5
+ packssdw m1, m2
+ pxor m2, m2
+ pmaxsw m1, m2
+ pavgw m1, m2
+ pminsw m1, m7
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+%if STACK_ALIGNMENT < 16
+ mov srcq, [esp+4*29]
+ mov dstq, [esp+4*30]
+ movzx hd, ww
+ add srcq, 8
+ add dstq, 8
+ mov [esp+4*29], srcq
+ mov [esp+4*30], dstq
+%else
+ mov srcq, srcmp
+ mov dstq, dstmp
+ movzx hd, ww
+ add srcq, 8
+ add dstq, 8
+ mov srcmp, srcq
+ mov dstmp, dstq
+%endif
+ sub wd, 1<<16
+%else
+.v_w4_loop:
+ pmaddwd m12, m8, m1 ; a0
+ pmaddwd m13, m8, m2 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m9 ; a1
+ pmaddwd m4, m9 ; b1
+ paddd m12, m3
+ paddd m13, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m12, m5
+ paddd m13, m6
+ movq m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklwd m5, m0, m6 ; 67
+ movq m0, [srcq+ssq*0]
+ pmaddwd m14, m11, m5 ; a3
+ punpcklwd m6, m0 ; 78
+ paddd m12, m14
+ pmaddwd m14, m11, m6 ; b3
+ paddd m13, m14
+ psrad m12, 5
+ psrad m13, 5
+ packssdw m12, m13
+ pxor m13, m13
+ pmaxsw m12, m13
+ pavgw m12, m13
+ pminsw m12, m7
+ movq [dstq+dsq*0], m12
+ movhps [dstq+dsq*1], m12
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ add r7, 8
+ add r8, 8
+ movzx hd, wb
+ mov srcq, r7
+ mov dstq, r8
+ sub wd, 1<<8
+%endif
+ jg .v_w4_loop0
+ RET
+.hv:
+%if STACK_ALIGNMENT < 16
+ %xdefine rstk rsp
+%else
+ %assign stack_offset stack_offset - stack_size_padded
+%endif
+%if ARCH_X86_32
+ movd m4, r8m
+ mova m6, [base+pd_512]
+ pshufb m4, [base+pw_256]
+%else
+%if WIN64
+ ALLOC_STACK 16*6, 16
+%endif
+ movd m15, r8m
+ pshufb m15, [base+pw_256]
+%endif
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ je .hv_w4
+ movq m0, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovb myd, mxd
+ movq m3, [base+subpel_filters+myq*8]
+%if ARCH_X86_32
+ mov dstq, dstmp
+ mov dsq, dsmp
+ mova m5, [base+spel_h_shuf2]
+ ALLOC_STACK -16*8
+%else
+ mova m6, [base+pd_512]
+ mova m9, [base+spel_h_shuf2]
+%endif
+ pshuflw m0, m0, q2121
+ pxor m7, m7
+ punpcklbw m7, m0
+ punpcklbw m3, m3
+ psraw m3, 8 ; sign-extend
+ test dword r8m, 0x800
+ jz .hv_w2_10bpc
+ psraw m7, 2
+ psllw m3, 2
+.hv_w2_10bpc:
+ lea r6, [ssq*3]
+ sub srcq, 2
+ sub srcq, r6
+%if ARCH_X86_32
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova m9, m5
+ mova m11, m0
+ mova m12, m1
+ mova m13, m2
+ mova m14, m3
+ mova m15, m4
+%else
+ pshufd m11, m3, q0000
+ pshufd m12, m3, q1111
+ pshufd m13, m3, q2222
+ pshufd m14, m3, q3333
+%endif
+ movu m2, [srcq+ssq*0]
+ movu m3, [srcq+ssq*1]
+ movu m1, [srcq+ssq*2]
+ add srcq, r6
+ movu m4, [srcq+ssq*0]
+%if ARCH_X86_32
+ REPX {pshufb x, m5}, m2, m3, m1, m4
+%else
+ REPX {pshufb x, m9}, m2, m3, m1, m4
+%endif
+ REPX {pmaddwd x, m7}, m2, m3, m1, m4
+ phaddd m2, m3 ; 0 1
+ phaddd m1, m4 ; 2 3
+ movu m3, [srcq+ssq*1]
+ movu m4, [srcq+ssq*2]
+ add srcq, r6
+ movu m0, [srcq+ssq*0]
+%if ARCH_X86_32
+ REPX {pshufb x, m5}, m3, m4, m0
+%else
+ REPX {pshufb x, m9}, m3, m4, m0
+%endif
+ REPX {pmaddwd x, m7}, m3, m4, m0
+ phaddd m3, m4 ; 4 5
+ phaddd m0, m0 ; 6 6
+ REPX {paddd x, m6}, m2, m1, m3, m0
+ REPX {psrad x, 10}, m2, m1, m3, m0
+ packssdw m2, m1 ; 0 1 2 3
+ packssdw m3, m0 ; 4 5 6 _
+ palignr m4, m3, m2, 4 ; 1 2 3 4
+ pshufd m5, m3, q0321 ; 5 6 _ _
+ punpcklwd m1, m2, m4 ; 01 12
+ punpckhwd m2, m4 ; 23 34
+ punpcklwd m3, m5 ; 45 56
+.hv_w2_loop:
+ movu m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movu m5, [srcq+ssq*0]
+ pshufb m4, m9
+ pshufb m5, m9
+ pmaddwd m4, m7
+ pmaddwd m5, m7
+ phaddd m4, m5
+ pmaddwd m5, m11, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m12 ; a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m13 ; a2 b2
+ paddd m5, m3
+ paddd m4, m6
+ psrad m4, 10 ; 7 8
+ packssdw m0, m4
+ pshufd m3, m0, q2103
+ punpckhwd m3, m0 ; 67 78
+ mova m0, m4
+ pmaddwd m4, m14, m3 ; a3 b3
+ paddd m5, m6
+ paddd m5, m4
+ psrad m5, 10
+ packssdw m5, m5
+ pxor m4, m4
+ pminsw m5, m15
+ pmaxsw m5, m4
+ movd [dstq+dsq*0], m5
+ pshuflw m5, m5, q3232
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+.hv_w4:
+ movq m2, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovb myd, mxd
+ movq m3, [base+subpel_filters+myq*8]
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < 16
+ %xdefine rstk rsp
+%else
+ %assign stack_offset stack_offset - stack_size_padded
+%endif
+ mov dstq, dstmp
+ mov dsq, dsmp
+ mova m0, [base+spel_h_shufA]
+ mova m1, [base+spel_h_shufB]
+ ALLOC_STACK -16*15
+ mova m8, m0
+ mova m9, m1
+ mova m14, m6
+%else
+ mova m8, [base+spel_h_shufA]
+ mova m9, [base+spel_h_shufB]
+%endif
+ pxor m0, m0
+ punpcklbw m0, m2
+ punpcklbw m3, m3
+ psraw m3, 8
+ test dword r8m, 0x800
+ jz .hv_w4_10bpc
+ psraw m0, 2
+ psllw m3, 2
+.hv_w4_10bpc:
+ lea r6, [ssq*3]
+ sub srcq, 6
+ sub srcq, r6
+%if ARCH_X86_32
+ %define tmp esp+16*8
+ shl wd, 14
+%if STACK_ALIGNMENT < 16
+ mov [esp+4*61], srcq
+ mov [esp+4*62], dstq
+%else
+ mov srcmp, srcq
+%endif
+ mova [tmp+16*5], m4
+ lea wd, [wq+hq-(1<<16)]
+ pshufd m1, m0, q0000
+ pshufd m2, m0, q1111
+ pshufd m5, m0, q2222
+ pshufd m0, m0, q3333
+ mova m10, m1
+ mova m11, m2
+ mova m12, m5
+ mova m13, m0
+%else
+%if WIN64
+ %define tmp rsp
+%else
+ %define tmp rsp-104 ; red zone
+%endif
+ shl wd, 6
+ mov r7, srcq
+ mov r8, dstq
+ lea wd, [wq+hq-(1<<8)]
+ pshufd m10, m0, q0000
+ pshufd m11, m0, q1111
+ pshufd m12, m0, q2222
+ pshufd m13, m0, q3333
+ mova [tmp+16*5], m15
+%endif
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova [tmp+16*1], m0
+ mova [tmp+16*2], m1
+ mova [tmp+16*3], m2
+ mova [tmp+16*4], m3
+%macro PUT_8TAP_HV_H 4-5 m14 ; dst/src+0, src+8, tmp, shift, [pd_512]
+ pshufb m%3, m%1, m8 ; 0 1 1 2 2 3 3 4
+ pshufb m%1, m9 ; 2 3 3 4 4 5 5 6
+ pmaddwd m%3, m10
+ pmaddwd m%1, m11
+ paddd m%3, %5
+ paddd m%1, m%3
+ pshufb m%3, m%2, m8 ; 4 5 5 6 6 7 7 8
+ pshufb m%2, m9 ; 6 7 7 8 8 9 9 a
+ pmaddwd m%3, m12
+ pmaddwd m%2, m13
+ paddd m%1, m%3
+ paddd m%1, m%2
+ psrad m%1, %4
+%endmacro
+.hv_w4_loop0:
+%if ARCH_X86_64
+ mova m14, [pd_512]
+%endif
+ movu m4, [srcq+ssq*0+0]
+ movu m1, [srcq+ssq*0+8]
+ movu m5, [srcq+ssq*1+0]
+ movu m2, [srcq+ssq*1+8]
+ movu m6, [srcq+ssq*2+0]
+ movu m3, [srcq+ssq*2+8]
+ add srcq, r6
+ PUT_8TAP_HV_H 4, 1, 0, 10
+ PUT_8TAP_HV_H 5, 2, 0, 10
+ PUT_8TAP_HV_H 6, 3, 0, 10
+ movu m7, [srcq+ssq*0+0]
+ movu m2, [srcq+ssq*0+8]
+ movu m1, [srcq+ssq*1+0]
+ movu m3, [srcq+ssq*1+8]
+ PUT_8TAP_HV_H 7, 2, 0, 10
+ PUT_8TAP_HV_H 1, 3, 0, 10
+ movu m2, [srcq+ssq*2+0]
+ movu m3, [srcq+ssq*2+8]
+ add srcq, r6
+ PUT_8TAP_HV_H 2, 3, 0, 10
+ packssdw m4, m7 ; 0 3
+ packssdw m5, m1 ; 1 4
+ movu m0, [srcq+ssq*0+0]
+ movu m1, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 0, 1, 3, 10
+ packssdw m6, m2 ; 2 5
+ packssdw m7, m0 ; 3 6
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ punpcklwd m3, m6, m7 ; 23
+ punpckhwd m6, m7 ; 56
+%if ARCH_X86_32
+ jmp .hv_w4_loop_start
+.hv_w4_loop:
+ mova m1, [tmp+16*6]
+ mova m2, m15
+.hv_w4_loop_start:
+ mova m7, [tmp+16*1]
+ pmaddwd m1, m7 ; a0
+ pmaddwd m2, m7 ; b0
+ mova m7, [tmp+16*2]
+ mova [tmp+16*6], m3
+ pmaddwd m3, m7 ; a1
+ mova m15, m4
+ pmaddwd m4, m7 ; b1
+ mova m7, [tmp+16*3]
+ paddd m1, m3
+ paddd m2, m4
+ mova m3, m5
+ pmaddwd m5, m7 ; a2
+ mova m4, m6
+ pmaddwd m6, m7 ; b2
+ paddd m1, m5
+ paddd m2, m6
+ movu m7, [srcq+ssq*1+0]
+ movu m5, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_HV_H 7, 5, 6, 10
+ packssdw m0, m7 ; 6 7
+ mova [tmp+16*0], m0
+ movu m0, [srcq+ssq*0+0]
+ movu m5, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 0, 5, 6, 10
+ mova m6, [tmp+16*0]
+ packssdw m7, m0 ; 7 8
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, [tmp+16*4]
+ paddd m1, m7 ; a3
+ pmaddwd m7, m6, [tmp+16*4]
+ paddd m2, m7 ; b3
+ psrad m1, 9
+ psrad m2, 9
+ packssdw m1, m2
+ pxor m7, m7
+ pmaxsw m1, m7
+ pavgw m7, m1
+ pminsw m7, [tmp+16*5]
+ movq [dstq+dsq*0], m7
+ movhps [dstq+dsq*1], m7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+%if STACK_ALIGNMENT < 16
+ mov srcq, [esp+4*61]
+ mov dstq, [esp+4*62]
+ add srcq, 8
+ add dstq, 8
+ mov [esp+4*61], srcq
+ mov [esp+4*62], dstq
+%else
+ mov srcq, srcmp
+ mov dstq, dstmp
+ add srcq, 8
+ add dstq, 8
+ mov srcmp, srcq
+ mov dstmp, dstq
+%endif
+ movzx hd, ww
+ sub wd, 1<<16
+%else
+.hv_w4_loop:
+ mova m15, [tmp+16*1]
+ pmaddwd m14, m15, m1 ; a0
+ pmaddwd m15, m2 ; b0
+ mova m7, [tmp+16*2]
+ mova m1, m3
+ pmaddwd m3, m7 ; a1
+ mova m2, m4
+ pmaddwd m4, m7 ; b1
+ mova m7, [tmp+16*3]
+ paddd m14, m3
+ paddd m15, m4
+ mova m3, m5
+ pmaddwd m5, m7 ; a2
+ mova m4, m6
+ pmaddwd m6, m7 ; b2
+ paddd m14, m5
+ paddd m15, m6
+ movu m7, [srcq+ssq*1+0]
+ movu m5, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_HV_H 7, 5, 6, 10, [pd_512]
+ packssdw m0, m7 ; 6 7
+ mova [tmp+16*0], m0
+ movu m0, [srcq+ssq*0+0]
+ movu m5, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 0, 5, 6, 10, [pd_512]
+ mova m6, [tmp+16*0]
+ packssdw m7, m0 ; 7 8
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, [tmp+16*4]
+ paddd m14, m7 ; a3
+ pmaddwd m7, m6, [tmp+16*4]
+ paddd m15, m7 ; b3
+ psrad m14, 9
+ psrad m15, 9
+ packssdw m14, m15
+ pxor m7, m7
+ pmaxsw m14, m7
+ pavgw m7, m14
+ pminsw m7, [tmp+16*5]
+ movq [dstq+dsq*0], m7
+ movhps [dstq+dsq*1], m7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ add r7, 8
+ add r8, 8
+ movzx hd, wb
+ mov srcq, r7
+ mov dstq, r8
+ sub wd, 1<<8
+%endif
+ jg .hv_w4_loop0
+ RET
+%undef tmp
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 2, 1, 6, 4
+%elif WIN64
+DECLARE_REG_TMP 6, 4, 7, 4
+%else
+DECLARE_REG_TMP 6, 7, 7, 8
+%endif
+
+%define PREP_8TAP_FN FN prep_8tap,
+PREP_8TAP_FN sharp, SHARP, SHARP
+PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN regular, REGULAR, REGULAR
+
+%if ARCH_X86_32
+cglobal prep_8tap_16bpc, 0, 7, 8, tmp, src, ss, w, h, mx, my
+%define mxb r0b
+%define mxd r0
+%define mxq r0
+%define myb r2b
+%define myd r2
+%define myq r2
+%else
+cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my
+%endif
+%define base t2-prep_ssse3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ LEA t2, prep_ssse3
+ movifnidn wd, wm
+ movifnidn srcq, srcmp
+ test mxd, 0xf00
+ jnz .h
+ movifnidn hd, hm
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ mov myd, r7m ; bitdepth_max
+ movzx wd, word [base+prep_ssse3_table+wq*2]
+ mova m5, [base+pw_8192]
+ shr myd, 11
+ add wq, t2
+ movddup m4, [base+prep_mul+myq*8]
+ movifnidn ssq, ssmp
+ movifnidn tmpq, tmpmp
+ lea r6, [ssq*3]
+%if WIN64
+ pop r7
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ movifnidn ssq, r2mp
+ movifnidn hd, r4m
+ movddup m5, [base+prep_8tap_1d_rnd]
+ cmp wd, 4
+ jne .h_w8
+ movzx mxd, mxb
+ movq m0, [base+subpel_filters+mxq*8]
+ mova m3, [base+spel_h_shufA]
+ mova m4, [base+spel_h_shufB]
+ movifnidn tmpq, tmpmp
+ sub srcq, 2
+ WIN64_SPILL_XMM 8
+ punpcklbw m0, m0
+ psraw m0, 8
+ test dword r7m, 0x800
+ jnz .h_w4_12bpc
+ psllw m0, 2
+.h_w4_12bpc:
+ pshufd m6, m0, q1111
+ pshufd m7, m0, q2222
+.h_w4_loop:
+ movu m1, [srcq+ssq*0]
+ movu m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4
+ pshufb m1, m4 ; 2 3 3 4 4 5 5 6
+ pmaddwd m0, m6
+ pmaddwd m1, m7
+ paddd m0, m5
+ paddd m0, m1
+ pshufb m1, m2, m3
+ pshufb m2, m4
+ pmaddwd m1, m6
+ pmaddwd m2, m7
+ paddd m1, m5
+ paddd m1, m2
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ mova [tmpq], m0
+ add tmpq, 16
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h_w8:
+ WIN64_SPILL_XMM 11
+ shr mxd, 16
+ movq m2, [base+subpel_filters+mxq*8]
+ mova m4, [base+spel_h_shufA]
+ mova m6, [base+spel_h_shufB]
+ movifnidn tmpq, r0mp
+ add wd, wd
+ punpcklbw m2, m2
+ add srcq, wq
+ psraw m2, 8
+ add tmpq, wq
+ neg wq
+ test dword r7m, 0x800
+ jnz .h_w8_12bpc
+ psllw m2, 2
+.h_w8_12bpc:
+ pshufd m7, m2, q0000
+%if ARCH_X86_32
+ ALLOC_STACK -16*3
+ pshufd m0, m2, q1111
+ pshufd m1, m2, q2222
+ pshufd m2, m2, q3333
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+%else
+ pshufd m8, m2, q1111
+ pshufd m9, m2, q2222
+ pshufd m10, m2, q3333
+%endif
+.h_w8_loop0:
+ mov r6, wq
+.h_w8_loop:
+ movu m0, [srcq+r6- 6]
+ movu m1, [srcq+r6+ 2]
+ pshufb m2, m0, m4 ; 0 1 1 2 2 3 3 4
+ pshufb m0, m6 ; 2 3 3 4 4 5 5 6
+ pmaddwd m2, m7 ; abcd0
+ pmaddwd m0, m8 ; abcd1
+ pshufb m3, m1, m4 ; 4 5 5 6 6 7 7 8
+ pshufb m1, m6 ; 6 7 7 8 8 9 9 a
+ paddd m2, m5
+ paddd m0, m2
+ pmaddwd m2, m9, m3 ; abcd2
+ pmaddwd m3, m7 ; efgh0
+ paddd m0, m2
+ pmaddwd m2, m10, m1 ; abcd3
+ pmaddwd m1, m8 ; efgh1
+ paddd m0, m2
+ movu m2, [srcq+r6+10]
+ paddd m3, m5
+ paddd m1, m3
+ pshufb m3, m2, m4 ; a b b c c d d e
+ pshufb m2, m6 ; 8 9 9 a a b b c
+ pmaddwd m3, m9 ; efgh2
+ pmaddwd m2, m10 ; efgh3
+ paddd m1, m3
+ paddd m1, m2
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ mova [tmpq+r6], m0
+ add r6, 16
+ jl .h_w8_loop
+ add srcq, ssq
+ sub tmpq, wq
+ dec hd
+ jg .h_w8_loop0
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ movq m3, [base+subpel_filters+myq*8]
+%if STACK_ALIGNMENT < 16
+ %xdefine rstk rsp
+%else
+ %assign stack_offset stack_offset - stack_size_padded
+%endif
+ WIN64_SPILL_XMM 15
+ movddup m7, [base+prep_8tap_1d_rnd]
+ movifnidn ssq, r2mp
+ movifnidn tmpq, r0mp
+ punpcklbw m3, m3
+ psraw m3, 8 ; sign-extend
+ test dword r7m, 0x800
+ jnz .v_12bpc
+ psllw m3, 2
+.v_12bpc:
+%if ARCH_X86_32
+ ALLOC_STACK -16*7
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+ mova m11, m3
+%else
+ pshufd m8, m3, q0000
+ pshufd m9, m3, q1111
+ pshufd m10, m3, q2222
+ pshufd m11, m3, q3333
+%endif
+ lea r6, [ssq*3]
+ sub srcq, r6
+ mov r6d, wd
+ shl wd, 6
+ mov r5, srcq
+%if ARCH_X86_64
+ mov r7, tmpq
+%elif STACK_ALIGNMENT < 16
+ mov [esp+4*29], tmpq
+%endif
+ lea wd, [wq+hq-(1<<8)]
+.v_loop0:
+ movq m1, [srcq+ssq*0]
+ movq m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movq m3, [srcq+ssq*0]
+ movq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movq m5, [srcq+ssq*0]
+ movq m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movq m0, [srcq+ssq*0]
+ punpcklwd m1, m2 ; 01
+ punpcklwd m2, m3 ; 12
+ punpcklwd m3, m4 ; 23
+ punpcklwd m4, m5 ; 34
+ punpcklwd m5, m6 ; 45
+ punpcklwd m6, m0 ; 56
+%if ARCH_X86_32
+ jmp .v_loop_start
+.v_loop:
+ mova m1, m12
+ mova m2, m13
+ mova m3, m14
+.v_loop_start:
+ pmaddwd m1, m8 ; a0
+ pmaddwd m2, m8 ; b0
+ mova m12, m3
+ mova m13, m4
+ pmaddwd m3, m9 ; a1
+ pmaddwd m4, m9 ; b1
+ paddd m1, m3
+ paddd m2, m4
+ mova m14, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m1, m5
+ paddd m2, m6
+ movq m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklwd m5, m0, m6 ; 67
+ movq m0, [srcq+ssq*0]
+ pmaddwd m3, m11, m5 ; a3
+ punpcklwd m6, m0 ; 78
+ paddd m1, m7
+ paddd m1, m3
+ pmaddwd m3, m11, m6 ; b3
+ paddd m2, m7
+ paddd m2, m3
+ psrad m1, 4
+ psrad m2, 4
+ packssdw m1, m2
+ movq [tmpq+r6*0], m1
+ movhps [tmpq+r6*2], m1
+ lea tmpq, [tmpq+r6*4]
+ sub hd, 2
+ jg .v_loop
+%if STACK_ALIGNMENT < 16
+ mov tmpq, [esp+4*29]
+ add r5, 8
+ add tmpq, 8
+ mov srcq, r5
+ mov [esp+4*29], tmpq
+%else
+ mov tmpq, tmpmp
+ add r5, 8
+ add tmpq, 8
+ mov srcq, r5
+ mov tmpmp, tmpq
+%endif
+%else
+.v_loop:
+ pmaddwd m12, m8, m1 ; a0
+ pmaddwd m13, m8, m2 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m9 ; a1
+ pmaddwd m4, m9 ; b1
+ paddd m12, m3
+ paddd m13, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m12, m5
+ paddd m13, m6
+ movq m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklwd m5, m0, m6 ; 67
+ movq m0, [srcq+ssq*0]
+ pmaddwd m14, m11, m5 ; a3
+ punpcklwd m6, m0 ; 78
+ paddd m12, m7
+ paddd m12, m14
+ pmaddwd m14, m11, m6 ; b3
+ paddd m13, m7
+ paddd m13, m14
+ psrad m12, 4
+ psrad m13, 4
+ packssdw m12, m13
+ movq [tmpq+r6*0], m12
+ movhps [tmpq+r6*2], m12
+ lea tmpq, [tmpq+r6*4]
+ sub hd, 2
+ jg .v_loop
+ add r5, 8
+ add r7, 8
+ mov srcq, r5
+ mov tmpq, r7
+%endif
+ movzx hd, wb
+ sub wd, 1<<8
+ jg .v_loop0
+ RET
+.hv:
+%if STACK_ALIGNMENT < 16
+ %xdefine rstk rsp
+%else
+ %assign stack_offset stack_offset - stack_size_padded
+%endif
+ movzx t3d, mxb
+ shr mxd, 16
+ cmp wd, 4
+ cmove mxd, t3d
+ movifnidn hd, r4m
+ movq m2, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ movq m3, [base+subpel_filters+myq*8]
+%if ARCH_X86_32
+ mov ssq, r2mp
+ mov tmpq, r0mp
+ mova m0, [base+spel_h_shufA]
+ mova m1, [base+spel_h_shufB]
+ mova m4, [base+prep_8tap_2d_rnd]
+ ALLOC_STACK -16*14
+ mova m8, m0
+ mova m9, m1
+ mova m14, m4
+%else
+%if WIN64
+ ALLOC_STACK 16*6, 16
+%endif
+ mova m8, [base+spel_h_shufA]
+ mova m9, [base+spel_h_shufB]
+%endif
+ pxor m0, m0
+ punpcklbw m0, m2
+ punpcklbw m3, m3
+ psraw m0, 4
+ psraw m3, 8
+ test dword r7m, 0x800
+ jz .hv_10bpc
+ psraw m0, 2
+.hv_10bpc:
+ lea r6, [ssq*3]
+ sub srcq, 6
+ sub srcq, r6
+ mov r6d, wd
+ shl wd, 6
+ mov r5, srcq
+%if ARCH_X86_32
+ %define tmp esp+16*8
+%if STACK_ALIGNMENT < 16
+ mov [esp+4*61], tmpq
+%endif
+ pshufd m1, m0, q0000
+ pshufd m2, m0, q1111
+ pshufd m5, m0, q2222
+ pshufd m0, m0, q3333
+ mova m10, m1
+ mova m11, m2
+ mova m12, m5
+ mova m13, m0
+%else
+%if WIN64
+ %define tmp rsp
+%else
+ %define tmp rsp-88 ; red zone
+%endif
+ mov r7, tmpq
+ pshufd m10, m0, q0000
+ pshufd m11, m0, q1111
+ pshufd m12, m0, q2222
+ pshufd m13, m0, q3333
+%endif
+ lea wd, [wq+hq-(1<<8)]
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova [tmp+16*1], m0
+ mova [tmp+16*2], m1
+ mova [tmp+16*3], m2
+ mova [tmp+16*4], m3
+.hv_loop0:
+%if ARCH_X86_64
+ mova m14, [prep_8tap_2d_rnd]
+%endif
+ movu m4, [srcq+ssq*0+0]
+ movu m1, [srcq+ssq*0+8]
+ movu m5, [srcq+ssq*1+0]
+ movu m2, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ movu m6, [srcq+ssq*0+0]
+ movu m3, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 4, 1, 0, 6
+ PUT_8TAP_HV_H 5, 2, 0, 6
+ PUT_8TAP_HV_H 6, 3, 0, 6
+ movu m7, [srcq+ssq*1+0]
+ movu m2, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ movu m1, [srcq+ssq*0+0]
+ movu m3, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 7, 2, 0, 6
+ PUT_8TAP_HV_H 1, 3, 0, 6
+ movu m2, [srcq+ssq*1+0]
+ movu m3, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_HV_H 2, 3, 0, 6
+ packssdw m4, m7 ; 0 3
+ packssdw m5, m1 ; 1 4
+ movu m0, [srcq+ssq*0+0]
+ movu m1, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 0, 1, 3, 6
+ packssdw m6, m2 ; 2 5
+ packssdw m7, m0 ; 3 6
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ punpcklwd m3, m6, m7 ; 23
+ punpckhwd m6, m7 ; 56
+%if ARCH_X86_32
+ jmp .hv_loop_start
+.hv_loop:
+ mova m1, [tmp+16*5]
+ mova m2, m15
+.hv_loop_start:
+ mova m7, [tmp+16*1]
+ pmaddwd m1, m7 ; a0
+ pmaddwd m2, m7 ; b0
+ mova m7, [tmp+16*2]
+ mova [tmp+16*5], m3
+ pmaddwd m3, m7 ; a1
+ mova m15, m4
+ pmaddwd m4, m7 ; b1
+ mova m7, [tmp+16*3]
+ paddd m1, m14
+ paddd m2, m14
+ paddd m1, m3
+ paddd m2, m4
+ mova m3, m5
+ pmaddwd m5, m7 ; a2
+ mova m4, m6
+ pmaddwd m6, m7 ; b2
+ paddd m1, m5
+ paddd m2, m6
+ movu m7, [srcq+ssq*1+0]
+ movu m5, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_HV_H 7, 5, 6, 6
+ packssdw m0, m7 ; 6 7
+ mova [tmp+16*0], m0
+ movu m0, [srcq+ssq*0+0]
+ movu m5, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 0, 5, 6, 6
+ mova m6, [tmp+16*0]
+ packssdw m7, m0 ; 7 8
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, [tmp+16*4]
+ paddd m1, m7 ; a3
+ pmaddwd m7, m6, [tmp+16*4]
+ paddd m2, m7 ; b3
+ psrad m1, 6
+ psrad m2, 6
+ packssdw m1, m2
+ movq [tmpq+r6*0], m1
+ movhps [tmpq+r6*2], m1
+ lea tmpq, [tmpq+r6*4]
+ sub hd, 2
+ jg .hv_loop
+%if STACK_ALIGNMENT < 16
+ mov tmpq, [esp+4*61]
+ add r5, 8
+ add tmpq, 8
+ mov srcq, r5
+ mov [esp+4*61], tmpq
+%else
+ mov tmpq, tmpmp
+ add r5, 8
+ add tmpq, 8
+ mov srcq, r5
+ mov tmpmp, tmpq
+%endif
+%else
+.hv_loop:
+ mova m15, [tmp+16*1]
+ mova m7, [prep_8tap_2d_rnd]
+ pmaddwd m14, m15, m1 ; a0
+ pmaddwd m15, m2 ; b0
+ paddd m14, m7
+ paddd m15, m7
+ mova m7, [tmp+16*2]
+ mova m1, m3
+ pmaddwd m3, m7 ; a1
+ mova m2, m4
+ pmaddwd m4, m7 ; b1
+ mova m7, [tmp+16*3]
+ paddd m14, m3
+ paddd m15, m4
+ mova m3, m5
+ pmaddwd m5, m7 ; a2
+ mova m4, m6
+ pmaddwd m6, m7 ; b2
+ paddd m14, m5
+ paddd m15, m6
+ movu m7, [srcq+ssq*1+0]
+ movu m5, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_HV_H 7, 5, 6, 6, [prep_8tap_2d_rnd]
+ packssdw m0, m7 ; 6 7
+ mova [tmp+16*0], m0
+ movu m0, [srcq+ssq*0+0]
+ movu m5, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 0, 5, 6, 6, [prep_8tap_2d_rnd]
+ mova m6, [tmp+16*0]
+ packssdw m7, m0 ; 7 8
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, [tmp+16*4]
+ paddd m14, m7 ; a3
+ pmaddwd m7, m6, [tmp+16*4]
+ paddd m15, m7 ; b3
+ psrad m14, 6
+ psrad m15, 6
+ packssdw m14, m15
+ movq [tmpq+r6*0], m14
+ movhps [tmpq+r6*2], m14
+ lea tmpq, [tmpq+r6*4]
+ sub hd, 2
+ jg .hv_loop
+ add r5, 8
+ add r7, 8
+ mov srcq, r5
+ mov tmpq, r7
+%endif
+ movzx hd, wb
+ sub wd, 1<<8
+ jg .hv_loop0
+ RET
+%undef tmp
+
+%macro movifprep 2
+ %if isprep
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro SAVE_REG 1
+ %xdefine r%1_save r%1
+ %xdefine r%1q_save r%1q
+ %xdefine r%1d_save r%1d
+ %if ARCH_X86_32
+ %define r%1m_save [rstk+stack_offset+(%1+1)*4]
+ %endif
+%endmacro
+
+%macro LOAD_REG 1
+ %xdefine r%1 r%1_save
+ %xdefine r%1q r%1q_save
+ %xdefine r%1d r%1d_save
+ %if ARCH_X86_32
+ %define r%1m r%1m_save
+ %endif
+ %undef r%1d_save
+ %undef r%1q_save
+ %undef r%1_save
+%endmacro
+
+%macro REMAP_REG 2-3
+ %xdefine r%1 r%2
+ %xdefine r%1q r%2q
+ %xdefine r%1d r%2d
+ %if ARCH_X86_32
+ %if %3 == 0
+ %xdefine r%1m r%2m
+ %else
+ %define r%1m [rstk+stack_offset+(%1+1)*4]
+ %endif
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
+ %if isprep
+ %if ARCH_X86_64
+ SAVE_REG 14
+ %assign %%i 14
+ %rep 14
+ %assign %%j %%i-1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i-1
+ %endrep
+ %else
+ SAVE_REG 5
+ %assign %%i 5
+ %rep 5
+ %assign %%j %%i-1
+ REMAP_REG %%i, %%j, 0
+ %assign %%i %%i-1
+ %endrep
+ %endif
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
+ %if isprep
+ %assign %%i 1
+ %if ARCH_X86_64
+ %rep 13
+ %assign %%j %%i+1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i+1
+ %endrep
+ LOAD_REG 14
+ %else
+ %rep 4
+ %assign %%j %%i+1
+ REMAP_REG %%i, %%j, 1
+ %assign %%i %%i+1
+ %endrep
+ LOAD_REG 5
+ %endif
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
+ MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+ RET
+ %if %1
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %endif
+%endmacro
+
+%if ARCH_X86_32
+ %macro MC_4TAP_SCALED_H 1 ; dst_mem
+ movu m7, [srcq+ssq*0]
+ movu m2, [srcq+ssq*1]
+ movu m5, [r4 +ssq*0]
+ movu m6, [r4 +ssq*1]
+ lea srcq, [srcq+ssq*2]
+ lea r4, [r4 +ssq*2]
+ REPX {pshufb x, m12}, m7, m2
+ REPX {pmaddwd x, m13}, m7, m2
+ REPX {pshufb x, m14}, m5, m6
+ REPX {pmaddwd x, m15}, m5, m6
+ phaddd m7, m5
+ phaddd m2, m6
+ mova m5, [esp+0x00]
+ movd m6, [esp+0x10]
+ paddd m7, m5
+ paddd m2, m5
+ psrad m7, m6
+ psrad m2, m6
+ packssdw m7, m2
+ mova [stk+%1], m7
+ %endmacro
+%endif
+
+%if ARCH_X86_64
+ %macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6]
+ movu m%1, [srcq+ r4*2]
+ movu m%2, [srcq+ r6*2]
+ movu m%3, [srcq+ r7*2]
+ movu m%4, [srcq+ r9*2]
+ movu m%5, [srcq+r10*2]
+ movu m%6, [srcq+r11*2]
+ movu m%7, [srcq+r13*2]
+ movu m%8, [srcq+ rX*2]
+ add srcq, ssq
+ pmaddwd m%1, [stk+0x10]
+ pmaddwd m%2, [stk+0x20]
+ pmaddwd m%3, [stk+0x30]
+ pmaddwd m%4, [stk+0x40]
+ pmaddwd m%5, [stk+0x50]
+ pmaddwd m%6, [stk+0x60]
+ pmaddwd m%7, [stk+0x70]
+ pmaddwd m%8, [stk+0x80]
+ phaddd m%1, m%2
+ phaddd m%3, m%4
+ phaddd m%5, m%6
+ phaddd m%7, m%8
+ phaddd m%1, m%3
+ phaddd m%5, m%7
+ paddd m%1, hround
+ paddd m%5, hround
+ psrad m%1, m12
+ psrad m%5, m12
+ packssdw m%1, m%5
+ %endmacro
+%else
+ %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem, load_fh_offsets
+ %if %3 == 1
+ mov r0, [stk+ 0]
+ mov rX, [stk+ 4]
+ mov r4, [stk+ 8]
+ mov r5, [stk+12]
+ %endif
+ movu m0, [srcq+r0*2]
+ movu m1, [srcq+rX*2]
+ movu m2, [srcq+r4*2]
+ movu m3, [srcq+r5*2]
+ mov r0, [stk+16]
+ mov rX, [stk+20]
+ mov r4, [stk+24]
+ mov r5, [stk+28]
+ pmaddwd m0, [stk+%1+0x00]
+ pmaddwd m1, [stk+%1+0x10]
+ pmaddwd m2, [stk+%1+0x20]
+ pmaddwd m3, [stk+%1+0x30]
+ phaddd m0, m1
+ phaddd m2, m3
+ movu m4, [srcq+r0*2]
+ movu m5, [srcq+rX*2]
+ movu m6, [srcq+r4*2]
+ movu m7, [srcq+r5*2]
+ add srcq, ssq
+ pmaddwd m4, [stk+%1+0xa0]
+ pmaddwd m5, [stk+%1+0xb0]
+ pmaddwd m6, [stk+%1+0xc0]
+ pmaddwd m7, [stk+%1+0xd0]
+ phaddd m4, m5
+ phaddd m6, m7
+ phaddd m0, m2
+ phaddd m4, m6
+ paddd m0, hround
+ paddd m4, hround
+ psrad m0, m12
+ psrad m4, m12
+ packssdw m0, m4
+ %if %2 != 0
+ mova [stk+%2], m0
+ %endif
+ %endmacro
+%endif
+
+%macro MC_8TAP_SCALED 1
+%ifidn %1, put
+ %assign isput 1
+ %assign isprep 0
+ %if ARCH_X86_64
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal put_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
+ %else
+cglobal put_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
+ %endif
+ %else ; ARCH_X86_32
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal put_8tap_scaled_16bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
+ %else
+cglobal put_8tap_scaled_16bpc, 0, 7, 8, -0x200-0x30, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
+ %endif
+ %endif
+ %xdefine base_reg r12
+%else ; prep
+ %assign isput 0
+ %assign isprep 1
+ %if ARCH_X86_64
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal prep_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
+ %xdefine tmp_stridem r14q
+ %else
+cglobal prep_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
+ %define tmp_stridem qword [stk+0x138]
+ %endif
+ %xdefine base_reg r11
+ %else ; ARCH_X86_32
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal prep_8tap_scaled_16bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
+ %else
+cglobal prep_8tap_scaled_16bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
+ %endif
+ %define tmp_stridem dword [stk+0x138]
+ %endif
+%endif
+%if ARCH_X86_32
+ mov [esp+0x1f0], t0d
+ mov [esp+0x1f4], t1d
+ %if isput && required_stack_alignment > STACK_ALIGNMENT
+ mov dstd, dstm
+ mov dsd, dsm
+ mov srcd, srcm
+ mov ssd, ssm
+ mov hd, hm
+ mov r4, mxm
+ %define r0m [esp+0x200]
+ %define dsm [esp+0x204]
+ %define dsmp dsm
+ %define r1m dsm
+ %define r2m [esp+0x208]
+ %define ssm [esp+0x20c]
+ %define r3m ssm
+ %define hm [esp+0x210]
+ %define mxm [esp+0x214]
+ mov r0m, dstd
+ mov dsm, dsd
+ mov r2m, srcd
+ mov ssm, ssd
+ mov hm, hd
+ mov r0, mym
+ mov r1, dxm
+ mov r2, dym
+ %define mym [esp+0x218]
+ %define dxm [esp+0x21c]
+ %define dym [esp+0x220]
+ mov mxm, r4
+ mov mym, r0
+ mov dxm, r1
+ mov dym, r2
+ tzcnt wd, wm
+ %endif
+ %if isput
+ mov r3, pxmaxm
+ %define pxmaxm r3
+ %else
+ mov r2, pxmaxm
+ %endif
+ %if isprep && required_stack_alignment > STACK_ALIGNMENT
+ %xdefine base_reg r5
+ %else
+ %xdefine base_reg r6
+ %endif
+%endif
+ LEA base_reg, %1_8tap_scaled_16bpc_ssse3
+%xdefine base base_reg-%1_8tap_scaled_16bpc_ssse3
+%if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT
+ tzcnt wd, wm
+%endif
+%if ARCH_X86_64
+ %if isput
+ mov r7d, pxmaxm
+ %endif
+%else
+ %define m8 m0
+ %define m9 m1
+ %define m14 m4
+ %define m15 m3
+%endif
+ movd m8, dxm
+ movd m14, mxm
+%if isput
+ movd m15, pxmaxm
+%endif
+ pshufd m8, m8, q0000
+ pshufd m14, m14, q0000
+%if isput
+ pshuflw m15, m15, q0000
+ punpcklqdq m15, m15
+%endif
+%if isprep
+ %if UNIX64
+ mov r5d, t0d
+ DECLARE_REG_TMP 5, 7
+ %endif
+ %if ARCH_X86_64
+ mov r6d, pxmaxm
+ %endif
+%endif
+%if ARCH_X86_64
+ mov dyd, dym
+%endif
+%if isput
+ %if WIN64
+ mov r8d, hm
+ DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r5m
+ %define dxm r8m
+ %elif ARCH_X86_64
+ DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm r6m
+ %else
+ %endif
+ %if ARCH_X86_64
+ %if required_stack_alignment > STACK_ALIGNMENT
+ %define dsm [rsp+0x138]
+ %define rX r1
+ %define rXd r1d
+ %else
+ %define dsm dsq
+ %define rX r14
+ %define rXd r14d
+ %endif
+ %else
+ %define rX r1
+ %endif
+%else ; prep
+ %if WIN64
+ mov r7d, hm
+ DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r4m
+ %define dxm r7m
+ %elif ARCH_X86_64
+ DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
+ %xdefine hm r7m
+ %endif
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %if ARCH_X86_64
+ %define rX r14
+ %define rXd r14d
+ %else
+ %define rX r3
+ %endif
+%endif
+%if ARCH_X86_64
+ shr r7d, 11
+ mova m10, [base+pd_0x3ff]
+ movddup m11, [base+s_8tap_h_rnd+r7*8]
+ movd m12, [base+s_8tap_h_sh+r7*4]
+ %if isput
+ movddup m13, [base+put_s_8tap_v_rnd+r7*8]
+ movd m7, [base+put_s_8tap_v_sh+r7*4]
+ %define pxmaxm [rsp]
+ mova pxmaxm, m15
+ punpcklqdq m12, m7
+ %endif
+ lea ss3q, [ssq*3]
+ movzx r7d, t1b
+ shr t1d, 16
+ cmp hd, 6
+ cmovs t1d, r7d
+ sub srcq, ss3q
+%else
+ %define m10 [base+pd_0x3ff]
+ %define m11 [esp+0x00]
+ %define m12 [esp+0x10]
+ shr r3, 11
+ movddup m1, [base+s_8tap_h_rnd+r3*8]
+ movd m2, [base+s_8tap_h_sh+r3*4]
+ %if isput
+ %define m13 [esp+0x20]
+ %define pxmaxm [esp+0x30]
+ %define stk esp+0x40
+ movddup m5, [base+put_s_8tap_v_rnd+r3*8]
+ movd m6, [base+put_s_8tap_v_sh+r3*4]
+ mova pxmaxm, m15
+ punpcklqdq m2, m6
+ mova m13, m5
+ %else
+ %define m13 [base+pd_m524256]
+ %endif
+ mov ssd, ssm
+ mova m11, m1
+ mova m12, m2
+ MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+ mov r1, [esp+0x1f4]
+ lea r0, [ssd*3]
+ movzx r2, r1b
+ shr r1, 16
+ cmp dword hm, 6
+ cmovs r1, r2
+ mov [esp+0x1f4], r1
+ %if isprep
+ mov r1, r1m
+ %endif
+ mov r2, r2m
+ sub srcq, r0
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %define ss3q r0
+ %define myd r4
+ %define dyd dword dym
+ %define hd dword hm
+%endif
+ cmp dyd, 1024
+ je .dy1
+ cmp dyd, 2048
+ je .dy2
+ movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%if isput
+.w2:
+ %if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ sub srcq, 2
+ movd m15, t0d
+ %else
+ movzx r4, byte [esp+0x1f0]
+ sub srcq, 2
+ movd m15, r4
+ %endif
+ pxor m9, m9
+ punpckldq m9, m8
+ paddd m14, m9 ; mx+dx*[0-1]
+ %if ARCH_X86_64
+ mova m9, [base+pd_0x4000]
+ %endif
+ pshufd m15, m15, q0000
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ movd r4d, m15
+ pshufd m15, m15, q0321
+ %if ARCH_X86_64
+ movd r6d, m15
+ %else
+ movd r3d, m15
+ %endif
+ mova m5, [base+bdct_lb_q]
+ mova m6, [base+spel_s_shuf2]
+ movd m15, [base+subpel_filters+r4*8+2]
+ %if ARCH_X86_64
+ movd m7, [base+subpel_filters+r6*8+2]
+ %else
+ movd m7, [base+subpel_filters+r3*8+2]
+ %endif
+ pxor m2, m2
+ pcmpeqd m8, m2
+ psrld m14, 10
+ paddd m14, m14
+ %if ARCH_X86_32
+ mov r3, r3m
+ pshufb m14, m5
+ paddb m14, m6
+ mova [stk], m14
+ SWAP m5, m0
+ SWAP m6, m3
+ %define m15 m6
+ %endif
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ movu m2, [srcq+ssq*2]
+ movu m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ punpckldq m15, m7
+ %if ARCH_X86_64
+ pshufb m14, m5
+ paddb m14, m6
+ pand m9, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m9
+ movu m4, [srcq+ssq*0]
+ movu m5, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ movu m7, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ %else
+ pand m7, m5, [base+pd_0x4000]
+ pandn m5, m15
+ por m5, m7
+ %define m15 m5
+ %endif
+ punpcklbw m15, m15
+ psraw m15, 8
+ REPX {pshufb x, m14}, m0, m1, m2, m3
+ REPX {pmaddwd x, m15}, m0, m1, m2, m3
+ %if ARCH_X86_64
+ REPX {pshufb x, m14}, m4, m5, m6, m7
+ REPX {pmaddwd x, m15}, m4, m5, m6, m7
+ phaddd m0, m1
+ phaddd m2, m3
+ phaddd m4, m5
+ phaddd m6, m7
+ REPX {paddd x, m11}, m0, m2, m4, m6
+ REPX {psrad x, m12}, m0, m2, m4, m6
+ packssdw m0, m2 ; 0 1 2 3
+ packssdw m4, m6 ; 4 5 6 7
+ SWAP m1, m4
+ %else
+ mova [stk+0x10], m15
+ phaddd m0, m1
+ phaddd m2, m3
+ movu m1, [srcq+ssq*0]
+ movu m7, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ movu m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ REPX {pshufb x, m14}, m1, m7, m6, m3
+ REPX {pmaddwd x, m15}, m1, m7, m6, m3
+ phaddd m1, m7
+ phaddd m6, m3
+ REPX {paddd x, m11}, m0, m2, m1, m6
+ REPX {psrad x, m12}, m0, m2, m1, m6
+ packssdw m0, m2
+ packssdw m1, m6
+ %define m14 [stk+0x00]
+ %define m15 [stk+0x10]
+ %endif
+ palignr m2, m1, m0, 4 ; 1 2 3 4
+ punpcklwd m3, m0, m2 ; 01 12
+ punpckhwd m0, m2 ; 23 34
+ pshufd m5, m1, q0321 ; 5 6 7 _
+ punpcklwd m2, m1, m5 ; 45 56
+ punpckhwd m4, m1, m5 ; 67 __
+ %if ARCH_X86_32
+ mov myd, mym
+ mov r0, r0m
+ mova [stk+0x20], m3
+ mova [stk+0x30], m0
+ mova [stk+0x40], m2
+ mova [stk+0x50], m4
+ %endif
+.w2_loop:
+ and myd, 0x3ff
+ %if ARCH_X86_64
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq m10, r6q
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pmaddwd m5, m3, m7
+ pmaddwd m6, m0, m8
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+ pmaddwd m7, m2, m9
+ pmaddwd m8, m4, m10
+ paddd m5, m6
+ paddd m7, m8
+ %else
+ mov r1, [esp+0x1f4]
+ xor r3, r3
+ mov r5, myd
+ shr r5, 6
+ lea r1, [r1+r5]
+ mov r5, 64 << 24
+ cmovnz r3, [base+subpel_filters+r1*8+4]
+ cmovnz r5, [base+subpel_filters+r1*8+0]
+ movd m6, r3
+ movd m7, r5
+ punpckldq m7, m6
+ punpcklbw m7, m7
+ psraw m7, 8
+ pshufd m5, m7, q0000
+ pshufd m6, m7, q1111
+ pmaddwd m3, m5
+ pmaddwd m0, m6
+ pshufd m5, m7, q2222
+ pshufd m7, m7, q3333
+ pmaddwd m2, m5
+ pmaddwd m4, m7
+ paddd m3, m0
+ paddd m2, m4
+ SWAP m5, m3
+ SWAP m7, m2
+ %define m8 m3
+ %endif
+ paddd m5, m13
+ pshufd m6, m12, q1032
+ pxor m8, m8
+ paddd m5, m7
+ psrad m5, m6
+ packssdw m5, m5
+ pmaxsw m5, m8
+ pminsw m5, pxmaxm
+ movd [dstq], m5
+ add dstq, dsmp
+ dec hd
+ jz .ret
+ %if ARCH_X86_64
+ add myd, dyd
+ %else
+ add myd, dym
+ %endif
+ test myd, ~0x3ff
+ %if ARCH_X86_32
+ SWAP m3, m5
+ SWAP m2, m7
+ mova m3, [stk+0x20]
+ mova m0, [stk+0x30]
+ mova m2, [stk+0x40]
+ mova m4, [stk+0x50]
+ %endif
+ jz .w2_loop
+ %if ARCH_X86_32
+ mov r3, r3m
+ %endif
+ movu m5, [srcq]
+ test myd, 0x400
+ jz .w2_skip_line
+ add srcq, ssq
+ shufps m3, m0, q1032 ; 01 12
+ shufps m0, m2, q1032 ; 23 34
+ shufps m2, m4, q1032 ; 45 56
+ pshufb m5, m14
+ pmaddwd m5, m15
+ phaddd m5, m5
+ paddd m5, m11
+ psrad m5, m12
+ packssdw m5, m5
+ palignr m4, m5, m1, 12
+ punpcklqdq m1, m4, m4 ; 6 7 6 7
+ punpcklwd m4, m1, m5 ; 67 __
+ %if ARCH_X86_32
+ mova [stk+0x20], m3
+ mova [stk+0x30], m0
+ mova [stk+0x40], m2
+ mova [stk+0x50], m4
+ %endif
+ jmp .w2_loop
+.w2_skip_line:
+ movu m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova m3, m0 ; 01 12
+ mova m0, m2 ; 23 34
+ pshufb m5, m14
+ pshufb m6, m14
+ pmaddwd m5, m15
+ pmaddwd m6, m15
+ phaddd m5, m6
+ paddd m5, m11
+ psrad m5, m12
+ packssdw m5, m5 ; 6 7 6 7
+ punpckhqdq m1, m5 ; 4 5 6 7
+ pshufd m5, m1, q0321 ; 5 6 7 _
+ punpcklwd m2, m1, m5 ; 45 56
+ punpckhwd m4, m1, m5 ; 67 __
+ %if ARCH_X86_32
+ mova [stk+0x20], m3
+ mova [stk+0x30], m0
+ mova [stk+0x40], m2
+ mova [stk+0x50], m4
+ %endif
+ jmp .w2_loop
+%endif
+INIT_XMM ssse3
+.w4:
+%if ARCH_X86_64
+ mov myd, mym
+ mova [rsp+0x10], m11
+ mova [rsp+0x20], m12
+ %if isput
+ mova [rsp+0x30], m13
+ %endif
+ movzx t0d, t0b
+ sub srcq, 2
+ movd m15, t0d
+%else
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ movzx r4, byte [esp+0x1f0]
+ sub srcq, 2
+ movd m15, r4
+%endif
+ pmaddwd m8, [base+rescale_mul]
+%if ARCH_X86_64
+ mova m9, [base+pd_0x4000]
+%else
+ %define m9 [base+pd_0x4000]
+%endif
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m0, m14, m10
+ psrld m0, 6
+ paddd m15, m0
+ pshufd m7, m15, q1032
+%if ARCH_X86_64
+ movd r4d, m15
+ movd r11d, m7
+ pshufd m15, m15, q0321
+ pshufd m7, m7, q0321
+ movd r6d, m15
+ movd r13d, m7
+ mova m10, [base+bdct_lb_q+ 0]
+ mova m11, [base+bdct_lb_q+16]
+ movd m13, [base+subpel_filters+ r4*8+2]
+ movd m2, [base+subpel_filters+ r6*8+2]
+ movd m15, [base+subpel_filters+r11*8+2]
+ movd m4, [base+subpel_filters+r13*8+2]
+%else
+ movd r0, m15
+ movd r4, m7
+ pshufd m15, m15, q0321
+ pshufd m7, m7, q0321
+ movd rX, m15
+ movd r5, m7
+ mova m5, [base+bdct_lb_q+ 0]
+ mova m6, [base+bdct_lb_q+16]
+ movd m1, [base+subpel_filters+r0*8+2]
+ movd m2, [base+subpel_filters+rX*8+2]
+ movd m3, [base+subpel_filters+r4*8+2]
+ movd m7, [base+subpel_filters+r5*8+2]
+ movifprep r3, r3m
+ SWAP m4, m7
+ %define m10 m5
+ %define m11 m6
+ %define m12 m1
+ %define m13 m1
+%endif
+ psrld m14, 10
+ paddd m14, m14
+ punpckldq m13, m2
+ punpckldq m15, m4
+ punpcklqdq m13, m15
+ pxor m2, m2
+ pcmpeqd m0, m2
+%if ARCH_X86_64
+ pand m9, m0
+%else
+ pand m2, m9, m0
+ %define m9 m2
+ SWAP m7, m4
+%endif
+ pandn m0, m13
+%if ARCH_X86_64
+ SWAP m13, m0
+%else
+ %define m13 m0
+%endif
+ por m13, m9
+ punpckhbw m15, m13, m13
+ punpcklbw m13, m13
+ psraw m15, 8
+ psraw m13, 8
+ pshufb m12, m14, m10
+ pshufb m14, m11
+ mova m10, [base+spel_s_shuf2]
+ movd r4d, m14
+ shr r4d, 24
+%if ARCH_X86_32
+ mova [stk+0x20], m13
+ mova [stk+0x30], m15
+ pxor m2, m2
+%endif
+ pshufb m7, m14, m2
+ psubb m14, m7
+ paddb m12, m10
+ paddb m14, m10
+%if ARCH_X86_64
+ lea r6, [r4+ssq*1]
+ lea r11, [r4+ssq*2]
+ lea r13, [r4+ss3q ]
+ movu m7, [srcq+ssq*0]
+ movu m9, [srcq+ssq*1]
+ movu m8, [srcq+ssq*2]
+ movu m10, [srcq+ss3q ]
+ movu m1, [srcq+r4 ]
+ movu m3, [srcq+r6 ]
+ movu m2, [srcq+r11 ]
+ movu m4, [srcq+r13 ]
+ lea srcq, [srcq+ssq*4]
+ REPX {pshufb x, m12}, m7, m9, m8, m10
+ REPX {pmaddwd x, m13}, m7, m9, m8, m10
+ REPX {pshufb x, m14}, m1, m2, m3, m4
+ REPX {pmaddwd x, m15}, m1, m2, m3, m4
+ mova m5, [rsp+0x10]
+ movd xm6, [rsp+0x20]
+ phaddd m7, m1
+ phaddd m9, m3
+ phaddd m8, m2
+ phaddd m10, m4
+ movu m1, [srcq+ssq*0]
+ movu m2, [srcq+ssq*1]
+ movu m3, [srcq+ssq*2]
+ movu m4, [srcq+ss3q ]
+ REPX {paddd x, m5}, m7, m9, m8, m10
+ REPX {psrad x, xm6}, m7, m9, m8, m10
+ packssdw m7, m9 ; 0 1
+ packssdw m8, m10 ; 2 3
+ movu m0, [srcq+r4 ]
+ movu m9, [srcq+r6 ]
+ movu m10, [srcq+r11 ]
+ movu m11, [srcq+r13 ]
+ lea srcq, [srcq+ssq*4]
+ REPX {pshufb x, m12}, m1, m2, m3, m4
+ REPX {pmaddwd x, m13}, m1, m2, m3, m4
+ REPX {pshufb x, m14}, m0, m9, m10, m11
+ REPX {pmaddwd x, m15}, m0, m9, m10, m11
+ phaddd m1, m0
+ phaddd m2, m9
+ phaddd m3, m10
+ phaddd m4, m11
+ REPX {paddd x, m5}, m1, m2, m3, m4
+ REPX {psrad x, xm6}, m1, m2, m3, m4
+ packssdw m1, m2 ; 4 5
+ packssdw m3, m4 ; 6 7
+ SWAP m9, m1
+ shufps m4, m7, m8, q1032 ; 1 2
+ shufps m5, m8, m9, q1032 ; 3 4
+ shufps m6, m9, m3, q1032 ; 5 6
+ pshufd m10, m3, q1032 ; 7 _
+ punpcklwd m0, m7, m4 ; 01
+ punpckhwd m7, m4 ; 12
+ punpcklwd m1, m8, m5 ; 23
+ punpckhwd m8, m5 ; 34
+ punpcklwd m2, m9, m6 ; 45
+ punpckhwd m9, m6 ; 56
+ punpcklwd m3, m10 ; 67
+ mova [rsp+0x40], m7
+ mova [rsp+0x50], m8
+ mova [rsp+0x60], m9
+%else
+ mova [stk+0x00], m12
+ mova [stk+0x10], m14
+ add r4, srcq
+ MC_4TAP_SCALED_H 0x40 ; 0 1
+ MC_4TAP_SCALED_H 0x50 ; 2 3
+ MC_4TAP_SCALED_H 0x60 ; 4 5
+ MC_4TAP_SCALED_H 0x70 ; 6 7
+ mova m4, [stk+0x40]
+ mova m5, [stk+0x50]
+ mova m6, [stk+0x60]
+ mova m7, [stk+0x70]
+ mov [stk+0xc0], r4
+ shufps m1, m4, m5, q1032 ; 1 2
+ shufps m2, m5, m6, q1032 ; 3 4
+ shufps m3, m6, m7, q1032 ; 5 6
+ pshufd m0, m7, q1032 ; 7 _
+ mova [stk+0xb0], m0
+ punpcklwd m0, m4, m1 ; 01
+ punpckhwd m4, m1 ; 12
+ punpcklwd m1, m5, m2 ; 23
+ punpckhwd m5, m2 ; 34
+ punpcklwd m2, m6, m3 ; 45
+ punpckhwd m6, m3 ; 56
+ punpcklwd m3, m7, [stk+0xb0] ; 67
+ mov myd, mym
+ mov r0, r0m
+ mova [stk+0x40], m0 ; 01
+ mova [stk+0x50], m1 ; 23
+ mova [stk+0x60], m2 ; 45
+ mova [stk+0x70], m3 ; 67
+ mova [stk+0x80], m4 ; 12
+ mova [stk+0x90], m5 ; 34
+ mova [stk+0xa0], m6 ; 56
+ %define m12 [stk+0x00]
+ %define m14 [stk+0x10]
+ %define m13 [stk+0x20]
+ %define m15 [stk+0x30]
+ %define hrnd_mem [esp+0x00]
+ %define hsh_mem [esp+0x10]
+ %if isput
+ %define vrnd_mem [esp+0x20]
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %endif
+%endif
+.w4_loop:
+ and myd, 0x3ff
+%if ARCH_X86_64
+ mov r11d, 64 << 24
+ mov r13d, myd
+ shr r13d, 6
+ lea r13d, [t1+r13]
+ cmovnz r11q, [base+subpel_filters+r13*8]
+ movq m9, r11q
+ punpcklbw m9, m9
+ psraw m9, 8
+ pshufd m7, m9, q0000
+ pshufd m8, m9, q1111
+ pmaddwd m4, m0, m7
+ pmaddwd m5, m1, m8
+ pshufd m7, m9, q2222
+ pshufd m9, m9, q3333
+ pmaddwd m6, m2, m7
+ pmaddwd m8, m3, m9
+ %if isput
+ movd m9, [rsp+0x28]
+ %define vrnd_mem [rsp+0x30]
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %endif
+ paddd m4, m5
+ paddd m6, m8
+ paddd m4, m6
+ paddd m4, vrnd_mem
+%else
+ mov mym, myd
+ mov r5, [esp+0x1f4]
+ xor r3, r3
+ shr r4, 6
+ lea r5, [r5+r4]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r5*8+0]
+ cmovnz r3, [base+subpel_filters+r5*8+4]
+ movd m7, r4
+ movd m6, r3
+ punpckldq m7, m6
+ punpcklbw m7, m7
+ psraw m7, 8
+ pshufd m4, m7, q0000
+ pshufd m5, m7, q1111
+ pshufd m6, m7, q2222
+ pshufd m7, m7, q3333
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ pmaddwd m2, m6
+ pmaddwd m3, m7
+ %if isput
+ movd m4, [esp+0x18]
+ %endif
+ paddd m0, m1
+ paddd m2, m3
+ paddd m0, vrnd_mem
+ paddd m0, m2
+ SWAP m4, m0
+ %define m9 m0
+%endif
+%if isput
+ pxor m5, m5
+ psrad m4, m9
+ packssdw m4, m4
+ pmaxsw m4, m5
+ pminsw m4, pxmaxm
+ movq [dstq], m4
+ add dstq, dsmp
+%else
+ psrad m4, 6
+ packssdw m4, m4
+ movq [tmpq], m4
+ add tmpq, 8
+%endif
+ dec hd
+ jz .ret
+%if ARCH_X86_64
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w4_loop
+ mova m8, [rsp+0x10]
+ movd m9, [rsp+0x20]
+ movu m4, [srcq]
+ movu m5, [srcq+r4]
+ test myd, 0x400
+ jz .w4_skip_line
+ mova m0, [rsp+0x40]
+ mova [rsp+0x40], m1
+ mova m1, [rsp+0x50]
+ mova [rsp+0x50], m2
+ mova m2, [rsp+0x60]
+ mova [rsp+0x60], m3
+ pshufb m4, m12
+ pshufb m5, m14
+ pmaddwd m4, m13
+ pmaddwd m5, m15
+ phaddd m4, m5
+ paddd m4, m8
+ psrad m4, m9
+ packssdw m4, m4
+ punpcklwd m3, m10, m4
+ mova m10, m4
+ add srcq, ssq
+ jmp .w4_loop
+.w4_skip_line:
+ movu m6, [srcq+ssq*1]
+ movu m7, [srcq+r6]
+ mova m0, [rsp+0x50]
+ mova m11, [rsp+0x60]
+ pshufb m4, m12
+ pshufb m6, m12
+ pshufb m5, m14
+ pshufb m7, m14
+ pmaddwd m4, m13
+ pmaddwd m6, m13
+ pmaddwd m5, m15
+ pmaddwd m7, m15
+ mova [rsp+0x40], m0
+ mova [rsp+0x50], m11
+ phaddd m4, m5
+ phaddd m6, m7
+ paddd m4, m8
+ paddd m6, m8
+ psrad m4, m9
+ psrad m6, m9
+ packssdw m4, m6
+ punpcklwd m9, m10, m4
+ mova [rsp+0x60], m9
+ pshufd m10, m4, q1032
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ punpcklwd m3, m4, m10
+ lea srcq, [srcq+ssq*2]
+ jmp .w4_loop
+%else
+ SWAP m0, m4
+ mov myd, mym
+ mov r3, r3m
+ add myd, dym
+ test myd, ~0x3ff
+ jnz .w4_next_line
+ mova m0, [stk+0x40]
+ mova m1, [stk+0x50]
+ mova m2, [stk+0x60]
+ mova m3, [stk+0x70]
+ jmp .w4_loop
+.w4_next_line:
+ mov r5, [stk+0xc0]
+ movu m4, [srcq]
+ movu m5, [r5]
+ test myd, 0x400
+ jz .w4_skip_line
+ add [stk+0xc0], ssq
+ mova m0, [stk+0x80]
+ mova m3, [stk+0x50]
+ mova [stk+0x40], m0
+ mova [stk+0x80], m3
+ mova m1, [stk+0x90]
+ mova m6, [stk+0x60]
+ mova [stk+0x50], m1
+ mova [stk+0x90], m6
+ mova m2, [stk+0xa0]
+ mova m7, [stk+0x70]
+ mova [stk+0x60], m2
+ mova [stk+0xa0], m7
+ pshufb m4, m12
+ pshufb m5, m14
+ pmaddwd m4, m13
+ pmaddwd m5, m15
+ phaddd m4, m5
+ paddd m4, hrnd_mem
+ psrad m4, hsh_mem
+ packssdw m4, m4
+ punpcklwd m3, [stk+0xb0], m4
+ mova [stk+0xb0], m4
+ mova [stk+0x70], m3
+ add srcq, ssq
+ jmp .w4_loop
+.w4_skip_line:
+ movu m6, [srcq+ssq*1]
+ movu m7, [r5 +ssq*1]
+ lea r5, [r5 +ssq*2]
+ mov [stk+0xc0], r5
+ mova m0, [stk+0x50]
+ mova m1, [stk+0x60]
+ mova m2, [stk+0x70]
+ mova m3, [stk+0x90]
+ pshufb m4, m12
+ pshufb m6, m12
+ pshufb m5, m14
+ pshufb m7, m14
+ pmaddwd m4, m13
+ pmaddwd m6, m13
+ pmaddwd m5, m15
+ pmaddwd m7, m15
+ mova [stk+0x40], m0
+ mova [stk+0x50], m1
+ mova [stk+0x60], m2
+ mova [stk+0x80], m3
+ phaddd m4, m5
+ phaddd m6, m7
+ mova m5, [stk+0xa0]
+ mova m7, [stk+0xb0]
+ paddd m4, hrnd_mem
+ paddd m6, hrnd_mem
+ psrad m4, hsh_mem
+ psrad m6, hsh_mem
+ packssdw m4, m6
+ punpcklwd m7, m4
+ pshufd m6, m4, q1032
+ mova [stk+0x90], m5
+ mova [stk+0xa0], m7
+ mova [stk+0xb0], m6
+ punpcklwd m3, m4, m6
+ mova [stk+0x70], m3
+ lea srcq, [srcq+ssq*2]
+ jmp .w4_loop
+%endif
+INIT_XMM ssse3
+%if ARCH_X86_64
+ %define stk rsp+0x20
+%endif
+.w8:
+ mov dword [stk+0xf0], 1
+ movifprep tmp_stridem, 16
+ jmp .w_start
+.w16:
+ mov dword [stk+0xf0], 2
+ movifprep tmp_stridem, 32
+ jmp .w_start
+.w32:
+ mov dword [stk+0xf0], 4
+ movifprep tmp_stridem, 64
+ jmp .w_start
+.w64:
+ mov dword [stk+0xf0], 8
+ movifprep tmp_stridem, 128
+ jmp .w_start
+.w128:
+ mov dword [stk+0xf0], 16
+ movifprep tmp_stridem, 256
+.w_start:
+%if ARCH_X86_64
+ %ifidn %1, put
+ movifnidn dsm, dsq
+ %endif
+ mova [rsp+0x10], m11
+ %define hround m11
+ shr t0d, 16
+ movd m15, t0d
+ %if isprep
+ mova m13, [base+pd_m524256]
+ %endif
+%else
+ %define hround [esp+0x00]
+ %define m12 [esp+0x10]
+ %define m10 [base+pd_0x3ff]
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ %if isprep
+ %define ssq ssm
+ %endif
+ mov r4, [esp+0x1f0]
+ shr r4, 16
+ movd m15, r4
+ mov r0, r0m
+ mov myd, mym
+%endif
+ sub srcq, 6
+ pslld m7, m8, 2 ; dx*4
+ pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ mova [stk+0x100], m7
+ mova [stk+0x120], m15
+ mov [stk+0x0f8], srcq
+ mov [stk+0x130], r0q ; dstq / tmpq
+%if ARCH_X86_64 && UNIX64
+ mov hm, hd
+%elif ARCH_X86_32
+ mov r5, hm
+ mov [stk+0x0f4], myd
+ mov [stk+0x134], r5
+%endif
+ jmp .hloop
+.hloop_prep:
+ dec dword [stk+0x0f0]
+ jz .ret
+%if ARCH_X86_64
+ add qword [stk+0x130], 16
+ mov hd, hm
+%else
+ add dword [stk+0x130], 16
+ mov myd, [stk+0x0f4]
+ mov r5, [stk+0x134]
+ mov r0, [stk+0x130]
+%endif
+ mova m7, [stk+0x100]
+ mova m14, [stk+0x110]
+%if ARCH_X86_64
+ mova m10, [base+pd_0x3ff]
+ mova m11, [rsp+0x10]
+%endif
+ mova m15, [stk+0x120]
+ mov srcq, [stk+0x0f8]
+%if ARCH_X86_64
+ mov r0q, [stk+0x130] ; dstq / tmpq
+%else
+ mov mym, myd
+ mov hm, r5
+ mov r0m, r0
+ mov r3, r3m
+%endif
+ paddd m14, m7
+.hloop:
+%if ARCH_X86_64
+ mova m9, [base+pq_0x40000000]
+%else
+ %define m9 [base+pq_0x40000000]
+%endif
+ pxor m1, m1
+ psrld m2, m14, 10
+ mova [stk], m2
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m5, m15, m6
+ pcmpeqd m6, m1
+ pshufd m2, m5, q1032
+%if ARCH_X86_64
+ movd r4d, m5
+ movd r6d, m2
+ pshufd m5, m5, q0321
+ pshufd m2, m2, q0321
+ movd r7d, m5
+ movd r9d, m2
+ movq m0, [base+subpel_filters+r4*8]
+ movq m1, [base+subpel_filters+r6*8]
+ movhps m0, [base+subpel_filters+r7*8]
+ movhps m1, [base+subpel_filters+r9*8]
+%else
+ movd r0, m5
+ movd rX, m2
+ pshufd m5, m5, q0321
+ pshufd m2, m2, q0321
+ movd r4, m5
+ movd r5, m2
+ movq m0, [base+subpel_filters+r0*8]
+ movq m1, [base+subpel_filters+rX*8]
+ movhps m0, [base+subpel_filters+r4*8]
+ movhps m1, [base+subpel_filters+r5*8]
+%endif
+ paddd m14, m7 ; mx+dx*[4-7]
+ pand m5, m14, m10
+ psrld m5, 6
+ paddd m15, m5
+ pxor m2, m2
+ pcmpeqd m5, m2
+ mova [stk+0x110], m14
+ pshufd m4, m15, q1032
+%if ARCH_X86_64
+ movd r10d, m15
+ movd r11d, m4
+ pshufd m15, m15, q0321
+ pshufd m4, m4, q0321
+ movd r13d, m15
+ movd rXd, m4
+ movq m2, [base+subpel_filters+r10*8]
+ movq m3, [base+subpel_filters+r11*8]
+ movhps m2, [base+subpel_filters+r13*8]
+ movhps m3, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ movq r11, m14
+ punpckhqdq m14, m14
+ movq rX, m14
+ mov r10d, r11d
+ shr r11, 32
+ mov r13d, rXd
+ shr rX, 32
+ mov r4d, [stk+ 0]
+ mov r6d, [stk+ 4]
+ mov r7d, [stk+ 8]
+ mov r9d, [stk+12]
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m14, m5, q1100
+ pshufd m5, m5, q3322
+ pand m7, m9, m4
+ pand m8, m9, m6
+ pand m15, m9, m14
+ pand m9, m9, m5
+ pandn m4, m0
+ pandn m6, m1
+ pandn m14, m2
+ pandn m5, m3
+ por m7, m4
+ por m8, m6
+ por m15, m14
+ por m9, m5
+ punpcklbw m0, m7, m7
+ punpckhbw m7, m7
+ punpcklbw m1, m8, m8
+ punpckhbw m8, m8
+ psraw m0, 8
+ psraw m7, 8
+ psraw m1, 8
+ psraw m8, 8
+ punpcklbw m2, m15, m15
+ punpckhbw m15, m15
+ punpcklbw m3, m9, m9
+ punpckhbw m9, m9
+ psraw m2, 8
+ psraw m15, 8
+ psraw m3, 8
+ psraw m9, 8
+ mova [stk+0x10], m0
+ mova [stk+0x20], m7
+ mova [stk+0x30], m1
+ mova [stk+0x40], m8
+ mova [stk+0x50], m2
+ mova [stk+0x60], m15
+ mova [stk+0x70], m3
+ mova [stk+0x80], m9
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
+ mova [stk+0x90], m1
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
+ mova [stk+0xa0], m2
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
+ mova [stk+0xb0], m3
+ MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
+ mova [stk+0xc0], m4
+ MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
+ mova [stk+0xd0], m5
+ MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
+ MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
+ MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
+ mova m5, [stk+0xd0]
+ mova m1, [stk+0x90]
+ mova m2, [stk+0xa0]
+ mova m3, [stk+0xb0]
+ mova m9, [stk+0xc0]
+ mov myd, mym
+ mov dyd, dym
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m8 ; 67a
+ punpckhwd m7, m8 ; 67b
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m9 ; 23a
+ punpckhwd m3, m9 ; 23b
+ mova [stk+0x90], m4
+ mova [stk+0xa0], m5
+ mova [stk+0xb0], m6
+ mova [stk+0xc0], m7
+ %define hround [rsp+0x10]
+.vloop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq m11, r6q
+ punpcklbw m11, m11
+ psraw m11, 8
+ pshufd m5, m11, q0000
+ pshufd m7, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+ pmaddwd m4, m5, m0
+ pmaddwd m5, m5, m1
+ pmaddwd m6, m7, m2
+ pmaddwd m7, m7, m3
+ paddd m4, m13
+ paddd m5, m13
+ paddd m4, m6
+ paddd m5, m7
+ pmaddwd m6, [stk+0x90], m10
+ pmaddwd m7, [stk+0xa0], m10
+ pmaddwd m8, [stk+0xb0], m11
+ pmaddwd m9, [stk+0xc0], m11
+ paddd m4, m6
+ paddd m5, m7
+ %if isput
+ pshufd m6, m12, q1032
+ %endif
+ paddd m4, m8
+ paddd m5, m9
+%else
+ movd r0, m15
+ movd rX, m4
+ pshufd m15, m15, q0321
+ pshufd m4, m4, q0321
+ movd r4, m15
+ movd r5, m4
+ mova m14, [stk+0x110]
+ movq m2, [base+subpel_filters+r0*8]
+ movq m3, [base+subpel_filters+rX*8]
+ movhps m2, [base+subpel_filters+r4*8]
+ movhps m3, [base+subpel_filters+r5*8]
+ psrld m14, 10
+ mova [stk+16], m14
+ mov r0, [stk+ 0]
+ mov rX, [stk+ 4]
+ mov r4, [stk+ 8]
+ mov r5, [stk+12]
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m0, m9, m4
+ pand m1, m9, m6
+ pand m2, m9, m7
+ pand m3, m9, m5
+ pandn m4, [stk+0x20]
+ pandn m6, [stk+0x30]
+ pandn m7, [stk+0x40]
+ pandn m5, [stk+0x50]
+ por m0, m4
+ por m1, m6
+ por m2, m7
+ por m3, m5
+ punpcklbw m4, m0, m0
+ punpckhbw m0, m0
+ punpcklbw m5, m1, m1
+ punpckhbw m1, m1
+ psraw m4, 8
+ psraw m0, 8
+ psraw m5, 8
+ psraw m1, 8
+ punpcklbw m6, m2, m2
+ punpckhbw m2, m2
+ punpcklbw m7, m3, m3
+ punpckhbw m3, m3
+ psraw m6, 8
+ psraw m2, 8
+ psraw m7, 8
+ psraw m3, 8
+ mova [stk+0x0a0], m4
+ mova [stk+0x0b0], m0
+ mova [stk+0x0c0], m5
+ mova [stk+0x0d0], m1
+ mova [stk+0x140], m6
+ mova [stk+0x150], m2
+ mova [stk+0x160], m7
+ mova [stk+0x170], m3
+ MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0
+ MC_8TAP_SCALED_H 0xa0, 0x30 ; 1
+ MC_8TAP_SCALED_H 0xa0, 0x40 ; 2
+ MC_8TAP_SCALED_H 0xa0, 0x50 ; 3
+ MC_8TAP_SCALED_H 0xa0, 0x60 ; 4
+ MC_8TAP_SCALED_H 0xa0, 0x70 ; 5
+ MC_8TAP_SCALED_H 0xa0, 0x80 ; 6
+ MC_8TAP_SCALED_H 0xa0, 0x90 ; 7
+ mova m5, [stk+0x60]
+ mova m6, [stk+0x70]
+ mova m7, [stk+0x80]
+ mova m0, [stk+0x90]
+ mov myd, mym
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova [stk+0x60], m4
+ mova [stk+0x70], m5
+ mova [stk+0x80], m6
+ mova [stk+0x90], m7
+ mova m1, [stk+0x20]
+ mova m2, [stk+0x30]
+ mova m3, [stk+0x40]
+ mova m4, [stk+0x50]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m4 ; 23a
+ punpckhwd m3, m4 ; 23b
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+.vloop:
+ mov r0, r0m
+ mov r5, [esp+0x1f4]
+ and myd, 0x3ff
+ mov mym, myd
+ xor r3, r3
+ shr r4, 6
+ lea r5, [r5+r4]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r5*8+0]
+ cmovnz r3, [base+subpel_filters+r5*8+4]
+ movd m7, r4
+ movd m6, r3
+ punpckldq m7, m6
+ punpcklbw m7, m7
+ psraw m7, 8
+ pshufd m4, m7, q0000
+ pshufd m5, m7, q1111
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m5
+ pmaddwd m3, m5
+ pshufd m6, m7, q2222
+ pshufd m7, m7, q3333
+ paddd m0, m2
+ paddd m1, m3
+ pmaddwd m2, [stk+0x60], m6
+ pmaddwd m3, [stk+0x70], m6
+ pmaddwd m4, [stk+0x80], m7
+ pmaddwd m5, [stk+0x90], m7
+ %if isput
+ movd m6, [esp+0x18]
+ %endif
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, vrnd_mem
+ paddd m1, vrnd_mem
+ paddd m4, m0
+ paddd m5, m1
+%endif
+%ifidn %1, put
+ psrad m4, m6
+ psrad m5, m6
+ packssdw m4, m5
+ pxor m7, m7
+ pmaxsw m4, m7
+ pminsw m4, pxmaxm
+ mova [dstq], m4
+ add dstq, dsm
+%else
+ psrad m4, 6
+ psrad m5, 6
+ packssdw m4, m5
+ mova [tmpq], m4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .hloop_prep
+%if ARCH_X86_64
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .vloop
+ test myd, 0x400
+ mov [stk+0x140], myd
+ mov r4d, [stk+ 0]
+ mov r6d, [stk+ 4]
+ mov r7d, [stk+ 8]
+ mov r9d, [stk+12]
+ jz .skip_line
+ mova m14, [base+unpckw]
+ movu m8, [srcq+r10*2]
+ movu m9, [srcq+r11*2]
+ movu m10, [srcq+r13*2]
+ movu m11, [srcq+ rX*2]
+ movu m4, [srcq+ r4*2]
+ movu m5, [srcq+ r6*2]
+ movu m6, [srcq+ r7*2]
+ movu m7, [srcq+ r9*2]
+ add srcq, ssq
+ mov myd, [stk+0x140]
+ mov dyd, dym
+ pshufd m15, m14, q1032
+ pshufb m0, m14 ; 0a 1a
+ pshufb m1, m14 ; 0b 1b
+ pshufb m2, m15 ; 3a 2a
+ pshufb m3, m15 ; 3b 2b
+ pmaddwd m8, [stk+0x50]
+ pmaddwd m9, [stk+0x60]
+ pmaddwd m10, [stk+0x70]
+ pmaddwd m11, [stk+0x80]
+ pmaddwd m4, [stk+0x10]
+ pmaddwd m5, [stk+0x20]
+ pmaddwd m6, [stk+0x30]
+ pmaddwd m7, [stk+0x40]
+ phaddd m8, m9
+ phaddd m10, m11
+ mova m11, hround
+ phaddd m4, m5
+ phaddd m6, m7
+ phaddd m8, m10
+ phaddd m4, m6
+ paddd m4, m11
+ paddd m8, m11
+ psrad m4, m12
+ psrad m8, m12
+ packssdw m4, m8
+ pshufb m5, [stk+0x90], m14 ; 4a 5a
+ pshufb m6, [stk+0xa0], m14 ; 4b 5b
+ pshufb m7, [stk+0xb0], m15 ; 7a 6a
+ pshufb m8, [stk+0xc0], m15 ; 7b 6b
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ punpcklwd m2, m5 ; 34a
+ punpcklwd m3, m6 ; 34b
+ punpckhwd m5, m7 ; 56a
+ punpckhwd m6, m8 ; 56b
+ punpcklwd m7, m4 ; 78a
+ punpckhqdq m4, m4
+ punpcklwd m8, m4 ; 78b
+ mova [stk+0x90], m5
+ mova [stk+0xa0], m6
+ mova [stk+0xb0], m7
+ mova [stk+0xc0], m8
+ jmp .vloop
+.skip_line:
+ MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11
+ MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 0, 10, 11
+ mov myd, [stk+0x140]
+ mov dyd, dym
+ mova m0, m2 ; 01a
+ mova m1, m3 ; 01b
+ mova m2, [stk+0x90] ; 23a
+ mova m3, [stk+0xa0] ; 23b
+ mova m5, [stk+0xb0] ; 45a
+ mova m6, [stk+0xc0] ; 45b
+ punpcklwd m7, m4, m8 ; 67a
+ punpckhwd m4, m8 ; 67b
+ mova [stk+0x90], m5
+ mova [stk+0xa0], m6
+ mova [stk+0xb0], m7
+ mova [stk+0xc0], m4
+%else
+ mov r0m, r0
+ mov myd, mym
+ mov r3, r3m
+ add myd, dym
+ test myd, ~0x3ff
+ mov mym, myd
+ jnz .next_line
+ mova m0, [stk+0x20]
+ mova m1, [stk+0x30]
+ mova m2, [stk+0x40]
+ mova m3, [stk+0x50]
+ jmp .vloop
+.next_line:
+ test myd, 0x400
+ mov r0, [stk+ 0]
+ mov rX, [stk+ 4]
+ mov r4, [stk+ 8]
+ mov r5, [stk+12]
+ jz .skip_line
+ MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
+ mova m7, [base+unpckw]
+ pshufd m4, m7, q1032
+ pshufb m0, [stk+0x20], m7 ; 0a 1a
+ pshufb m1, [stk+0x30], m7 ; 0b 1b
+ pshufb m2, [stk+0x40], m4 ; 3a 2a
+ pshufb m3, [stk+0x50], m4 ; 3b 2b
+ pshufb m5, [stk+0x60], m7 ; 4a 5a
+ pshufb m6, [stk+0x70], m7 ; 4b 5b
+ pshufb m7, [stk+0x80], m4 ; 7a 6a
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ punpcklwd m2, m5 ; 34a
+ punpcklwd m3, m6 ; 34b
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ punpckhwd m5, m7 ; 56a
+ mova [stk+0x60], m5
+ pshufb m5, [stk+0x90], m4 ; 7b 6b
+ punpcklwd m7, [stk+0xe0] ; 78a
+ punpckhwd m6, m5 ; 56b
+ mova [stk+0x70], m6
+ movq m6, [stk+0xe8]
+ mova [stk+0x80], m7
+ punpcklwd m5, m6
+ mov myd, mym
+ mova [stk+0x90], m5
+ jmp .vloop
+.skip_line:
+ MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
+ MC_8TAP_SCALED_H 0xa0, 0 ; 9
+ mova m7, [stk+0xe0]
+ mova m2, [stk+0x60] ; 23a
+ mova m3, [stk+0x70] ; 23b
+ mova m4, [stk+0x80] ; 45a
+ mova m5, [stk+0x90] ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova m0, [stk+0x40] ; 01a
+ mova m1, [stk+0x50] ; 01b
+ mov myd, mym
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ mova [stk+0x60], m4
+ mova [stk+0x70], m5
+ mova [stk+0x80], m6
+ mova [stk+0x90], m7
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+%endif
+ jmp .vloop
+INIT_XMM ssse3
+.dy1:
+ movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%if isput
+.dy1_w2:
+ %if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ sub srcq, 2
+ movd m15, t0d
+ %else
+ %define m8 m0
+ %define m9 m1
+ %define m14 m4
+ %define m15 m3
+ %define m11 [esp+0x00]
+ %define m12 [esp+0x10]
+ %define m13 [esp+0x20]
+ movzx r5, byte [esp+0x1f0]
+ sub srcq, 2
+ movd m15, r5
+ mov r1, r1m
+ %endif
+ pxor m9, m9
+ punpckldq m9, m8
+ paddd m14, m9 ; mx+dx*[0-1]
+ %if ARCH_X86_64
+ mova m9, [base+pd_0x4000]
+ %endif
+ pshufd m15, m15, q0000
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ movd r4d, m15
+ pshufd m15, m15, q0321
+ %if ARCH_X86_64
+ movd r6d, m15
+ %else
+ movd r3d, m15
+ %endif
+ mova m5, [base+bdct_lb_q]
+ mova m6, [base+spel_s_shuf2]
+ movd m15, [base+subpel_filters+r4*8+2]
+ %if ARCH_X86_64
+ movd m7, [base+subpel_filters+r6*8+2]
+ %else
+ movd m7, [base+subpel_filters+r3*8+2]
+ %endif
+ pxor m2, m2
+ pcmpeqd m8, m2
+ psrld m14, 10
+ paddd m14, m14
+ %if ARCH_X86_32
+ mov r3, r3m
+ pshufb m14, m5
+ paddb m14, m6
+ mova [stk], m14
+ SWAP m5, m0
+ SWAP m6, m3
+ %define m15 m6
+ %endif
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ movu m2, [srcq+ssq*2]
+ movu m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ punpckldq m15, m7
+ %if ARCH_X86_64
+ pshufb m14, m5
+ paddb m14, m6
+ pand m9, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m9
+ movu m4, [srcq+ssq*0]
+ movu m5, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ add srcq, ss3q
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ %else
+ pand m7, m5, [base+pd_0x4000]
+ pandn m5, m15
+ por m5, m7
+ %define m15 m5
+ mov myd, mym
+ mov r5, [esp+0x1f4]
+ xor r3, r3
+ shr myd, 6
+ lea r5, [r5+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r5*8+0]
+ cmovnz r3, [base+subpel_filters+r5*8+4]
+ mov [stk+0x20], r3
+ mov r3, r3m
+ %endif
+ punpcklbw m15, m15
+ psraw m15, 8
+ REPX {pshufb x, m14}, m0, m1, m2, m3
+ REPX {pmaddwd x, m15}, m0, m1, m2, m3
+ %if ARCH_X86_64
+ REPX {pshufb x, m14}, m4, m5, m6
+ REPX {pmaddwd x, m15}, m4, m5, m6
+ phaddd m0, m1
+ phaddd m2, m3
+ phaddd m4, m5
+ phaddd m6, m6
+ REPX {paddd x, m11}, m0, m2, m4, m6
+ REPX {psrad x, m12}, m0, m2, m4, m6
+ packssdw m0, m2 ; 0 1 2 3
+ packssdw m4, m6 ; 4 5 6
+ SWAP m1, m4
+ movq m10, r4
+ %else
+ mova [stk+0x10], m15
+ phaddd m0, m1
+ phaddd m2, m3
+ movu m1, [srcq+ssq*0]
+ movu m7, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ add srcq, ss3q
+ REPX {pshufb x, m14}, m1, m7, m6
+ REPX {pmaddwd x, m15}, m1, m7, m6
+ %define m14 [stk+0x00]
+ %define m15 [stk+0x10]
+ phaddd m1, m7
+ phaddd m6, m6
+ REPX {paddd x, m11}, m0, m2, m1, m6
+ REPX {psrad x, m12}, m0, m2, m1, m6
+ packssdw m0, m2
+ packssdw m1, m6
+ %define m8 m6
+ %define m9 m4
+ %define m10 m5
+ movd m10, r4
+ movd m9, [stk+0x20]
+ punpckldq m10, m9
+ %endif
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+ %if ARCH_X86_32
+ mova [stk+0x50], m7
+ mova [stk+0x60], m8
+ mova [stk+0x70], m9
+ mova [stk+0x80], m10
+ %define m7 [stk+0x50]
+ %define m8 [stk+0x60]
+ %define m9 [stk+0x70]
+ %define m10 [stk+0x80]
+ %endif
+ palignr m2, m1, m0, 4 ; 1 2 3 4
+ punpcklwd m3, m0, m2 ; 01 12
+ punpckhwd m0, m2 ; 23 34
+ pshufd m4, m1, q2121 ; 5 6 5 6
+ punpcklwd m2, m1, m4 ; 45 56
+ %if ARCH_X86_32
+ mov r0, r0m
+ %endif
+.dy1_w2_loop:
+ movu m1, [srcq+ssq*0]
+ movu m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m5, m3, m7
+ mova m3, m0
+ pmaddwd m0, m8
+ pshufb m1, m14
+ pshufb m6, m14
+ pmaddwd m1, m15
+ pmaddwd m6, m15
+ phaddd m1, m6
+ paddd m1, m11
+ psrad m1, m12
+ packssdw m1, m1
+ paddd m5, m0
+ mova m0, m2
+ pmaddwd m2, m9
+ paddd m5, m2
+ palignr m2, m1, m4, 12
+ punpcklwd m2, m1 ; 67 78
+ pmaddwd m4, m2, m10
+ paddd m5, m13
+ paddd m5, m4
+ pxor m6, m6
+ mova m4, m1
+ pshufd m1, m12, q1032
+ psrad m5, m1
+ packssdw m5, m5
+ pmaxsw m5, m6
+ pminsw m5, pxmaxm
+ movd [dstq+dsq*0], m5
+ pshuflw m5, m5, q1032
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy1_w2_loop
+ RET
+%endif
+INIT_XMM ssse3
+.dy1_w4:
+%if ARCH_X86_64
+ mov myd, mym
+ mova [rsp+0x10], m11
+ mova [rsp+0x20], m12
+ %if isput
+ mova [rsp+0x30], m13
+ %define vrnd_mem [rsp+0x30]
+ %define stk rsp+0x40
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %define stk rsp+0x30
+ %endif
+ movzx t0d, t0b
+ sub srcq, 2
+ movd m15, t0d
+%else
+ %define m10 [base+pd_0x3ff]
+ %define m9 [base+pd_0x4000]
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ %if isprep
+ %define ssq r3
+ %endif
+ movzx r5, byte [esp+0x1f0]
+ sub srcq, 2
+ movd m15, r5
+%endif
+ pmaddwd m8, [base+rescale_mul]
+%if ARCH_X86_64
+ mova m9, [base+pd_0x4000]
+%endif
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m0, m14, m10
+ psrld m0, 6
+ paddd m15, m0
+ pshufd m7, m15, q1032
+%if ARCH_X86_64
+ movd r4d, m15
+ movd r11d, m7
+ pshufd m15, m15, q0321
+ pshufd m7, m7, q0321
+ movd r6d, m15
+ movd r13d, m7
+ mova m10, [base+bdct_lb_q+ 0]
+ mova m11, [base+bdct_lb_q+16]
+ movd m13, [base+subpel_filters+ r4*8+2]
+ movd m2, [base+subpel_filters+ r6*8+2]
+ movd m15, [base+subpel_filters+r11*8+2]
+ movd m4, [base+subpel_filters+r13*8+2]
+%else
+ movd r0, m15
+ movd r4, m7
+ pshufd m15, m15, q0321
+ pshufd m7, m7, q0321
+ movd rX, m15
+ movd r5, m7
+ mova m5, [base+bdct_lb_q+ 0]
+ mova m6, [base+bdct_lb_q+16]
+ movd m1, [base+subpel_filters+r0*8+2]
+ movd m2, [base+subpel_filters+rX*8+2]
+ movd m3, [base+subpel_filters+r4*8+2]
+ movd m7, [base+subpel_filters+r5*8+2]
+ SWAP m4, m7
+ %if isprep
+ mov r3, r3m
+ %endif
+ %define m10 m5
+ %define m11 m6
+ %define m12 m1
+ %define m13 m1
+%endif
+ psrld m14, 10
+ paddd m14, m14
+ punpckldq m13, m2
+ punpckldq m15, m4
+ punpcklqdq m13, m15
+ pxor m2, m2
+ pcmpeqd m0, m2
+%if ARCH_X86_64
+ pand m9, m0
+%else
+ pand m2, m9, m0
+ %define m9 m2
+ SWAP m7, m4
+%endif
+ pandn m0, m13
+%if ARCH_X86_64
+ SWAP m13, m0
+%else
+ %define m13 m0
+%endif
+ por m13, m9
+ punpckhbw m15, m13, m13
+ punpcklbw m13, m13
+ psraw m15, 8
+ psraw m13, 8
+ pshufb m12, m14, m10
+ pshufb m14, m11
+ mova m10, [base+spel_s_shuf2]
+ movd r4d, m14
+ shr r4d, 24
+%if ARCH_X86_32
+ mova [stk+0x40], m13
+ mova [stk+0x50], m15
+ pxor m2, m2
+%endif
+ pshufb m7, m14, m2
+ psubb m14, m7
+ paddb m12, m10
+ paddb m14, m10
+%if ARCH_X86_64
+ lea r6, [r4+ssq*1]
+ lea r11, [r4+ssq*2]
+ lea r13, [r4+ss3q ]
+ movu m7, [srcq+ssq*0]
+ movu m9, [srcq+ssq*1]
+ movu m8, [srcq+ssq*2]
+ movu m10, [srcq+ss3q ]
+ movu m1, [srcq+r4 ]
+ movu m3, [srcq+r6 ]
+ movu m2, [srcq+r11 ]
+ movu m4, [srcq+r13 ]
+ lea srcq, [srcq+ssq*4]
+ REPX {pshufb x, m12}, m7, m9, m8, m10
+ REPX {pmaddwd x, m13}, m7, m9, m8, m10
+ REPX {pshufb x, m14}, m1, m3, m2, m4
+ REPX {pmaddwd x, m15}, m1, m3, m2, m4
+ mova m5, [rsp+0x10]
+ movd xm6, [rsp+0x20]
+ phaddd m7, m1
+ phaddd m9, m3
+ phaddd m8, m2
+ phaddd m10, m4
+ movu m1, [srcq+ssq*0]
+ movu m2, [srcq+ssq*1]
+ movu m3, [srcq+ssq*2]
+ REPX {paddd x, m5}, m7, m9, m8, m10
+ REPX {psrad x, xm6}, m7, m9, m8, m10
+ packssdw m7, m9 ; 0 1
+ packssdw m8, m10 ; 2 3
+ movu m0, [srcq+r4 ]
+ movu m9, [srcq+r6 ]
+ movu m10, [srcq+r11 ]
+ add srcq, ss3q
+ REPX {pshufb x, m12}, m1, m2, m3
+ REPX {pmaddwd x, m13}, m1, m2, m3
+ REPX {pshufb x, m14}, m0, m9, m10
+ REPX {pmaddwd x, m15}, m0, m9, m10
+ phaddd m1, m0
+ phaddd m2, m9
+ phaddd m3, m10
+ shr myd, 6
+ mov r13d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r13q, [base+subpel_filters+myq*8]
+ REPX {paddd x, m5}, m1, m2, m3
+ REPX {psrad x, xm6}, m1, m2, m3
+ packssdw m1, m2 ; 4 5
+ packssdw m3, m3 ; 6 6
+ SWAP m9, m1
+ shufps m4, m7, m8, q1032 ; 1 2
+ shufps m5, m8, m9, q1032 ; 3 4
+ shufps m6, m9, m3, q1032 ; 5 6
+ punpcklwd m0, m7, m4 ; 01
+ punpckhwd m7, m4 ; 12
+ punpcklwd m1, m8, m5 ; 23
+ punpckhwd m8, m5 ; 34
+ punpcklwd m2, m9, m6 ; 45
+ punpckhwd m9, m6 ; 56
+ movq m10, r13
+ mova [stk+0x00], m1
+ mova [stk+0x10], m8
+ mova [stk+0x20], m2
+ mova [stk+0x30], m9
+ mova [stk+0x40], m3
+ %define hrnd_mem [rsp+0x10]
+ %define hsh_mem [rsp+0x20]
+ %define vsh_mem [rsp+0x28]
+ %if isput
+ %define vrnd_mem [rsp+0x30]
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %endif
+%else
+ mova [stk+0x20], m12
+ mova [stk+0x30], m14
+ add r4, srcq
+ MC_4TAP_SCALED_H 0x60 ; 0 1
+ MC_4TAP_SCALED_H 0x70 ; 2 3
+ MC_4TAP_SCALED_H 0x80 ; 4 5
+ movu m7, [srcq]
+ movu m2, [r4]
+ add srcq, ssq
+ add r4, ssq
+ mov [stk+0xb0], r4
+ pshufb m7, m12
+ pshufb m2, m14
+ pmaddwd m7, m13
+ pmaddwd m2, m15
+ phaddd m7, m2
+ paddd m7, [esp+0x00]
+ psrad m7, [esp+0x10]
+ packssdw m7, m7 ; 6 6
+ mova m4, [stk+0x60]
+ mova m5, [stk+0x70]
+ mova m6, [stk+0x80]
+ mov myd, mym
+ mov rX, [esp+0x1f4]
+ xor r5, r5
+ shr myd, 6
+ lea rX, [rX+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+rX*8+0]
+ cmovnz r5, [base+subpel_filters+rX*8+4]
+ mov r3, r3m
+ shufps m1, m4, m5, q1032 ; 1 2
+ shufps m2, m5, m6, q1032 ; 3 4
+ shufps m3, m6, m7, q1032 ; 5 6
+ mova [stk+0xa0], m7
+ punpcklwd m0, m4, m1 ; 01
+ punpckhwd m4, m1 ; 12
+ punpcklwd m1, m5, m2 ; 23
+ punpckhwd m5, m2 ; 34
+ punpcklwd m2, m6, m3 ; 45
+ punpckhwd m6, m3 ; 56
+ movd m7, r4
+ movd m3, r5
+ mov r0, r0m
+ %if isput
+ mov r1, r1m
+ %endif
+ mov r4, [stk+0xb0]
+ mova [stk+0xc0], m4 ; 12
+ mova [stk+0x60], m1 ; 23
+ mova [stk+0x70], m2 ; 45
+ mova [stk+0x80], m5 ; 34
+ mova [stk+0x90], m6 ; 56
+ %define m12 [stk+0x20]
+ %define m14 [stk+0x30]
+ %define m13 [stk+0x40]
+ %define m15 [stk+0x50]
+ %define hrnd_mem [esp+0x00]
+ %define hsh_mem [esp+0x10]
+ %define vsh_mem [esp+0x18]
+ %if isput
+ %define vrnd_mem [esp+0x20]
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %endif
+ %define m10 m7
+ punpckldq m10, m3
+%endif
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufd m3, m10, q0000
+ pshufd m4, m10, q1111
+ pshufd m5, m10, q2222
+ pshufd m10, m10, q3333
+%if ARCH_X86_32
+ %xdefine m8 m3
+ %xdefine m9 m6
+ %xdefine m11 m5
+ %xdefine m6 m4
+ mova [stk+0x100], m3
+ mova [stk+0x110], m4
+ mova [stk+0x120], m5
+ mova [stk+0x130], m10
+ %define m3 [stk+0x100]
+ %define m4 [stk+0x110]
+ %define m5 [stk+0x120]
+ %define m10 [stk+0x130]
+ mova m7, [stk+0xc0]
+ mova m8, [stk+0x80]
+%endif
+.dy1_w4_loop:
+ movu m11, [srcq+ssq*0]
+ movu m6, [srcq+ssq*1]
+ pmaddwd m0, m3
+ pmaddwd m7, m3
+ pmaddwd m1, m4
+ pmaddwd m8, m4
+ pmaddwd m2, m5
+ pmaddwd m9, m5
+ paddd m1, m0
+ paddd m8, m7
+%if ARCH_X86_64
+ movu m0, [srcq+r4]
+ movu m7, [srcq+r6]
+%else
+ movu m0, [r4+ssq*0]
+ movu m7, [r4+ssq*1]
+ lea r4, [r4+ssq*2]
+%endif
+ lea srcq, [srcq+ssq*2]
+ paddd m1, m2
+ paddd m8, m9
+ pshufb m11, m12
+ pshufb m6, m12
+ pmaddwd m11, m13
+ pmaddwd m6, m13
+ pshufb m0, m14
+ pshufb m7, m14
+ pmaddwd m0, m15
+ pmaddwd m7, m15
+ phaddd m11, m0
+ phaddd m6, m7
+ paddd m11, hrnd_mem
+ paddd m6, hrnd_mem
+ psrad m11, hsh_mem
+ psrad m6, hsh_mem
+ packssdw m11, m6 ; 7 8
+%if ARCH_X86_64
+ shufps m9, [stk+0x40], m11, q1032 ; 6 7
+ mova m0, [stk+0x00]
+ mova [stk+0x40], m11
+%else
+ shufps m9, [stk+0xa0], m11, q1032 ; 6 7
+ mova m0, [stk+0x60]
+ mova [stk+0xa0], m11
+%endif
+ punpcklwd m2, m9, m11 ; 67
+ punpckhwd m9, m11 ; 78
+ pmaddwd m6, m2, m10
+ pmaddwd m7, m9, m10
+%if isput
+ movd m11, vsh_mem
+%endif
+ paddd m1, vrnd_mem
+ paddd m8, vrnd_mem
+ paddd m1, m6
+ paddd m8, m7
+%if ARCH_X86_64
+ mova m7, [stk+0x10]
+%else
+ mova m7, [stk+0x80]
+%endif
+%if isput
+ psrad m1, m11
+ psrad m8, m11
+%else
+ psrad m1, 6
+ psrad m8, 6
+%endif
+ packssdw m1, m8
+%if ARCH_X86_64
+ mova m8, [stk+0x30]
+%else
+ mova m8, [stk+0x90]
+%endif
+%if isput
+ pxor m6, m6
+ pmaxsw m1, m6
+ pminsw m1, pxmaxm
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+%else
+ mova [tmpq], m1
+ add tmpq, 16
+%endif
+%if ARCH_X86_64
+ mova m1, [stk+0x20]
+ mova [stk+0x10], m8
+ mova [stk+0x00], m1
+ mova [stk+0x20], m2
+ mova [stk+0x30], m9
+%else
+ mova m1, [stk+0x70]
+ mova [stk+0x80], m8
+ mova [stk+0x60], m1
+ mova [stk+0x70], m2
+ mova [stk+0x90], m9
+%endif
+ sub hd, 2
+ jg .dy1_w4_loop
+ MC_8TAP_SCALED_RET ; why not jz .ret?
+INIT_XMM ssse3
+.dy1_w8:
+ mov dword [stk+0xf0], 1
+ movifprep tmp_stridem, 16
+ jmp .dy1_w_start
+.dy1_w16:
+ mov dword [stk+0xf0], 2
+ movifprep tmp_stridem, 32
+ jmp .dy1_w_start
+.dy1_w32:
+ mov dword [stk+0xf0], 4
+ movifprep tmp_stridem, 64
+ jmp .dy1_w_start
+.dy1_w64:
+ mov dword [stk+0xf0], 8
+ movifprep tmp_stridem, 128
+ jmp .dy1_w_start
+.dy1_w128:
+ mov dword [stk+0xf0], 16
+ movifprep tmp_stridem, 256
+.dy1_w_start:
+ mov myd, mym
+%if ARCH_X86_64
+ %ifidn %1, put
+ movifnidn dsm, dsq
+ %endif
+ mova [rsp+0x10], m11
+ mova [rsp+0x20], m12
+ %define hround m11
+ %if isput
+ mova [rsp+0x30], m13
+ %else
+ mova m13, [base+pd_m524256]
+ %endif
+ shr t0d, 16
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ movd m15, t0d
+%else
+ %define hround [esp+0x00]
+ %define m12 [esp+0x10]
+ %define m10 [base+pd_0x3ff]
+ %define m8 m0
+ %xdefine m14 m4
+ %xdefine m15 m3
+ %if isprep
+ %define ssq ssm
+ %endif
+ mov r5, [esp+0x1f0]
+ mov r3, [esp+0x1f4]
+ shr r5, 16
+ movd m15, r5
+ xor r5, r5
+ shr myd, 6
+ lea r3, [r3+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r3*8+0]
+ cmovnz r5, [base+subpel_filters+r3*8+4]
+ mov r0, r0m
+ mov r3, r3m
+%endif
+ sub srcq, 6
+ pslld m7, m8, 2 ; dx*4
+ pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+%if ARCH_X86_64
+ movq m3, r4q
+%else
+ movd m5, r4
+ movd m6, r5
+ punpckldq m5, m6
+ SWAP m3, m5
+%endif
+ punpcklbw m3, m3
+ psraw m3, 8
+ mova [stk+0x100], m7
+ mova [stk+0x120], m15
+ mov [stk+0x0f8], srcq
+ mov [stk+0x130], r0q ; dstq / tmpq
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+%if ARCH_X86_64
+ mova [stk+0x140], m0
+ mova [stk+0x150], m1
+ mova [stk+0x160], m2
+ mova [stk+0x170], m3
+ %if UNIX64
+ mov hm, hd
+ %endif
+%else
+ mova [stk+0x180], m0
+ mova [stk+0x190], m1
+ mova [stk+0x1a0], m2
+ mova [stk+0x1b0], m3
+ SWAP m5, m3
+ mov r5, hm
+ mov [stk+0x134], r5
+%endif
+ jmp .dy1_hloop
+.dy1_hloop_prep:
+ dec dword [stk+0x0f0]
+ jz .ret
+%if ARCH_X86_64
+ add qword [stk+0x130], 16
+ mov hd, hm
+%else
+ add dword [stk+0x130], 16
+ mov r5, [stk+0x134]
+ mov r0, [stk+0x130]
+%endif
+ mova m7, [stk+0x100]
+ mova m14, [stk+0x110]
+%if ARCH_X86_64
+ mova m10, [base+pd_0x3ff]
+ mova m11, [rsp+0x10]
+%endif
+ mova m15, [stk+0x120]
+ mov srcq, [stk+0x0f8]
+%if ARCH_X86_64
+ mov r0q, [stk+0x130] ; dstq / tmpq
+%else
+ mov hm, r5
+ mov r0m, r0
+ mov r3, r3m
+%endif
+ paddd m14, m7
+.dy1_hloop:
+%if ARCH_X86_64
+ mova m9, [base+pq_0x40000000]
+%else
+ %define m9 [base+pq_0x40000000]
+%endif
+ pxor m1, m1
+ psrld m2, m14, 10
+ mova [stk], m2
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m5, m15, m6
+ pcmpeqd m6, m1
+ pshufd m2, m5, q1032
+%if ARCH_X86_64
+ movd r4d, m5
+ movd r6d, m2
+ pshufd m5, m5, q0321
+ pshufd m2, m2, q0321
+ movd r7d, m5
+ movd r9d, m2
+ movq m0, [base+subpel_filters+r4*8]
+ movq m1, [base+subpel_filters+r6*8]
+ movhps m0, [base+subpel_filters+r7*8]
+ movhps m1, [base+subpel_filters+r9*8]
+%else
+ movd r0, m5
+ movd rX, m2
+ pshufd m5, m5, q0321
+ pshufd m2, m2, q0321
+ movd r4, m5
+ movd r5, m2
+ movq m0, [base+subpel_filters+r0*8]
+ movq m1, [base+subpel_filters+rX*8]
+ movhps m0, [base+subpel_filters+r4*8]
+ movhps m1, [base+subpel_filters+r5*8]
+%endif
+ paddd m14, m7 ; mx+dx*[4-7]
+ pand m5, m14, m10
+ psrld m5, 6
+ paddd m15, m5
+ pxor m2, m2
+ pcmpeqd m5, m2
+ mova [stk+0x110], m14
+ pshufd m4, m15, q1032
+%if ARCH_X86_64
+ movd r10d, m15
+ movd r11d, m4
+ pshufd m15, m15, q0321
+ pshufd m4, m4, q0321
+ movd r13d, m15
+ movd rXd, m4
+ movq m2, [base+subpel_filters+r10*8]
+ movq m3, [base+subpel_filters+r11*8]
+ movhps m2, [base+subpel_filters+r13*8]
+ movhps m3, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ movq r11, m14
+ punpckhqdq m14, m14
+ movq rX, m14
+ mov r10d, r11d
+ shr r11, 32
+ mov r13d, rXd
+ shr rX, 32
+ mov r4d, [stk+ 0]
+ mov r6d, [stk+ 4]
+ mov r7d, [stk+ 8]
+ mov r9d, [stk+12]
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m14, m5, q1100
+ pshufd m5, m5, q3322
+ pand m7, m9, m4
+ pand m8, m9, m6
+ pand m15, m9, m14
+ pand m9, m9, m5
+ pandn m4, m0
+ pandn m6, m1
+ pandn m14, m2
+ pandn m5, m3
+ por m7, m4
+ por m8, m6
+ por m15, m14
+ por m9, m5
+ punpcklbw m0, m7, m7
+ punpckhbw m7, m7
+ punpcklbw m1, m8, m8
+ punpckhbw m8, m8
+ psraw m0, 8
+ psraw m7, 8
+ psraw m1, 8
+ psraw m8, 8
+ punpcklbw m2, m15, m15
+ punpckhbw m15, m15
+ punpcklbw m3, m9, m9
+ punpckhbw m9, m9
+ psraw m2, 8
+ psraw m15, 8
+ psraw m3, 8
+ psraw m9, 8
+ mova [stk+0x10], m0
+ mova [stk+0x20], m7
+ mova [stk+0x30], m1
+ mova [stk+0x40], m8
+ mova [stk+0x50], m2
+ mova [stk+0x60], m15
+ mova [stk+0x70], m3
+ mova [stk+0x80], m9
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
+ mova [stk+0x90], m1
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
+ mova [stk+0xa0], m2
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
+ mova [stk+0xb0], m3
+ MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
+ mova [stk+0xc0], m4
+ MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
+ mova [stk+0xd0], m5
+ MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
+ MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
+ MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
+ mova m5, [stk+0xd0]
+ mova m1, [stk+0x90]
+ mova m2, [stk+0xa0]
+ mova m3, [stk+0xb0]
+ mova m9, [stk+0xc0]
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m8 ; 67a
+ punpckhwd m7, m8 ; 67b
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m9 ; 23a
+ punpckhwd m3, m9 ; 23b
+ mova m10, [stk+0x140]
+ mova m11, [stk+0x150]
+ mova m14, [stk+0x160]
+ mova m15, [stk+0x170]
+ mova [stk+0x90], m4
+ mova [stk+0xa0], m5
+ mova [stk+0xb0], m6
+ mova [stk+0xc0], m7
+ %define hround [rsp+0x10]
+ %define shift [rsp+0x20]
+ %if isput
+ %define vround [rsp+0x30]
+ %else
+ %define vround [base+pd_m524256]
+ %endif
+.dy1_vloop:
+ pmaddwd m4, m0, m10
+ pmaddwd m5, m1, m10
+ pmaddwd m6, m2, m11
+ pmaddwd m7, m3, m11
+ paddd m4, m13
+ paddd m5, m13
+ paddd m4, m6
+ paddd m5, m7
+ pmaddwd m6, [stk+0x90], m14
+ pmaddwd m7, [stk+0xa0], m14
+ pmaddwd m8, [stk+0xb0], m15
+ pmaddwd m9, [stk+0xc0], m15
+ paddd m4, m6
+ paddd m5, m7
+ %if isput
+ pshufd m6, m12, q1032
+ %endif
+ paddd m4, m8
+ paddd m5, m9
+%else
+ movd r0, m15
+ movd rX, m4
+ pshufd m15, m15, q0321
+ pshufd m4, m4, q0321
+ movd r4, m15
+ movd r5, m4
+ mova m14, [stk+0x110]
+ movq m2, [base+subpel_filters+r0*8]
+ movq m3, [base+subpel_filters+rX*8]
+ movhps m2, [base+subpel_filters+r4*8]
+ movhps m3, [base+subpel_filters+r5*8]
+ psrld m14, 10
+ mova [stk+16], m14
+ mov r0, [stk+ 0]
+ mov rX, [stk+ 4]
+ mov r4, [stk+ 8]
+ mov r5, [stk+12]
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m0, m9, m4
+ pand m1, m9, m6
+ pand m2, m9, m7
+ pand m3, m9, m5
+ pandn m4, [stk+0x20]
+ pandn m6, [stk+0x30]
+ pandn m7, [stk+0x40]
+ pandn m5, [stk+0x50]
+ por m0, m4
+ por m1, m6
+ por m2, m7
+ por m3, m5
+ punpcklbw m4, m0, m0
+ punpckhbw m0, m0
+ punpcklbw m5, m1, m1
+ punpckhbw m1, m1
+ psraw m4, 8
+ psraw m0, 8
+ psraw m5, 8
+ psraw m1, 8
+ punpcklbw m6, m2, m2
+ punpckhbw m2, m2
+ punpcklbw m7, m3, m3
+ punpckhbw m3, m3
+ psraw m6, 8
+ psraw m2, 8
+ psraw m7, 8
+ psraw m3, 8
+ mova [stk+0x0a0], m4
+ mova [stk+0x0b0], m0
+ mova [stk+0x0c0], m5
+ mova [stk+0x0d0], m1
+ mova [stk+0x140], m6
+ mova [stk+0x150], m2
+ mova [stk+0x160], m7
+ mova [stk+0x170], m3
+ MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0
+ MC_8TAP_SCALED_H 0xa0, 0x30 ; 1
+ MC_8TAP_SCALED_H 0xa0, 0x40 ; 2
+ MC_8TAP_SCALED_H 0xa0, 0x50 ; 3
+ MC_8TAP_SCALED_H 0xa0, 0x60 ; 4
+ MC_8TAP_SCALED_H 0xa0, 0x70 ; 5
+ MC_8TAP_SCALED_H 0xa0, 0x80 ; 6
+ MC_8TAP_SCALED_H 0xa0, 0x90 ; 7
+ mova m5, [stk+0x60]
+ mova m6, [stk+0x70]
+ mova m7, [stk+0x80]
+ mova m0, [stk+0x90]
+ mov r0, r0m
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova [stk+0x60], m4
+ mova [stk+0x70], m5
+ mova [stk+0x80], m6
+ mova [stk+0x90], m7
+ mova m1, [stk+0x20]
+ mova m2, [stk+0x30]
+ mova m3, [stk+0x40]
+ mova m4, [stk+0x50]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m4 ; 23a
+ punpckhwd m3, m4 ; 23b
+ mova m4, [stk+0x180]
+ mova m5, [stk+0x190]
+ mova m6, [stk+0x1a0]
+ mova m7, [stk+0x1b0]
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+.dy1_vloop:
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m5
+ pmaddwd m3, m5
+ paddd m0, m2
+ paddd m1, m3
+ pmaddwd m2, [stk+0x60], m6
+ pmaddwd m3, [stk+0x70], m6
+ pmaddwd m4, [stk+0x80], m7
+ pmaddwd m5, [stk+0x90], m7
+ %if isput
+ movd m6, [esp+0x18]
+ %endif
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, vrnd_mem
+ paddd m1, vrnd_mem
+ paddd m4, m0
+ paddd m5, m1
+%endif
+%ifidn %1, put
+ psrad m4, m6
+ psrad m5, m6
+ packssdw m4, m5
+ pxor m7, m7
+ pmaxsw m4, m7
+ pminsw m4, pxmaxm
+ mova [dstq], m4
+ add dstq, dsm
+%else
+ psrad m4, 6
+ psrad m5, 6
+ packssdw m4, m5
+ mova [tmpq], m4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy1_hloop_prep
+%if ARCH_X86_64
+ movu m8, [srcq+r10*2]
+ movu m9, [srcq+r11*2]
+ movu m12, [srcq+r13*2]
+ movu m13, [srcq+ rX*2]
+ movu m4, [srcq+ r4*2]
+ movu m5, [srcq+ r6*2]
+ movu m6, [srcq+ r7*2]
+ movu m7, [srcq+ r9*2]
+ add srcq, ssq
+ pmaddwd m8, [stk+0x50]
+ pmaddwd m9, [stk+0x60]
+ pmaddwd m12, [stk+0x70]
+ pmaddwd m13, [stk+0x80]
+ pmaddwd m4, [stk+0x10]
+ pmaddwd m5, [stk+0x20]
+ pmaddwd m6, [stk+0x30]
+ pmaddwd m7, [stk+0x40]
+ phaddd m8, m9
+ phaddd m12, m13
+ mova m9, [base+unpckw]
+ mova m13, hround
+ phaddd m4, m5
+ phaddd m6, m7
+ phaddd m8, m12
+ phaddd m4, m6
+ pshufd m5, m9, q1032
+ pshufb m0, m9 ; 0a 1a
+ pshufb m1, m9 ; 0b 1b
+ pshufb m2, m5 ; 3a 2a
+ pshufb m3, m5 ; 3b 2b
+ mova m12, shift
+ paddd m4, m13
+ paddd m8, m13
+ psrad m4, m12
+ psrad m8, m12
+ packssdw m4, m8
+ pshufb m6, [stk+0x90], m9 ; 4a 5a
+ pshufb m7, [stk+0xa0], m9 ; 4b 5b
+ pshufb m8, [stk+0xb0], m5 ; 7a 6a
+ pshufb m13, [stk+0xc0], m5 ; 7b 6b
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ punpcklwd m2, m6 ; 34a
+ punpcklwd m3, m7 ; 34b
+ punpckhwd m6, m8 ; 56a
+ punpckhwd m7, m13 ; 56b
+ punpcklwd m8, m4 ; 78a
+ punpckhqdq m4, m4
+ punpcklwd m13, m4 ; 78b
+ mova [stk+0x90], m6
+ mova [stk+0xa0], m7
+ mova [stk+0xb0], m8
+ mova [stk+0xc0], m13
+ mova m13, vround
+%else
+ mov r0m, r0
+ mov r3, r3m
+ mov r0, [stk+ 0]
+ mov rX, [stk+ 4]
+ mov r4, [stk+ 8]
+ mov r5, [stk+12]
+ MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
+ mova m7, [base+unpckw]
+ pshufd m4, m7, q1032
+ pshufb m0, [stk+0x20], m7 ; 0a 1a
+ pshufb m1, [stk+0x30], m7 ; 0b 1b
+ pshufb m2, [stk+0x40], m4 ; 3a 2a
+ pshufb m3, [stk+0x50], m4 ; 3b 2b
+ pshufb m5, [stk+0x60], m7 ; 4a 5a
+ pshufb m6, [stk+0x70], m7 ; 4b 5b
+ pshufb m7, [stk+0x80], m4 ; 7a 6a
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ punpcklwd m2, m5 ; 34a
+ punpcklwd m3, m6 ; 34b
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ punpckhwd m5, m7 ; 56a
+ mova [stk+0x60], m5
+ pshufb m5, [stk+0x90], m4 ; 7b 6b
+ punpcklwd m7, [stk+0xe0] ; 78a
+ mova m4, [stk+0x180]
+ punpckhwd m6, m5 ; 56b
+ mova [stk+0x70], m6
+ movq m6, [stk+0xe8]
+ mova [stk+0x80], m7
+ mova m7, [stk+0x1b0]
+ punpcklwd m5, m6
+ mova m6, [stk+0x1a0]
+ mova [stk+0x90], m5
+ mova m5, [stk+0x190]
+ mov r0, r0m
+%endif
+ jmp .dy1_vloop
+INIT_XMM ssse3
+%if ARCH_X86_64
+ %define stk rsp+0x20
+%endif
+.dy2:
+ movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%if isput
+.dy2_w2:
+ %if ARCH_X86_64
+ mov myd, mym
+ mova [rsp+0x10], m13
+ %define vrnd_mem [rsp+0x10]
+ movzx t0d, t0b
+ sub srcq, 2
+ movd m15, t0d
+ %else
+ %define m8 m0
+ %define m9 m1
+ %define m14 m4
+ %define m15 m3
+ %define m11 [esp+0x00]
+ %define m12 [esp+0x10]
+ %define vrnd_mem [esp+0x20]
+ mov r1, r1m
+ movzx r5, byte [esp+0x1f0]
+ sub srcq, 2
+ movd m15, r5
+ %endif
+ pxor m9, m9
+ punpckldq m9, m8
+ paddd m14, m9 ; mx+dx*[0-1]
+ %if ARCH_X86_64
+ mova m9, [base+pd_0x4000]
+ %endif
+ pshufd m15, m15, q0000
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ movd r4d, m15
+ pshufd m15, m15, q0321
+ %if ARCH_X86_64
+ movd r6d, m15
+ %else
+ movd r3d, m15
+ %endif
+ mova m5, [base+bdct_lb_q]
+ mova m6, [base+spel_s_shuf2]
+ movd m15, [base+subpel_filters+r4*8+2]
+ %if ARCH_X86_64
+ movd m7, [base+subpel_filters+r6*8+2]
+ %else
+ movd m7, [base+subpel_filters+r3*8+2]
+ %endif
+ pxor m2, m2
+ pcmpeqd m8, m2
+ psrld m14, 10
+ paddd m14, m14
+ %if ARCH_X86_32
+ mov r3, r3m
+ pshufb m14, m5
+ paddb m14, m6
+ mova [stk], m14
+ SWAP m5, m0
+ SWAP m6, m3
+ %define m15 m6
+ %endif
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*2]
+ movu m2, [srcq+ssq*4]
+ punpckldq m15, m7
+ %if ARCH_X86_64
+ pshufb m14, m5
+ paddb m14, m6
+ pand m9, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m9
+ movu m4, [srcq+ssq*1]
+ movu m5, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ movu m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ %else
+ pand m7, m5, [base+pd_0x4000]
+ pandn m5, m15
+ por m5, m7
+ %define m15 m5
+ mov myd, mym
+ mov r5, [esp+0x1f4]
+ xor r3, r3
+ shr myd, 6
+ lea r5, [r5+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r5*8+0]
+ cmovnz r3, [base+subpel_filters+r5*8+4]
+ mov [stk+0x20], r3
+ mov r3, r3m
+ %endif
+ punpcklbw m15, m15
+ psraw m15, 8
+ REPX {pshufb x, m14}, m0, m1, m2
+ REPX {pmaddwd x, m15}, m0, m1, m2
+ %if ARCH_X86_64
+ REPX {pshufb x, m14}, m4, m5, m6
+ REPX {pmaddwd x, m15}, m4, m5, m6
+ phaddd m0, m1
+ phaddd m1, m2
+ phaddd m4, m5
+ phaddd m5, m6
+ REPX {paddd x, m11}, m0, m1, m4, m5
+ REPX {psrad x, m12}, m0, m1, m4, m5
+ packssdw m0, m1 ; 0 2 2 4
+ packssdw m4, m5 ; 1 3 3 5
+ SWAP m2, m4
+ movq m10, r4
+ %else
+ mova [stk+0x10], m15
+ phaddd m0, m1
+ phaddd m1, m2
+ movu m2, [srcq+ssq*1]
+ movu m7, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ movu m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ REPX {pshufb x, m14}, m2, m7, m6
+ REPX {pmaddwd x, m15}, m2, m7, m6
+ %define m14 [stk+0x00]
+ %define m15 [stk+0x10]
+ phaddd m2, m7
+ phaddd m7, m6
+ REPX {paddd x, m11}, m0, m1, m2, m7
+ REPX {psrad x, m12}, m0, m1, m2, m7
+ packssdw m0, m1
+ packssdw m2, m7
+ %define m8 m6
+ %define m9 m4
+ %define m10 m5
+ movd m10, r4
+ movd m9, [stk+0x20]
+ punpckldq m10, m9
+ %endif
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+ %if ARCH_X86_32
+ mova [stk+0x50], m7
+ mova [stk+0x60], m8
+ mova [stk+0x70], m9
+ mova [stk+0x80], m10
+ %xdefine m13 m7
+ %define m7 [stk+0x50]
+ %define m8 [stk+0x60]
+ %define m9 [stk+0x70]
+ %define m10 [stk+0x80]
+ %endif
+ punpcklwd m1, m0, m2 ; 01 23
+ punpckhwd m3, m0, m2 ; 23 45
+ %if ARCH_X86_32
+ mov r4, r0m
+ %define dstq r4
+ mova [stk+0x20], m3
+ mova [stk+0x30], m0
+ %endif
+.dy2_w2_loop:
+ movu m4, [srcq+ssq*0]
+ movu m5, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ movu m13, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pmaddwd m3, m8
+ REPX {pshufb x, m14}, m4, m5, m6, m13
+ REPX {pmaddwd x, m15}, m4, m5, m6, m13
+ phaddd m4, m5
+ phaddd m6, m13
+ pmaddwd m5, m1, m7
+ paddd m4, m11
+ paddd m6, m11
+ psrad m4, m12
+ psrad m6, m12
+ packssdw m4, m6 ; 6 7 8 9
+ paddd m5, m3
+ pshufd m3, m4, q2200
+ pshufd m4, m4, q3311
+ palignr m3, m0, 12 ; 4 6 6 8
+ palignr m4, m2, 12 ; 5 7 7 9
+ mova m0, m3
+ mova m2, m4
+ punpcklwd m1, m3, m4
+ punpckhwd m3, m4
+ pmaddwd m6, m1, m9
+ pmaddwd m4, m3, m10
+ paddd m5, vrnd_mem
+ paddd m6, m4
+ paddd m5, m6
+ pshufd m4, m12, q1032
+ pxor m6, m6
+ psrad m5, m4
+ packssdw m5, m5
+ pmaxsw m5, m6
+ pminsw m5, pxmaxm
+ movd [dstq+dsq*0], m5
+ pshuflw m5, m5, q1032
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy2_w2_loop
+ RET
+%endif
+INIT_XMM ssse3
+.dy2_w4:
+%if ARCH_X86_64
+ mov myd, mym
+ mova [rsp+0x10], m11
+ mova [rsp+0x20], m12
+ %if isput
+ mova [rsp+0x30], m13
+ %define vrnd_mem [rsp+0x30]
+ %define stk rsp+0x40
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %define stk rsp+0x30
+ %endif
+ movzx t0d, t0b
+ sub srcq, 2
+ movd m15, t0d
+%else
+ %define m10 [base+pd_0x3ff]
+ %define m9 [base+pd_0x4000]
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ %if isprep
+ %define ssq r3
+ %endif
+ movzx r5, byte [esp+0x1f0]
+ sub srcq, 2
+ movd m15, r5
+%endif
+ pmaddwd m8, [base+rescale_mul]
+%if ARCH_X86_64
+ mova m9, [base+pd_0x4000]
+%endif
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m0, m14, m10
+ psrld m0, 6
+ paddd m15, m0
+ pshufd m7, m15, q1032
+%if ARCH_X86_64
+ movd r4d, m15
+ movd r11d, m7
+ pshufd m15, m15, q0321
+ pshufd m7, m7, q0321
+ movd r6d, m15
+ movd r13d, m7
+ mova m10, [base+bdct_lb_q+ 0]
+ mova m11, [base+bdct_lb_q+16]
+ movd m13, [base+subpel_filters+ r4*8+2]
+ movd m2, [base+subpel_filters+ r6*8+2]
+ movd m15, [base+subpel_filters+r11*8+2]
+ movd m4, [base+subpel_filters+r13*8+2]
+%else
+ movd r1, m15
+ movd r4, m7
+ pshufd m15, m15, q0321
+ pshufd m7, m7, q0321
+ movd r3, m15
+ movd r5, m7
+ mova m5, [base+bdct_lb_q+ 0]
+ mova m6, [base+bdct_lb_q+16]
+ movd m1, [base+subpel_filters+r1*8+2]
+ movd m2, [base+subpel_filters+r3*8+2]
+ movd m3, [base+subpel_filters+r4*8+2]
+ movd m7, [base+subpel_filters+r5*8+2]
+ SWAP m4, m7
+ mov r3, r3m
+ %if isprep
+ lea ss3q, [ssq*3]
+ %endif
+ %define m10 m5
+ %define m11 m6
+ %define m12 m1
+ %define m13 m1
+%endif
+ psrld m14, 10
+ paddd m14, m14
+ punpckldq m13, m2
+ punpckldq m15, m4
+ punpcklqdq m13, m15
+ pxor m2, m2
+ pcmpeqd m0, m2
+%if ARCH_X86_64
+ pand m9, m0
+%else
+ pand m2, m9, m0
+ %define m9 m2
+ SWAP m7, m4
+%endif
+ pandn m0, m13
+%if ARCH_X86_64
+ SWAP m13, m0
+%else
+ %define m13 m0
+%endif
+ por m13, m9
+ punpckhbw m15, m13, m13
+ punpcklbw m13, m13
+ psraw m15, 8
+ psraw m13, 8
+ pshufb m12, m14, m10
+ pshufb m14, m11
+ mova m10, [base+spel_s_shuf2]
+ movd r4d, m14
+ shr r4d, 24
+%if ARCH_X86_32
+ mova [stk+0x40], m13
+ mova [stk+0x50], m15
+ pxor m2, m2
+%endif
+ pshufb m7, m14, m2
+ psubb m14, m7
+ paddb m12, m10
+ paddb m14, m10
+%if ARCH_X86_64
+ lea r6, [r4+ssq*1]
+ lea r11, [r4+ssq*2]
+ lea r13, [r4+ss3q ]
+ movu m1, [srcq+ssq*0]
+ movu m8, [srcq+ssq*2]
+ movu m9, [srcq+ssq*1]
+ movu m10, [srcq+ss3q ]
+ movu m7, [srcq+r4 ]
+ movu m2, [srcq+r11 ]
+ movu m3, [srcq+r6 ]
+ movu m4, [srcq+r13 ]
+ lea srcq, [srcq+ssq*4]
+ REPX {pshufb x, m12}, m1, m9, m8, m10
+ REPX {pmaddwd x, m13}, m1, m9, m8, m10
+ REPX {pshufb x, m14}, m7, m3, m2, m4
+ REPX {pmaddwd x, m15}, m7, m3, m2, m4
+ mova m5, [rsp+0x10]
+ movd xm6, [rsp+0x20]
+ phaddd m1, m7
+ phaddd m8, m2
+ phaddd m9, m3
+ phaddd m10, m4
+ movu m2, [srcq+ssq*0]
+ movu m3, [srcq+ssq*1]
+ REPX {paddd x, m5}, m1, m9, m8, m10
+ REPX {psrad x, xm6}, m1, m9, m8, m10
+ packssdw m1, m8 ; 0 2
+ packssdw m9, m10 ; 1 3
+ movu m0, [srcq+r4 ]
+ movu m8, [srcq+r6 ]
+ lea srcq, [srcq+ssq*2]
+ REPX {pshufb x, m12}, m2, m3
+ REPX {pmaddwd x, m13}, m2, m3
+ REPX {pshufb x, m14}, m0, m8
+ REPX {pmaddwd x, m15}, m0, m8
+ phaddd m2, m0
+ phaddd m3, m8
+ shr myd, 6
+ mov r9d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r9q, [base+subpel_filters+myq*8]
+ REPX {paddd x, m5}, m2, m3
+ REPX {psrad x, xm6}, m2, m3
+ packssdw m2, m3 ; 4 5
+ pshufd m3, m2, q1032 ; 5 _
+ punpcklwd m0, m1, m9 ; 01
+ punpckhwd m1, m9 ; 23
+ punpcklwd m2, m3 ; 45
+ movq m10, r9
+ %define hrnd_mem [rsp+0x10]
+ %define hsh_mem [rsp+0x20]
+ %define vsh_mem [rsp+0x28]
+ %if isput
+ %define vrnd_mem [rsp+0x30]
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %endif
+%else
+ mova [stk+0x20], m12
+ mova [stk+0x30], m14
+ add r4, srcq
+ MC_4TAP_SCALED_H 0x60 ; 0 1
+ MC_4TAP_SCALED_H 0x70 ; 2 3
+ MC_4TAP_SCALED_H 0x80 ; 4 5
+ mov [stk+0xe0], r4
+ mova m3, [base+spel_s_shuf8]
+ mova m0, [stk+0x60]
+ mova m1, [stk+0x70]
+ mova m2, [stk+0x80]
+ mov myd, mym
+ mov rX, [esp+0x1f4]
+ xor r5, r5
+ shr myd, 6
+ lea rX, [rX+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+rX*8+0]
+ cmovnz r5, [base+subpel_filters+rX*8+4]
+ mov r3, r3m
+ pshufb m0, m3 ; 01
+ pshufb m1, m3 ; 23
+ pshufb m2, m3 ; 45
+ movd m7, r4
+ movd m4, r5
+ mov r5, r0m
+ %if isput
+ mov r1, r1m
+ %endif
+ mov r4, [stk+0xe0]
+ %define dstq r5
+ %define tmpq r5
+ %define m12 [stk+0x20]
+ %define m14 [stk+0x30]
+ %define m13 [stk+0x40]
+ %define m15 [stk+0x50]
+ %define hrnd_mem [esp+0x00]
+ %define hsh_mem [esp+0x10]
+ %define vsh_mem [esp+0x18]
+ %if isput
+ %define vrnd_mem [esp+0x20]
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %endif
+ %define m10 m7
+ punpckldq m10, m4
+%endif
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufd m3, m10, q0000
+ pshufd m4, m10, q1111
+ pshufd m5, m10, q2222
+ pshufd m10, m10, q3333
+%if ARCH_X86_32
+ %xdefine m8 m3
+ %xdefine m9 m6
+ %xdefine m11 m5
+ %xdefine m6 m4
+ mova [stk+0x100], m3
+ mova [stk+0x110], m4
+ mova [stk+0x120], m5
+ mova [stk+0x130], m10
+ %define m3 [stk+0x100]
+ %define m4 [stk+0x110]
+ %define m5 [stk+0x120]
+ %define m10 [stk+0x130]
+%endif
+.dy2_w4_loop:
+ pmaddwd m8, m0, m3
+ pmaddwd m9, m1, m3
+ mova m0, m2
+ pmaddwd m1, m4
+ pmaddwd m11, m2, m4
+ paddd m8, vrnd_mem
+ paddd m9, vrnd_mem
+ pmaddwd m2, m5
+ paddd m8, m1
+ paddd m9, m11
+ paddd m8, m2
+ movu m6, [srcq+ssq*0]
+ movu m1, [srcq+ssq*2]
+%if ARCH_X86_64
+ movu m11, [srcq+r4 ]
+ movu m2, [srcq+r11]
+%else
+ movu m11, [r4+ssq*0]
+ movu m2, [r4+ssq*2]
+%endif
+ pshufb m6, m12
+ pshufb m1, m12
+ pmaddwd m6, m13
+ pmaddwd m1, m13
+ pshufb m11, m14
+ pshufb m2, m14
+ pmaddwd m11, m15
+ pmaddwd m2, m15
+ phaddd m6, m11
+ phaddd m1, m2
+ paddd m6, hrnd_mem
+ paddd m1, hrnd_mem
+ psrad m6, hsh_mem
+ psrad m1, hsh_mem
+ movu m7, [srcq+ssq*1]
+ movu m11, [srcq+ss3q ]
+ packssdw m6, m1 ; 6 8
+%if ARCH_X86_64
+ movu m2, [srcq+r6 ]
+ movu m1, [srcq+r13]
+%else
+ movu m2, [r4+ssq*1]
+ movu m1, [r4+ss3q ]
+%endif
+ pshufb m7, m12
+ pshufb m11, m12
+ pmaddwd m7, m13
+ pmaddwd m11, m13
+ pshufb m2, m14
+ pshufb m1, m14
+ pmaddwd m2, m15
+ pmaddwd m1, m15
+ phaddd m7, m2
+ phaddd m11, m1
+ paddd m7, hrnd_mem
+ paddd m11, hrnd_mem
+ psrad m7, hsh_mem
+ psrad m11, hsh_mem
+ packssdw m7, m11 ; 7 9
+%if ARCH_X86_32
+ lea r4, [r4+ssq*4]
+%endif
+ lea srcq, [srcq+ssq*4]
+ punpcklwd m1, m6, m7 ; 67
+ punpckhwd m6, m7 ; 89
+ mova m2, m6
+ pmaddwd m11, m1, m5
+ pmaddwd m7, m1, m10
+ pmaddwd m6, m10
+ paddd m9, m11
+%if isput
+ movd m11, vsh_mem
+%endif
+ paddd m8, m7
+ paddd m9, m6
+%if isput
+ psrad m8, m11
+ psrad m9, m11
+ packssdw m8, m9
+ pxor m7, m7
+ pmaxsw m8, m7
+ pminsw m8, pxmaxm
+ movq [dstq+dsq*0], m8
+ movhps [dstq+dsq*1], m8
+ lea dstq, [dstq+dsq*2]
+%else
+ psrad m8, 6
+ psrad m9, 6
+ packssdw m8, m9
+ mova [tmpq], m8
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jg .dy2_w4_loop
+ MC_8TAP_SCALED_RET ; why not jz .ret?
+INIT_XMM ssse3
+.dy2_w8:
+ mov dword [stk+0xf0], 1
+ movifprep tmp_stridem, 16
+ jmp .dy2_w_start
+.dy2_w16:
+ mov dword [stk+0xf0], 2
+ movifprep tmp_stridem, 32
+ jmp .dy2_w_start
+.dy2_w32:
+ mov dword [stk+0xf0], 4
+ movifprep tmp_stridem, 64
+ jmp .dy2_w_start
+.dy2_w64:
+ mov dword [stk+0xf0], 8
+ movifprep tmp_stridem, 128
+ jmp .dy2_w_start
+.dy2_w128:
+ mov dword [stk+0xf0], 16
+ movifprep tmp_stridem, 256
+.dy2_w_start:
+ mov myd, mym
+%if ARCH_X86_64
+ %ifidn %1, put
+ movifnidn dsm, dsq
+ %endif
+ mova [rsp+0x10], m11
+ mova [rsp+0x20], m12
+ %define hround m11
+ %if isput
+ mova [rsp+0x30], m13
+ %else
+ mova m13, [base+pd_m524256]
+ %endif
+ shr t0d, 16
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ movd m15, t0d
+%else
+ %define hround [esp+0x00]
+ %define m12 [esp+0x10]
+ %define m10 [base+pd_0x3ff]
+ %define m8 m0
+ %xdefine m14 m4
+ %xdefine m15 m3
+ %if isput
+ %define dstq r0
+ %else
+ %define tmpq r0
+ %define ssq ssm
+ %endif
+ mov r5, [esp+0x1f0]
+ mov r3, [esp+0x1f4]
+ shr r5, 16
+ movd m15, r5
+ xor r5, r5
+ shr myd, 6
+ lea r3, [r3+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r3*8+0]
+ cmovnz r5, [base+subpel_filters+r3*8+4]
+ mov r0, r0m
+ mov r3, r3m
+%endif
+ sub srcq, 6
+ pslld m7, m8, 2 ; dx*4
+ pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+%if ARCH_X86_64
+ movq m3, r4q
+%else
+ movd m5, r4
+ movd m6, r5
+ punpckldq m5, m6
+ SWAP m3, m5
+%endif
+ punpcklbw m3, m3
+ psraw m3, 8
+ mova [stk+0x100], m7
+ mova [stk+0x120], m15
+ mov [stk+0x0f8], srcq
+ mov [stk+0x130], r0q ; dstq / tmpq
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+%if ARCH_X86_64
+ mova [stk+0x140], m0
+ mova [stk+0x150], m1
+ mova [stk+0x160], m2
+ mova [stk+0x170], m3
+ %if UNIX64
+ mov hm, hd
+ %endif
+%else
+ mova [stk+0x180], m0
+ mova [stk+0x190], m1
+ mova [stk+0x1a0], m2
+ mova [stk+0x1b0], m3
+ SWAP m5, m3
+ mov r5, hm
+ mov [stk+0x134], r5
+%endif
+ jmp .dy2_hloop
+.dy2_hloop_prep:
+ dec dword [stk+0x0f0]
+ jz .ret
+%if ARCH_X86_64
+ add qword [stk+0x130], 16
+ mov hd, hm
+%else
+ add dword [stk+0x130], 16
+ mov r5, [stk+0x134]
+ mov r0, [stk+0x130]
+%endif
+ mova m7, [stk+0x100]
+ mova m14, [stk+0x110]
+%if ARCH_X86_64
+ mova m10, [base+pd_0x3ff]
+ mova m11, [rsp+0x10]
+%endif
+ mova m15, [stk+0x120]
+ mov srcq, [stk+0x0f8]
+%if ARCH_X86_64
+ mov r0q, [stk+0x130] ; dstq / tmpq
+%else
+ mov hm, r5
+ mov r0m, r0
+ mov r3, r3m
+%endif
+ paddd m14, m7
+.dy2_hloop:
+%if ARCH_X86_64
+ mova m9, [base+pq_0x40000000]
+%else
+ %define m9 [base+pq_0x40000000]
+%endif
+ pxor m1, m1
+ psrld m2, m14, 10
+ mova [stk], m2
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m5, m15, m6
+ pcmpeqd m6, m1
+ pshufd m2, m5, q1032
+%if ARCH_X86_64
+ movd r4d, m5
+ movd r6d, m2
+ pshufd m5, m5, q0321
+ pshufd m2, m2, q0321
+ movd r7d, m5
+ movd r9d, m2
+ movq m0, [base+subpel_filters+r4*8]
+ movq m1, [base+subpel_filters+r6*8]
+ movhps m0, [base+subpel_filters+r7*8]
+ movhps m1, [base+subpel_filters+r9*8]
+%else
+ movd r0, m5
+ movd rX, m2
+ pshufd m5, m5, q0321
+ pshufd m2, m2, q0321
+ movd r4, m5
+ movd r5, m2
+ movq m0, [base+subpel_filters+r0*8]
+ movq m1, [base+subpel_filters+rX*8]
+ movhps m0, [base+subpel_filters+r4*8]
+ movhps m1, [base+subpel_filters+r5*8]
+%endif
+ paddd m14, m7 ; mx+dx*[4-7]
+ pand m5, m14, m10
+ psrld m5, 6
+ paddd m15, m5
+ pxor m2, m2
+ pcmpeqd m5, m2
+ mova [stk+0x110], m14
+ pshufd m4, m15, q1032
+%if ARCH_X86_64
+ movd r10d, m15
+ movd r11d, m4
+ pshufd m15, m15, q0321
+ pshufd m4, m4, q0321
+ movd r13d, m15
+ movd rXd, m4
+ movq m2, [base+subpel_filters+r10*8]
+ movq m3, [base+subpel_filters+r11*8]
+ movhps m2, [base+subpel_filters+r13*8]
+ movhps m3, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ movq r11, m14
+ punpckhqdq m14, m14
+ movq rX, m14
+ mov r10d, r11d
+ shr r11, 32
+ mov r13d, rXd
+ shr rX, 32
+ mov r4d, [stk+ 0]
+ mov r6d, [stk+ 4]
+ mov r7d, [stk+ 8]
+ mov r9d, [stk+12]
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m14, m5, q1100
+ pshufd m5, m5, q3322
+ pand m7, m9, m4
+ pand m8, m9, m6
+ pand m15, m9, m14
+ pand m9, m9, m5
+ pandn m4, m0
+ pandn m6, m1
+ pandn m14, m2
+ pandn m5, m3
+ por m7, m4
+ por m8, m6
+ por m15, m14
+ por m9, m5
+ punpcklbw m0, m7, m7
+ punpckhbw m7, m7
+ punpcklbw m1, m8, m8
+ punpckhbw m8, m8
+ psraw m0, 8
+ psraw m7, 8
+ psraw m1, 8
+ psraw m8, 8
+ punpcklbw m2, m15, m15
+ punpckhbw m15, m15
+ punpcklbw m3, m9, m9
+ punpckhbw m9, m9
+ psraw m2, 8
+ psraw m15, 8
+ psraw m3, 8
+ psraw m9, 8
+ mova [stk+0x10], m0
+ mova [stk+0x20], m7
+ mova [stk+0x30], m1
+ mova [stk+0x40], m8
+ mova [stk+0x50], m2
+ mova [stk+0x60], m15
+ mova [stk+0x70], m3
+ mova [stk+0x80], m9
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
+ mova [stk+0x90], m1
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
+ mova [stk+0xa0], m2
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
+ mova [stk+0xb0], m3
+ MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
+ mova [stk+0xc0], m4
+ MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
+ mova [stk+0xd0], m5
+ MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
+ MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
+ MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
+ mova m5, [stk+0xd0]
+ mova m1, [stk+0x90]
+ mova m2, [stk+0xa0]
+ mova m3, [stk+0xb0]
+ mova m9, [stk+0xc0]
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m8 ; 67a
+ punpckhwd m7, m8 ; 67b
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m9 ; 23a
+ punpckhwd m3, m9 ; 23b
+ mova m10, [stk+0x140]
+ mova m11, [stk+0x150]
+ mova m14, [stk+0x160]
+ mova m15, [stk+0x170]
+ mova [stk+0x90], m4
+ mova [stk+0xa0], m5
+ mova [stk+0xb0], m6
+ mova [stk+0xc0], m7
+ %define hround [rsp+0x10]
+ %define shift [rsp+0x20]
+ %if isput
+ %define vround [rsp+0x30]
+ %else
+ %define vround [base+pd_m524256]
+ %endif
+.dy2_vloop:
+ pmaddwd m4, m0, m10
+ pmaddwd m5, m1, m10
+ pmaddwd m6, m2, m11
+ pmaddwd m7, m3, m11
+ paddd m4, m13
+ paddd m5, m13
+ paddd m4, m6
+ paddd m5, m7
+ pmaddwd m6, [stk+0x90], m14
+ pmaddwd m7, [stk+0xa0], m14
+ pmaddwd m8, [stk+0xb0], m15
+ pmaddwd m9, [stk+0xc0], m15
+ paddd m4, m6
+ paddd m5, m7
+ %if isput
+ pshufd m6, m12, q1032
+ %endif
+ paddd m4, m8
+ paddd m5, m9
+%else
+ movd r0, m15
+ movd rX, m4
+ pshufd m15, m15, q0321
+ pshufd m4, m4, q0321
+ movd r4, m15
+ movd r5, m4
+ mova m14, [stk+0x110]
+ movq m2, [base+subpel_filters+r0*8]
+ movq m3, [base+subpel_filters+rX*8]
+ movhps m2, [base+subpel_filters+r4*8]
+ movhps m3, [base+subpel_filters+r5*8]
+ psrld m14, 10
+ mova [stk+16], m14
+ mov r0, [stk+ 0]
+ mov rX, [stk+ 4]
+ mov r4, [stk+ 8]
+ mov r5, [stk+12]
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m0, m9, m4
+ pand m1, m9, m6
+ pand m2, m9, m7
+ pand m3, m9, m5
+ pandn m4, [stk+0x20]
+ pandn m6, [stk+0x30]
+ pandn m7, [stk+0x40]
+ pandn m5, [stk+0x50]
+ por m0, m4
+ por m1, m6
+ por m2, m7
+ por m3, m5
+ punpcklbw m4, m0, m0
+ punpckhbw m0, m0
+ punpcklbw m5, m1, m1
+ punpckhbw m1, m1
+ psraw m4, 8
+ psraw m0, 8
+ psraw m5, 8
+ psraw m1, 8
+ punpcklbw m6, m2, m2
+ punpckhbw m2, m2
+ punpcklbw m7, m3, m3
+ punpckhbw m3, m3
+ psraw m6, 8
+ psraw m2, 8
+ psraw m7, 8
+ psraw m3, 8
+ mova [stk+0x0a0], m4
+ mova [stk+0x0b0], m0
+ mova [stk+0x0c0], m5
+ mova [stk+0x0d0], m1
+ mova [stk+0x140], m6
+ mova [stk+0x150], m2
+ mova [stk+0x160], m7
+ mova [stk+0x170], m3
+ MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0
+ MC_8TAP_SCALED_H 0xa0, 0x30 ; 1
+ MC_8TAP_SCALED_H 0xa0, 0x40 ; 2
+ MC_8TAP_SCALED_H 0xa0, 0x50 ; 3
+ MC_8TAP_SCALED_H 0xa0, 0x60 ; 4
+ MC_8TAP_SCALED_H 0xa0, 0x70 ; 5
+ MC_8TAP_SCALED_H 0xa0, 0x80 ; 6
+ MC_8TAP_SCALED_H 0xa0, 0x90 ; 7
+ mova m5, [stk+0x60]
+ mova m6, [stk+0x70]
+ mova m7, [stk+0x80]
+ mova m0, [stk+0x90]
+ mov r0, r0m
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova [stk+0x60], m4
+ mova [stk+0x70], m5
+ mova [stk+0x80], m6
+ mova [stk+0x90], m7
+ mova m1, [stk+0x20]
+ mova m2, [stk+0x30]
+ mova m3, [stk+0x40]
+ mova m4, [stk+0x50]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m4 ; 23a
+ punpckhwd m3, m4 ; 23b
+ mova m4, [stk+0x180]
+ mova m5, [stk+0x190]
+ mova m6, [stk+0x1a0]
+ mova m7, [stk+0x1b0]
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+.dy2_vloop:
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m5
+ pmaddwd m3, m5
+ paddd m0, m2
+ paddd m1, m3
+ pmaddwd m2, [stk+0x60], m6
+ pmaddwd m3, [stk+0x70], m6
+ pmaddwd m4, [stk+0x80], m7
+ pmaddwd m5, [stk+0x90], m7
+ %if isput
+ movd m6, [esp+0x18]
+ %endif
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, vrnd_mem
+ paddd m1, vrnd_mem
+ paddd m4, m0
+ paddd m5, m1
+%endif
+%ifidn %1, put
+ psrad m4, m6
+ psrad m5, m6
+ packssdw m4, m5
+ pxor m7, m7
+ pmaxsw m4, m7
+ pminsw m4, pxmaxm
+ mova [dstq], m4
+ add dstq, dsm
+%else
+ psrad m4, 6
+ psrad m5, 6
+ packssdw m4, m5
+ mova [tmpq], m4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy2_hloop_prep
+%if ARCH_X86_64
+ MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 0, 1
+ mova [stk+0xd0], m4
+ MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 4, 0, 1
+ mova m4, [stk+0xd0]
+ mova m0, m2 ; 01a
+ mova m1, m3 ; 01b
+ mova m2, [stk+0x90] ; 23a
+ mova m3, [stk+0xa0] ; 23b
+ mova m5, [stk+0xb0] ; 45a
+ mova m6, [stk+0xc0] ; 45b
+ punpcklwd m7, m4, m8 ; 67a
+ punpckhwd m4, m8 ; 67b
+ mova [stk+0x90], m5
+ mova [stk+0xa0], m6
+ mova [stk+0xb0], m7
+ mova [stk+0xc0], m4
+%else
+ mov r0m, r0
+ mov r3, r3m
+ MC_8TAP_SCALED_H 0xa0, 0xe0 ; 8
+ MC_8TAP_SCALED_H 0xa0, 0 ; 9
+ mova m7, [stk+0xe0]
+ mova m2, [stk+0x60] ; 23a
+ mova m3, [stk+0x70] ; 23b
+ mova m4, [stk+0x80] ; 45a
+ mova m5, [stk+0x90] ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova m0, [stk+0x40] ; 01a
+ mova m1, [stk+0x50] ; 01b
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ mova [stk+0x60], m4
+ mova [stk+0x70], m5
+ mova m4, [stk+0x180]
+ mova m5, [stk+0x190]
+ mova [stk+0x80], m6
+ mova [stk+0x90], m7
+ mova m6, [stk+0x1a0]
+ mova m7, [stk+0x1b0]
+ mov r0, r0m
+%endif
+ jmp .dy2_vloop
+INIT_XMM ssse3
+.ret:
+ MC_8TAP_SCALED_RET 0
+%if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT
+ %define r0m [rstk+stack_offset+ 4]
+ %define r1m [rstk+stack_offset+ 8]
+ %define r2m [rstk+stack_offset+12]
+ %define r3m [rstk+stack_offset+16]
+%endif
+%undef isput
+%undef isprep
+%endmacro
+
+%macro BILIN_SCALED_FN 1
+cglobal %1_bilin_scaled_16bpc
+ mov t0d, (5*15 << 16) | 5*15
+ mov t1d, (5*15 << 16) | 5*15
+ jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX)
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 5
+%elif ARCH_X86_64
+DECLARE_REG_TMP 6, 8
+%else
+DECLARE_REG_TMP 1, 2
+%endif
+BILIN_SCALED_FN put
+FN put_8tap_scaled, sharp, SHARP, SHARP
+FN put_8tap_scaled, sharp_smooth, SHARP, SMOOTH
+FN put_8tap_scaled, smooth_sharp, SMOOTH, SHARP
+FN put_8tap_scaled, smooth, SMOOTH, SMOOTH
+FN put_8tap_scaled, sharp_regular, SHARP, REGULAR
+FN put_8tap_scaled, regular_sharp, REGULAR, SHARP
+FN put_8tap_scaled, smooth_regular, SMOOTH, REGULAR
+FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH
+FN put_8tap_scaled, regular, REGULAR, REGULAR
+MC_8TAP_SCALED put
+
+%if WIN64
+DECLARE_REG_TMP 5, 4
+%elif ARCH_X86_64
+DECLARE_REG_TMP 6, 7
+%else
+DECLARE_REG_TMP 1, 2
+%endif
+BILIN_SCALED_FN prep
+FN prep_8tap_scaled, sharp, SHARP, SHARP
+FN prep_8tap_scaled, sharp_smooth, SHARP, SMOOTH
+FN prep_8tap_scaled, smooth_sharp, SMOOTH, SHARP
+FN prep_8tap_scaled, smooth, SMOOTH, SMOOTH
+FN prep_8tap_scaled, sharp_regular, SHARP, REGULAR
+FN prep_8tap_scaled, regular_sharp, REGULAR, SHARP
+FN prep_8tap_scaled, smooth_regular, SMOOTH, REGULAR
+FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH
+FN prep_8tap_scaled, regular, REGULAR, REGULAR
+MC_8TAP_SCALED prep
+
+%if ARCH_X86_64
+DECLARE_REG_TMP 6
+%else
+DECLARE_REG_TMP 2
+%endif
+
+%if ARCH_X86_64
+; warp8x8t spills one less xmm register than warp8x8 on WIN64, compensate that
+; by allocating 16 bytes more stack space so that stack offsets match up.
+%if WIN64 && STACK_ALIGNMENT == 16
+%assign stksz 16*14
+%else
+%assign stksz 16*13
+%endif
+cglobal warp_affine_8x8t_16bpc, 4, 13, 9, stksz, dst, ds, src, ss, delta, \
+ mx, tmp, alpha, beta, \
+ filter, my, gamma, cnt
+%assign stack_size_padded_8x8t stack_size_padded
+%else
+cglobal warp_affine_8x8t_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \
+ filter, mx, my
+%define m8 [esp+16*13]
+%define m9 [esp+16*14]
+%define cntd dword [esp+4*63]
+%define dstq tmpq
+%define dsq 0
+%if STACK_ALIGNMENT < 16
+%define dstm [esp+4*65]
+%define dsm [esp+4*66]
+%else
+%define dstm r0m
+%define dsm r1m
+%endif
+%endif
+%define base filterq-$$
+ mov t0d, r7m
+ LEA filterq, $$
+ shr t0d, 11
+%if ARCH_X86_64
+ movddup m8, [base+warp8x8t_rnd]
+%else
+ movddup m1, [base+warp8x8t_rnd]
+ mov r1, r1m
+ add r1, r1
+ mova m8, m1
+ mov r1m, r1 ; ds *= 2
+%endif
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main
+ jmp .start
+.loop:
+%if ARCH_X86_64
+ lea dstq, [dstq+dsq*4]
+%else
+ add dstq, dsm
+ mov dstm, dstq
+%endif
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main2
+.start:
+%if ARCH_X86_32
+ mov dstq, dstm
+%endif
+ paddd m1, m8
+ paddd m2, m8
+ psrad m1, 15
+ psrad m2, 15
+ packssdw m1, m2
+ mova [dstq+dsq*0], m1
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main3
+%if ARCH_X86_32
+ mov dstq, dstm
+ add dstq, dsm
+%endif
+ paddd m1, m8
+ paddd m2, m8
+ psrad m1, 15
+ psrad m2, 15
+ packssdw m1, m2
+ mova [dstq+dsq*2], m1
+ dec cntd
+ jg .loop
+ RET
+
+%if ARCH_X86_64
+cglobal warp_affine_8x8_16bpc, 4, 13, 10, 16*13, dst, ds, src, ss, delta, \
+ mx, tmp, alpha, beta, \
+ filter, my, gamma, cnt
+ASSERT stack_size_padded == stack_size_padded_8x8t
+%else
+cglobal warp_affine_8x8_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \
+ filter, mx, my
+%endif
+ mov t0d, r7m
+ LEA filterq, $$
+ shr t0d, 11
+%if ARCH_X86_64
+ movddup m8, [base+warp8x8_rnd2+t0*8]
+ movd m9, r7m ; pixel_max
+ pshufb m9, [base+pw_256]
+%else
+ movddup m1, [base+warp8x8_rnd2+t0*8]
+ movd m2, r7m ; pixel_max
+ pshufb m2, [base+pw_256]
+ mova m8, m1
+ mova m9, m2
+%endif
+ call .main
+ jmp .start
+.loop:
+%if ARCH_X86_64
+ lea dstq, [dstq+dsq*2]
+%else
+ add dstq, dsm
+ mov dstm, dstq
+%endif
+ call .main2
+.start:
+%if ARCH_X86_32
+ mov dstq, dstm
+%endif
+ psrad m1, 16
+ psrad m2, 16
+ packssdw m1, m2
+ pmaxsw m1, m6
+ pmulhrsw m1, m8
+ pminsw m1, m9
+ mova [dstq+dsq*0], m1
+ call .main3
+%if ARCH_X86_32
+ mov dstq, dstm
+ add dstq, dsm
+%endif
+ psrad m1, 16
+ psrad m2, 16
+ packssdw m1, m2
+ pmaxsw m1, m6
+ pmulhrsw m1, m8
+ pminsw m1, m9
+ mova [dstq+dsq*1], m1
+ dec cntd
+ jg .loop
+ RET
+ALIGN function_align
+.main:
+ ; Stack args offset by one (r4m -> r5m etc.) due to call
+%if WIN64
+ mov deltaq, r5m
+ mov mxd, r6m
+%endif
+ movd m0, [base+warp8x8_shift+t0*4]
+ movddup m7, [base+warp8x8_rnd1+t0*8]
+ add filterq, mc_warp_filter-$$
+%if ARCH_X86_64
+ movsx alphad, word [deltaq+2*0]
+ movsx betad, word [deltaq+2*1]
+ movsx gammad, word [deltaq+2*2]
+ movsx deltad, word [deltaq+2*3]
+ lea tmpq, [ssq*3]
+ add mxd, 512+(64<<10)
+ sub srcq, tmpq ; src -= ss*3
+ imul tmpd, alphad, -7
+ mov myd, r7m
+ add betad, tmpd ; beta -= alpha*7
+ imul tmpd, gammad, -7
+ add myd, 512+(64<<10)
+ mov cntd, 4
+ add deltad, tmpd ; delta -= gamma*7
+%else
+%if STACK_ALIGNMENT < 16
+ %assign stack_offset stack_offset - gprsize
+%endif
+ mov r3d, r5m ; abcd
+%if STACK_ALIGNMENT < 16
+ mov r0, r1m ; dst
+ mov r1, r2m ; ds
+ mov [esp+gprsize+4*65], r0
+ mov [esp+gprsize+4*66], r1
+%endif
+ movsx alphad, word [r3+2*0]
+ movsx r2d, word [r3+2*1]
+ movsx gammad, word [r3+2*2]
+ movsx r3d, word [r3+2*3]
+ imul r5d, alphad, -7
+ add r2d, r5d ; beta -= alpha*7
+ imul r5d, gammad, -7
+ mov [esp+gprsize+4*60], r2d
+ add r3d, r5d ; delta -= gamma*7
+ mov [esp+gprsize+4*61], r3d
+ mov r3d, r4m ; ss
+ mov srcq, r3m
+ mov mxd, r6m
+ mov myd, r7m
+ mov dword [esp+gprsize+4*63], 4 ; cnt
+ mov [esp+gprsize+4*62], r3
+ lea r3, [r3*3]
+ add mxd, 512+(64<<10)
+ add myd, 512+(64<<10)
+ sub srcq, r3 ; src -= ss*3
+%if STACK_ALIGNMENT < 16
+ %assign stack_offset stack_offset + gprsize
+%endif
+%endif
+ mova [rsp+gprsize], m0
+ pxor m6, m6
+ call .h
+ mova m5, m0
+ call .h
+ punpcklwd m1, m5, m0 ; 01
+ punpckhwd m5, m0
+ mova [rsp+gprsize+16* 1], m1
+ mova [rsp+gprsize+16* 4], m5
+ mova m5, m0
+ call .h
+ punpcklwd m1, m5, m0 ; 12
+ punpckhwd m5, m0
+ mova [rsp+gprsize+16* 7], m1
+ mova [rsp+gprsize+16*10], m5
+ mova m5, m0
+ call .h
+ punpcklwd m1, m5, m0 ; 23
+ punpckhwd m5, m0
+ mova [rsp+gprsize+16* 2], m1
+ mova [rsp+gprsize+16* 5], m5
+ mova m5, m0
+ call .h
+ punpcklwd m1, m5, m0 ; 34
+ punpckhwd m5, m0
+ mova [rsp+gprsize+16* 8], m1
+ mova [rsp+gprsize+16*11], m5
+ mova m5, m0
+ call .h
+ punpcklwd m1, m5, m0 ; 45
+ punpckhwd m5, m0
+ mova [rsp+gprsize+16* 3], m1
+ mova [rsp+gprsize+16* 6], m5
+ mova m5, m0
+ call .h
+ punpcklwd m1, m5, m0 ; 56
+ punpckhwd m5, m0
+ mova [rsp+gprsize+16* 9], m1
+ mova [rsp+gprsize+16*12], m5
+ mova m5, m0
+.main2:
+ call .h
+%macro WARP_V 6 ; 01l, 23l, 45l, 01h, 23h, 45h
+ lea tmpd, [myq+gammaq]
+ shr myd, 10
+ movq m4, [filterq+myq*8] ; a
+ lea myd, [tmpq+gammaq]
+ shr tmpd, 10
+ movq m2, [filterq+tmpq*8] ; b
+ lea tmpd, [myq+gammaq]
+ shr myd, 10
+ movq m3, [filterq+myq*8] ; c
+ lea myd, [tmpq+gammaq]
+ shr tmpd, 10
+ movq m1, [filterq+tmpq*8] ; d
+ lea tmpd, [myq+gammaq]
+ shr myd, 10
+ punpcklwd m4, m2
+ punpcklwd m3, m1
+ punpckldq m2, m4, m3
+ punpckhdq m4, m3
+ punpcklbw m1, m6, m2 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8
+ pmaddwd m1, [rsp+gprsize+16*%1]
+ punpckhbw m3, m6, m2 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8
+ mova m2, [rsp+gprsize+16*%2]
+ pmaddwd m3, m2
+ mova [rsp+gprsize+16*%1], m2
+ paddd m1, m3
+ punpcklbw m3, m6, m4 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8
+ mova m2, [rsp+gprsize+16*%3]
+ pmaddwd m3, m2
+ mova [rsp+gprsize+16*%2], m2
+ paddd m1, m3
+ punpcklwd m3, m5, m0 ; 67
+ punpckhbw m2, m6, m4 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8
+ pmaddwd m2, m3
+ mova [rsp+gprsize+16*%3], m3
+ paddd m1, m2
+ movq m4, [filterq+myq*8] ; e
+ lea myd, [tmpq+gammaq]
+ shr tmpd, 10
+ movq m3, [filterq+tmpq*8] ; f
+ lea tmpd, [myq+gammaq]
+ shr myd, 10
+ movq m2, [filterq+myq*8] ; g
+%if ARCH_X86_64
+ lea myd, [tmpq+deltaq] ; my += delta
+%else
+ mov myd, [esp+gprsize+4*61]
+ add myd, tmpd
+%endif
+ shr tmpd, 10
+ punpcklwd m4, m3
+ movq m3, [filterq+tmpq*8] ; h
+ punpcklwd m2, m3
+ punpckldq m3, m4, m2
+ punpckhdq m4, m2
+ punpcklbw m2, m6, m3 ; e0 e1 f0 f1 g0 g1 h0 h1 << 8
+ pmaddwd m2, [rsp+gprsize+16*%4]
+ punpckhbw m6, m3 ; e2 e3 f2 f3 g2 g3 h2 h3 << 8
+ mova m3, [rsp+gprsize+16*%5]
+ pmaddwd m6, m3
+ mova [rsp+gprsize+16*%4], m3
+ pxor m3, m3
+ paddd m2, m6
+ punpcklbw m3, m4 ; e4 e5 f4 f5 g4 g5 h4 h5 << 8
+ mova m6, [rsp+gprsize+16*%6]
+ pmaddwd m3, m6
+ mova [rsp+gprsize+16*%5], m6
+ punpckhwd m5, m0
+ pxor m6, m6
+ paddd m2, m3
+ punpckhbw m3, m6, m4 ; e6 e7 f6 f7 g6 g7 h6 h7 << 8
+ pmaddwd m3, m5
+ mova [rsp+gprsize+16*%6], m5
+ mova m5, m0
+ paddd m2, m3
+%endmacro
+ WARP_V 1, 2, 3, 4, 5, 6
+ ret
+.main3:
+ call .h
+ WARP_V 7, 8, 9, 10, 11, 12
+ ret
+ALIGN function_align
+.h:
+ lea tmpd, [mxq+alphaq]
+ shr mxd, 10
+ movq m3, [filterq+mxq*8]
+ punpcklbw m0, m6, m3
+ movu m3, [srcq-6]
+ pmaddwd m0, m3 ; 0
+ lea mxd, [tmpq+alphaq]
+ shr tmpd, 10
+ movq m3, [filterq+tmpq*8]
+ punpcklbw m2, m6, m3
+ movu m3, [srcq-4]
+ pmaddwd m2, m3 ; 1
+ lea tmpd, [mxq+alphaq]
+ shr mxd, 10
+ movq m3, [filterq+mxq*8]
+ phaddd m0, m2 ; 0 1
+ punpcklbw m2, m6, m3
+ movu m3, [srcq-2]
+ pmaddwd m2, m3 ; 2
+ lea mxd, [tmpq+alphaq]
+ shr tmpd, 10
+ movq m3, [filterq+tmpq*8]
+ punpcklbw m1, m6, m3
+ movu m3, [srcq+0]
+ pmaddwd m1, m3 ; 3
+ lea tmpd, [mxq+alphaq]
+ shr mxd, 10
+ movq m3, [filterq+mxq*8]
+ phaddd m2, m1 ; 2 3
+ punpcklbw m1, m6, m3
+ movu m3, [srcq+2]
+ pmaddwd m1, m3 ; 4
+ lea mxd, [tmpq+alphaq]
+ shr tmpd, 10
+ movq m3, [filterq+tmpq*8]
+ phaddd m0, m2 ; 0 1 2 3
+ punpcklbw m2, m6, m3
+ movu m3, [srcq+4]
+ pmaddwd m2, m3 ; 5
+ lea tmpd, [mxq+alphaq]
+ shr mxd, 10
+ movq m3, [filterq+mxq*8]
+ phaddd m1, m2 ; 4 5
+ punpcklbw m2, m6, m3
+ movu m3, [srcq+6]
+ pmaddwd m2, m3 ; 6
+%if ARCH_X86_64
+ lea mxd, [tmpq+betaq] ; mx += beta
+%else
+ mov mxd, [esp+gprsize*2+4*60]
+ add mxd, tmpd
+%endif
+ shr tmpd, 10
+ movq m3, [filterq+tmpq*8]
+ punpcklbw m4, m6, m3
+ movu m3, [srcq+8]
+%if ARCH_X86_64
+ add srcq, ssq
+%else
+ add srcq, [esp+gprsize*2+4*62]
+%endif
+ pmaddwd m3, m4 ; 7
+ phaddd m2, m3 ; 6 7
+ phaddd m1, m2 ; 4 5 6 7
+ paddd m0, m7
+ paddd m1, m7
+ psrad m0, [rsp+gprsize*2]
+ psrad m1, [rsp+gprsize*2]
+ packssdw m0, m1
+ ret
+
+%macro BIDIR_FN 0
+ call .main
+ jmp wq
+.w4_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w4:
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ sub hd, 4
+ jg .w4_loop
+.ret:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w8:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jne .w8_loop
+ RET
+.w16_loop:
+ call .main
+ add dstq, strideq
+.w16:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ dec hd
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ add dstq, strideq
+.w32:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ call .main
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ call .main
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ call .main
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ call .main
+ mova [dstq+16*6], m0
+ mova [dstq+16*7], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+16* 0], m0
+ mova [dstq+16* 1], m1
+ call .main
+ mova [dstq+16* 2], m0
+ mova [dstq+16* 3], m1
+ call .main
+ mova [dstq+16* 4], m0
+ mova [dstq+16* 5], m1
+ call .main
+ mova [dstq+16* 6], m0
+ mova [dstq+16* 7], m1
+ call .main
+ mova [dstq+16* 8], m0
+ mova [dstq+16* 9], m1
+ call .main
+ mova [dstq+16*10], m0
+ mova [dstq+16*11], m1
+ call .main
+ mova [dstq+16*12], m0
+ mova [dstq+16*13], m1
+ call .main
+ mova [dstq+16*14], m0
+ mova [dstq+16*15], m1
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%if UNIX64
+DECLARE_REG_TMP 7
+%else
+DECLARE_REG_TMP 5
+%endif
+
+cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h
+%define base r6-avg_ssse3_table
+ LEA r6, avg_ssse3_table
+ tzcnt wd, wm
+ mov t0d, r6m ; pixel_max
+ movsxd wq, [r6+wq*4]
+ shr t0d, 11
+ movddup m2, [base+bidir_rnd+t0*8]
+ movddup m3, [base+bidir_mul+t0*8]
+ movifnidn hd, hm
+ add wq, r6
+ BIDIR_FN
+ALIGN function_align
+.main:
+ mova m0, [tmp1q+16*0]
+ paddsw m0, [tmp2q+16*0]
+ mova m1, [tmp1q+16*1]
+ paddsw m1, [tmp2q+16*1]
+ add tmp1q, 16*2
+ add tmp2q, 16*2
+ pmaxsw m0, m2
+ pmaxsw m1, m2
+ psubsw m0, m2
+ psubsw m1, m2
+ pmulhw m0, m3
+ pmulhw m1, m3
+ ret
+
+cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h
+%define base r6-w_avg_ssse3_table
+ LEA r6, w_avg_ssse3_table
+ tzcnt wd, wm
+ mov t0d, r6m ; weight
+ movd m6, r7m ; pixel_max
+ movddup m5, [base+pd_65538]
+ movsxd wq, [r6+wq*4]
+ pshufb m6, [base+pw_256]
+ add wq, r6
+ lea r6d, [t0-16]
+ shl t0d, 16
+ sub t0d, r6d ; 16-weight, weight
+ paddw m5, m6
+ mov r6d, t0d
+ shl t0d, 2
+ test dword r7m, 0x800
+ cmovnz r6d, t0d
+ movifnidn hd, hm
+ movd m4, r6d
+ pslld m5, 7
+ pxor m7, m7
+ pshufd m4, m4, q0000
+ BIDIR_FN
+ALIGN function_align
+.main:
+ mova m2, [tmp1q+16*0]
+ mova m0, [tmp2q+16*0]
+ punpckhwd m3, m0, m2
+ punpcklwd m0, m2
+ mova m2, [tmp1q+16*1]
+ mova m1, [tmp2q+16*1]
+ add tmp1q, 16*2
+ add tmp2q, 16*2
+ pmaddwd m3, m4
+ pmaddwd m0, m4
+ paddd m3, m5
+ paddd m0, m5
+ psrad m3, 8
+ psrad m0, 8
+ packssdw m0, m3
+ punpckhwd m3, m1, m2
+ punpcklwd m1, m2
+ pmaddwd m3, m4
+ pmaddwd m1, m4
+ paddd m3, m5
+ paddd m1, m5
+ psrad m3, 8
+ psrad m1, 8
+ packssdw m1, m3
+ pminsw m0, m6
+ pminsw m1, m6
+ pmaxsw m0, m7
+ pmaxsw m1, m7
+ ret
+
+%if ARCH_X86_64
+cglobal mask_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask
+%else
+cglobal mask_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask
+%define hd dword r5m
+%define m8 [base+pw_64]
+%endif
+%define base r6-mask_ssse3_table
+ LEA r6, mask_ssse3_table
+ tzcnt wd, wm
+ mov t0d, r7m ; pixel_max
+ shr t0d, 11
+ movsxd wq, [r6+wq*4]
+ movddup m6, [base+bidir_rnd+t0*8]
+ movddup m7, [base+bidir_mul+t0*8]
+%if ARCH_X86_64
+ mova m8, [base+pw_64]
+ movifnidn hd, hm
+%endif
+ add wq, r6
+ mov maskq, r6mp
+ BIDIR_FN
+ALIGN function_align
+.main:
+ movq m3, [maskq+8*0]
+ mova m0, [tmp1q+16*0]
+ mova m4, [tmp2q+16*0]
+ pxor m5, m5
+ punpcklbw m3, m5
+ punpckhwd m2, m0, m4
+ punpcklwd m0, m4
+ psubw m1, m8, m3
+ punpckhwd m4, m3, m1 ; m, 64-m
+ punpcklwd m3, m1
+ pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m)
+ pmaddwd m0, m3
+ movq m3, [maskq+8*1]
+ mova m1, [tmp1q+16*1]
+ mova m4, [tmp2q+16*1]
+ add maskq, 8*2
+ add tmp1q, 16*2
+ add tmp2q, 16*2
+ psrad m2, 5
+ psrad m0, 5
+ packssdw m0, m2
+ punpcklbw m3, m5
+ punpckhwd m2, m1, m4
+ punpcklwd m1, m4
+ psubw m5, m8, m3
+ punpckhwd m4, m3, m5 ; m, 64-m
+ punpcklwd m3, m5
+ pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m)
+ pmaddwd m1, m3
+ psrad m2, 5
+ psrad m1, 5
+ packssdw m1, m2
+ pmaxsw m0, m6
+ pmaxsw m1, m6
+ psubsw m0, m6
+ psubsw m1, m6
+ pmulhw m0, m7
+ pmulhw m1, m7
+ ret
+
+cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
+%define base t0-w_mask_420_ssse3_table
+ LEA t0, w_mask_420_ssse3_table
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movd m0, r7m ; sign
+ shr r6d, 11
+ movsxd wq, [t0+wq*4]
+%if ARCH_X86_64
+ mova m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
+ mova m9, [base+pw_64]
+ movddup m10, [base+bidir_rnd+r6*8]
+ movddup m11, [base+bidir_mul+r6*8]
+%else
+ mova m1, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
+ mova m2, [base+pw_64]
+ movddup m3, [base+bidir_rnd+r6*8]
+ movddup m4, [base+bidir_mul+r6*8]
+ ALLOC_STACK -16*4
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ mova [rsp+16*2], m3
+ mova [rsp+16*3], m4
+ %define m8 [rsp+gprsize+16*0]
+ %define m9 [rsp+gprsize+16*1]
+ %define m10 [rsp+gprsize+16*2]
+ %define m11 [rsp+gprsize+16*3]
+%endif
+ movd m7, [base+pw_2]
+ psubw m7, m0
+ pshufb m7, [base+pw_256]
+ add wq, t0
+ movifnidn hd, r5m
+ mov maskq, r6mp
+ call .main
+ jmp wq
+.w4_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 4
+.w4:
+ movq [dstq+strideq*0], m0
+ phaddw m2, m3
+ movhps [dstq+strideq*1], m0
+ phaddd m2, m2
+ lea dstq, [dstq+strideq*2]
+ paddw m2, m7
+ movq [dstq+strideq*0], m1
+ psrlw m2, 2
+ movhps [dstq+strideq*1], m1
+ packuswb m2, m2
+ movd [maskq], m2
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 4
+.w8:
+ mova [dstq+strideq*0], m0
+ paddw m2, m3
+ phaddw m2, m2
+ mova [dstq+strideq*1], m1
+ paddw m2, m7
+ psrlw m2, 2
+ packuswb m2, m2
+ movd [maskq], m2
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 8
+.w16:
+ mova [dstq+strideq*1+16*0], m2
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*1+16*1], m3
+ mova [dstq+strideq*0+16*1], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16*0]
+ paddw m3, [dstq+strideq*1+16*1]
+ mova [dstq+strideq*1+16*0], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*1], m1
+ paddw m2, m7
+ psrlw m2, 2
+ packuswb m2, m2
+ movq [maskq], m2
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 16
+.w32:
+ mova [dstq+strideq*1+16*0], m2
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*1+16*1], m3
+ mova [dstq+strideq*0+16*1], m1
+ call .main
+ mova [dstq+strideq*0+16*2], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*3], m2
+ mova [dstq+strideq*0+16*3], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16*0]
+ paddw m3, [dstq+strideq*1+16*1]
+ mova [dstq+strideq*1+16*0], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*2], m2
+ mova [dstq+strideq*1+16*1], m1
+ call .main
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16*2]
+ paddw m2, [dstq+strideq*1+16*3]
+ mova [dstq+strideq*1+16*2], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16*3], m1
+ packuswb m3, m2
+ mova [maskq], m3
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 16*2
+.w64:
+ mova [dstq+strideq*1+16*1], m2
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*1+16*2], m3
+ mova [dstq+strideq*0+16*1], m1
+ call .main
+ mova [dstq+strideq*1+16*3], m2
+ mova [dstq+strideq*0+16*2], m0
+ mova [dstq+strideq*1+16*4], m3
+ mova [dstq+strideq*0+16*3], m1
+ call .main
+ mova [dstq+strideq*1+16*5], m2
+ mova [dstq+strideq*0+16*4], m0
+ mova [dstq+strideq*1+16*6], m3
+ mova [dstq+strideq*0+16*5], m1
+ call .main
+ mova [dstq+strideq*0+16*6], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*7], m2
+ mova [dstq+strideq*0+16*7], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16*1]
+ paddw m3, [dstq+strideq*1+16*2]
+ mova [dstq+strideq*1+16*0], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*2], m2
+ mova [dstq+strideq*1+16*1], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16*3]
+ paddw m3, [dstq+strideq*1+16*4]
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16*2]
+ mova [dstq+strideq*1+16*2], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16*3], m1
+ packuswb m3, m2
+ mova [maskq+16*0], m3
+ call .main
+ paddw m2, [dstq+strideq*1+16*5]
+ paddw m3, [dstq+strideq*1+16*6]
+ mova [dstq+strideq*1+16*4], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*6], m2
+ mova [dstq+strideq*1+16*5], m1
+ call .main
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16*6]
+ paddw m2, [dstq+strideq*1+16*7]
+ mova [dstq+strideq*1+16*6], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16*7], m1
+ packuswb m3, m2
+ mova [maskq+16*1], m3
+ sub hd, 2
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 16*4
+.w128:
+ mova [dstq+strideq*1+16* 1], m2
+ mova [dstq+strideq*0+16* 0], m0
+ mova [dstq+strideq*1+16* 2], m3
+ mova [dstq+strideq*0+16* 1], m1
+ call .main
+ mova [dstq+strideq*1+16* 3], m2
+ mova [dstq+strideq*0+16* 2], m0
+ mova [dstq+strideq*1+16* 4], m3
+ mova [dstq+strideq*0+16* 3], m1
+ call .main
+ mova [dstq+strideq*1+16* 5], m2
+ mova [dstq+strideq*0+16* 4], m0
+ mova [dstq+strideq*1+16* 6], m3
+ mova [dstq+strideq*0+16* 5], m1
+ call .main
+ mova [dstq+strideq*1+16* 7], m2
+ mova [dstq+strideq*0+16* 6], m0
+ mova [dstq+strideq*1+16* 8], m3
+ mova [dstq+strideq*0+16* 7], m1
+ call .main
+ mova [dstq+strideq*1+16* 9], m2
+ mova [dstq+strideq*0+16* 8], m0
+ mova [dstq+strideq*1+16*10], m3
+ mova [dstq+strideq*0+16* 9], m1
+ call .main
+ mova [dstq+strideq*1+16*11], m2
+ mova [dstq+strideq*0+16*10], m0
+ mova [dstq+strideq*1+16*12], m3
+ mova [dstq+strideq*0+16*11], m1
+ call .main
+ mova [dstq+strideq*1+16*13], m2
+ mova [dstq+strideq*0+16*12], m0
+ mova [dstq+strideq*1+16*14], m3
+ mova [dstq+strideq*0+16*13], m1
+ call .main
+ mova [dstq+strideq*0+16*14], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*15], m2
+ mova [dstq+strideq*0+16*15], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16* 1]
+ paddw m3, [dstq+strideq*1+16* 2]
+ mova [dstq+strideq*1+16* 0], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16* 2], m2
+ mova [dstq+strideq*1+16* 1], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16* 3]
+ paddw m3, [dstq+strideq*1+16* 4]
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16* 2]
+ mova [dstq+strideq*1+16* 2], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16* 3], m1
+ packuswb m3, m2
+ mova [maskq+16*0], m3
+ call .main
+ paddw m2, [dstq+strideq*1+16* 5]
+ paddw m3, [dstq+strideq*1+16* 6]
+ mova [dstq+strideq*1+16* 4], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16* 6], m2
+ mova [dstq+strideq*1+16* 5], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16* 7]
+ paddw m3, [dstq+strideq*1+16* 8]
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16* 6]
+ mova [dstq+strideq*1+16* 6], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16* 7], m1
+ packuswb m3, m2
+ mova [maskq+16*1], m3
+ call .main
+ paddw m2, [dstq+strideq*1+16* 9]
+ paddw m3, [dstq+strideq*1+16*10]
+ mova [dstq+strideq*1+16* 8], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*10], m2
+ mova [dstq+strideq*1+16* 9], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16*11]
+ paddw m3, [dstq+strideq*1+16*12]
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16*10]
+ mova [dstq+strideq*1+16*10], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16*11], m1
+ packuswb m3, m2
+ mova [maskq+16*2], m3
+ call .main
+ paddw m2, [dstq+strideq*1+16*13]
+ paddw m3, [dstq+strideq*1+16*14]
+ mova [dstq+strideq*1+16*12], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*14], m2
+ mova [dstq+strideq*1+16*13], m1
+ call .main
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16*14]
+ paddw m2, [dstq+strideq*1+16*15]
+ mova [dstq+strideq*1+16*14], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16*15], m1
+ packuswb m3, m2
+ mova [maskq+16*3], m3
+ sub hd, 2
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+%macro W_MASK 2 ; dst/tmp_offset, mask
+ mova m%1, [tmp1q+16*%1]
+ mova m%2, [tmp2q+16*%1]
+ punpcklwd m4, m%2, m%1
+ punpckhwd m5, m%2, m%1
+ psubsw m%1, m%2
+ pabsw m%1, m%1
+ psubusw m6, m8, m%1
+ psrlw m6, 10 ; 64-m
+ psubw m%2, m9, m6 ; m
+ punpcklwd m%1, m6, m%2
+ punpckhwd m6, m%2
+ pmaddwd m%1, m4
+ pmaddwd m6, m5
+ psrad m%1, 5
+ psrad m6, 5
+ packssdw m%1, m6
+ pmaxsw m%1, m10
+ psubsw m%1, m10
+ pmulhw m%1, m11
+%endmacro
+ W_MASK 0, 2
+ W_MASK 1, 3
+ add tmp1q, 16*2
+ add tmp2q, 16*2
+ ret
+
+cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
+%define base t0-w_mask_422_ssse3_table
+ LEA t0, w_mask_422_ssse3_table
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movd m7, r7m ; sign
+ shr r6d, 11
+ movsxd wq, [t0+wq*4]
+%if ARCH_X86_64
+ mova m8, [base+pw_27615]
+ mova m9, [base+pw_64]
+ movddup m10, [base+bidir_rnd+r6*8]
+ movddup m11, [base+bidir_mul+r6*8]
+%else
+ mova m1, [base+pw_27615]
+ mova m2, [base+pw_64]
+ movddup m3, [base+bidir_rnd+r6*8]
+ movddup m4, [base+bidir_mul+r6*8]
+ ALLOC_STACK -16*4
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ mova [rsp+16*2], m3
+ mova [rsp+16*3], m4
+%endif
+ pxor m0, m0
+ add wq, t0
+ pshufb m7, m0
+ movifnidn hd, r5m
+ mov maskq, r6mp
+ call .main
+ jmp wq
+.w4_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w4:
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ sub hd, 4
+ jg .w4_loop
+.end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w8:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w16:
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*0+16*1], m1
+ call .main
+ mova [dstq+strideq*1+16*0], m0
+ mova [dstq+strideq*1+16*1], m1
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ add dstq, strideq
+.w32:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ call .main
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ call .main
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ call .main
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ call .main
+ mova [dstq+16*6], m0
+ mova [dstq+16*7], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+16* 0], m0
+ mova [dstq+16* 1], m1
+ call .main
+ mova [dstq+16* 2], m0
+ mova [dstq+16* 3], m1
+ call .main
+ mova [dstq+16* 4], m0
+ mova [dstq+16* 5], m1
+ call .main
+ mova [dstq+16* 6], m0
+ mova [dstq+16* 7], m1
+ call .main
+ mova [dstq+16* 8], m0
+ mova [dstq+16* 9], m1
+ call .main
+ mova [dstq+16*10], m0
+ mova [dstq+16*11], m1
+ call .main
+ mova [dstq+16*12], m0
+ mova [dstq+16*13], m1
+ call .main
+ mova [dstq+16*14], m0
+ mova [dstq+16*15], m1
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ W_MASK 0, 2
+ W_MASK 1, 3
+ phaddw m2, m3
+ add tmp1q, 16*2
+ add tmp2q, 16*2
+ packuswb m2, m2
+ pxor m3, m3
+ psubb m2, m7
+ pavgb m2, m3
+ movq [maskq], m2
+ add maskq, 8
+ ret
+
+cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
+%define base t0-w_mask_444_ssse3_table
+ LEA t0, w_mask_444_ssse3_table
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ shr r6d, 11
+ movsxd wq, [t0+wq*4]
+%if ARCH_X86_64
+ mova m8, [base+pw_27615]
+ mova m9, [base+pw_64]
+ movddup m10, [base+bidir_rnd+r6*8]
+ movddup m11, [base+bidir_mul+r6*8]
+%else
+ mova m1, [base+pw_27615]
+ mova m2, [base+pw_64]
+ movddup m3, [base+bidir_rnd+r6*8]
+ movddup m7, [base+bidir_mul+r6*8]
+ ALLOC_STACK -16*3
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ mova [rsp+16*2], m3
+ %define m11 m7
+%endif
+ add wq, t0
+ movifnidn hd, r5m
+ mov maskq, r6mp
+ call .main
+ jmp wq
+.w4_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w4:
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ sub hd, 4
+ jg .w4_loop
+.end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w8:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w16:
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*0+16*1], m1
+ call .main
+ mova [dstq+strideq*1+16*0], m0
+ mova [dstq+strideq*1+16*1], m1
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ add dstq, strideq
+.w32:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ call .main
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ call .main
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ call .main
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ call .main
+ mova [dstq+16*6], m0
+ mova [dstq+16*7], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+16* 0], m0
+ mova [dstq+16* 1], m1
+ call .main
+ mova [dstq+16* 2], m0
+ mova [dstq+16* 3], m1
+ call .main
+ mova [dstq+16* 4], m0
+ mova [dstq+16* 5], m1
+ call .main
+ mova [dstq+16* 6], m0
+ mova [dstq+16* 7], m1
+ call .main
+ mova [dstq+16* 8], m0
+ mova [dstq+16* 9], m1
+ call .main
+ mova [dstq+16*10], m0
+ mova [dstq+16*11], m1
+ call .main
+ mova [dstq+16*12], m0
+ mova [dstq+16*13], m1
+ call .main
+ mova [dstq+16*14], m0
+ mova [dstq+16*15], m1
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ W_MASK 0, 2
+ W_MASK 1, 3
+ packuswb m2, m3
+ add tmp1q, 16*2
+ add tmp2q, 16*2
+ mova [maskq], m2
+ add maskq, 16
+ ret
+
+; (a * (64 - m) + b * m + 32) >> 6
+; = (((b - a) * m + 32) >> 6) + a
+; = (((b - a) * (m << 9) + 16384) >> 15) + a
+; except m << 9 overflows int16_t when m == 64 (which is possible),
+; but if we negate m it works out (-64 << 9 == -32768).
+; = (((a - b) * (m * -512) + 16384) >> 15) + a
+cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h, mask, stride3
+%define base r6-blend_ssse3_table
+ LEA r6, blend_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r6+wq*4]
+ movifnidn maskq, maskmp
+ mova m7, [base+pw_m512]
+ add wq, r6
+ lea stride3q, [strideq*3]
+ pxor m6, m6
+ jmp wq
+.w4:
+ mova m5, [maskq]
+ movq m0, [dstq+strideq*0]
+ movhps m0, [dstq+strideq*1]
+ movq m1, [dstq+strideq*2]
+ movhps m1, [dstq+stride3q ]
+ psubw m2, m0, [tmpq+16*0]
+ psubw m3, m1, [tmpq+16*1]
+ add maskq, 16
+ add tmpq, 32
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ pmullw m4, m7
+ pmullw m5, m7
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m1
+ movhps [dstq+stride3q ], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ mova m5, [maskq]
+ mova m0, [dstq+strideq*0]
+ mova m1, [dstq+strideq*1]
+ psubw m2, m0, [tmpq+16*0]
+ psubw m3, m1, [tmpq+16*1]
+ add maskq, 16
+ add tmpq, 32
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ pmullw m4, m7
+ pmullw m5, m7
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8
+ RET
+.w16:
+ mova m5, [maskq]
+ mova m0, [dstq+16*0]
+ mova m1, [dstq+16*1]
+ psubw m2, m0, [tmpq+16*0]
+ psubw m3, m1, [tmpq+16*1]
+ add maskq, 16
+ add tmpq, 32
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ pmullw m4, m7
+ pmullw m5, m7
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ add dstq, strideq
+ dec hd
+ jg .w16
+ RET
+.w32:
+ mova m5, [maskq+16*0]
+ mova m0, [dstq+16*0]
+ mova m1, [dstq+16*1]
+ psubw m2, m0, [tmpq+16*0]
+ psubw m3, m1, [tmpq+16*1]
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ pmullw m4, m7
+ pmullw m5, m7
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova m5, [maskq+16*1]
+ mova m0, [dstq+16*2]
+ mova m1, [dstq+16*3]
+ psubw m2, m0, [tmpq+16*2]
+ psubw m3, m1, [tmpq+16*3]
+ add maskq, 32
+ add tmpq, 64
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ pmullw m4, m7
+ pmullw m5, m7
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ add dstq, strideq
+ dec hd
+ jg .w32
+ RET
+
+cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h
+%define base r5-blend_v_ssse3_table
+ LEA r5, blend_v_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp wq
+.w2:
+ movd m4, [base+obmc_masks+2*2]
+.w2_loop:
+ movd m0, [dstq+strideq*0]
+ movd m2, [tmpq+4*0]
+ movd m1, [dstq+strideq*1]
+ movd m3, [tmpq+4*1]
+ add tmpq, 4*2
+ psubw m2, m0
+ psubw m3, m1
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w2_loop
+ RET
+.w4:
+ movddup m2, [base+obmc_masks+4*2]
+.w4_loop:
+ movq m0, [dstq+strideq*0]
+ movhps m0, [dstq+strideq*1]
+ mova m1, [tmpq]
+ add tmpq, 8*2
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+.w8:
+ mova m4, [base+obmc_masks+8*2]
+.w8_loop:
+ mova m0, [dstq+strideq*0]
+ mova m2, [tmpq+16*0]
+ mova m1, [dstq+strideq*1]
+ mova m3, [tmpq+16*1]
+ add tmpq, 16*2
+ psubw m2, m0
+ psubw m3, m1
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16:
+ mova m4, [base+obmc_masks+16*2]
+ movq m5, [base+obmc_masks+16*3]
+.w16_loop:
+ mova m0, [dstq+16*0]
+ mova m2, [tmpq+16*0]
+ mova m1, [dstq+16*1]
+ mova m3, [tmpq+16*1]
+ add tmpq, 16*2
+ psubw m2, m0
+ psubw m3, m1
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ add dstq, strideq
+ dec hd
+ jg .w16_loop
+ RET
+.w32:
+%if WIN64
+ movaps [rsp+8], m6
+%endif
+ mova m4, [base+obmc_masks+16*4]
+ mova m5, [base+obmc_masks+16*5]
+ mova m6, [base+obmc_masks+16*6]
+.w32_loop:
+ mova m0, [dstq+16*0]
+ mova m2, [tmpq+16*0]
+ mova m1, [dstq+16*1]
+ mova m3, [tmpq+16*1]
+ psubw m2, m0
+ psubw m3, m1
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ mova m2, [dstq+16*2]
+ paddw m1, m3
+ mova m3, [tmpq+16*2]
+ add tmpq, 16*4
+ psubw m3, m2
+ pmulhrsw m3, m6
+ paddw m2, m3
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+%if WIN64
+ movaps m6, [rsp+8]
+%endif
+ RET
+
+%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp
+ mova m0, [dstq+16*(%1+0)]
+ mova m2, [tmpq+16*(%2+0)]
+ mova m1, [dstq+16*(%1+1)]
+ mova m3, [tmpq+16*(%2+1)]
+%if %3
+ add tmpq, 16*%3
+%endif
+ psubw m2, m0
+ psubw m3, m1
+ pmulhrsw m2, m5
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+16*(%1+0)], m0
+ mova [dstq+16*(%1+1)], m1
+%endmacro
+
+cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
+%define base r6-blend_h_ssse3_table
+ LEA r6, blend_h_ssse3_table
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ movddup m4, [base+blend_shuf]
+ lea maskq, [base+obmc_masks+hq*2]
+ lea hd, [hq*3]
+ add wq, r6
+ shr hd, 2 ; h * 3/4
+ lea maskq, [maskq+hq*2]
+ neg hq
+ jmp wq
+.w2:
+ movd m0, [dstq+dsq*0]
+ movd m2, [dstq+dsq*1]
+ movd m3, [maskq+hq*2]
+ movq m1, [tmpq]
+ add tmpq, 4*2
+ punpckldq m0, m2
+ punpcklwd m3, m3
+ psubw m1, m0
+ pmulhrsw m1, m3
+ paddw m0, m1
+ movd [dstq+dsq*0], m0
+ psrlq m0, 32
+ movd [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w2
+ RET
+.w4:
+ mova m3, [base+blend_shuf]
+.w4_loop:
+ movq m0, [dstq+dsq*0]
+ movhps m0, [dstq+dsq*1]
+ movd m2, [maskq+hq*2]
+ mova m1, [tmpq]
+ add tmpq, 8*2
+ psubw m1, m0
+ pshufb m2, m3
+ pmulhrsw m1, m2
+ paddw m0, m1
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w4_loop
+ RET
+.w8:
+ movddup m5, [base+blend_shuf+8]
+%if WIN64
+ movaps [rsp+ 8], m6
+ movaps [rsp+24], m7
+%endif
+.w8_loop:
+ movd m7, [maskq+hq*2]
+ mova m0, [dstq+dsq*0]
+ mova m2, [tmpq+16*0]
+ mova m1, [dstq+dsq*1]
+ mova m3, [tmpq+16*1]
+ add tmpq, 16*2
+ pshufb m6, m7, m4
+ psubw m2, m0
+ pshufb m7, m5
+ psubw m3, m1
+ pmulhrsw m2, m6
+ pmulhrsw m3, m7
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w8_loop
+%if WIN64
+ movaps m6, [rsp+ 8]
+ movaps m7, [rsp+24]
+%endif
+ RET
+.w16:
+ movd m5, [maskq+hq*2]
+ pshufb m5, m4
+ BLEND_H_ROW 0, 0, 2
+ add dstq, dsq
+ inc hq
+ jl .w16
+ RET
+.w32:
+ movd m5, [maskq+hq*2]
+ pshufb m5, m4
+ BLEND_H_ROW 0, 0
+ BLEND_H_ROW 2, 2, 4
+ add dstq, dsq
+ inc hq
+ jl .w32
+ RET
+.w64:
+ movd m5, [maskq+hq*2]
+ pshufb m5, m4
+ BLEND_H_ROW 0, 0
+ BLEND_H_ROW 2, 2
+ BLEND_H_ROW 4, 4
+ BLEND_H_ROW 6, 6, 8
+ add dstq, dsq
+ inc hq
+ jl .w64
+ RET
+.w128:
+ movd m5, [maskq+hq*2]
+ pshufb m5, m4
+ BLEND_H_ROW 0, 0
+ BLEND_H_ROW 2, 2
+ BLEND_H_ROW 4, 4
+ BLEND_H_ROW 6, 6, 16
+ BLEND_H_ROW 8, -8
+ BLEND_H_ROW 10, -6
+ BLEND_H_ROW 12, -4
+ BLEND_H_ROW 14, -2
+ add dstq, dsq
+ inc hq
+ jl .w128
+ RET
+
+; emu_edge args:
+; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih,
+; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride,
+; const pixel *ref, const ptrdiff_t ref_stride
+;
+; bw, bh total filled size
+; iw, ih, copied block -> fill bottom, right
+; x, y, offset in bw/bh -> fill top, left
+cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, \
+ y, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+ ; we assume that the buffer (stride) is larger than width, so we can
+ ; safely overwrite by a few bytes
+
+%if ARCH_X86_64
+ %define reg_zero r12q
+ %define reg_tmp r10
+ %define reg_src srcq
+ %define reg_bottomext bottomextq
+ %define reg_rightext rightextq
+ %define reg_blkm r9m
+%else
+ %define reg_zero r6
+ %define reg_tmp r0
+ %define reg_src r1
+ %define reg_bottomext r0
+ %define reg_rightext r1
+ %define reg_blkm r2m
+%endif
+ ;
+ ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ xor reg_zero, reg_zero
+ lea reg_tmp, [ihq-1]
+ cmp yq, ihq
+ cmovs reg_tmp, yq
+ test yq, yq
+ cmovs reg_tmp, reg_zero
+%if ARCH_X86_64
+ imul reg_tmp, sstrideq
+ add srcq, reg_tmp
+%else
+ imul reg_tmp, sstridem
+ mov reg_src, srcm
+ add reg_src, reg_tmp
+%endif
+ ;
+ ; ref += iclip(x, 0, iw - 1)
+ lea reg_tmp, [iwq-1]
+ cmp xq, iwq
+ cmovs reg_tmp, xq
+ test xq, xq
+ cmovs reg_tmp, reg_zero
+ lea reg_src, [reg_src+reg_tmp*2]
+%if ARCH_X86_32
+ mov srcm, reg_src
+%endif
+ ;
+ ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+%if ARCH_X86_32
+ mov r1, r1m ; restore bh
+%endif
+ lea reg_bottomext, [yq+bhq]
+ sub reg_bottomext, ihq
+ lea r3, [bhq-1]
+ cmovs reg_bottomext, reg_zero
+ ;
+
+ DEFINE_ARGS bw, bh, iw, ih, x, \
+ topext, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+
+ ; top_ext = iclip(-y, 0, bh - 1)
+ neg topextq
+ cmovs topextq, reg_zero
+ cmp reg_bottomext, bhq
+ cmovns reg_bottomext, r3
+ cmp topextq, bhq
+ cmovg topextq, r3
+ %if ARCH_X86_32
+ mov r4m, reg_bottomext
+ ;
+ ; right_ext = iclip(x + bw - iw, 0, bw - 1)
+ mov r0, r0m ; restore bw
+ %endif
+ lea reg_rightext, [xq+bwq]
+ sub reg_rightext, iwq
+ lea r2, [bwq-1]
+ cmovs reg_rightext, reg_zero
+
+ DEFINE_ARGS bw, bh, iw, ih, leftext, \
+ topext, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+
+ ; left_ext = iclip(-x, 0, bw - 1)
+ neg leftextq
+ cmovs leftextq, reg_zero
+ cmp reg_rightext, bwq
+ cmovns reg_rightext, r2
+ %if ARCH_X86_32
+ mov r3m, r1
+ %endif
+ cmp leftextq, bwq
+ cmovns leftextq, r2
+
+%undef reg_zero
+%undef reg_tmp
+%undef reg_src
+%undef reg_bottomext
+%undef reg_rightext
+
+ DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \
+ topext, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+
+ ; center_h = bh - top_ext - bottom_ext
+%if ARCH_X86_64
+ lea r3, [bottomextq+topextq]
+ sub centerhq, r3
+%else
+ mov r1, centerhm ; restore r1
+ sub centerhq, topextq
+ sub centerhq, r4m
+ mov r1m, centerhq
+%endif
+ ;
+ ; blk += top_ext * PXSTRIDE(dst_stride)
+ mov r2, topextq
+%if ARCH_X86_64
+ imul r2, dstrideq
+%else
+ mov r6, r6m ; restore dstq
+ imul r2, dstridem
+%endif
+ add dstq, r2
+ mov reg_blkm, dstq ; save pointer for ext
+ ;
+ ; center_w = bw - left_ext - right_ext
+ mov centerwq, bwq
+%if ARCH_X86_64
+ lea r3, [rightextq+leftextq]
+ sub centerwq, r3
+%else
+ sub centerwq, r3m
+ sub centerwq, leftextq
+%endif
+
+; vloop Macro
+%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
+ %if ARCH_X86_64
+ %define reg_tmp r12
+ %else
+ %define reg_tmp r0
+ %endif
+.v_loop_%3:
+ %if ARCH_X86_32
+ mov r0, r0m
+ mov r1, r1m
+ %endif
+%if %1
+ ; left extension
+ %if ARCH_X86_64
+ movd m0, [srcq]
+ %else
+ mov r3, srcm
+ movd m0, [r3]
+ %endif
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ xor r3, r3
+.left_loop_%3:
+ mova [dstq+r3*2], m0
+ add r3, mmsize/2
+ cmp r3, leftextq
+ jl .left_loop_%3
+ ; body
+ lea reg_tmp, [dstq+leftextq*2]
+%endif
+ xor r3, r3
+.body_loop_%3:
+ %if ARCH_X86_64
+ movu m0, [srcq+r3*2]
+ %else
+ mov r1, srcm
+ movu m0, [r1+r3*2]
+ %endif
+%if %1
+ movu [reg_tmp+r3*2], m0
+%else
+ movu [dstq+r3*2], m0
+%endif
+ add r3, mmsize/2
+ cmp r3, centerwq
+ jl .body_loop_%3
+%if %2
+ ; right extension
+%if %1
+ lea reg_tmp, [reg_tmp+centerwq*2]
+%else
+ lea reg_tmp, [dstq+centerwq*2]
+%endif
+ %if ARCH_X86_64
+ movd m0, [srcq+centerwq*2-2]
+ %else
+ mov r3, srcm
+ movd m0, [r3+centerwq*2-2]
+ %endif
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ xor r3, r3
+.right_loop_%3:
+ movu [reg_tmp+r3*2], m0
+ add r3, mmsize/2
+ %if ARCH_X86_64
+ cmp r3, rightextq
+ %else
+ cmp r3, r3m
+ %endif
+ jl .right_loop_%3
+%endif
+ %if ARCH_X86_64
+ add dstq, dstrideq
+ add srcq, sstrideq
+ dec centerhq
+ jg .v_loop_%3
+ %else
+ add dstq, dstridem
+ mov r0, sstridem
+ add srcm, r0
+ sub dword centerhm, 1
+ jg .v_loop_%3
+ mov r0, r0m ; restore r0
+ %endif
+%endmacro ; vloop MACRO
+
+ test leftextq, leftextq
+ jnz .need_left_ext
+ %if ARCH_X86_64
+ test rightextq, rightextq
+ jnz .need_right_ext
+ %else
+ cmp leftextq, r3m ; leftextq == 0
+ jne .need_right_ext
+ %endif
+ v_loop 0, 0, 0
+ jmp .body_done
+
+ ;left right extensions
+.need_left_ext:
+ %if ARCH_X86_64
+ test rightextq, rightextq
+ %else
+ mov r3, r3m
+ test r3, r3
+ %endif
+ jnz .need_left_right_ext
+ v_loop 1, 0, 1
+ jmp .body_done
+
+.need_left_right_ext:
+ v_loop 1, 1, 2
+ jmp .body_done
+
+.need_right_ext:
+ v_loop 0, 1, 3
+
+.body_done:
+; r0 ; bw
+; r1 ;; x loop
+; r4 ;; y loop
+; r5 ; topextq
+; r6 ;dstq
+; r7 ;dstrideq
+; r8 ; srcq
+%if ARCH_X86_64
+ %define reg_dstride dstrideq
+%else
+ %define reg_dstride r2
+%endif
+ ;
+ ; bottom edge extension
+ %if ARCH_X86_64
+ test bottomextq, bottomextq
+ jz .top
+ %else
+ xor r1, r1
+ cmp r1, r4m
+ je .top
+ %endif
+ ;
+ %if ARCH_X86_64
+ mov srcq, dstq
+ sub srcq, dstrideq
+ xor r1, r1
+ %else
+ mov r3, dstq
+ mov reg_dstride, dstridem
+ sub r3, reg_dstride
+ mov srcm, r3
+ %endif
+ ;
+.bottom_x_loop:
+ %if ARCH_X86_64
+ mova m0, [srcq+r1*2]
+ lea r3, [dstq+r1*2]
+ mov r4, bottomextq
+ %else
+ mov r3, srcm
+ mova m0, [r3+r1*2]
+ lea r3, [dstq+r1*2]
+ mov r4, r4m
+ %endif
+ ;
+.bottom_y_loop:
+ mova [r3], m0
+ add r3, reg_dstride
+ dec r4
+ jg .bottom_y_loop
+ add r1, mmsize/2
+ cmp r1, bwq
+ jl .bottom_x_loop
+
+.top:
+ ; top edge extension
+ test topextq, topextq
+ jz .end
+%if ARCH_X86_64
+ mov srcq, reg_blkm
+%else
+ mov r3, reg_blkm
+ mov reg_dstride, dstridem
+%endif
+ mov dstq, dstm
+ xor r1, r1
+ ;
+.top_x_loop:
+%if ARCH_X86_64
+ mova m0, [srcq+r1*2]
+%else
+ mov r3, reg_blkm
+ mova m0, [r3+r1*2]
+%endif
+ lea r3, [dstq+r1*2]
+ mov r4, topextq
+ ;
+.top_y_loop:
+ mova [r3], m0
+ add r3, reg_dstride
+ dec r4
+ jg .top_y_loop
+ add r1, mmsize/2
+ cmp r1, bwq
+ jl .top_x_loop
+
+.end:
+ RET
+
+%undef reg_dstride
+%undef reg_blkm
+%undef reg_tmp
+
+%macro SCRATCH 3
+%if ARCH_X86_32
+ mova [rsp+%3*mmsize], m%1
+%define m%2 [rsp+%3*mmsize]
+%else
+ SWAP %1, %2
+%endif
+%endmacro
+
+%if ARCH_X86_64
+cglobal resize_16bpc, 0, 12, 16, 1*16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0, pxmax
+%elif STACK_ALIGNMENT >= 16
+cglobal resize_16bpc, 0, 7, 8, 6*16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0, pxmax
+%else
+cglobal resize_16bpc, 0, 6, 8, 6*16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0, pxmax
+%endif
+ movifnidn dstq, dstmp
+ movifnidn srcq, srcmp
+%if STACK_ALIGNMENT >= 16
+ movifnidn dst_wd, dst_wm
+%endif
+%if ARCH_X86_64
+ movifnidn hd, hm
+%endif
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ movd m4, pxmaxm
+ movd m7, dxm
+ movd m6, mx0m
+ movd m5, src_wm
+ punpcklwd m4, m4
+ pshufd m4, m4, q0000
+ pshufd m7, m7, q0000
+ pshufd m6, m6, q0000
+ pshufd m5, m5, q0000
+ mova [rsp+16*3*ARCH_X86_32], m4
+%if ARCH_X86_64
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
+ LEA r7, $$
+ %define base r7-$$
+%else
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x
+ %define hd dword r5m
+ %if STACK_ALIGNMENT >= 16
+ LEA r6, $$
+ %define base r6-$$
+ %else
+ LEA r4, $$
+ %define base r4-$$
+ %endif
+%endif
+%if ARCH_X86_64
+ mova m12, [base+pd_64]
+ mova m11, [base+pd_63]
+%else
+ %define m12 [base+pd_64]
+ %define m11 [base+pd_63]
+%endif
+ pmaddwd m4, m7, [base+rescale_mul] ; dx*[0,1,2,3]
+ pslld m7, 2 ; dx*4
+ pslld m5, 14
+ paddd m6, m4 ; mx+[0..3]*dx
+ SCRATCH 7, 15, 0
+ SCRATCH 6, 14, 1
+ SCRATCH 5, 13, 2
+ pxor m1, m1
+.loop_y:
+ xor xd, xd
+ mova m0, m14 ; per-line working version of mx
+.loop_x:
+ pcmpgtd m1, m0
+ pandn m1, m0
+ psrad m2, m0, 8 ; filter offset (unmasked)
+ pcmpgtd m3, m13, m1
+ pand m1, m3
+ pandn m3, m13
+ por m1, m3
+ psubd m3, m0, m1 ; pshufb offset
+ psrad m1, 14 ; clipped src_x offset
+ psrad m3, 14 ; pshufb edge_emu offset
+ pand m2, m11 ; filter offset (masked)
+ ; load source pixels
+%if ARCH_X86_64
+ movd r8d, m1
+ pshuflw m1, m1, q3232
+ movd r9d, m1
+ punpckhqdq m1, m1
+ movd r10d, m1
+ psrlq m1, 32
+ movd r11d, m1
+ movu m4, [srcq+r8*2]
+ movu m5, [srcq+r9*2]
+ movu m6, [srcq+r10*2]
+ movu m7, [srcq+r11*2]
+ ; if no emulation is required, we don't need to shuffle or emulate edges
+ packssdw m3, m3
+ movq r11, m3
+ test r11, r11
+ jz .filter
+ movsx r8, r11w
+ sar r11, 16
+ movsx r9, r11w
+ sar r11, 16
+ movsx r10, r11w
+ sar r11, 16
+ movu m1, [base+resize_shuf+8+r8*2]
+ movu m3, [base+resize_shuf+8+r9*2]
+ movu m8, [base+resize_shuf+8+r10*2]
+ movu m9, [base+resize_shuf+8+r11*2]
+ pshufb m4, m1
+ pshufb m5, m3
+ pshufb m6, m8
+ pshufb m7, m9
+.filter:
+ movd r8d, m2
+ pshuflw m2, m2, q3232
+ movd r9d, m2
+ punpckhqdq m2, m2
+ movd r10d, m2
+ psrlq m2, 32
+ movd r11d, m2
+ movq m8, [base+resize_filter+r8*8]
+ movq m2, [base+resize_filter+r9*8]
+ pxor m9, m9
+ punpcklbw m1, m9, m8
+ punpcklbw m3, m9, m2
+ psraw m1, 8
+ psraw m3, 8
+ movq m10, [base+resize_filter+r10*8]
+ movq m2, [base+resize_filter+r11*8]
+ punpcklbw m8, m9, m10
+ punpcklbw m9, m2
+ psraw m8, 8
+ psraw m9, 8
+ pmaddwd m4, m1
+ pmaddwd m5, m3
+ pmaddwd m6, m8
+ pmaddwd m7, m9
+ phaddd m4, m5
+%else
+ movd r3, m1
+ pshuflw m1, m1, q3232
+ movd r1, m1
+ punpckhqdq m1, m1
+ movu m4, [srcq+r3*2]
+ movu m5, [srcq+r1*2]
+ movd r3, m1
+ psrlq m1, 32
+ movd r1, m1
+ movu m6, [srcq+r3*2]
+ movu m7, [srcq+r1*2]
+ ; if no emulation is required, we don't need to shuffle or emulate edges
+ pxor m1, m1
+ pcmpeqb m1, m3
+ pmovmskb r3d, m1
+ cmp r3d, 0xffff
+ je .filter
+ movd r3, m3
+ movu m1, [base+resize_shuf+8+r3*2]
+ pshuflw m3, m3, q3232
+ movd r1, m3
+ pshufb m4, m1
+ movu m1, [base+resize_shuf+8+r1*2]
+ punpckhqdq m3, m3
+ movd r3, m3
+ pshufb m5, m1
+ movu m1, [base+resize_shuf+8+r3*2]
+ psrlq m3, 32
+ movd r1, m3
+ pshufb m6, m1
+ movu m1, [base+resize_shuf+8+r1*2]
+ pshufb m7, m1
+.filter:
+ mova [esp+4*16], m6
+ mova [esp+5*16], m7
+ movd r3, m2
+ pshuflw m2, m2, q3232
+ movd r1, m2
+ movq m6, [base+resize_filter+r3*8]
+ movq m7, [base+resize_filter+r1*8]
+ pxor m3, m3
+ punpcklbw m1, m3, m6
+ punpcklbw m3, m7
+ psraw m1, 8
+ psraw m3, 8
+ pmaddwd m4, m1
+ pmaddwd m5, m3
+ punpckhqdq m2, m2
+ movd r3, m2
+ psrlq m2, 32
+ movd r1, m2
+ phaddd m4, m5
+ movq m2, [base+resize_filter+r3*8]
+ movq m5, [base+resize_filter+r1*8]
+ mova m6, [esp+4*16]
+ mova m7, [esp+5*16]
+ pxor m3, m3
+ punpcklbw m1, m3, m2
+ punpcklbw m3, m5
+ psraw m1, 8
+ psraw m3, 8
+ pmaddwd m6, m1
+ pmaddwd m7, m3
+%endif
+ phaddd m6, m7
+ phaddd m4, m6
+ pxor m1, m1
+ psubd m2, m12, m4
+ psrad m2, 7
+ packssdw m2, m2
+ pmaxsw m2, m1
+ pminsw m2, [rsp+16*3*ARCH_X86_32]
+ movq [dstq+xq*2], m2
+ paddd m0, m15
+ add xd, 4
+%if STACK_ALIGNMENT >= 16
+ cmp xd, dst_wd
+%else
+ cmp xd, dst_wm
+%endif
+ jl .loop_x
+ add dstq, dst_stridemp
+ add srcq, src_stridemp
+ dec hd
+ jg .loop_y
+ RET