diff options
Diffstat (limited to 'third_party/dav1d/src/x86/mc16_sse.asm')
-rw-r--r-- | third_party/dav1d/src/x86/mc16_sse.asm | 8731 |
1 files changed, 8731 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/mc16_sse.asm b/third_party/dav1d/src/x86/mc16_sse.asm new file mode 100644 index 0000000000..fde8e372a3 --- /dev/null +++ b/third_party/dav1d/src/x86/mc16_sse.asm @@ -0,0 +1,8731 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA + +; dav1d_obmc_masks[] << 9 +obmc_masks: dw 0, 0, 9728, 0, 12800, 7168, 2560, 0 + dw 14336, 11264, 8192, 5632, 3584, 1536, 0, 0 + dw 15360, 13824, 12288, 10752, 9216, 7680, 6144, 5120 + dw 4096, 3072, 2048, 1536, 0, 0, 0, 0 + dw 15872, 14848, 14336, 13312, 12288, 11776, 10752, 10240 + dw 9728, 8704, 8192, 7168, 6656, 6144, 5632, 4608 + dw 4096, 3584, 3072, 2560, 2048, 2048, 1536, 1024 + +blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 +spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 +spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 +spel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 +spel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 +spel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 +rescale_mul: dd 0, 1, 2, 3 +resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 + db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 +bdct_lb_q: times 8 db 0 + times 8 db 4 + times 8 db 8 + times 8 db 12 + +pw_2: times 8 dw 2 +pw_16: times 4 dw 16 +prep_mul: times 4 dw 16 + times 8 dw 4 +pw_64: times 8 dw 64 +pw_256: times 8 dw 256 +pw_2048: times 4 dw 2048 +bidir_mul: times 4 dw 2048 +pw_8192: times 8 dw 8192 +pw_27615: times 8 dw 27615 +pw_32766: times 8 dw 32766 +pw_m512: times 8 dw -512 +pd_63: times 4 dd 63 +pd_64: times 4 dd 64 +pd_512: times 4 dd 512 +pd_m524256: times 4 dd -524256 ; -8192 << 6 + 32 +pd_0x3ff: times 4 dd 0x3ff +pd_0x4000: times 4 dd 0x4000 +pq_0x400000: times 2 dq 0x400000 +pq_0x40000000: times 2 dq 0x40000000 +pd_65538: times 2 dd 65538 + +put_bilin_h_rnd: times 4 dw 8 + times 4 dw 10 +s_8tap_h_rnd: times 2 dd 2 + times 2 dd 8 +put_s_8tap_v_rnd: times 2 dd 512 + times 2 dd 128 +s_8tap_h_sh: dd 2, 4 +put_s_8tap_v_sh: dd 10, 8 +bidir_rnd: times 4 dw -16400 + times 4 dw -16388 +put_8tap_h_rnd: dd 34, 34, 40, 40 +prep_8tap_1d_rnd: times 2 dd 8 - (8192 << 4) +prep_8tap_2d_rnd: times 4 dd 32 - (8192 << 5) + +warp8x8_shift: dd 11, 13 +warp8x8_rnd1: dd 1024, 1024, 4096, 4096 +warp8x8_rnd2: times 4 dw 4096 + times 4 dw 16384 +warp8x8t_rnd: times 2 dd 16384 - (8192 << 15) + +%macro BIDIR_JMP_TABLE 2-* + %xdefine %1_%2_table (%%table - 2*%3) + %xdefine %%base %1_%2_table + %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) + %%table: + %rep %0 - 2 + dd %%prefix %+ .w%3 - %%base + %rotate 1 + %endrep +%endmacro + +BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 32, 64, 128 + +%macro BASE_JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base %1_%2 + %%table: + %rep %0 - 2 + dw %%base %+ _w%3 - %%base + %rotate 1 + %endrep +%endmacro + +%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_16bpc_ssse3.put) +%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_16bpc_ssse3.prep) + +BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128 +BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 + +%macro SCALED_JMP_TABLE 2-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2) +%%table: + %rep %0 - 2 + dw %%base %+ .w%3 - %%base + %rotate 1 + %endrep + %rotate 2 +%%dy_1024: + %xdefine %1_%2_dy1_table (%%dy_1024 - %3) + %rep %0 - 2 + dw %%base %+ .dy1_w%3 - %%base + %rotate 1 + %endrep + %rotate 2 +%%dy_2048: + %xdefine %1_%2_dy2_table (%%dy_2048 - %3) + %rep %0 - 2 + dw %%base %+ .dy2_w%3 - %%base + %rotate 1 + %endrep +%endmacro + +SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128 +SCALED_JMP_TABLE prep_8tap_scaled, ssse3, 4, 8, 16, 32, 64, 128 + +cextern mc_subpel_filters +%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) + +cextern mc_warp_filter +cextern resize_filter + +SECTION .text + +%if UNIX64 +DECLARE_REG_TMP 7 +%else +DECLARE_REG_TMP 5 +%endif + +INIT_XMM ssse3 +cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w, h, mxy +%define base t0-put_ssse3 + mov mxyd, r6m ; mx + LEA t0, put_ssse3 + movifnidn wd, wm + test mxyd, mxyd + jnz .h + mov mxyd, r7m ; my + test mxyd, mxyd + jnz .v +.put: + tzcnt wd, wd + movzx wd, word [base+put_ssse3_table+wq*2] + add wq, t0 + movifnidn hd, hm + jmp wq +.put_w2: + mov r4d, [srcq+ssq*0] + mov r6d, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r4d + mov [dstq+dsq*1], r6d + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w2 + RET +.put_w4: + movq m0, [srcq+ssq*0] + movq m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq [dstq+dsq*0], m0 + movq [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w4 + RET +.put_w8: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w8 + RET +.put_w16: + movu m0, [srcq+ssq*0+16*0] + movu m1, [srcq+ssq*0+16*1] + movu m2, [srcq+ssq*1+16*0] + movu m3, [srcq+ssq*1+16*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0+16*0], m0 + mova [dstq+dsq*0+16*1], m1 + mova [dstq+dsq*1+16*0], m2 + mova [dstq+dsq*1+16*1], m3 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w16 + RET +.put_w32: + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + add srcq, ssq + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m3 + add dstq, dsq + dec hd + jg .put_w32 + RET +.put_w64: + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m3 + movu m0, [srcq+16*4] + movu m1, [srcq+16*5] + movu m2, [srcq+16*6] + movu m3, [srcq+16*7] + add srcq, ssq + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + mova [dstq+16*6], m2 + mova [dstq+16*7], m3 + add dstq, dsq + dec hd + jg .put_w64 + RET +.put_w128: + add srcq, 16*8 + add dstq, 16*8 +.put_w128_loop: + movu m0, [srcq-16*8] + movu m1, [srcq-16*7] + movu m2, [srcq-16*6] + movu m3, [srcq-16*5] + mova [dstq-16*8], m0 + mova [dstq-16*7], m1 + mova [dstq-16*6], m2 + mova [dstq-16*5], m3 + movu m0, [srcq-16*4] + movu m1, [srcq-16*3] + movu m2, [srcq-16*2] + movu m3, [srcq-16*1] + mova [dstq-16*4], m0 + mova [dstq-16*3], m1 + mova [dstq-16*2], m2 + mova [dstq-16*1], m3 + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m3 + movu m0, [srcq+16*4] + movu m1, [srcq+16*5] + movu m2, [srcq+16*6] + movu m3, [srcq+16*7] + add srcq, ssq + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + mova [dstq+16*6], m2 + mova [dstq+16*7], m3 + add dstq, dsq + dec hd + jg .put_w128_loop + RET +.h: + movd m5, mxyd + mov mxyd, r7m ; my + mova m4, [base+pw_16] + pshufb m5, [base+pw_256] + psubw m4, m5 + test mxyd, mxyd + jnz .hv + ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v + mov r6d, r8m ; bitdepth_max + shr r6d, 11 + movddup m3, [base+put_bilin_h_rnd+r6*8] + movifnidn hd, hm + sub wd, 8 + jg .h_w16 + je .h_w8 + cmp wd, -4 + je .h_w4 +.h_w2: + movq m1, [srcq+ssq*0] + movhps m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmullw m0, m4, m1 + psrlq m1, 16 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 4 + movd [dstq+dsq*0], m0 + punpckhqdq m0, m0 + movd [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2 + RET +.h_w4: + movq m0, [srcq+ssq*0] + movhps m0, [srcq+ssq*1] + movq m1, [srcq+ssq*0+2] + movhps m1, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 4 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4 + RET +.h_w8: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*0+2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + movu m1, [srcq+ssq*1] + movu m2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m0, 4 + psrlw m1, 4 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + neg wq +.h_w16_loop0: + mov r6, wq +.h_w16_loop: + movu m0, [srcq+r6*2+ 0] + movu m1, [srcq+r6*2+ 2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + movu m1, [srcq+r6*2+16] + movu m2, [srcq+r6*2+18] + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m0, 4 + psrlw m1, 4 + mova [dstq+r6*2+16*0], m0 + mova [dstq+r6*2+16*1], m1 + add r6, 16 + jl .h_w16_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w16_loop0 + RET +.v: + shl mxyd, 11 + movd m5, mxyd + pshufb m5, [base+pw_256] + movifnidn hd, hm + cmp wd, 4 + jg .v_w8 + je .v_w4 +.v_w2: + movd m0, [srcq+ssq*0] +.v_w2_loop: + movd m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklqdq m2, m0, m1 + movd m0, [srcq+ssq*0] + punpcklqdq m1, m0 + psubw m1, m2 + pmulhrsw m1, m5 + paddw m1, m2 + movd [dstq+dsq*0], m1 + punpckhqdq m1, m1 + movd [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movq m0, [srcq+ssq*0] +.v_w4_loop: + movq m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklqdq m2, m0, m1 + movq m0, [srcq+ssq*0] + punpcklqdq m1, m0 + psubw m1, m2 + pmulhrsw m1, m5 + paddw m1, m2 + movq [dstq+dsq*0], m1 + movhps [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: +%if ARCH_X86_64 +%if WIN64 + push r7 +%endif + shl wd, 5 + mov r7, srcq + lea r6d, [wq+hq-256] + mov r4, dstq +%else + mov r6, srcq +%endif +.v_w8_loop0: + movu m0, [srcq+ssq*0] +.v_w8_loop: + movu m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + psubw m1, m3, m0 + pmulhrsw m1, m5 + paddw m1, m0 + movu m0, [srcq+ssq*0] + psubw m2, m0, m3 + pmulhrsw m2, m5 + paddw m2, m3 + mova [dstq+dsq*0], m1 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop +%if ARCH_X86_64 + add r7, 16 + add r4, 16 + movzx hd, r6b + mov srcq, r7 + mov dstq, r4 + sub r6d, 1<<8 +%else + mov dstq, dstmp + add r6, 16 + mov hd, hm + add dstq, 16 + mov srcq, r6 + mov dstmp, dstq + sub wd, 8 +%endif + jg .v_w8_loop0 +%if WIN64 + pop r7 +%endif + RET +.hv: + WIN64_SPILL_XMM 8 + shl mxyd, 11 + mova m3, [base+pw_2] + movd m6, mxyd + mova m7, [base+pw_8192] + pshufb m6, [base+pw_256] + test dword r8m, 0x800 + jnz .hv_12bpc + psllw m4, 2 + psllw m5, 2 + mova m7, [base+pw_2048] +.hv_12bpc: + movifnidn hd, hm + cmp wd, 4 + jg .hv_w8 + je .hv_w4 +.hv_w2: + movddup m0, [srcq+ssq*0] + pshufhw m1, m0, q0321 + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 2 +.hv_w2_loop: + movq m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps m2, [srcq+ssq*0] + pmullw m1, m4, m2 + psrlq m2, 16 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m1, 2 ; 1 _ 2 _ + shufpd m2, m0, m1, 0x01 ; 0 _ 1 _ + mova m0, m1 + psubw m1, m2 + paddw m1, m1 + pmulhw m1, m6 + paddw m1, m2 + pmulhrsw m1, m7 + movd [dstq+dsq*0], m1 + punpckhqdq m1, m1 + movd [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + movddup m0, [srcq+ssq*0] + movddup m1, [srcq+ssq*0+2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 2 +.hv_w4_loop: + movq m1, [srcq+ssq*1] + movq m2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + movhps m1, [srcq+ssq*0] + movhps m2, [srcq+ssq*0+2] + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m1, 2 ; 1 2 + shufpd m2, m0, m1, 0x01 ; 0 1 + mova m0, m1 + psubw m1, m2 + paddw m1, m1 + pmulhw m1, m6 + paddw m1, m2 + pmulhrsw m1, m7 + movq [dstq+dsq*0], m1 + movhps [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: +%if ARCH_X86_64 +%if WIN64 + push r7 +%endif + shl wd, 5 + lea r6d, [wq+hq-256] + mov r4, srcq + mov r7, dstq +%else + mov r6, srcq +%endif +.hv_w8_loop0: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*0+2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 2 +.hv_w8_loop: + movu m1, [srcq+ssq*1] + movu m2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m1, 2 + psubw m2, m1, m0 + paddw m2, m2 + pmulhw m2, m6 + paddw m2, m0 + pmulhrsw m2, m7 + mova [dstq+dsq*0], m2 + movu m0, [srcq+ssq*0] + movu m2, [srcq+ssq*0+2] + pmullw m0, m4 + pmullw m2, m5 + paddw m0, m3 + paddw m0, m2 + psrlw m0, 2 + psubw m2, m0, m1 + paddw m2, m2 + pmulhw m2, m6 + paddw m2, m1 + pmulhrsw m2, m7 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop +%if ARCH_X86_64 + add r4, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 +%else + mov dstq, dstmp + add r6, 16 + mov hd, hm + add dstq, 16 + mov srcq, r6 + mov dstmp, dstq + sub wd, 8 +%endif + jg .hv_w8_loop0 +%if WIN64 + pop r7 +%endif + RET + +cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w, h, mxy, stride3 +%define base r6-prep_ssse3 + movifnidn mxyd, r5m ; mx + LEA r6, prep_ssse3 + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r6m ; my + test mxyd, mxyd + jnz .v +.prep: + tzcnt wd, wd + movzx wd, word [base+prep_ssse3_table+wq*2] + mov r5d, r7m ; bitdepth_max + mova m5, [base+pw_8192] + add wq, r6 + shr r5d, 11 + movddup m4, [base+prep_mul+r5*8] + lea stride3q, [strideq*3] + jmp wq +.prep_w4: + movq m0, [srcq+strideq*0] + movhps m0, [srcq+strideq*1] + movq m1, [srcq+strideq*2] + movhps m1, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + pmullw m0, m4 + pmullw m1, m4 + psubw m0, m5 + psubw m1, m5 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + add tmpq, 16*2 + sub hd, 4 + jg .prep_w4 + RET +.prep_w8: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*2] + movu m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + sub hd, 4 + jg .prep_w8 + RET +.prep_w16: + movu m0, [srcq+strideq*0+16*0] + movu m1, [srcq+strideq*0+16*1] + movu m2, [srcq+strideq*1+16*0] + movu m3, [srcq+strideq*1+16*1] + lea srcq, [srcq+strideq*2] + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + sub hd, 2 + jg .prep_w16 + RET +.prep_w32: + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + add srcq, strideq + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + dec hd + jg .prep_w32 + RET +.prep_w64: + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + movu m0, [srcq+16*4] + movu m1, [srcq+16*5] + movu m2, [srcq+16*6] + movu m3, [srcq+16*7] + add srcq, strideq + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*4], m0 + mova [tmpq+16*5], m1 + mova [tmpq+16*6], m2 + mova [tmpq+16*7], m3 + add tmpq, 16*8 + dec hd + jg .prep_w64 + RET +.prep_w128: + movu m0, [srcq+16* 0] + movu m1, [srcq+16* 1] + movu m2, [srcq+16* 2] + movu m3, [srcq+16* 3] + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + movu m0, [srcq+16* 4] + movu m1, [srcq+16* 5] + movu m2, [srcq+16* 6] + movu m3, [srcq+16* 7] + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*4], m0 + mova [tmpq+16*5], m1 + mova [tmpq+16*6], m2 + mova [tmpq+16*7], m3 + movu m0, [srcq+16* 8] + movu m1, [srcq+16* 9] + movu m2, [srcq+16*10] + movu m3, [srcq+16*11] + add tmpq, 16*16 + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq-16*8], m0 + mova [tmpq-16*7], m1 + mova [tmpq-16*6], m2 + mova [tmpq-16*5], m3 + movu m0, [srcq+16*12] + movu m1, [srcq+16*13] + movu m2, [srcq+16*14] + movu m3, [srcq+16*15] + add srcq, strideq + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq-16*4], m0 + mova [tmpq-16*3], m1 + mova [tmpq-16*2], m2 + mova [tmpq-16*1], m3 + dec hd + jg .prep_w128 + RET +.h: + movd m4, mxyd + mov mxyd, r6m ; my + mova m3, [base+pw_16] + pshufb m4, [base+pw_256] + mova m5, [base+pw_32766] + psubw m3, m4 + test dword r7m, 0x800 + jnz .h_12bpc + psllw m3, 2 + psllw m4, 2 +.h_12bpc: + test mxyd, mxyd + jnz .hv + sub wd, 8 + je .h_w8 + jg .h_w16 +.h_w4: + movq m0, [srcq+strideq*0] + movhps m0, [srcq+strideq*1] + movq m1, [srcq+strideq*0+2] + movhps m1, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + psraw m0, 2 + mova [tmpq], m0 + add tmpq, 16 + sub hd, 2 + jg .h_w4 + RET +.h_w8: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + add tmpq, 16*2 + sub hd, 2 + jg .h_w8 + RET +.h_w16: + lea srcq, [srcq+wq*2] + neg wq +.h_w16_loop0: + mov r6, wq +.h_w16_loop: + movu m0, [srcq+r6*2+ 0] + movu m1, [srcq+r6*2+ 2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + movu m1, [srcq+r6*2+16] + movu m2, [srcq+r6*2+18] + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + add tmpq, 16*2 + add r6, 16 + jl .h_w16_loop + add srcq, strideq + dec hd + jg .h_w16_loop0 + RET +.v: + movd m4, mxyd + mova m3, [base+pw_16] + pshufb m4, [base+pw_256] + mova m5, [base+pw_32766] + psubw m3, m4 + test dword r7m, 0x800 + jnz .v_12bpc + psllw m3, 2 + psllw m4, 2 +.v_12bpc: + cmp wd, 8 + je .v_w8 + jg .v_w16 +.v_w4: + movq m0, [srcq+strideq*0] +.v_w4_loop: + movq m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + punpcklqdq m1, m0, m2 ; 0 1 + movq m0, [srcq+strideq*0] + punpcklqdq m2, m0 ; 1 2 + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m1, 2 + mova [tmpq], m1 + add tmpq, 16 + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movu m0, [srcq+strideq*0] +.v_w8_loop: + movu m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + pmullw m0, m3 + pmullw m1, m4, m2 + psubw m0, m5 + paddw m1, m0 + movu m0, [srcq+strideq*0] + psraw m1, 2 + pmullw m2, m3 + mova [tmpq+16*0], m1 + pmullw m1, m4, m0 + psubw m2, m5 + paddw m1, m2 + psraw m1, 2 + mova [tmpq+16*1], m1 + add tmpq, 16*2 + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: +%if WIN64 + push r7 +%endif + mov r5, srcq +%if ARCH_X86_64 + lea r6d, [wq*4-32] + mov wd, wd + lea r6d, [hq+r6*8] + mov r7, tmpq +%else + mov r6d, wd +%endif +.v_w16_loop0: + movu m0, [srcq+strideq*0] +.v_w16_loop: + movu m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + pmullw m0, m3 + pmullw m1, m4, m2 + psubw m0, m5 + paddw m1, m0 + movu m0, [srcq+strideq*0] + psraw m1, 2 + pmullw m2, m3 + mova [tmpq+wq*0], m1 + pmullw m1, m4, m0 + psubw m2, m5 + paddw m1, m2 + psraw m1, 2 + mova [tmpq+wq*2], m1 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .v_w16_loop +%if ARCH_X86_64 + add r5, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r5 + mov tmpq, r7 + sub r6d, 1<<8 +%else + mov tmpq, tmpmp + add r5, 16 + mov hd, hm + add tmpq, 16 + mov srcq, r5 + mov tmpmp, tmpq + sub r6d, 8 +%endif + jg .v_w16_loop0 +%if WIN64 + pop r7 +%endif + RET +.hv: + WIN64_SPILL_XMM 7 + shl mxyd, 11 + movd m6, mxyd + pshufb m6, [base+pw_256] + cmp wd, 8 + je .hv_w8 + jg .hv_w16 +.hv_w4: + movddup m0, [srcq+strideq*0] + movddup m1, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + psraw m0, 2 +.hv_w4_loop: + movq m1, [srcq+strideq*1] + movq m2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + movhps m1, [srcq+strideq*0] + movhps m2, [srcq+strideq*0+2] + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m1, 2 ; 1 2 + shufpd m2, m0, m1, 0x01 ; 0 1 + mova m0, m1 + psubw m1, m2 + pmulhrsw m1, m6 + paddw m1, m2 + mova [tmpq], m1 + add tmpq, 16 + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + psraw m0, 2 +.hv_w8_loop: + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m1, 2 + psubw m2, m1, m0 + pmulhrsw m2, m6 + paddw m2, m0 + mova [tmpq+16*0], m2 + movu m0, [srcq+strideq*0] + movu m2, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m2, m4 + psubw m0, m5 + paddw m0, m2 + psraw m0, 2 + psubw m2, m0, m1 + pmulhrsw m2, m6 + paddw m2, m1 + mova [tmpq+16*1], m2 + add tmpq, 16*2 + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: +%if WIN64 + push r7 +%endif + mov r5, srcq +%if ARCH_X86_64 + lea r6d, [wq*4-32] + mov wd, wd + lea r6d, [hq+r6*8] + mov r7, tmpq +%else + mov r6d, wd +%endif +.hv_w16_loop0: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + psraw m0, 2 +.hv_w16_loop: + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m1, 2 + psubw m2, m1, m0 + pmulhrsw m2, m6 + paddw m2, m0 + mova [tmpq+wq*0], m2 + movu m0, [srcq+strideq*0] + movu m2, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m2, m4 + psubw m0, m5 + paddw m0, m2 + psraw m0, 2 + psubw m2, m0, m1 + pmulhrsw m2, m6 + paddw m2, m1 + mova [tmpq+wq*2], m2 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .hv_w16_loop +%if ARCH_X86_64 + add r5, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r5 + mov tmpq, r7 + sub r6d, 1<<8 +%else + mov tmpq, tmpmp + add r5, 16 + mov hd, hm + add tmpq, 16 + mov srcq, r5 + mov tmpmp, tmpq + sub r6d, 8 +%endif + jg .hv_w16_loop0 +%if WIN64 + pop r7 +%endif + RET + +; int8_t subpel_filters[5][15][8] +%assign FILTER_REGULAR (0*15 << 16) | 3*15 +%assign FILTER_SMOOTH (1*15 << 16) | 4*15 +%assign FILTER_SHARP (2*15 << 16) | 3*15 + +%macro FN 4 ; prefix, type, type_h, type_v +cglobal %1_%2_16bpc + mov t0d, FILTER_%3 +%ifidn %3, %4 + mov t1d, t0d +%else + mov t1d, FILTER_%4 +%endif +%ifnidn %2, regular ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX) +%endif +%endmacro + +%if ARCH_X86_32 +DECLARE_REG_TMP 1, 2, 6 +%elif WIN64 +DECLARE_REG_TMP 4, 5, 8 +%else +DECLARE_REG_TMP 7, 8, 8 +%endif + +%define PUT_8TAP_FN FN put_8tap, +PUT_8TAP_FN sharp, SHARP, SHARP +PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH +PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP +PUT_8TAP_FN smooth, SMOOTH, SMOOTH +PUT_8TAP_FN sharp_regular, SHARP, REGULAR +PUT_8TAP_FN regular_sharp, REGULAR, SHARP +PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR +PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_FN regular, REGULAR, REGULAR + +%if ARCH_X86_32 +cglobal put_8tap_16bpc, 0, 7, 8, dst, ds, src, ss, w, h, mx, my +%define mxb r0b +%define mxd r0 +%define mxq r0 +%define myb r1b +%define myd r1 +%define myq r1 +%define m8 [esp+16*0] +%define m9 [esp+16*1] +%define m10 [esp+16*2] +%define m11 [esp+16*3] +%define m12 [esp+16*4] +%define m13 [esp+16*5] +%define m14 [esp+16*6] +%define m15 [esp+16*7] +%else +cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my +%endif +%define base t2-put_ssse3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + LEA t2, put_ssse3 + movifnidn wd, wm + movifnidn srcq, srcmp + movifnidn ssq, ssmp + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + movzx wd, word [base+put_ssse3_table+wq*2] + movifnidn dstq, dstmp + movifnidn dsq, dsmp + add wq, t2 +%if WIN64 + pop r8 + pop r7 +%endif + jmp wq +.h: + test myd, 0xf00 + jnz .hv + mov myd, r8m + movd m5, r8m + shr myd, 11 + movddup m4, [base+put_8tap_h_rnd+myq*8] + movifnidn dsq, dsmp + pshufb m5, [base+pw_256] + cmp wd, 4 + jg .h_w8 + movzx mxd, mxb + lea srcq, [srcq-2] + movq m3, [base+subpel_filters+mxq*8] + movifnidn dstq, dstmp + punpcklbw m3, m3 + psraw m3, 8 ; sign-extend + je .h_w4 +.h_w2: + mova m2, [base+spel_h_shuf2] + pshufd m3, m3, q2121 +.h_w2_loop: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb m0, m2 + pshufb m1, m2 + pmaddwd m0, m3 + pmaddwd m1, m3 + phaddd m0, m1 + paddd m0, m4 + psrad m0, 6 + packssdw m0, m0 + pxor m1, m1 + pminsw m0, m5 + pmaxsw m0, m1 + movd [dstq+dsq*0], m0 + pshuflw m0, m0, q3232 + movd [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2_loop + RET +.h_w4: + WIN64_SPILL_XMM 8 + mova m6, [base+spel_h_shufA] + mova m7, [base+spel_h_shufB] + pshufd m2, m3, q1111 + pshufd m3, m3, q2222 +.h_w4_loop: + movu m1, [srcq] + add srcq, ssq + pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4 + pshufb m1, m7 ; 2 3 3 4 4 5 5 6 + pmaddwd m0, m2 + pmaddwd m1, m3 + paddd m0, m4 + paddd m0, m1 + psrad m0, 6 + packssdw m0, m0 + pxor m1, m1 + pminsw m0, m5 + pmaxsw m0, m1 + movq [dstq], m0 + add dstq, dsq + dec hd + jg .h_w4_loop + RET +.h_w8: +%if WIN64 + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 12 +%endif + shr mxd, 16 + movq m3, [base+subpel_filters+mxq*8] + movifnidn dstq, dstmp + mova m6, [base+spel_h_shufA] + mova m7, [base+spel_h_shufB] +%if UNIX64 + mov wd, wd +%endif + lea srcq, [srcq+wq*2] + punpcklbw m3, m3 + lea dstq, [dstq+wq*2] + psraw m3, 8 + neg wq +%if ARCH_X86_32 + ALLOC_STACK -16*4 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 +%else + pshufd m8, m3, q0000 + pshufd m9, m3, q1111 + pshufd m10, m3, q2222 + pshufd m11, m3, q3333 +%endif +.h_w8_loop0: + mov r6, wq +.h_w8_loop: + movu m0, [srcq+r6*2- 6] + movu m1, [srcq+r6*2+ 2] + pshufb m2, m0, m6 ; 0 1 1 2 2 3 3 4 + pshufb m0, m7 ; 2 3 3 4 4 5 5 6 + pmaddwd m2, m8 ; abcd0 + pmaddwd m0, m9 ; abcd1 + pshufb m3, m1, m6 ; 4 5 5 6 6 7 7 8 + pshufb m1, m7 ; 6 7 7 8 8 9 9 a + paddd m2, m4 + paddd m0, m2 + pmaddwd m2, m10, m3 ; abcd2 + pmaddwd m3, m8 ; efgh0 + paddd m0, m2 + pmaddwd m2, m11, m1 ; abcd3 + pmaddwd m1, m9 ; efgh1 + paddd m0, m2 + movu m2, [srcq+r6*2+10] + paddd m3, m4 + paddd m1, m3 + pshufb m3, m2, m6 ; 8 9 9 a a b b c + pshufb m2, m7 ; a b b c c d d e + pmaddwd m3, m10 ; efgh2 + pmaddwd m2, m11 ; efgh3 + paddd m1, m3 + paddd m1, m2 + psrad m0, 6 + psrad m1, 6 + packssdw m0, m1 + pxor m1, m1 + pminsw m0, m5 + pmaxsw m0, m1 + mova [dstq+r6*2], m0 + add r6, 8 + jl .h_w8_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w8_loop0 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovb myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if STACK_ALIGNMENT < 16 + %xdefine rstk rsp +%else + %assign stack_offset stack_offset - stack_size_padded +%endif +%if WIN64 + WIN64_SPILL_XMM 15 +%endif + movd m7, r8m + movifnidn dstq, dstmp + movifnidn dsq, dsmp + punpcklbw m3, m3 + pshufb m7, [base+pw_256] + psraw m3, 8 ; sign-extend +%if ARCH_X86_32 + ALLOC_STACK -16*7 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 +%else + pshufd m8, m3, q0000 + pshufd m9, m3, q1111 + pshufd m10, m3, q2222 + pshufd m11, m3, q3333 +%endif + lea r6, [ssq*3] + sub srcq, r6 + cmp wd, 2 + jne .v_w4 +.v_w2: + movd m1, [srcq+ssq*0] + movd m4, [srcq+ssq*1] + movd m2, [srcq+ssq*2] + add srcq, r6 + movd m5, [srcq+ssq*0] + movd m3, [srcq+ssq*1] + movd m6, [srcq+ssq*2] + add srcq, r6 + movd m0, [srcq+ssq*0] + punpckldq m1, m4 ; 0 1 + punpckldq m4, m2 ; 1 2 + punpckldq m2, m5 ; 2 3 + punpckldq m5, m3 ; 3 4 + punpckldq m3, m6 ; 4 5 + punpckldq m6, m0 ; 5 6 + punpcklwd m1, m4 ; 01 12 + punpcklwd m2, m5 ; 23 34 + punpcklwd m3, m6 ; 45 56 + pxor m6, m6 +.v_w2_loop: + movd m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd m5, m8, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m9 ; a1 b1 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m10 ; a2 b2 + paddd m5, m3 + punpckldq m3, m0, m4 ; 6 7 + movd m0, [srcq+ssq*0] + punpckldq m4, m0 ; 7 8 + punpcklwd m3, m4 ; 67 78 + pmaddwd m4, m11, m3 ; a3 b3 + paddd m5, m4 + psrad m5, 5 + packssdw m5, m5 + pmaxsw m5, m6 + pavgw m5, m6 + pminsw m5, m7 + movd [dstq+dsq*0], m5 + pshuflw m5, m5, q3232 + movd [dstq+dsq*1], m5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: +%if ARCH_X86_32 + shl wd, 14 +%if STACK_ALIGNMENT < 16 + mov [esp+4*29], srcq + mov [esp+4*30], dstq +%else + mov srcmp, srcq +%endif + lea wd, [wq+hq-(1<<16)] +%else + shl wd, 6 + mov r7, srcq + mov r8, dstq + lea wd, [wq+hq-(1<<8)] +%endif +.v_w4_loop0: + movq m1, [srcq+ssq*0] + movq m2, [srcq+ssq*1] + movq m3, [srcq+ssq*2] + add srcq, r6 + movq m4, [srcq+ssq*0] + movq m5, [srcq+ssq*1] + movq m6, [srcq+ssq*2] + add srcq, r6 + movq m0, [srcq+ssq*0] + punpcklwd m1, m2 ; 01 + punpcklwd m2, m3 ; 12 + punpcklwd m3, m4 ; 23 + punpcklwd m4, m5 ; 34 + punpcklwd m5, m6 ; 45 + punpcklwd m6, m0 ; 56 +%if ARCH_X86_32 + jmp .v_w4_loop_start +.v_w4_loop: + mova m1, m12 + mova m2, m13 + mova m3, m14 +.v_w4_loop_start: + pmaddwd m1, m8 ; a0 + pmaddwd m2, m8 ; b0 + mova m12, m3 + mova m13, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m1, m3 + paddd m2, m4 + mova m14, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m1, m5 + paddd m2, m6 + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m5, m0, m6 ; 67 + movq m0, [srcq+ssq*0] + pmaddwd m3, m11, m5 ; a3 + punpcklwd m6, m0 ; 78 + paddd m1, m3 + pmaddwd m3, m11, m6 ; b3 + paddd m2, m3 + psrad m1, 5 + psrad m2, 5 + packssdw m1, m2 + pxor m2, m2 + pmaxsw m1, m2 + pavgw m1, m2 + pminsw m1, m7 + movq [dstq+dsq*0], m1 + movhps [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop +%if STACK_ALIGNMENT < 16 + mov srcq, [esp+4*29] + mov dstq, [esp+4*30] + movzx hd, ww + add srcq, 8 + add dstq, 8 + mov [esp+4*29], srcq + mov [esp+4*30], dstq +%else + mov srcq, srcmp + mov dstq, dstmp + movzx hd, ww + add srcq, 8 + add dstq, 8 + mov srcmp, srcq + mov dstmp, dstq +%endif + sub wd, 1<<16 +%else +.v_w4_loop: + pmaddwd m12, m8, m1 ; a0 + pmaddwd m13, m8, m2 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m12, m3 + paddd m13, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m12, m5 + paddd m13, m6 + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m5, m0, m6 ; 67 + movq m0, [srcq+ssq*0] + pmaddwd m14, m11, m5 ; a3 + punpcklwd m6, m0 ; 78 + paddd m12, m14 + pmaddwd m14, m11, m6 ; b3 + paddd m13, m14 + psrad m12, 5 + psrad m13, 5 + packssdw m12, m13 + pxor m13, m13 + pmaxsw m12, m13 + pavgw m12, m13 + pminsw m12, m7 + movq [dstq+dsq*0], m12 + movhps [dstq+dsq*1], m12 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + add r7, 8 + add r8, 8 + movzx hd, wb + mov srcq, r7 + mov dstq, r8 + sub wd, 1<<8 +%endif + jg .v_w4_loop0 + RET +.hv: +%if STACK_ALIGNMENT < 16 + %xdefine rstk rsp +%else + %assign stack_offset stack_offset - stack_size_padded +%endif +%if ARCH_X86_32 + movd m4, r8m + mova m6, [base+pd_512] + pshufb m4, [base+pw_256] +%else +%if WIN64 + ALLOC_STACK 16*6, 16 +%endif + movd m15, r8m + pshufb m15, [base+pw_256] +%endif + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + je .hv_w4 + movq m0, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovb myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if ARCH_X86_32 + mov dstq, dstmp + mov dsq, dsmp + mova m5, [base+spel_h_shuf2] + ALLOC_STACK -16*8 +%else + mova m6, [base+pd_512] + mova m9, [base+spel_h_shuf2] +%endif + pshuflw m0, m0, q2121 + pxor m7, m7 + punpcklbw m7, m0 + punpcklbw m3, m3 + psraw m3, 8 ; sign-extend + test dword r8m, 0x800 + jz .hv_w2_10bpc + psraw m7, 2 + psllw m3, 2 +.hv_w2_10bpc: + lea r6, [ssq*3] + sub srcq, 2 + sub srcq, r6 +%if ARCH_X86_32 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m9, m5 + mova m11, m0 + mova m12, m1 + mova m13, m2 + mova m14, m3 + mova m15, m4 +%else + pshufd m11, m3, q0000 + pshufd m12, m3, q1111 + pshufd m13, m3, q2222 + pshufd m14, m3, q3333 +%endif + movu m2, [srcq+ssq*0] + movu m3, [srcq+ssq*1] + movu m1, [srcq+ssq*2] + add srcq, r6 + movu m4, [srcq+ssq*0] +%if ARCH_X86_32 + REPX {pshufb x, m5}, m2, m3, m1, m4 +%else + REPX {pshufb x, m9}, m2, m3, m1, m4 +%endif + REPX {pmaddwd x, m7}, m2, m3, m1, m4 + phaddd m2, m3 ; 0 1 + phaddd m1, m4 ; 2 3 + movu m3, [srcq+ssq*1] + movu m4, [srcq+ssq*2] + add srcq, r6 + movu m0, [srcq+ssq*0] +%if ARCH_X86_32 + REPX {pshufb x, m5}, m3, m4, m0 +%else + REPX {pshufb x, m9}, m3, m4, m0 +%endif + REPX {pmaddwd x, m7}, m3, m4, m0 + phaddd m3, m4 ; 4 5 + phaddd m0, m0 ; 6 6 + REPX {paddd x, m6}, m2, m1, m3, m0 + REPX {psrad x, 10}, m2, m1, m3, m0 + packssdw m2, m1 ; 0 1 2 3 + packssdw m3, m0 ; 4 5 6 _ + palignr m4, m3, m2, 4 ; 1 2 3 4 + pshufd m5, m3, q0321 ; 5 6 _ _ + punpcklwd m1, m2, m4 ; 01 12 + punpckhwd m2, m4 ; 23 34 + punpcklwd m3, m5 ; 45 56 +.hv_w2_loop: + movu m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movu m5, [srcq+ssq*0] + pshufb m4, m9 + pshufb m5, m9 + pmaddwd m4, m7 + pmaddwd m5, m7 + phaddd m4, m5 + pmaddwd m5, m11, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m12 ; a1 b1 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m13 ; a2 b2 + paddd m5, m3 + paddd m4, m6 + psrad m4, 10 ; 7 8 + packssdw m0, m4 + pshufd m3, m0, q2103 + punpckhwd m3, m0 ; 67 78 + mova m0, m4 + pmaddwd m4, m14, m3 ; a3 b3 + paddd m5, m6 + paddd m5, m4 + psrad m5, 10 + packssdw m5, m5 + pxor m4, m4 + pminsw m5, m15 + pmaxsw m5, m4 + movd [dstq+dsq*0], m5 + pshuflw m5, m5, q3232 + movd [dstq+dsq*1], m5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w8: + shr mxd, 16 +.hv_w4: + movq m2, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovb myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if ARCH_X86_32 +%if STACK_ALIGNMENT < 16 + %xdefine rstk rsp +%else + %assign stack_offset stack_offset - stack_size_padded +%endif + mov dstq, dstmp + mov dsq, dsmp + mova m0, [base+spel_h_shufA] + mova m1, [base+spel_h_shufB] + ALLOC_STACK -16*15 + mova m8, m0 + mova m9, m1 + mova m14, m6 +%else + mova m8, [base+spel_h_shufA] + mova m9, [base+spel_h_shufB] +%endif + pxor m0, m0 + punpcklbw m0, m2 + punpcklbw m3, m3 + psraw m3, 8 + test dword r8m, 0x800 + jz .hv_w4_10bpc + psraw m0, 2 + psllw m3, 2 +.hv_w4_10bpc: + lea r6, [ssq*3] + sub srcq, 6 + sub srcq, r6 +%if ARCH_X86_32 + %define tmp esp+16*8 + shl wd, 14 +%if STACK_ALIGNMENT < 16 + mov [esp+4*61], srcq + mov [esp+4*62], dstq +%else + mov srcmp, srcq +%endif + mova [tmp+16*5], m4 + lea wd, [wq+hq-(1<<16)] + pshufd m1, m0, q0000 + pshufd m2, m0, q1111 + pshufd m5, m0, q2222 + pshufd m0, m0, q3333 + mova m10, m1 + mova m11, m2 + mova m12, m5 + mova m13, m0 +%else +%if WIN64 + %define tmp rsp +%else + %define tmp rsp-104 ; red zone +%endif + shl wd, 6 + mov r7, srcq + mov r8, dstq + lea wd, [wq+hq-(1<<8)] + pshufd m10, m0, q0000 + pshufd m11, m0, q1111 + pshufd m12, m0, q2222 + pshufd m13, m0, q3333 + mova [tmp+16*5], m15 +%endif + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova [tmp+16*1], m0 + mova [tmp+16*2], m1 + mova [tmp+16*3], m2 + mova [tmp+16*4], m3 +%macro PUT_8TAP_HV_H 4-5 m14 ; dst/src+0, src+8, tmp, shift, [pd_512] + pshufb m%3, m%1, m8 ; 0 1 1 2 2 3 3 4 + pshufb m%1, m9 ; 2 3 3 4 4 5 5 6 + pmaddwd m%3, m10 + pmaddwd m%1, m11 + paddd m%3, %5 + paddd m%1, m%3 + pshufb m%3, m%2, m8 ; 4 5 5 6 6 7 7 8 + pshufb m%2, m9 ; 6 7 7 8 8 9 9 a + pmaddwd m%3, m12 + pmaddwd m%2, m13 + paddd m%1, m%3 + paddd m%1, m%2 + psrad m%1, %4 +%endmacro +.hv_w4_loop0: +%if ARCH_X86_64 + mova m14, [pd_512] +%endif + movu m4, [srcq+ssq*0+0] + movu m1, [srcq+ssq*0+8] + movu m5, [srcq+ssq*1+0] + movu m2, [srcq+ssq*1+8] + movu m6, [srcq+ssq*2+0] + movu m3, [srcq+ssq*2+8] + add srcq, r6 + PUT_8TAP_HV_H 4, 1, 0, 10 + PUT_8TAP_HV_H 5, 2, 0, 10 + PUT_8TAP_HV_H 6, 3, 0, 10 + movu m7, [srcq+ssq*0+0] + movu m2, [srcq+ssq*0+8] + movu m1, [srcq+ssq*1+0] + movu m3, [srcq+ssq*1+8] + PUT_8TAP_HV_H 7, 2, 0, 10 + PUT_8TAP_HV_H 1, 3, 0, 10 + movu m2, [srcq+ssq*2+0] + movu m3, [srcq+ssq*2+8] + add srcq, r6 + PUT_8TAP_HV_H 2, 3, 0, 10 + packssdw m4, m7 ; 0 3 + packssdw m5, m1 ; 1 4 + movu m0, [srcq+ssq*0+0] + movu m1, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 1, 3, 10 + packssdw m6, m2 ; 2 5 + packssdw m7, m0 ; 3 6 + punpcklwd m1, m4, m5 ; 01 + punpckhwd m4, m5 ; 34 + punpcklwd m2, m5, m6 ; 12 + punpckhwd m5, m6 ; 45 + punpcklwd m3, m6, m7 ; 23 + punpckhwd m6, m7 ; 56 +%if ARCH_X86_32 + jmp .hv_w4_loop_start +.hv_w4_loop: + mova m1, [tmp+16*6] + mova m2, m15 +.hv_w4_loop_start: + mova m7, [tmp+16*1] + pmaddwd m1, m7 ; a0 + pmaddwd m2, m7 ; b0 + mova m7, [tmp+16*2] + mova [tmp+16*6], m3 + pmaddwd m3, m7 ; a1 + mova m15, m4 + pmaddwd m4, m7 ; b1 + mova m7, [tmp+16*3] + paddd m1, m3 + paddd m2, m4 + mova m3, m5 + pmaddwd m5, m7 ; a2 + mova m4, m6 + pmaddwd m6, m7 ; b2 + paddd m1, m5 + paddd m2, m6 + movu m7, [srcq+ssq*1+0] + movu m5, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 7, 5, 6, 10 + packssdw m0, m7 ; 6 7 + mova [tmp+16*0], m0 + movu m0, [srcq+ssq*0+0] + movu m5, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 5, 6, 10 + mova m6, [tmp+16*0] + packssdw m7, m0 ; 7 8 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, [tmp+16*4] + paddd m1, m7 ; a3 + pmaddwd m7, m6, [tmp+16*4] + paddd m2, m7 ; b3 + psrad m1, 9 + psrad m2, 9 + packssdw m1, m2 + pxor m7, m7 + pmaxsw m1, m7 + pavgw m7, m1 + pminsw m7, [tmp+16*5] + movq [dstq+dsq*0], m7 + movhps [dstq+dsq*1], m7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop +%if STACK_ALIGNMENT < 16 + mov srcq, [esp+4*61] + mov dstq, [esp+4*62] + add srcq, 8 + add dstq, 8 + mov [esp+4*61], srcq + mov [esp+4*62], dstq +%else + mov srcq, srcmp + mov dstq, dstmp + add srcq, 8 + add dstq, 8 + mov srcmp, srcq + mov dstmp, dstq +%endif + movzx hd, ww + sub wd, 1<<16 +%else +.hv_w4_loop: + mova m15, [tmp+16*1] + pmaddwd m14, m15, m1 ; a0 + pmaddwd m15, m2 ; b0 + mova m7, [tmp+16*2] + mova m1, m3 + pmaddwd m3, m7 ; a1 + mova m2, m4 + pmaddwd m4, m7 ; b1 + mova m7, [tmp+16*3] + paddd m14, m3 + paddd m15, m4 + mova m3, m5 + pmaddwd m5, m7 ; a2 + mova m4, m6 + pmaddwd m6, m7 ; b2 + paddd m14, m5 + paddd m15, m6 + movu m7, [srcq+ssq*1+0] + movu m5, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 7, 5, 6, 10, [pd_512] + packssdw m0, m7 ; 6 7 + mova [tmp+16*0], m0 + movu m0, [srcq+ssq*0+0] + movu m5, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 5, 6, 10, [pd_512] + mova m6, [tmp+16*0] + packssdw m7, m0 ; 7 8 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, [tmp+16*4] + paddd m14, m7 ; a3 + pmaddwd m7, m6, [tmp+16*4] + paddd m15, m7 ; b3 + psrad m14, 9 + psrad m15, 9 + packssdw m14, m15 + pxor m7, m7 + pmaxsw m14, m7 + pavgw m7, m14 + pminsw m7, [tmp+16*5] + movq [dstq+dsq*0], m7 + movhps [dstq+dsq*1], m7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + add r7, 8 + add r8, 8 + movzx hd, wb + mov srcq, r7 + mov dstq, r8 + sub wd, 1<<8 +%endif + jg .hv_w4_loop0 + RET +%undef tmp + +%if ARCH_X86_32 +DECLARE_REG_TMP 2, 1, 6, 4 +%elif WIN64 +DECLARE_REG_TMP 6, 4, 7, 4 +%else +DECLARE_REG_TMP 6, 7, 7, 8 +%endif + +%define PREP_8TAP_FN FN prep_8tap, +PREP_8TAP_FN sharp, SHARP, SHARP +PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH +PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP +PREP_8TAP_FN smooth, SMOOTH, SMOOTH +PREP_8TAP_FN sharp_regular, SHARP, REGULAR +PREP_8TAP_FN regular_sharp, REGULAR, SHARP +PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR +PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_FN regular, REGULAR, REGULAR + +%if ARCH_X86_32 +cglobal prep_8tap_16bpc, 0, 7, 8, tmp, src, ss, w, h, mx, my +%define mxb r0b +%define mxd r0 +%define mxq r0 +%define myb r2b +%define myd r2 +%define myq r2 +%else +cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my +%endif +%define base t2-prep_ssse3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + LEA t2, prep_ssse3 + movifnidn wd, wm + movifnidn srcq, srcmp + test mxd, 0xf00 + jnz .h + movifnidn hd, hm + test myd, 0xf00 + jnz .v + tzcnt wd, wd + mov myd, r7m ; bitdepth_max + movzx wd, word [base+prep_ssse3_table+wq*2] + mova m5, [base+pw_8192] + shr myd, 11 + add wq, t2 + movddup m4, [base+prep_mul+myq*8] + movifnidn ssq, ssmp + movifnidn tmpq, tmpmp + lea r6, [ssq*3] +%if WIN64 + pop r7 +%endif + jmp wq +.h: + test myd, 0xf00 + jnz .hv + movifnidn ssq, r2mp + movifnidn hd, r4m + movddup m5, [base+prep_8tap_1d_rnd] + cmp wd, 4 + jne .h_w8 + movzx mxd, mxb + movq m0, [base+subpel_filters+mxq*8] + mova m3, [base+spel_h_shufA] + mova m4, [base+spel_h_shufB] + movifnidn tmpq, tmpmp + sub srcq, 2 + WIN64_SPILL_XMM 8 + punpcklbw m0, m0 + psraw m0, 8 + test dword r7m, 0x800 + jnz .h_w4_12bpc + psllw m0, 2 +.h_w4_12bpc: + pshufd m6, m0, q1111 + pshufd m7, m0, q2222 +.h_w4_loop: + movu m1, [srcq+ssq*0] + movu m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 + pshufb m1, m4 ; 2 3 3 4 4 5 5 6 + pmaddwd m0, m6 + pmaddwd m1, m7 + paddd m0, m5 + paddd m0, m1 + pshufb m1, m2, m3 + pshufb m2, m4 + pmaddwd m1, m6 + pmaddwd m2, m7 + paddd m1, m5 + paddd m1, m2 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + mova [tmpq], m0 + add tmpq, 16 + sub hd, 2 + jg .h_w4_loop + RET +.h_w8: + WIN64_SPILL_XMM 11 + shr mxd, 16 + movq m2, [base+subpel_filters+mxq*8] + mova m4, [base+spel_h_shufA] + mova m6, [base+spel_h_shufB] + movifnidn tmpq, r0mp + add wd, wd + punpcklbw m2, m2 + add srcq, wq + psraw m2, 8 + add tmpq, wq + neg wq + test dword r7m, 0x800 + jnz .h_w8_12bpc + psllw m2, 2 +.h_w8_12bpc: + pshufd m7, m2, q0000 +%if ARCH_X86_32 + ALLOC_STACK -16*3 + pshufd m0, m2, q1111 + pshufd m1, m2, q2222 + pshufd m2, m2, q3333 + mova m8, m0 + mova m9, m1 + mova m10, m2 +%else + pshufd m8, m2, q1111 + pshufd m9, m2, q2222 + pshufd m10, m2, q3333 +%endif +.h_w8_loop0: + mov r6, wq +.h_w8_loop: + movu m0, [srcq+r6- 6] + movu m1, [srcq+r6+ 2] + pshufb m2, m0, m4 ; 0 1 1 2 2 3 3 4 + pshufb m0, m6 ; 2 3 3 4 4 5 5 6 + pmaddwd m2, m7 ; abcd0 + pmaddwd m0, m8 ; abcd1 + pshufb m3, m1, m4 ; 4 5 5 6 6 7 7 8 + pshufb m1, m6 ; 6 7 7 8 8 9 9 a + paddd m2, m5 + paddd m0, m2 + pmaddwd m2, m9, m3 ; abcd2 + pmaddwd m3, m7 ; efgh0 + paddd m0, m2 + pmaddwd m2, m10, m1 ; abcd3 + pmaddwd m1, m8 ; efgh1 + paddd m0, m2 + movu m2, [srcq+r6+10] + paddd m3, m5 + paddd m1, m3 + pshufb m3, m2, m4 ; a b b c c d d e + pshufb m2, m6 ; 8 9 9 a a b b c + pmaddwd m3, m9 ; efgh2 + pmaddwd m2, m10 ; efgh3 + paddd m1, m3 + paddd m1, m2 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + mova [tmpq+r6], m0 + add r6, 16 + jl .h_w8_loop + add srcq, ssq + sub tmpq, wq + dec hd + jg .h_w8_loop0 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if STACK_ALIGNMENT < 16 + %xdefine rstk rsp +%else + %assign stack_offset stack_offset - stack_size_padded +%endif + WIN64_SPILL_XMM 15 + movddup m7, [base+prep_8tap_1d_rnd] + movifnidn ssq, r2mp + movifnidn tmpq, r0mp + punpcklbw m3, m3 + psraw m3, 8 ; sign-extend + test dword r7m, 0x800 + jnz .v_12bpc + psllw m3, 2 +.v_12bpc: +%if ARCH_X86_32 + ALLOC_STACK -16*7 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 +%else + pshufd m8, m3, q0000 + pshufd m9, m3, q1111 + pshufd m10, m3, q2222 + pshufd m11, m3, q3333 +%endif + lea r6, [ssq*3] + sub srcq, r6 + mov r6d, wd + shl wd, 6 + mov r5, srcq +%if ARCH_X86_64 + mov r7, tmpq +%elif STACK_ALIGNMENT < 16 + mov [esp+4*29], tmpq +%endif + lea wd, [wq+hq-(1<<8)] +.v_loop0: + movq m1, [srcq+ssq*0] + movq m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq m3, [srcq+ssq*0] + movq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq m5, [srcq+ssq*0] + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq m0, [srcq+ssq*0] + punpcklwd m1, m2 ; 01 + punpcklwd m2, m3 ; 12 + punpcklwd m3, m4 ; 23 + punpcklwd m4, m5 ; 34 + punpcklwd m5, m6 ; 45 + punpcklwd m6, m0 ; 56 +%if ARCH_X86_32 + jmp .v_loop_start +.v_loop: + mova m1, m12 + mova m2, m13 + mova m3, m14 +.v_loop_start: + pmaddwd m1, m8 ; a0 + pmaddwd m2, m8 ; b0 + mova m12, m3 + mova m13, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m1, m3 + paddd m2, m4 + mova m14, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m1, m5 + paddd m2, m6 + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m5, m0, m6 ; 67 + movq m0, [srcq+ssq*0] + pmaddwd m3, m11, m5 ; a3 + punpcklwd m6, m0 ; 78 + paddd m1, m7 + paddd m1, m3 + pmaddwd m3, m11, m6 ; b3 + paddd m2, m7 + paddd m2, m3 + psrad m1, 4 + psrad m2, 4 + packssdw m1, m2 + movq [tmpq+r6*0], m1 + movhps [tmpq+r6*2], m1 + lea tmpq, [tmpq+r6*4] + sub hd, 2 + jg .v_loop +%if STACK_ALIGNMENT < 16 + mov tmpq, [esp+4*29] + add r5, 8 + add tmpq, 8 + mov srcq, r5 + mov [esp+4*29], tmpq +%else + mov tmpq, tmpmp + add r5, 8 + add tmpq, 8 + mov srcq, r5 + mov tmpmp, tmpq +%endif +%else +.v_loop: + pmaddwd m12, m8, m1 ; a0 + pmaddwd m13, m8, m2 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m12, m3 + paddd m13, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m12, m5 + paddd m13, m6 + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m5, m0, m6 ; 67 + movq m0, [srcq+ssq*0] + pmaddwd m14, m11, m5 ; a3 + punpcklwd m6, m0 ; 78 + paddd m12, m7 + paddd m12, m14 + pmaddwd m14, m11, m6 ; b3 + paddd m13, m7 + paddd m13, m14 + psrad m12, 4 + psrad m13, 4 + packssdw m12, m13 + movq [tmpq+r6*0], m12 + movhps [tmpq+r6*2], m12 + lea tmpq, [tmpq+r6*4] + sub hd, 2 + jg .v_loop + add r5, 8 + add r7, 8 + mov srcq, r5 + mov tmpq, r7 +%endif + movzx hd, wb + sub wd, 1<<8 + jg .v_loop0 + RET +.hv: +%if STACK_ALIGNMENT < 16 + %xdefine rstk rsp +%else + %assign stack_offset stack_offset - stack_size_padded +%endif + movzx t3d, mxb + shr mxd, 16 + cmp wd, 4 + cmove mxd, t3d + movifnidn hd, r4m + movq m2, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if ARCH_X86_32 + mov ssq, r2mp + mov tmpq, r0mp + mova m0, [base+spel_h_shufA] + mova m1, [base+spel_h_shufB] + mova m4, [base+prep_8tap_2d_rnd] + ALLOC_STACK -16*14 + mova m8, m0 + mova m9, m1 + mova m14, m4 +%else +%if WIN64 + ALLOC_STACK 16*6, 16 +%endif + mova m8, [base+spel_h_shufA] + mova m9, [base+spel_h_shufB] +%endif + pxor m0, m0 + punpcklbw m0, m2 + punpcklbw m3, m3 + psraw m0, 4 + psraw m3, 8 + test dword r7m, 0x800 + jz .hv_10bpc + psraw m0, 2 +.hv_10bpc: + lea r6, [ssq*3] + sub srcq, 6 + sub srcq, r6 + mov r6d, wd + shl wd, 6 + mov r5, srcq +%if ARCH_X86_32 + %define tmp esp+16*8 +%if STACK_ALIGNMENT < 16 + mov [esp+4*61], tmpq +%endif + pshufd m1, m0, q0000 + pshufd m2, m0, q1111 + pshufd m5, m0, q2222 + pshufd m0, m0, q3333 + mova m10, m1 + mova m11, m2 + mova m12, m5 + mova m13, m0 +%else +%if WIN64 + %define tmp rsp +%else + %define tmp rsp-88 ; red zone +%endif + mov r7, tmpq + pshufd m10, m0, q0000 + pshufd m11, m0, q1111 + pshufd m12, m0, q2222 + pshufd m13, m0, q3333 +%endif + lea wd, [wq+hq-(1<<8)] + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova [tmp+16*1], m0 + mova [tmp+16*2], m1 + mova [tmp+16*3], m2 + mova [tmp+16*4], m3 +.hv_loop0: +%if ARCH_X86_64 + mova m14, [prep_8tap_2d_rnd] +%endif + movu m4, [srcq+ssq*0+0] + movu m1, [srcq+ssq*0+8] + movu m5, [srcq+ssq*1+0] + movu m2, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + movu m6, [srcq+ssq*0+0] + movu m3, [srcq+ssq*0+8] + PUT_8TAP_HV_H 4, 1, 0, 6 + PUT_8TAP_HV_H 5, 2, 0, 6 + PUT_8TAP_HV_H 6, 3, 0, 6 + movu m7, [srcq+ssq*1+0] + movu m2, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + movu m1, [srcq+ssq*0+0] + movu m3, [srcq+ssq*0+8] + PUT_8TAP_HV_H 7, 2, 0, 6 + PUT_8TAP_HV_H 1, 3, 0, 6 + movu m2, [srcq+ssq*1+0] + movu m3, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 2, 3, 0, 6 + packssdw m4, m7 ; 0 3 + packssdw m5, m1 ; 1 4 + movu m0, [srcq+ssq*0+0] + movu m1, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 1, 3, 6 + packssdw m6, m2 ; 2 5 + packssdw m7, m0 ; 3 6 + punpcklwd m1, m4, m5 ; 01 + punpckhwd m4, m5 ; 34 + punpcklwd m2, m5, m6 ; 12 + punpckhwd m5, m6 ; 45 + punpcklwd m3, m6, m7 ; 23 + punpckhwd m6, m7 ; 56 +%if ARCH_X86_32 + jmp .hv_loop_start +.hv_loop: + mova m1, [tmp+16*5] + mova m2, m15 +.hv_loop_start: + mova m7, [tmp+16*1] + pmaddwd m1, m7 ; a0 + pmaddwd m2, m7 ; b0 + mova m7, [tmp+16*2] + mova [tmp+16*5], m3 + pmaddwd m3, m7 ; a1 + mova m15, m4 + pmaddwd m4, m7 ; b1 + mova m7, [tmp+16*3] + paddd m1, m14 + paddd m2, m14 + paddd m1, m3 + paddd m2, m4 + mova m3, m5 + pmaddwd m5, m7 ; a2 + mova m4, m6 + pmaddwd m6, m7 ; b2 + paddd m1, m5 + paddd m2, m6 + movu m7, [srcq+ssq*1+0] + movu m5, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 7, 5, 6, 6 + packssdw m0, m7 ; 6 7 + mova [tmp+16*0], m0 + movu m0, [srcq+ssq*0+0] + movu m5, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 5, 6, 6 + mova m6, [tmp+16*0] + packssdw m7, m0 ; 7 8 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, [tmp+16*4] + paddd m1, m7 ; a3 + pmaddwd m7, m6, [tmp+16*4] + paddd m2, m7 ; b3 + psrad m1, 6 + psrad m2, 6 + packssdw m1, m2 + movq [tmpq+r6*0], m1 + movhps [tmpq+r6*2], m1 + lea tmpq, [tmpq+r6*4] + sub hd, 2 + jg .hv_loop +%if STACK_ALIGNMENT < 16 + mov tmpq, [esp+4*61] + add r5, 8 + add tmpq, 8 + mov srcq, r5 + mov [esp+4*61], tmpq +%else + mov tmpq, tmpmp + add r5, 8 + add tmpq, 8 + mov srcq, r5 + mov tmpmp, tmpq +%endif +%else +.hv_loop: + mova m15, [tmp+16*1] + mova m7, [prep_8tap_2d_rnd] + pmaddwd m14, m15, m1 ; a0 + pmaddwd m15, m2 ; b0 + paddd m14, m7 + paddd m15, m7 + mova m7, [tmp+16*2] + mova m1, m3 + pmaddwd m3, m7 ; a1 + mova m2, m4 + pmaddwd m4, m7 ; b1 + mova m7, [tmp+16*3] + paddd m14, m3 + paddd m15, m4 + mova m3, m5 + pmaddwd m5, m7 ; a2 + mova m4, m6 + pmaddwd m6, m7 ; b2 + paddd m14, m5 + paddd m15, m6 + movu m7, [srcq+ssq*1+0] + movu m5, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 7, 5, 6, 6, [prep_8tap_2d_rnd] + packssdw m0, m7 ; 6 7 + mova [tmp+16*0], m0 + movu m0, [srcq+ssq*0+0] + movu m5, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 5, 6, 6, [prep_8tap_2d_rnd] + mova m6, [tmp+16*0] + packssdw m7, m0 ; 7 8 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, [tmp+16*4] + paddd m14, m7 ; a3 + pmaddwd m7, m6, [tmp+16*4] + paddd m15, m7 ; b3 + psrad m14, 6 + psrad m15, 6 + packssdw m14, m15 + movq [tmpq+r6*0], m14 + movhps [tmpq+r6*2], m14 + lea tmpq, [tmpq+r6*4] + sub hd, 2 + jg .hv_loop + add r5, 8 + add r7, 8 + mov srcq, r5 + mov tmpq, r7 +%endif + movzx hd, wb + sub wd, 1<<8 + jg .hv_loop0 + RET +%undef tmp + +%macro movifprep 2 + %if isprep + mov %1, %2 + %endif +%endmacro + +%macro SAVE_REG 1 + %xdefine r%1_save r%1 + %xdefine r%1q_save r%1q + %xdefine r%1d_save r%1d + %if ARCH_X86_32 + %define r%1m_save [rstk+stack_offset+(%1+1)*4] + %endif +%endmacro + +%macro LOAD_REG 1 + %xdefine r%1 r%1_save + %xdefine r%1q r%1q_save + %xdefine r%1d r%1d_save + %if ARCH_X86_32 + %define r%1m r%1m_save + %endif + %undef r%1d_save + %undef r%1q_save + %undef r%1_save +%endmacro + +%macro REMAP_REG 2-3 + %xdefine r%1 r%2 + %xdefine r%1q r%2q + %xdefine r%1d r%2d + %if ARCH_X86_32 + %if %3 == 0 + %xdefine r%1m r%2m + %else + %define r%1m [rstk+stack_offset+(%1+1)*4] + %endif + %endif +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 + %if isprep + %if ARCH_X86_64 + SAVE_REG 14 + %assign %%i 14 + %rep 14 + %assign %%j %%i-1 + REMAP_REG %%i, %%j + %assign %%i %%i-1 + %endrep + %else + SAVE_REG 5 + %assign %%i 5 + %rep 5 + %assign %%j %%i-1 + REMAP_REG %%i, %%j, 0 + %assign %%i %%i-1 + %endrep + %endif + %endif +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 + %if isprep + %assign %%i 1 + %if ARCH_X86_64 + %rep 13 + %assign %%j %%i+1 + REMAP_REG %%i, %%j + %assign %%i %%i+1 + %endrep + LOAD_REG 14 + %else + %rep 4 + %assign %%j %%i+1 + REMAP_REG %%i, %%j, 1 + %assign %%i %%i+1 + %endrep + LOAD_REG 5 + %endif + %endif +%endmacro + +%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged + MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT + RET + %if %1 + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %endif +%endmacro + +%if ARCH_X86_32 + %macro MC_4TAP_SCALED_H 1 ; dst_mem + movu m7, [srcq+ssq*0] + movu m2, [srcq+ssq*1] + movu m5, [r4 +ssq*0] + movu m6, [r4 +ssq*1] + lea srcq, [srcq+ssq*2] + lea r4, [r4 +ssq*2] + REPX {pshufb x, m12}, m7, m2 + REPX {pmaddwd x, m13}, m7, m2 + REPX {pshufb x, m14}, m5, m6 + REPX {pmaddwd x, m15}, m5, m6 + phaddd m7, m5 + phaddd m2, m6 + mova m5, [esp+0x00] + movd m6, [esp+0x10] + paddd m7, m5 + paddd m2, m5 + psrad m7, m6 + psrad m2, m6 + packssdw m7, m2 + mova [stk+%1], m7 + %endmacro +%endif + +%if ARCH_X86_64 + %macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6] + movu m%1, [srcq+ r4*2] + movu m%2, [srcq+ r6*2] + movu m%3, [srcq+ r7*2] + movu m%4, [srcq+ r9*2] + movu m%5, [srcq+r10*2] + movu m%6, [srcq+r11*2] + movu m%7, [srcq+r13*2] + movu m%8, [srcq+ rX*2] + add srcq, ssq + pmaddwd m%1, [stk+0x10] + pmaddwd m%2, [stk+0x20] + pmaddwd m%3, [stk+0x30] + pmaddwd m%4, [stk+0x40] + pmaddwd m%5, [stk+0x50] + pmaddwd m%6, [stk+0x60] + pmaddwd m%7, [stk+0x70] + pmaddwd m%8, [stk+0x80] + phaddd m%1, m%2 + phaddd m%3, m%4 + phaddd m%5, m%6 + phaddd m%7, m%8 + phaddd m%1, m%3 + phaddd m%5, m%7 + paddd m%1, hround + paddd m%5, hround + psrad m%1, m12 + psrad m%5, m12 + packssdw m%1, m%5 + %endmacro +%else + %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem, load_fh_offsets + %if %3 == 1 + mov r0, [stk+ 0] + mov rX, [stk+ 4] + mov r4, [stk+ 8] + mov r5, [stk+12] + %endif + movu m0, [srcq+r0*2] + movu m1, [srcq+rX*2] + movu m2, [srcq+r4*2] + movu m3, [srcq+r5*2] + mov r0, [stk+16] + mov rX, [stk+20] + mov r4, [stk+24] + mov r5, [stk+28] + pmaddwd m0, [stk+%1+0x00] + pmaddwd m1, [stk+%1+0x10] + pmaddwd m2, [stk+%1+0x20] + pmaddwd m3, [stk+%1+0x30] + phaddd m0, m1 + phaddd m2, m3 + movu m4, [srcq+r0*2] + movu m5, [srcq+rX*2] + movu m6, [srcq+r4*2] + movu m7, [srcq+r5*2] + add srcq, ssq + pmaddwd m4, [stk+%1+0xa0] + pmaddwd m5, [stk+%1+0xb0] + pmaddwd m6, [stk+%1+0xc0] + pmaddwd m7, [stk+%1+0xd0] + phaddd m4, m5 + phaddd m6, m7 + phaddd m0, m2 + phaddd m4, m6 + paddd m0, hround + paddd m4, hround + psrad m0, m12 + psrad m4, m12 + packssdw m0, m4 + %if %2 != 0 + mova [stk+%2], m0 + %endif + %endmacro +%endif + +%macro MC_8TAP_SCALED 1 +%ifidn %1, put + %assign isput 1 + %assign isprep 0 + %if ARCH_X86_64 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal put_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax + %else +cglobal put_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax + %endif + %else ; ARCH_X86_32 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal put_8tap_scaled_16bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax + %else +cglobal put_8tap_scaled_16bpc, 0, 7, 8, -0x200-0x30, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax + %endif + %endif + %xdefine base_reg r12 +%else ; prep + %assign isput 0 + %assign isprep 1 + %if ARCH_X86_64 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal prep_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax + %xdefine tmp_stridem r14q + %else +cglobal prep_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax + %define tmp_stridem qword [stk+0x138] + %endif + %xdefine base_reg r11 + %else ; ARCH_X86_32 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal prep_8tap_scaled_16bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax + %else +cglobal prep_8tap_scaled_16bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax + %endif + %define tmp_stridem dword [stk+0x138] + %endif +%endif +%if ARCH_X86_32 + mov [esp+0x1f0], t0d + mov [esp+0x1f4], t1d + %if isput && required_stack_alignment > STACK_ALIGNMENT + mov dstd, dstm + mov dsd, dsm + mov srcd, srcm + mov ssd, ssm + mov hd, hm + mov r4, mxm + %define r0m [esp+0x200] + %define dsm [esp+0x204] + %define dsmp dsm + %define r1m dsm + %define r2m [esp+0x208] + %define ssm [esp+0x20c] + %define r3m ssm + %define hm [esp+0x210] + %define mxm [esp+0x214] + mov r0m, dstd + mov dsm, dsd + mov r2m, srcd + mov ssm, ssd + mov hm, hd + mov r0, mym + mov r1, dxm + mov r2, dym + %define mym [esp+0x218] + %define dxm [esp+0x21c] + %define dym [esp+0x220] + mov mxm, r4 + mov mym, r0 + mov dxm, r1 + mov dym, r2 + tzcnt wd, wm + %endif + %if isput + mov r3, pxmaxm + %define pxmaxm r3 + %else + mov r2, pxmaxm + %endif + %if isprep && required_stack_alignment > STACK_ALIGNMENT + %xdefine base_reg r5 + %else + %xdefine base_reg r6 + %endif +%endif + LEA base_reg, %1_8tap_scaled_16bpc_ssse3 +%xdefine base base_reg-%1_8tap_scaled_16bpc_ssse3 +%if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT + tzcnt wd, wm +%endif +%if ARCH_X86_64 + %if isput + mov r7d, pxmaxm + %endif +%else + %define m8 m0 + %define m9 m1 + %define m14 m4 + %define m15 m3 +%endif + movd m8, dxm + movd m14, mxm +%if isput + movd m15, pxmaxm +%endif + pshufd m8, m8, q0000 + pshufd m14, m14, q0000 +%if isput + pshuflw m15, m15, q0000 + punpcklqdq m15, m15 +%endif +%if isprep + %if UNIX64 + mov r5d, t0d + DECLARE_REG_TMP 5, 7 + %endif + %if ARCH_X86_64 + mov r6d, pxmaxm + %endif +%endif +%if ARCH_X86_64 + mov dyd, dym +%endif +%if isput + %if WIN64 + mov r8d, hm + DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 + %define hm r5m + %define dxm r8m + %elif ARCH_X86_64 + DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 + %define hm r6m + %else + %endif + %if ARCH_X86_64 + %if required_stack_alignment > STACK_ALIGNMENT + %define dsm [rsp+0x138] + %define rX r1 + %define rXd r1d + %else + %define dsm dsq + %define rX r14 + %define rXd r14d + %endif + %else + %define rX r1 + %endif +%else ; prep + %if WIN64 + mov r7d, hm + DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 + %define hm r4m + %define dxm r7m + %elif ARCH_X86_64 + DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 + %xdefine hm r7m + %endif + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %if ARCH_X86_64 + %define rX r14 + %define rXd r14d + %else + %define rX r3 + %endif +%endif +%if ARCH_X86_64 + shr r7d, 11 + mova m10, [base+pd_0x3ff] + movddup m11, [base+s_8tap_h_rnd+r7*8] + movd m12, [base+s_8tap_h_sh+r7*4] + %if isput + movddup m13, [base+put_s_8tap_v_rnd+r7*8] + movd m7, [base+put_s_8tap_v_sh+r7*4] + %define pxmaxm [rsp] + mova pxmaxm, m15 + punpcklqdq m12, m7 + %endif + lea ss3q, [ssq*3] + movzx r7d, t1b + shr t1d, 16 + cmp hd, 6 + cmovs t1d, r7d + sub srcq, ss3q +%else + %define m10 [base+pd_0x3ff] + %define m11 [esp+0x00] + %define m12 [esp+0x10] + shr r3, 11 + movddup m1, [base+s_8tap_h_rnd+r3*8] + movd m2, [base+s_8tap_h_sh+r3*4] + %if isput + %define m13 [esp+0x20] + %define pxmaxm [esp+0x30] + %define stk esp+0x40 + movddup m5, [base+put_s_8tap_v_rnd+r3*8] + movd m6, [base+put_s_8tap_v_sh+r3*4] + mova pxmaxm, m15 + punpcklqdq m2, m6 + mova m13, m5 + %else + %define m13 [base+pd_m524256] + %endif + mov ssd, ssm + mova m11, m1 + mova m12, m2 + MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT + mov r1, [esp+0x1f4] + lea r0, [ssd*3] + movzx r2, r1b + shr r1, 16 + cmp dword hm, 6 + cmovs r1, r2 + mov [esp+0x1f4], r1 + %if isprep + mov r1, r1m + %endif + mov r2, r2m + sub srcq, r0 + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %define ss3q r0 + %define myd r4 + %define dyd dword dym + %define hd dword hm +%endif + cmp dyd, 1024 + je .dy1 + cmp dyd, 2048 + je .dy2 + movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2] + add wq, base_reg + jmp wq +%if isput +.w2: + %if ARCH_X86_64 + mov myd, mym + movzx t0d, t0b + sub srcq, 2 + movd m15, t0d + %else + movzx r4, byte [esp+0x1f0] + sub srcq, 2 + movd m15, r4 + %endif + pxor m9, m9 + punpckldq m9, m8 + paddd m14, m9 ; mx+dx*[0-1] + %if ARCH_X86_64 + mova m9, [base+pd_0x4000] + %endif + pshufd m15, m15, q0000 + pand m8, m14, m10 + psrld m8, 6 + paddd m15, m8 + movd r4d, m15 + pshufd m15, m15, q0321 + %if ARCH_X86_64 + movd r6d, m15 + %else + movd r3d, m15 + %endif + mova m5, [base+bdct_lb_q] + mova m6, [base+spel_s_shuf2] + movd m15, [base+subpel_filters+r4*8+2] + %if ARCH_X86_64 + movd m7, [base+subpel_filters+r6*8+2] + %else + movd m7, [base+subpel_filters+r3*8+2] + %endif + pxor m2, m2 + pcmpeqd m8, m2 + psrld m14, 10 + paddd m14, m14 + %if ARCH_X86_32 + mov r3, r3m + pshufb m14, m5 + paddb m14, m6 + mova [stk], m14 + SWAP m5, m0 + SWAP m6, m3 + %define m15 m6 + %endif + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + movu m2, [srcq+ssq*2] + movu m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + punpckldq m15, m7 + %if ARCH_X86_64 + pshufb m14, m5 + paddb m14, m6 + pand m9, m8 + pandn m8, m15 + SWAP m15, m8 + por m15, m9 + movu m4, [srcq+ssq*0] + movu m5, [srcq+ssq*1] + movu m6, [srcq+ssq*2] + movu m7, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + %else + pand m7, m5, [base+pd_0x4000] + pandn m5, m15 + por m5, m7 + %define m15 m5 + %endif + punpcklbw m15, m15 + psraw m15, 8 + REPX {pshufb x, m14}, m0, m1, m2, m3 + REPX {pmaddwd x, m15}, m0, m1, m2, m3 + %if ARCH_X86_64 + REPX {pshufb x, m14}, m4, m5, m6, m7 + REPX {pmaddwd x, m15}, m4, m5, m6, m7 + phaddd m0, m1 + phaddd m2, m3 + phaddd m4, m5 + phaddd m6, m7 + REPX {paddd x, m11}, m0, m2, m4, m6 + REPX {psrad x, m12}, m0, m2, m4, m6 + packssdw m0, m2 ; 0 1 2 3 + packssdw m4, m6 ; 4 5 6 7 + SWAP m1, m4 + %else + mova [stk+0x10], m15 + phaddd m0, m1 + phaddd m2, m3 + movu m1, [srcq+ssq*0] + movu m7, [srcq+ssq*1] + movu m6, [srcq+ssq*2] + movu m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + REPX {pshufb x, m14}, m1, m7, m6, m3 + REPX {pmaddwd x, m15}, m1, m7, m6, m3 + phaddd m1, m7 + phaddd m6, m3 + REPX {paddd x, m11}, m0, m2, m1, m6 + REPX {psrad x, m12}, m0, m2, m1, m6 + packssdw m0, m2 + packssdw m1, m6 + %define m14 [stk+0x00] + %define m15 [stk+0x10] + %endif + palignr m2, m1, m0, 4 ; 1 2 3 4 + punpcklwd m3, m0, m2 ; 01 12 + punpckhwd m0, m2 ; 23 34 + pshufd m5, m1, q0321 ; 5 6 7 _ + punpcklwd m2, m1, m5 ; 45 56 + punpckhwd m4, m1, m5 ; 67 __ + %if ARCH_X86_32 + mov myd, mym + mov r0, r0m + mova [stk+0x20], m3 + mova [stk+0x30], m0 + mova [stk+0x40], m2 + mova [stk+0x50], m4 + %endif +.w2_loop: + and myd, 0x3ff + %if ARCH_X86_64 + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq m10, r6q + punpcklbw m10, m10 + psraw m10, 8 + pshufd m7, m10, q0000 + pshufd m8, m10, q1111 + pmaddwd m5, m3, m7 + pmaddwd m6, m0, m8 + pshufd m9, m10, q2222 + pshufd m10, m10, q3333 + pmaddwd m7, m2, m9 + pmaddwd m8, m4, m10 + paddd m5, m6 + paddd m7, m8 + %else + mov r1, [esp+0x1f4] + xor r3, r3 + mov r5, myd + shr r5, 6 + lea r1, [r1+r5] + mov r5, 64 << 24 + cmovnz r3, [base+subpel_filters+r1*8+4] + cmovnz r5, [base+subpel_filters+r1*8+0] + movd m6, r3 + movd m7, r5 + punpckldq m7, m6 + punpcklbw m7, m7 + psraw m7, 8 + pshufd m5, m7, q0000 + pshufd m6, m7, q1111 + pmaddwd m3, m5 + pmaddwd m0, m6 + pshufd m5, m7, q2222 + pshufd m7, m7, q3333 + pmaddwd m2, m5 + pmaddwd m4, m7 + paddd m3, m0 + paddd m2, m4 + SWAP m5, m3 + SWAP m7, m2 + %define m8 m3 + %endif + paddd m5, m13 + pshufd m6, m12, q1032 + pxor m8, m8 + paddd m5, m7 + psrad m5, m6 + packssdw m5, m5 + pmaxsw m5, m8 + pminsw m5, pxmaxm + movd [dstq], m5 + add dstq, dsmp + dec hd + jz .ret + %if ARCH_X86_64 + add myd, dyd + %else + add myd, dym + %endif + test myd, ~0x3ff + %if ARCH_X86_32 + SWAP m3, m5 + SWAP m2, m7 + mova m3, [stk+0x20] + mova m0, [stk+0x30] + mova m2, [stk+0x40] + mova m4, [stk+0x50] + %endif + jz .w2_loop + %if ARCH_X86_32 + mov r3, r3m + %endif + movu m5, [srcq] + test myd, 0x400 + jz .w2_skip_line + add srcq, ssq + shufps m3, m0, q1032 ; 01 12 + shufps m0, m2, q1032 ; 23 34 + shufps m2, m4, q1032 ; 45 56 + pshufb m5, m14 + pmaddwd m5, m15 + phaddd m5, m5 + paddd m5, m11 + psrad m5, m12 + packssdw m5, m5 + palignr m4, m5, m1, 12 + punpcklqdq m1, m4, m4 ; 6 7 6 7 + punpcklwd m4, m1, m5 ; 67 __ + %if ARCH_X86_32 + mova [stk+0x20], m3 + mova [stk+0x30], m0 + mova [stk+0x40], m2 + mova [stk+0x50], m4 + %endif + jmp .w2_loop +.w2_skip_line: + movu m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova m3, m0 ; 01 12 + mova m0, m2 ; 23 34 + pshufb m5, m14 + pshufb m6, m14 + pmaddwd m5, m15 + pmaddwd m6, m15 + phaddd m5, m6 + paddd m5, m11 + psrad m5, m12 + packssdw m5, m5 ; 6 7 6 7 + punpckhqdq m1, m5 ; 4 5 6 7 + pshufd m5, m1, q0321 ; 5 6 7 _ + punpcklwd m2, m1, m5 ; 45 56 + punpckhwd m4, m1, m5 ; 67 __ + %if ARCH_X86_32 + mova [stk+0x20], m3 + mova [stk+0x30], m0 + mova [stk+0x40], m2 + mova [stk+0x50], m4 + %endif + jmp .w2_loop +%endif +INIT_XMM ssse3 +.w4: +%if ARCH_X86_64 + mov myd, mym + mova [rsp+0x10], m11 + mova [rsp+0x20], m12 + %if isput + mova [rsp+0x30], m13 + %endif + movzx t0d, t0b + sub srcq, 2 + movd m15, t0d +%else + %define m8 m0 + %xdefine m14 m4 + %define m15 m3 + movzx r4, byte [esp+0x1f0] + sub srcq, 2 + movd m15, r4 +%endif + pmaddwd m8, [base+rescale_mul] +%if ARCH_X86_64 + mova m9, [base+pd_0x4000] +%else + %define m9 [base+pd_0x4000] +%endif + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + pand m0, m14, m10 + psrld m0, 6 + paddd m15, m0 + pshufd m7, m15, q1032 +%if ARCH_X86_64 + movd r4d, m15 + movd r11d, m7 + pshufd m15, m15, q0321 + pshufd m7, m7, q0321 + movd r6d, m15 + movd r13d, m7 + mova m10, [base+bdct_lb_q+ 0] + mova m11, [base+bdct_lb_q+16] + movd m13, [base+subpel_filters+ r4*8+2] + movd m2, [base+subpel_filters+ r6*8+2] + movd m15, [base+subpel_filters+r11*8+2] + movd m4, [base+subpel_filters+r13*8+2] +%else + movd r0, m15 + movd r4, m7 + pshufd m15, m15, q0321 + pshufd m7, m7, q0321 + movd rX, m15 + movd r5, m7 + mova m5, [base+bdct_lb_q+ 0] + mova m6, [base+bdct_lb_q+16] + movd m1, [base+subpel_filters+r0*8+2] + movd m2, [base+subpel_filters+rX*8+2] + movd m3, [base+subpel_filters+r4*8+2] + movd m7, [base+subpel_filters+r5*8+2] + movifprep r3, r3m + SWAP m4, m7 + %define m10 m5 + %define m11 m6 + %define m12 m1 + %define m13 m1 +%endif + psrld m14, 10 + paddd m14, m14 + punpckldq m13, m2 + punpckldq m15, m4 + punpcklqdq m13, m15 + pxor m2, m2 + pcmpeqd m0, m2 +%if ARCH_X86_64 + pand m9, m0 +%else + pand m2, m9, m0 + %define m9 m2 + SWAP m7, m4 +%endif + pandn m0, m13 +%if ARCH_X86_64 + SWAP m13, m0 +%else + %define m13 m0 +%endif + por m13, m9 + punpckhbw m15, m13, m13 + punpcklbw m13, m13 + psraw m15, 8 + psraw m13, 8 + pshufb m12, m14, m10 + pshufb m14, m11 + mova m10, [base+spel_s_shuf2] + movd r4d, m14 + shr r4d, 24 +%if ARCH_X86_32 + mova [stk+0x20], m13 + mova [stk+0x30], m15 + pxor m2, m2 +%endif + pshufb m7, m14, m2 + psubb m14, m7 + paddb m12, m10 + paddb m14, m10 +%if ARCH_X86_64 + lea r6, [r4+ssq*1] + lea r11, [r4+ssq*2] + lea r13, [r4+ss3q ] + movu m7, [srcq+ssq*0] + movu m9, [srcq+ssq*1] + movu m8, [srcq+ssq*2] + movu m10, [srcq+ss3q ] + movu m1, [srcq+r4 ] + movu m3, [srcq+r6 ] + movu m2, [srcq+r11 ] + movu m4, [srcq+r13 ] + lea srcq, [srcq+ssq*4] + REPX {pshufb x, m12}, m7, m9, m8, m10 + REPX {pmaddwd x, m13}, m7, m9, m8, m10 + REPX {pshufb x, m14}, m1, m2, m3, m4 + REPX {pmaddwd x, m15}, m1, m2, m3, m4 + mova m5, [rsp+0x10] + movd xm6, [rsp+0x20] + phaddd m7, m1 + phaddd m9, m3 + phaddd m8, m2 + phaddd m10, m4 + movu m1, [srcq+ssq*0] + movu m2, [srcq+ssq*1] + movu m3, [srcq+ssq*2] + movu m4, [srcq+ss3q ] + REPX {paddd x, m5}, m7, m9, m8, m10 + REPX {psrad x, xm6}, m7, m9, m8, m10 + packssdw m7, m9 ; 0 1 + packssdw m8, m10 ; 2 3 + movu m0, [srcq+r4 ] + movu m9, [srcq+r6 ] + movu m10, [srcq+r11 ] + movu m11, [srcq+r13 ] + lea srcq, [srcq+ssq*4] + REPX {pshufb x, m12}, m1, m2, m3, m4 + REPX {pmaddwd x, m13}, m1, m2, m3, m4 + REPX {pshufb x, m14}, m0, m9, m10, m11 + REPX {pmaddwd x, m15}, m0, m9, m10, m11 + phaddd m1, m0 + phaddd m2, m9 + phaddd m3, m10 + phaddd m4, m11 + REPX {paddd x, m5}, m1, m2, m3, m4 + REPX {psrad x, xm6}, m1, m2, m3, m4 + packssdw m1, m2 ; 4 5 + packssdw m3, m4 ; 6 7 + SWAP m9, m1 + shufps m4, m7, m8, q1032 ; 1 2 + shufps m5, m8, m9, q1032 ; 3 4 + shufps m6, m9, m3, q1032 ; 5 6 + pshufd m10, m3, q1032 ; 7 _ + punpcklwd m0, m7, m4 ; 01 + punpckhwd m7, m4 ; 12 + punpcklwd m1, m8, m5 ; 23 + punpckhwd m8, m5 ; 34 + punpcklwd m2, m9, m6 ; 45 + punpckhwd m9, m6 ; 56 + punpcklwd m3, m10 ; 67 + mova [rsp+0x40], m7 + mova [rsp+0x50], m8 + mova [rsp+0x60], m9 +%else + mova [stk+0x00], m12 + mova [stk+0x10], m14 + add r4, srcq + MC_4TAP_SCALED_H 0x40 ; 0 1 + MC_4TAP_SCALED_H 0x50 ; 2 3 + MC_4TAP_SCALED_H 0x60 ; 4 5 + MC_4TAP_SCALED_H 0x70 ; 6 7 + mova m4, [stk+0x40] + mova m5, [stk+0x50] + mova m6, [stk+0x60] + mova m7, [stk+0x70] + mov [stk+0xc0], r4 + shufps m1, m4, m5, q1032 ; 1 2 + shufps m2, m5, m6, q1032 ; 3 4 + shufps m3, m6, m7, q1032 ; 5 6 + pshufd m0, m7, q1032 ; 7 _ + mova [stk+0xb0], m0 + punpcklwd m0, m4, m1 ; 01 + punpckhwd m4, m1 ; 12 + punpcklwd m1, m5, m2 ; 23 + punpckhwd m5, m2 ; 34 + punpcklwd m2, m6, m3 ; 45 + punpckhwd m6, m3 ; 56 + punpcklwd m3, m7, [stk+0xb0] ; 67 + mov myd, mym + mov r0, r0m + mova [stk+0x40], m0 ; 01 + mova [stk+0x50], m1 ; 23 + mova [stk+0x60], m2 ; 45 + mova [stk+0x70], m3 ; 67 + mova [stk+0x80], m4 ; 12 + mova [stk+0x90], m5 ; 34 + mova [stk+0xa0], m6 ; 56 + %define m12 [stk+0x00] + %define m14 [stk+0x10] + %define m13 [stk+0x20] + %define m15 [stk+0x30] + %define hrnd_mem [esp+0x00] + %define hsh_mem [esp+0x10] + %if isput + %define vrnd_mem [esp+0x20] + %else + %define vrnd_mem [base+pd_m524256] + %endif +%endif +.w4_loop: + and myd, 0x3ff +%if ARCH_X86_64 + mov r11d, 64 << 24 + mov r13d, myd + shr r13d, 6 + lea r13d, [t1+r13] + cmovnz r11q, [base+subpel_filters+r13*8] + movq m9, r11q + punpcklbw m9, m9 + psraw m9, 8 + pshufd m7, m9, q0000 + pshufd m8, m9, q1111 + pmaddwd m4, m0, m7 + pmaddwd m5, m1, m8 + pshufd m7, m9, q2222 + pshufd m9, m9, q3333 + pmaddwd m6, m2, m7 + pmaddwd m8, m3, m9 + %if isput + movd m9, [rsp+0x28] + %define vrnd_mem [rsp+0x30] + %else + %define vrnd_mem [base+pd_m524256] + %endif + paddd m4, m5 + paddd m6, m8 + paddd m4, m6 + paddd m4, vrnd_mem +%else + mov mym, myd + mov r5, [esp+0x1f4] + xor r3, r3 + shr r4, 6 + lea r5, [r5+r4] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r5*8+0] + cmovnz r3, [base+subpel_filters+r5*8+4] + movd m7, r4 + movd m6, r3 + punpckldq m7, m6 + punpcklbw m7, m7 + psraw m7, 8 + pshufd m4, m7, q0000 + pshufd m5, m7, q1111 + pshufd m6, m7, q2222 + pshufd m7, m7, q3333 + pmaddwd m0, m4 + pmaddwd m1, m5 + pmaddwd m2, m6 + pmaddwd m3, m7 + %if isput + movd m4, [esp+0x18] + %endif + paddd m0, m1 + paddd m2, m3 + paddd m0, vrnd_mem + paddd m0, m2 + SWAP m4, m0 + %define m9 m0 +%endif +%if isput + pxor m5, m5 + psrad m4, m9 + packssdw m4, m4 + pmaxsw m4, m5 + pminsw m4, pxmaxm + movq [dstq], m4 + add dstq, dsmp +%else + psrad m4, 6 + packssdw m4, m4 + movq [tmpq], m4 + add tmpq, 8 +%endif + dec hd + jz .ret +%if ARCH_X86_64 + add myd, dyd + test myd, ~0x3ff + jz .w4_loop + mova m8, [rsp+0x10] + movd m9, [rsp+0x20] + movu m4, [srcq] + movu m5, [srcq+r4] + test myd, 0x400 + jz .w4_skip_line + mova m0, [rsp+0x40] + mova [rsp+0x40], m1 + mova m1, [rsp+0x50] + mova [rsp+0x50], m2 + mova m2, [rsp+0x60] + mova [rsp+0x60], m3 + pshufb m4, m12 + pshufb m5, m14 + pmaddwd m4, m13 + pmaddwd m5, m15 + phaddd m4, m5 + paddd m4, m8 + psrad m4, m9 + packssdw m4, m4 + punpcklwd m3, m10, m4 + mova m10, m4 + add srcq, ssq + jmp .w4_loop +.w4_skip_line: + movu m6, [srcq+ssq*1] + movu m7, [srcq+r6] + mova m0, [rsp+0x50] + mova m11, [rsp+0x60] + pshufb m4, m12 + pshufb m6, m12 + pshufb m5, m14 + pshufb m7, m14 + pmaddwd m4, m13 + pmaddwd m6, m13 + pmaddwd m5, m15 + pmaddwd m7, m15 + mova [rsp+0x40], m0 + mova [rsp+0x50], m11 + phaddd m4, m5 + phaddd m6, m7 + paddd m4, m8 + paddd m6, m8 + psrad m4, m9 + psrad m6, m9 + packssdw m4, m6 + punpcklwd m9, m10, m4 + mova [rsp+0x60], m9 + pshufd m10, m4, q1032 + mova m0, m1 + mova m1, m2 + mova m2, m3 + punpcklwd m3, m4, m10 + lea srcq, [srcq+ssq*2] + jmp .w4_loop +%else + SWAP m0, m4 + mov myd, mym + mov r3, r3m + add myd, dym + test myd, ~0x3ff + jnz .w4_next_line + mova m0, [stk+0x40] + mova m1, [stk+0x50] + mova m2, [stk+0x60] + mova m3, [stk+0x70] + jmp .w4_loop +.w4_next_line: + mov r5, [stk+0xc0] + movu m4, [srcq] + movu m5, [r5] + test myd, 0x400 + jz .w4_skip_line + add [stk+0xc0], ssq + mova m0, [stk+0x80] + mova m3, [stk+0x50] + mova [stk+0x40], m0 + mova [stk+0x80], m3 + mova m1, [stk+0x90] + mova m6, [stk+0x60] + mova [stk+0x50], m1 + mova [stk+0x90], m6 + mova m2, [stk+0xa0] + mova m7, [stk+0x70] + mova [stk+0x60], m2 + mova [stk+0xa0], m7 + pshufb m4, m12 + pshufb m5, m14 + pmaddwd m4, m13 + pmaddwd m5, m15 + phaddd m4, m5 + paddd m4, hrnd_mem + psrad m4, hsh_mem + packssdw m4, m4 + punpcklwd m3, [stk+0xb0], m4 + mova [stk+0xb0], m4 + mova [stk+0x70], m3 + add srcq, ssq + jmp .w4_loop +.w4_skip_line: + movu m6, [srcq+ssq*1] + movu m7, [r5 +ssq*1] + lea r5, [r5 +ssq*2] + mov [stk+0xc0], r5 + mova m0, [stk+0x50] + mova m1, [stk+0x60] + mova m2, [stk+0x70] + mova m3, [stk+0x90] + pshufb m4, m12 + pshufb m6, m12 + pshufb m5, m14 + pshufb m7, m14 + pmaddwd m4, m13 + pmaddwd m6, m13 + pmaddwd m5, m15 + pmaddwd m7, m15 + mova [stk+0x40], m0 + mova [stk+0x50], m1 + mova [stk+0x60], m2 + mova [stk+0x80], m3 + phaddd m4, m5 + phaddd m6, m7 + mova m5, [stk+0xa0] + mova m7, [stk+0xb0] + paddd m4, hrnd_mem + paddd m6, hrnd_mem + psrad m4, hsh_mem + psrad m6, hsh_mem + packssdw m4, m6 + punpcklwd m7, m4 + pshufd m6, m4, q1032 + mova [stk+0x90], m5 + mova [stk+0xa0], m7 + mova [stk+0xb0], m6 + punpcklwd m3, m4, m6 + mova [stk+0x70], m3 + lea srcq, [srcq+ssq*2] + jmp .w4_loop +%endif +INIT_XMM ssse3 +%if ARCH_X86_64 + %define stk rsp+0x20 +%endif +.w8: + mov dword [stk+0xf0], 1 + movifprep tmp_stridem, 16 + jmp .w_start +.w16: + mov dword [stk+0xf0], 2 + movifprep tmp_stridem, 32 + jmp .w_start +.w32: + mov dword [stk+0xf0], 4 + movifprep tmp_stridem, 64 + jmp .w_start +.w64: + mov dword [stk+0xf0], 8 + movifprep tmp_stridem, 128 + jmp .w_start +.w128: + mov dword [stk+0xf0], 16 + movifprep tmp_stridem, 256 +.w_start: +%if ARCH_X86_64 + %ifidn %1, put + movifnidn dsm, dsq + %endif + mova [rsp+0x10], m11 + %define hround m11 + shr t0d, 16 + movd m15, t0d + %if isprep + mova m13, [base+pd_m524256] + %endif +%else + %define hround [esp+0x00] + %define m12 [esp+0x10] + %define m10 [base+pd_0x3ff] + %define m8 m0 + %xdefine m14 m4 + %define m15 m3 + %if isprep + %define ssq ssm + %endif + mov r4, [esp+0x1f0] + shr r4, 16 + movd m15, r4 + mov r0, r0m + mov myd, mym +%endif + sub srcq, 6 + pslld m7, m8, 2 ; dx*4 + pmaddwd m8, [base+rescale_mul] ; dx*[0-3] + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + mova [stk+0x100], m7 + mova [stk+0x120], m15 + mov [stk+0x0f8], srcq + mov [stk+0x130], r0q ; dstq / tmpq +%if ARCH_X86_64 && UNIX64 + mov hm, hd +%elif ARCH_X86_32 + mov r5, hm + mov [stk+0x0f4], myd + mov [stk+0x134], r5 +%endif + jmp .hloop +.hloop_prep: + dec dword [stk+0x0f0] + jz .ret +%if ARCH_X86_64 + add qword [stk+0x130], 16 + mov hd, hm +%else + add dword [stk+0x130], 16 + mov myd, [stk+0x0f4] + mov r5, [stk+0x134] + mov r0, [stk+0x130] +%endif + mova m7, [stk+0x100] + mova m14, [stk+0x110] +%if ARCH_X86_64 + mova m10, [base+pd_0x3ff] + mova m11, [rsp+0x10] +%endif + mova m15, [stk+0x120] + mov srcq, [stk+0x0f8] +%if ARCH_X86_64 + mov r0q, [stk+0x130] ; dstq / tmpq +%else + mov mym, myd + mov hm, r5 + mov r0m, r0 + mov r3, r3m +%endif + paddd m14, m7 +.hloop: +%if ARCH_X86_64 + mova m9, [base+pq_0x40000000] +%else + %define m9 [base+pq_0x40000000] +%endif + pxor m1, m1 + psrld m2, m14, 10 + mova [stk], m2 + pand m6, m14, m10 + psrld m6, 6 + paddd m5, m15, m6 + pcmpeqd m6, m1 + pshufd m2, m5, q1032 +%if ARCH_X86_64 + movd r4d, m5 + movd r6d, m2 + pshufd m5, m5, q0321 + pshufd m2, m2, q0321 + movd r7d, m5 + movd r9d, m2 + movq m0, [base+subpel_filters+r4*8] + movq m1, [base+subpel_filters+r6*8] + movhps m0, [base+subpel_filters+r7*8] + movhps m1, [base+subpel_filters+r9*8] +%else + movd r0, m5 + movd rX, m2 + pshufd m5, m5, q0321 + pshufd m2, m2, q0321 + movd r4, m5 + movd r5, m2 + movq m0, [base+subpel_filters+r0*8] + movq m1, [base+subpel_filters+rX*8] + movhps m0, [base+subpel_filters+r4*8] + movhps m1, [base+subpel_filters+r5*8] +%endif + paddd m14, m7 ; mx+dx*[4-7] + pand m5, m14, m10 + psrld m5, 6 + paddd m15, m5 + pxor m2, m2 + pcmpeqd m5, m2 + mova [stk+0x110], m14 + pshufd m4, m15, q1032 +%if ARCH_X86_64 + movd r10d, m15 + movd r11d, m4 + pshufd m15, m15, q0321 + pshufd m4, m4, q0321 + movd r13d, m15 + movd rXd, m4 + movq m2, [base+subpel_filters+r10*8] + movq m3, [base+subpel_filters+r11*8] + movhps m2, [base+subpel_filters+r13*8] + movhps m3, [base+subpel_filters+ rX*8] + psrld m14, 10 + movq r11, m14 + punpckhqdq m14, m14 + movq rX, m14 + mov r10d, r11d + shr r11, 32 + mov r13d, rXd + shr rX, 32 + mov r4d, [stk+ 0] + mov r6d, [stk+ 4] + mov r7d, [stk+ 8] + mov r9d, [stk+12] + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m14, m5, q1100 + pshufd m5, m5, q3322 + pand m7, m9, m4 + pand m8, m9, m6 + pand m15, m9, m14 + pand m9, m9, m5 + pandn m4, m0 + pandn m6, m1 + pandn m14, m2 + pandn m5, m3 + por m7, m4 + por m8, m6 + por m15, m14 + por m9, m5 + punpcklbw m0, m7, m7 + punpckhbw m7, m7 + punpcklbw m1, m8, m8 + punpckhbw m8, m8 + psraw m0, 8 + psraw m7, 8 + psraw m1, 8 + psraw m8, 8 + punpcklbw m2, m15, m15 + punpckhbw m15, m15 + punpcklbw m3, m9, m9 + punpckhbw m9, m9 + psraw m2, 8 + psraw m15, 8 + psraw m3, 8 + psraw m9, 8 + mova [stk+0x10], m0 + mova [stk+0x20], m7 + mova [stk+0x30], m1 + mova [stk+0x40], m8 + mova [stk+0x50], m2 + mova [stk+0x60], m15 + mova [stk+0x70], m3 + mova [stk+0x80], m9 + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 + mova [stk+0x90], m1 + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 + mova [stk+0xa0], m2 + MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 + mova [stk+0xb0], m3 + MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 + mova [stk+0xc0], m4 + MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 + mova [stk+0xd0], m5 + MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 + MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 + MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 + mova m5, [stk+0xd0] + mova m1, [stk+0x90] + mova m2, [stk+0xa0] + mova m3, [stk+0xb0] + mova m9, [stk+0xc0] + mov myd, mym + mov dyd, dym + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m8 ; 67a + punpckhwd m7, m8 ; 67b + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m9 ; 23a + punpckhwd m3, m9 ; 23b + mova [stk+0x90], m4 + mova [stk+0xa0], m5 + mova [stk+0xb0], m6 + mova [stk+0xc0], m7 + %define hround [rsp+0x10] +.vloop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq m11, r6q + punpcklbw m11, m11 + psraw m11, 8 + pshufd m5, m11, q0000 + pshufd m7, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 + pmaddwd m4, m5, m0 + pmaddwd m5, m5, m1 + pmaddwd m6, m7, m2 + pmaddwd m7, m7, m3 + paddd m4, m13 + paddd m5, m13 + paddd m4, m6 + paddd m5, m7 + pmaddwd m6, [stk+0x90], m10 + pmaddwd m7, [stk+0xa0], m10 + pmaddwd m8, [stk+0xb0], m11 + pmaddwd m9, [stk+0xc0], m11 + paddd m4, m6 + paddd m5, m7 + %if isput + pshufd m6, m12, q1032 + %endif + paddd m4, m8 + paddd m5, m9 +%else + movd r0, m15 + movd rX, m4 + pshufd m15, m15, q0321 + pshufd m4, m4, q0321 + movd r4, m15 + movd r5, m4 + mova m14, [stk+0x110] + movq m2, [base+subpel_filters+r0*8] + movq m3, [base+subpel_filters+rX*8] + movhps m2, [base+subpel_filters+r4*8] + movhps m3, [base+subpel_filters+r5*8] + psrld m14, 10 + mova [stk+16], m14 + mov r0, [stk+ 0] + mov rX, [stk+ 4] + mov r4, [stk+ 8] + mov r5, [stk+12] + mova [stk+0x20], m0 + mova [stk+0x30], m1 + mova [stk+0x40], m2 + mova [stk+0x50], m3 + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m7, m5, q1100 + pshufd m5, m5, q3322 + pand m0, m9, m4 + pand m1, m9, m6 + pand m2, m9, m7 + pand m3, m9, m5 + pandn m4, [stk+0x20] + pandn m6, [stk+0x30] + pandn m7, [stk+0x40] + pandn m5, [stk+0x50] + por m0, m4 + por m1, m6 + por m2, m7 + por m3, m5 + punpcklbw m4, m0, m0 + punpckhbw m0, m0 + punpcklbw m5, m1, m1 + punpckhbw m1, m1 + psraw m4, 8 + psraw m0, 8 + psraw m5, 8 + psraw m1, 8 + punpcklbw m6, m2, m2 + punpckhbw m2, m2 + punpcklbw m7, m3, m3 + punpckhbw m3, m3 + psraw m6, 8 + psraw m2, 8 + psraw m7, 8 + psraw m3, 8 + mova [stk+0x0a0], m4 + mova [stk+0x0b0], m0 + mova [stk+0x0c0], m5 + mova [stk+0x0d0], m1 + mova [stk+0x140], m6 + mova [stk+0x150], m2 + mova [stk+0x160], m7 + mova [stk+0x170], m3 + MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 + MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 + MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 + MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 + MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 + MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 + MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 + MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 + mova m5, [stk+0x60] + mova m6, [stk+0x70] + mova m7, [stk+0x80] + mova m0, [stk+0x90] + mov myd, mym + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m0 ; 67a + punpckhwd m7, m0 ; 67b + mova [stk+0x60], m4 + mova [stk+0x70], m5 + mova [stk+0x80], m6 + mova [stk+0x90], m7 + mova m1, [stk+0x20] + mova m2, [stk+0x30] + mova m3, [stk+0x40] + mova m4, [stk+0x50] + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m4 ; 23a + punpckhwd m3, m4 ; 23b + mova [stk+0x20], m0 + mova [stk+0x30], m1 + mova [stk+0x40], m2 + mova [stk+0x50], m3 +.vloop: + mov r0, r0m + mov r5, [esp+0x1f4] + and myd, 0x3ff + mov mym, myd + xor r3, r3 + shr r4, 6 + lea r5, [r5+r4] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r5*8+0] + cmovnz r3, [base+subpel_filters+r5*8+4] + movd m7, r4 + movd m6, r3 + punpckldq m7, m6 + punpcklbw m7, m7 + psraw m7, 8 + pshufd m4, m7, q0000 + pshufd m5, m7, q1111 + pmaddwd m0, m4 + pmaddwd m1, m4 + pmaddwd m2, m5 + pmaddwd m3, m5 + pshufd m6, m7, q2222 + pshufd m7, m7, q3333 + paddd m0, m2 + paddd m1, m3 + pmaddwd m2, [stk+0x60], m6 + pmaddwd m3, [stk+0x70], m6 + pmaddwd m4, [stk+0x80], m7 + pmaddwd m5, [stk+0x90], m7 + %if isput + movd m6, [esp+0x18] + %endif + paddd m0, m2 + paddd m1, m3 + paddd m0, vrnd_mem + paddd m1, vrnd_mem + paddd m4, m0 + paddd m5, m1 +%endif +%ifidn %1, put + psrad m4, m6 + psrad m5, m6 + packssdw m4, m5 + pxor m7, m7 + pmaxsw m4, m7 + pminsw m4, pxmaxm + mova [dstq], m4 + add dstq, dsm +%else + psrad m4, 6 + psrad m5, 6 + packssdw m4, m5 + mova [tmpq], m4 + add tmpq, tmp_stridem +%endif + dec hd + jz .hloop_prep +%if ARCH_X86_64 + add myd, dyd + test myd, ~0x3ff + jz .vloop + test myd, 0x400 + mov [stk+0x140], myd + mov r4d, [stk+ 0] + mov r6d, [stk+ 4] + mov r7d, [stk+ 8] + mov r9d, [stk+12] + jz .skip_line + mova m14, [base+unpckw] + movu m8, [srcq+r10*2] + movu m9, [srcq+r11*2] + movu m10, [srcq+r13*2] + movu m11, [srcq+ rX*2] + movu m4, [srcq+ r4*2] + movu m5, [srcq+ r6*2] + movu m6, [srcq+ r7*2] + movu m7, [srcq+ r9*2] + add srcq, ssq + mov myd, [stk+0x140] + mov dyd, dym + pshufd m15, m14, q1032 + pshufb m0, m14 ; 0a 1a + pshufb m1, m14 ; 0b 1b + pshufb m2, m15 ; 3a 2a + pshufb m3, m15 ; 3b 2b + pmaddwd m8, [stk+0x50] + pmaddwd m9, [stk+0x60] + pmaddwd m10, [stk+0x70] + pmaddwd m11, [stk+0x80] + pmaddwd m4, [stk+0x10] + pmaddwd m5, [stk+0x20] + pmaddwd m6, [stk+0x30] + pmaddwd m7, [stk+0x40] + phaddd m8, m9 + phaddd m10, m11 + mova m11, hround + phaddd m4, m5 + phaddd m6, m7 + phaddd m8, m10 + phaddd m4, m6 + paddd m4, m11 + paddd m8, m11 + psrad m4, m12 + psrad m8, m12 + packssdw m4, m8 + pshufb m5, [stk+0x90], m14 ; 4a 5a + pshufb m6, [stk+0xa0], m14 ; 4b 5b + pshufb m7, [stk+0xb0], m15 ; 7a 6a + pshufb m8, [stk+0xc0], m15 ; 7b 6b + punpckhwd m0, m2 ; 12a + punpckhwd m1, m3 ; 12b + punpcklwd m2, m5 ; 34a + punpcklwd m3, m6 ; 34b + punpckhwd m5, m7 ; 56a + punpckhwd m6, m8 ; 56b + punpcklwd m7, m4 ; 78a + punpckhqdq m4, m4 + punpcklwd m8, m4 ; 78b + mova [stk+0x90], m5 + mova [stk+0xa0], m6 + mova [stk+0xb0], m7 + mova [stk+0xc0], m8 + jmp .vloop +.skip_line: + MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11 + MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 0, 10, 11 + mov myd, [stk+0x140] + mov dyd, dym + mova m0, m2 ; 01a + mova m1, m3 ; 01b + mova m2, [stk+0x90] ; 23a + mova m3, [stk+0xa0] ; 23b + mova m5, [stk+0xb0] ; 45a + mova m6, [stk+0xc0] ; 45b + punpcklwd m7, m4, m8 ; 67a + punpckhwd m4, m8 ; 67b + mova [stk+0x90], m5 + mova [stk+0xa0], m6 + mova [stk+0xb0], m7 + mova [stk+0xc0], m4 +%else + mov r0m, r0 + mov myd, mym + mov r3, r3m + add myd, dym + test myd, ~0x3ff + mov mym, myd + jnz .next_line + mova m0, [stk+0x20] + mova m1, [stk+0x30] + mova m2, [stk+0x40] + mova m3, [stk+0x50] + jmp .vloop +.next_line: + test myd, 0x400 + mov r0, [stk+ 0] + mov rX, [stk+ 4] + mov r4, [stk+ 8] + mov r5, [stk+12] + jz .skip_line + MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 + mova m7, [base+unpckw] + pshufd m4, m7, q1032 + pshufb m0, [stk+0x20], m7 ; 0a 1a + pshufb m1, [stk+0x30], m7 ; 0b 1b + pshufb m2, [stk+0x40], m4 ; 3a 2a + pshufb m3, [stk+0x50], m4 ; 3b 2b + pshufb m5, [stk+0x60], m7 ; 4a 5a + pshufb m6, [stk+0x70], m7 ; 4b 5b + pshufb m7, [stk+0x80], m4 ; 7a 6a + punpckhwd m0, m2 ; 12a + punpckhwd m1, m3 ; 12b + punpcklwd m2, m5 ; 34a + punpcklwd m3, m6 ; 34b + mova [stk+0x20], m0 + mova [stk+0x30], m1 + mova [stk+0x40], m2 + mova [stk+0x50], m3 + punpckhwd m5, m7 ; 56a + mova [stk+0x60], m5 + pshufb m5, [stk+0x90], m4 ; 7b 6b + punpcklwd m7, [stk+0xe0] ; 78a + punpckhwd m6, m5 ; 56b + mova [stk+0x70], m6 + movq m6, [stk+0xe8] + mova [stk+0x80], m7 + punpcklwd m5, m6 + mov myd, mym + mova [stk+0x90], m5 + jmp .vloop +.skip_line: + MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 + MC_8TAP_SCALED_H 0xa0, 0 ; 9 + mova m7, [stk+0xe0] + mova m2, [stk+0x60] ; 23a + mova m3, [stk+0x70] ; 23b + mova m4, [stk+0x80] ; 45a + mova m5, [stk+0x90] ; 45b + punpcklwd m6, m7, m0 ; 67a + punpckhwd m7, m0 ; 67b + mova m0, [stk+0x40] ; 01a + mova m1, [stk+0x50] ; 01b + mov myd, mym + mova [stk+0x40], m2 + mova [stk+0x50], m3 + mova [stk+0x60], m4 + mova [stk+0x70], m5 + mova [stk+0x80], m6 + mova [stk+0x90], m7 + mova [stk+0x20], m0 + mova [stk+0x30], m1 +%endif + jmp .vloop +INIT_XMM ssse3 +.dy1: + movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2] + add wq, base_reg + jmp wq +%if isput +.dy1_w2: + %if ARCH_X86_64 + mov myd, mym + movzx t0d, t0b + sub srcq, 2 + movd m15, t0d + %else + %define m8 m0 + %define m9 m1 + %define m14 m4 + %define m15 m3 + %define m11 [esp+0x00] + %define m12 [esp+0x10] + %define m13 [esp+0x20] + movzx r5, byte [esp+0x1f0] + sub srcq, 2 + movd m15, r5 + mov r1, r1m + %endif + pxor m9, m9 + punpckldq m9, m8 + paddd m14, m9 ; mx+dx*[0-1] + %if ARCH_X86_64 + mova m9, [base+pd_0x4000] + %endif + pshufd m15, m15, q0000 + pand m8, m14, m10 + psrld m8, 6 + paddd m15, m8 + movd r4d, m15 + pshufd m15, m15, q0321 + %if ARCH_X86_64 + movd r6d, m15 + %else + movd r3d, m15 + %endif + mova m5, [base+bdct_lb_q] + mova m6, [base+spel_s_shuf2] + movd m15, [base+subpel_filters+r4*8+2] + %if ARCH_X86_64 + movd m7, [base+subpel_filters+r6*8+2] + %else + movd m7, [base+subpel_filters+r3*8+2] + %endif + pxor m2, m2 + pcmpeqd m8, m2 + psrld m14, 10 + paddd m14, m14 + %if ARCH_X86_32 + mov r3, r3m + pshufb m14, m5 + paddb m14, m6 + mova [stk], m14 + SWAP m5, m0 + SWAP m6, m3 + %define m15 m6 + %endif + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + movu m2, [srcq+ssq*2] + movu m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + punpckldq m15, m7 + %if ARCH_X86_64 + pshufb m14, m5 + paddb m14, m6 + pand m9, m8 + pandn m8, m15 + SWAP m15, m8 + por m15, m9 + movu m4, [srcq+ssq*0] + movu m5, [srcq+ssq*1] + movu m6, [srcq+ssq*2] + add srcq, ss3q + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + %else + pand m7, m5, [base+pd_0x4000] + pandn m5, m15 + por m5, m7 + %define m15 m5 + mov myd, mym + mov r5, [esp+0x1f4] + xor r3, r3 + shr myd, 6 + lea r5, [r5+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r5*8+0] + cmovnz r3, [base+subpel_filters+r5*8+4] + mov [stk+0x20], r3 + mov r3, r3m + %endif + punpcklbw m15, m15 + psraw m15, 8 + REPX {pshufb x, m14}, m0, m1, m2, m3 + REPX {pmaddwd x, m15}, m0, m1, m2, m3 + %if ARCH_X86_64 + REPX {pshufb x, m14}, m4, m5, m6 + REPX {pmaddwd x, m15}, m4, m5, m6 + phaddd m0, m1 + phaddd m2, m3 + phaddd m4, m5 + phaddd m6, m6 + REPX {paddd x, m11}, m0, m2, m4, m6 + REPX {psrad x, m12}, m0, m2, m4, m6 + packssdw m0, m2 ; 0 1 2 3 + packssdw m4, m6 ; 4 5 6 + SWAP m1, m4 + movq m10, r4 + %else + mova [stk+0x10], m15 + phaddd m0, m1 + phaddd m2, m3 + movu m1, [srcq+ssq*0] + movu m7, [srcq+ssq*1] + movu m6, [srcq+ssq*2] + add srcq, ss3q + REPX {pshufb x, m14}, m1, m7, m6 + REPX {pmaddwd x, m15}, m1, m7, m6 + %define m14 [stk+0x00] + %define m15 [stk+0x10] + phaddd m1, m7 + phaddd m6, m6 + REPX {paddd x, m11}, m0, m2, m1, m6 + REPX {psrad x, m12}, m0, m2, m1, m6 + packssdw m0, m2 + packssdw m1, m6 + %define m8 m6 + %define m9 m4 + %define m10 m5 + movd m10, r4 + movd m9, [stk+0x20] + punpckldq m10, m9 + %endif + punpcklbw m10, m10 + psraw m10, 8 + pshufd m7, m10, q0000 + pshufd m8, m10, q1111 + pshufd m9, m10, q2222 + pshufd m10, m10, q3333 + %if ARCH_X86_32 + mova [stk+0x50], m7 + mova [stk+0x60], m8 + mova [stk+0x70], m9 + mova [stk+0x80], m10 + %define m7 [stk+0x50] + %define m8 [stk+0x60] + %define m9 [stk+0x70] + %define m10 [stk+0x80] + %endif + palignr m2, m1, m0, 4 ; 1 2 3 4 + punpcklwd m3, m0, m2 ; 01 12 + punpckhwd m0, m2 ; 23 34 + pshufd m4, m1, q2121 ; 5 6 5 6 + punpcklwd m2, m1, m4 ; 45 56 + %if ARCH_X86_32 + mov r0, r0m + %endif +.dy1_w2_loop: + movu m1, [srcq+ssq*0] + movu m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd m5, m3, m7 + mova m3, m0 + pmaddwd m0, m8 + pshufb m1, m14 + pshufb m6, m14 + pmaddwd m1, m15 + pmaddwd m6, m15 + phaddd m1, m6 + paddd m1, m11 + psrad m1, m12 + packssdw m1, m1 + paddd m5, m0 + mova m0, m2 + pmaddwd m2, m9 + paddd m5, m2 + palignr m2, m1, m4, 12 + punpcklwd m2, m1 ; 67 78 + pmaddwd m4, m2, m10 + paddd m5, m13 + paddd m5, m4 + pxor m6, m6 + mova m4, m1 + pshufd m1, m12, q1032 + psrad m5, m1 + packssdw m5, m5 + pmaxsw m5, m6 + pminsw m5, pxmaxm + movd [dstq+dsq*0], m5 + pshuflw m5, m5, q1032 + movd [dstq+dsq*1], m5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy1_w2_loop + RET +%endif +INIT_XMM ssse3 +.dy1_w4: +%if ARCH_X86_64 + mov myd, mym + mova [rsp+0x10], m11 + mova [rsp+0x20], m12 + %if isput + mova [rsp+0x30], m13 + %define vrnd_mem [rsp+0x30] + %define stk rsp+0x40 + %else + %define vrnd_mem [base+pd_m524256] + %define stk rsp+0x30 + %endif + movzx t0d, t0b + sub srcq, 2 + movd m15, t0d +%else + %define m10 [base+pd_0x3ff] + %define m9 [base+pd_0x4000] + %define m8 m0 + %xdefine m14 m4 + %define m15 m3 + %if isprep + %define ssq r3 + %endif + movzx r5, byte [esp+0x1f0] + sub srcq, 2 + movd m15, r5 +%endif + pmaddwd m8, [base+rescale_mul] +%if ARCH_X86_64 + mova m9, [base+pd_0x4000] +%endif + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + pand m0, m14, m10 + psrld m0, 6 + paddd m15, m0 + pshufd m7, m15, q1032 +%if ARCH_X86_64 + movd r4d, m15 + movd r11d, m7 + pshufd m15, m15, q0321 + pshufd m7, m7, q0321 + movd r6d, m15 + movd r13d, m7 + mova m10, [base+bdct_lb_q+ 0] + mova m11, [base+bdct_lb_q+16] + movd m13, [base+subpel_filters+ r4*8+2] + movd m2, [base+subpel_filters+ r6*8+2] + movd m15, [base+subpel_filters+r11*8+2] + movd m4, [base+subpel_filters+r13*8+2] +%else + movd r0, m15 + movd r4, m7 + pshufd m15, m15, q0321 + pshufd m7, m7, q0321 + movd rX, m15 + movd r5, m7 + mova m5, [base+bdct_lb_q+ 0] + mova m6, [base+bdct_lb_q+16] + movd m1, [base+subpel_filters+r0*8+2] + movd m2, [base+subpel_filters+rX*8+2] + movd m3, [base+subpel_filters+r4*8+2] + movd m7, [base+subpel_filters+r5*8+2] + SWAP m4, m7 + %if isprep + mov r3, r3m + %endif + %define m10 m5 + %define m11 m6 + %define m12 m1 + %define m13 m1 +%endif + psrld m14, 10 + paddd m14, m14 + punpckldq m13, m2 + punpckldq m15, m4 + punpcklqdq m13, m15 + pxor m2, m2 + pcmpeqd m0, m2 +%if ARCH_X86_64 + pand m9, m0 +%else + pand m2, m9, m0 + %define m9 m2 + SWAP m7, m4 +%endif + pandn m0, m13 +%if ARCH_X86_64 + SWAP m13, m0 +%else + %define m13 m0 +%endif + por m13, m9 + punpckhbw m15, m13, m13 + punpcklbw m13, m13 + psraw m15, 8 + psraw m13, 8 + pshufb m12, m14, m10 + pshufb m14, m11 + mova m10, [base+spel_s_shuf2] + movd r4d, m14 + shr r4d, 24 +%if ARCH_X86_32 + mova [stk+0x40], m13 + mova [stk+0x50], m15 + pxor m2, m2 +%endif + pshufb m7, m14, m2 + psubb m14, m7 + paddb m12, m10 + paddb m14, m10 +%if ARCH_X86_64 + lea r6, [r4+ssq*1] + lea r11, [r4+ssq*2] + lea r13, [r4+ss3q ] + movu m7, [srcq+ssq*0] + movu m9, [srcq+ssq*1] + movu m8, [srcq+ssq*2] + movu m10, [srcq+ss3q ] + movu m1, [srcq+r4 ] + movu m3, [srcq+r6 ] + movu m2, [srcq+r11 ] + movu m4, [srcq+r13 ] + lea srcq, [srcq+ssq*4] + REPX {pshufb x, m12}, m7, m9, m8, m10 + REPX {pmaddwd x, m13}, m7, m9, m8, m10 + REPX {pshufb x, m14}, m1, m3, m2, m4 + REPX {pmaddwd x, m15}, m1, m3, m2, m4 + mova m5, [rsp+0x10] + movd xm6, [rsp+0x20] + phaddd m7, m1 + phaddd m9, m3 + phaddd m8, m2 + phaddd m10, m4 + movu m1, [srcq+ssq*0] + movu m2, [srcq+ssq*1] + movu m3, [srcq+ssq*2] + REPX {paddd x, m5}, m7, m9, m8, m10 + REPX {psrad x, xm6}, m7, m9, m8, m10 + packssdw m7, m9 ; 0 1 + packssdw m8, m10 ; 2 3 + movu m0, [srcq+r4 ] + movu m9, [srcq+r6 ] + movu m10, [srcq+r11 ] + add srcq, ss3q + REPX {pshufb x, m12}, m1, m2, m3 + REPX {pmaddwd x, m13}, m1, m2, m3 + REPX {pshufb x, m14}, m0, m9, m10 + REPX {pmaddwd x, m15}, m0, m9, m10 + phaddd m1, m0 + phaddd m2, m9 + phaddd m3, m10 + shr myd, 6 + mov r13d, 64 << 24 + lea myd, [t1+myq] + cmovnz r13q, [base+subpel_filters+myq*8] + REPX {paddd x, m5}, m1, m2, m3 + REPX {psrad x, xm6}, m1, m2, m3 + packssdw m1, m2 ; 4 5 + packssdw m3, m3 ; 6 6 + SWAP m9, m1 + shufps m4, m7, m8, q1032 ; 1 2 + shufps m5, m8, m9, q1032 ; 3 4 + shufps m6, m9, m3, q1032 ; 5 6 + punpcklwd m0, m7, m4 ; 01 + punpckhwd m7, m4 ; 12 + punpcklwd m1, m8, m5 ; 23 + punpckhwd m8, m5 ; 34 + punpcklwd m2, m9, m6 ; 45 + punpckhwd m9, m6 ; 56 + movq m10, r13 + mova [stk+0x00], m1 + mova [stk+0x10], m8 + mova [stk+0x20], m2 + mova [stk+0x30], m9 + mova [stk+0x40], m3 + %define hrnd_mem [rsp+0x10] + %define hsh_mem [rsp+0x20] + %define vsh_mem [rsp+0x28] + %if isput + %define vrnd_mem [rsp+0x30] + %else + %define vrnd_mem [base+pd_m524256] + %endif +%else + mova [stk+0x20], m12 + mova [stk+0x30], m14 + add r4, srcq + MC_4TAP_SCALED_H 0x60 ; 0 1 + MC_4TAP_SCALED_H 0x70 ; 2 3 + MC_4TAP_SCALED_H 0x80 ; 4 5 + movu m7, [srcq] + movu m2, [r4] + add srcq, ssq + add r4, ssq + mov [stk+0xb0], r4 + pshufb m7, m12 + pshufb m2, m14 + pmaddwd m7, m13 + pmaddwd m2, m15 + phaddd m7, m2 + paddd m7, [esp+0x00] + psrad m7, [esp+0x10] + packssdw m7, m7 ; 6 6 + mova m4, [stk+0x60] + mova m5, [stk+0x70] + mova m6, [stk+0x80] + mov myd, mym + mov rX, [esp+0x1f4] + xor r5, r5 + shr myd, 6 + lea rX, [rX+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+rX*8+0] + cmovnz r5, [base+subpel_filters+rX*8+4] + mov r3, r3m + shufps m1, m4, m5, q1032 ; 1 2 + shufps m2, m5, m6, q1032 ; 3 4 + shufps m3, m6, m7, q1032 ; 5 6 + mova [stk+0xa0], m7 + punpcklwd m0, m4, m1 ; 01 + punpckhwd m4, m1 ; 12 + punpcklwd m1, m5, m2 ; 23 + punpckhwd m5, m2 ; 34 + punpcklwd m2, m6, m3 ; 45 + punpckhwd m6, m3 ; 56 + movd m7, r4 + movd m3, r5 + mov r0, r0m + %if isput + mov r1, r1m + %endif + mov r4, [stk+0xb0] + mova [stk+0xc0], m4 ; 12 + mova [stk+0x60], m1 ; 23 + mova [stk+0x70], m2 ; 45 + mova [stk+0x80], m5 ; 34 + mova [stk+0x90], m6 ; 56 + %define m12 [stk+0x20] + %define m14 [stk+0x30] + %define m13 [stk+0x40] + %define m15 [stk+0x50] + %define hrnd_mem [esp+0x00] + %define hsh_mem [esp+0x10] + %define vsh_mem [esp+0x18] + %if isput + %define vrnd_mem [esp+0x20] + %else + %define vrnd_mem [base+pd_m524256] + %endif + %define m10 m7 + punpckldq m10, m3 +%endif + punpcklbw m10, m10 + psraw m10, 8 + pshufd m3, m10, q0000 + pshufd m4, m10, q1111 + pshufd m5, m10, q2222 + pshufd m10, m10, q3333 +%if ARCH_X86_32 + %xdefine m8 m3 + %xdefine m9 m6 + %xdefine m11 m5 + %xdefine m6 m4 + mova [stk+0x100], m3 + mova [stk+0x110], m4 + mova [stk+0x120], m5 + mova [stk+0x130], m10 + %define m3 [stk+0x100] + %define m4 [stk+0x110] + %define m5 [stk+0x120] + %define m10 [stk+0x130] + mova m7, [stk+0xc0] + mova m8, [stk+0x80] +%endif +.dy1_w4_loop: + movu m11, [srcq+ssq*0] + movu m6, [srcq+ssq*1] + pmaddwd m0, m3 + pmaddwd m7, m3 + pmaddwd m1, m4 + pmaddwd m8, m4 + pmaddwd m2, m5 + pmaddwd m9, m5 + paddd m1, m0 + paddd m8, m7 +%if ARCH_X86_64 + movu m0, [srcq+r4] + movu m7, [srcq+r6] +%else + movu m0, [r4+ssq*0] + movu m7, [r4+ssq*1] + lea r4, [r4+ssq*2] +%endif + lea srcq, [srcq+ssq*2] + paddd m1, m2 + paddd m8, m9 + pshufb m11, m12 + pshufb m6, m12 + pmaddwd m11, m13 + pmaddwd m6, m13 + pshufb m0, m14 + pshufb m7, m14 + pmaddwd m0, m15 + pmaddwd m7, m15 + phaddd m11, m0 + phaddd m6, m7 + paddd m11, hrnd_mem + paddd m6, hrnd_mem + psrad m11, hsh_mem + psrad m6, hsh_mem + packssdw m11, m6 ; 7 8 +%if ARCH_X86_64 + shufps m9, [stk+0x40], m11, q1032 ; 6 7 + mova m0, [stk+0x00] + mova [stk+0x40], m11 +%else + shufps m9, [stk+0xa0], m11, q1032 ; 6 7 + mova m0, [stk+0x60] + mova [stk+0xa0], m11 +%endif + punpcklwd m2, m9, m11 ; 67 + punpckhwd m9, m11 ; 78 + pmaddwd m6, m2, m10 + pmaddwd m7, m9, m10 +%if isput + movd m11, vsh_mem +%endif + paddd m1, vrnd_mem + paddd m8, vrnd_mem + paddd m1, m6 + paddd m8, m7 +%if ARCH_X86_64 + mova m7, [stk+0x10] +%else + mova m7, [stk+0x80] +%endif +%if isput + psrad m1, m11 + psrad m8, m11 +%else + psrad m1, 6 + psrad m8, 6 +%endif + packssdw m1, m8 +%if ARCH_X86_64 + mova m8, [stk+0x30] +%else + mova m8, [stk+0x90] +%endif +%if isput + pxor m6, m6 + pmaxsw m1, m6 + pminsw m1, pxmaxm + movq [dstq+dsq*0], m1 + movhps [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] +%else + mova [tmpq], m1 + add tmpq, 16 +%endif +%if ARCH_X86_64 + mova m1, [stk+0x20] + mova [stk+0x10], m8 + mova [stk+0x00], m1 + mova [stk+0x20], m2 + mova [stk+0x30], m9 +%else + mova m1, [stk+0x70] + mova [stk+0x80], m8 + mova [stk+0x60], m1 + mova [stk+0x70], m2 + mova [stk+0x90], m9 +%endif + sub hd, 2 + jg .dy1_w4_loop + MC_8TAP_SCALED_RET ; why not jz .ret? +INIT_XMM ssse3 +.dy1_w8: + mov dword [stk+0xf0], 1 + movifprep tmp_stridem, 16 + jmp .dy1_w_start +.dy1_w16: + mov dword [stk+0xf0], 2 + movifprep tmp_stridem, 32 + jmp .dy1_w_start +.dy1_w32: + mov dword [stk+0xf0], 4 + movifprep tmp_stridem, 64 + jmp .dy1_w_start +.dy1_w64: + mov dword [stk+0xf0], 8 + movifprep tmp_stridem, 128 + jmp .dy1_w_start +.dy1_w128: + mov dword [stk+0xf0], 16 + movifprep tmp_stridem, 256 +.dy1_w_start: + mov myd, mym +%if ARCH_X86_64 + %ifidn %1, put + movifnidn dsm, dsq + %endif + mova [rsp+0x10], m11 + mova [rsp+0x20], m12 + %define hround m11 + %if isput + mova [rsp+0x30], m13 + %else + mova m13, [base+pd_m524256] + %endif + shr t0d, 16 + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + movd m15, t0d +%else + %define hround [esp+0x00] + %define m12 [esp+0x10] + %define m10 [base+pd_0x3ff] + %define m8 m0 + %xdefine m14 m4 + %xdefine m15 m3 + %if isprep + %define ssq ssm + %endif + mov r5, [esp+0x1f0] + mov r3, [esp+0x1f4] + shr r5, 16 + movd m15, r5 + xor r5, r5 + shr myd, 6 + lea r3, [r3+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r3*8+0] + cmovnz r5, [base+subpel_filters+r3*8+4] + mov r0, r0m + mov r3, r3m +%endif + sub srcq, 6 + pslld m7, m8, 2 ; dx*4 + pmaddwd m8, [base+rescale_mul] ; dx*[0-3] + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] +%if ARCH_X86_64 + movq m3, r4q +%else + movd m5, r4 + movd m6, r5 + punpckldq m5, m6 + SWAP m3, m5 +%endif + punpcklbw m3, m3 + psraw m3, 8 + mova [stk+0x100], m7 + mova [stk+0x120], m15 + mov [stk+0x0f8], srcq + mov [stk+0x130], r0q ; dstq / tmpq + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 +%if ARCH_X86_64 + mova [stk+0x140], m0 + mova [stk+0x150], m1 + mova [stk+0x160], m2 + mova [stk+0x170], m3 + %if UNIX64 + mov hm, hd + %endif +%else + mova [stk+0x180], m0 + mova [stk+0x190], m1 + mova [stk+0x1a0], m2 + mova [stk+0x1b0], m3 + SWAP m5, m3 + mov r5, hm + mov [stk+0x134], r5 +%endif + jmp .dy1_hloop +.dy1_hloop_prep: + dec dword [stk+0x0f0] + jz .ret +%if ARCH_X86_64 + add qword [stk+0x130], 16 + mov hd, hm +%else + add dword [stk+0x130], 16 + mov r5, [stk+0x134] + mov r0, [stk+0x130] +%endif + mova m7, [stk+0x100] + mova m14, [stk+0x110] +%if ARCH_X86_64 + mova m10, [base+pd_0x3ff] + mova m11, [rsp+0x10] +%endif + mova m15, [stk+0x120] + mov srcq, [stk+0x0f8] +%if ARCH_X86_64 + mov r0q, [stk+0x130] ; dstq / tmpq +%else + mov hm, r5 + mov r0m, r0 + mov r3, r3m +%endif + paddd m14, m7 +.dy1_hloop: +%if ARCH_X86_64 + mova m9, [base+pq_0x40000000] +%else + %define m9 [base+pq_0x40000000] +%endif + pxor m1, m1 + psrld m2, m14, 10 + mova [stk], m2 + pand m6, m14, m10 + psrld m6, 6 + paddd m5, m15, m6 + pcmpeqd m6, m1 + pshufd m2, m5, q1032 +%if ARCH_X86_64 + movd r4d, m5 + movd r6d, m2 + pshufd m5, m5, q0321 + pshufd m2, m2, q0321 + movd r7d, m5 + movd r9d, m2 + movq m0, [base+subpel_filters+r4*8] + movq m1, [base+subpel_filters+r6*8] + movhps m0, [base+subpel_filters+r7*8] + movhps m1, [base+subpel_filters+r9*8] +%else + movd r0, m5 + movd rX, m2 + pshufd m5, m5, q0321 + pshufd m2, m2, q0321 + movd r4, m5 + movd r5, m2 + movq m0, [base+subpel_filters+r0*8] + movq m1, [base+subpel_filters+rX*8] + movhps m0, [base+subpel_filters+r4*8] + movhps m1, [base+subpel_filters+r5*8] +%endif + paddd m14, m7 ; mx+dx*[4-7] + pand m5, m14, m10 + psrld m5, 6 + paddd m15, m5 + pxor m2, m2 + pcmpeqd m5, m2 + mova [stk+0x110], m14 + pshufd m4, m15, q1032 +%if ARCH_X86_64 + movd r10d, m15 + movd r11d, m4 + pshufd m15, m15, q0321 + pshufd m4, m4, q0321 + movd r13d, m15 + movd rXd, m4 + movq m2, [base+subpel_filters+r10*8] + movq m3, [base+subpel_filters+r11*8] + movhps m2, [base+subpel_filters+r13*8] + movhps m3, [base+subpel_filters+ rX*8] + psrld m14, 10 + movq r11, m14 + punpckhqdq m14, m14 + movq rX, m14 + mov r10d, r11d + shr r11, 32 + mov r13d, rXd + shr rX, 32 + mov r4d, [stk+ 0] + mov r6d, [stk+ 4] + mov r7d, [stk+ 8] + mov r9d, [stk+12] + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m14, m5, q1100 + pshufd m5, m5, q3322 + pand m7, m9, m4 + pand m8, m9, m6 + pand m15, m9, m14 + pand m9, m9, m5 + pandn m4, m0 + pandn m6, m1 + pandn m14, m2 + pandn m5, m3 + por m7, m4 + por m8, m6 + por m15, m14 + por m9, m5 + punpcklbw m0, m7, m7 + punpckhbw m7, m7 + punpcklbw m1, m8, m8 + punpckhbw m8, m8 + psraw m0, 8 + psraw m7, 8 + psraw m1, 8 + psraw m8, 8 + punpcklbw m2, m15, m15 + punpckhbw m15, m15 + punpcklbw m3, m9, m9 + punpckhbw m9, m9 + psraw m2, 8 + psraw m15, 8 + psraw m3, 8 + psraw m9, 8 + mova [stk+0x10], m0 + mova [stk+0x20], m7 + mova [stk+0x30], m1 + mova [stk+0x40], m8 + mova [stk+0x50], m2 + mova [stk+0x60], m15 + mova [stk+0x70], m3 + mova [stk+0x80], m9 + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 + mova [stk+0x90], m1 + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 + mova [stk+0xa0], m2 + MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 + mova [stk+0xb0], m3 + MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 + mova [stk+0xc0], m4 + MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 + mova [stk+0xd0], m5 + MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 + MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 + MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 + mova m5, [stk+0xd0] + mova m1, [stk+0x90] + mova m2, [stk+0xa0] + mova m3, [stk+0xb0] + mova m9, [stk+0xc0] + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m8 ; 67a + punpckhwd m7, m8 ; 67b + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m9 ; 23a + punpckhwd m3, m9 ; 23b + mova m10, [stk+0x140] + mova m11, [stk+0x150] + mova m14, [stk+0x160] + mova m15, [stk+0x170] + mova [stk+0x90], m4 + mova [stk+0xa0], m5 + mova [stk+0xb0], m6 + mova [stk+0xc0], m7 + %define hround [rsp+0x10] + %define shift [rsp+0x20] + %if isput + %define vround [rsp+0x30] + %else + %define vround [base+pd_m524256] + %endif +.dy1_vloop: + pmaddwd m4, m0, m10 + pmaddwd m5, m1, m10 + pmaddwd m6, m2, m11 + pmaddwd m7, m3, m11 + paddd m4, m13 + paddd m5, m13 + paddd m4, m6 + paddd m5, m7 + pmaddwd m6, [stk+0x90], m14 + pmaddwd m7, [stk+0xa0], m14 + pmaddwd m8, [stk+0xb0], m15 + pmaddwd m9, [stk+0xc0], m15 + paddd m4, m6 + paddd m5, m7 + %if isput + pshufd m6, m12, q1032 + %endif + paddd m4, m8 + paddd m5, m9 +%else + movd r0, m15 + movd rX, m4 + pshufd m15, m15, q0321 + pshufd m4, m4, q0321 + movd r4, m15 + movd r5, m4 + mova m14, [stk+0x110] + movq m2, [base+subpel_filters+r0*8] + movq m3, [base+subpel_filters+rX*8] + movhps m2, [base+subpel_filters+r4*8] + movhps m3, [base+subpel_filters+r5*8] + psrld m14, 10 + mova [stk+16], m14 + mov r0, [stk+ 0] + mov rX, [stk+ 4] + mov r4, [stk+ 8] + mov r5, [stk+12] + mova [stk+0x20], m0 + mova [stk+0x30], m1 + mova [stk+0x40], m2 + mova [stk+0x50], m3 + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m7, m5, q1100 + pshufd m5, m5, q3322 + pand m0, m9, m4 + pand m1, m9, m6 + pand m2, m9, m7 + pand m3, m9, m5 + pandn m4, [stk+0x20] + pandn m6, [stk+0x30] + pandn m7, [stk+0x40] + pandn m5, [stk+0x50] + por m0, m4 + por m1, m6 + por m2, m7 + por m3, m5 + punpcklbw m4, m0, m0 + punpckhbw m0, m0 + punpcklbw m5, m1, m1 + punpckhbw m1, m1 + psraw m4, 8 + psraw m0, 8 + psraw m5, 8 + psraw m1, 8 + punpcklbw m6, m2, m2 + punpckhbw m2, m2 + punpcklbw m7, m3, m3 + punpckhbw m3, m3 + psraw m6, 8 + psraw m2, 8 + psraw m7, 8 + psraw m3, 8 + mova [stk+0x0a0], m4 + mova [stk+0x0b0], m0 + mova [stk+0x0c0], m5 + mova [stk+0x0d0], m1 + mova [stk+0x140], m6 + mova [stk+0x150], m2 + mova [stk+0x160], m7 + mova [stk+0x170], m3 + MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 + MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 + MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 + MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 + MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 + MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 + MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 + MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 + mova m5, [stk+0x60] + mova m6, [stk+0x70] + mova m7, [stk+0x80] + mova m0, [stk+0x90] + mov r0, r0m + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m0 ; 67a + punpckhwd m7, m0 ; 67b + mova [stk+0x60], m4 + mova [stk+0x70], m5 + mova [stk+0x80], m6 + mova [stk+0x90], m7 + mova m1, [stk+0x20] + mova m2, [stk+0x30] + mova m3, [stk+0x40] + mova m4, [stk+0x50] + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m4 ; 23a + punpckhwd m3, m4 ; 23b + mova m4, [stk+0x180] + mova m5, [stk+0x190] + mova m6, [stk+0x1a0] + mova m7, [stk+0x1b0] + mova [stk+0x20], m0 + mova [stk+0x30], m1 + mova [stk+0x40], m2 + mova [stk+0x50], m3 +.dy1_vloop: + pmaddwd m0, m4 + pmaddwd m1, m4 + pmaddwd m2, m5 + pmaddwd m3, m5 + paddd m0, m2 + paddd m1, m3 + pmaddwd m2, [stk+0x60], m6 + pmaddwd m3, [stk+0x70], m6 + pmaddwd m4, [stk+0x80], m7 + pmaddwd m5, [stk+0x90], m7 + %if isput + movd m6, [esp+0x18] + %endif + paddd m0, m2 + paddd m1, m3 + paddd m0, vrnd_mem + paddd m1, vrnd_mem + paddd m4, m0 + paddd m5, m1 +%endif +%ifidn %1, put + psrad m4, m6 + psrad m5, m6 + packssdw m4, m5 + pxor m7, m7 + pmaxsw m4, m7 + pminsw m4, pxmaxm + mova [dstq], m4 + add dstq, dsm +%else + psrad m4, 6 + psrad m5, 6 + packssdw m4, m5 + mova [tmpq], m4 + add tmpq, tmp_stridem +%endif + dec hd + jz .dy1_hloop_prep +%if ARCH_X86_64 + movu m8, [srcq+r10*2] + movu m9, [srcq+r11*2] + movu m12, [srcq+r13*2] + movu m13, [srcq+ rX*2] + movu m4, [srcq+ r4*2] + movu m5, [srcq+ r6*2] + movu m6, [srcq+ r7*2] + movu m7, [srcq+ r9*2] + add srcq, ssq + pmaddwd m8, [stk+0x50] + pmaddwd m9, [stk+0x60] + pmaddwd m12, [stk+0x70] + pmaddwd m13, [stk+0x80] + pmaddwd m4, [stk+0x10] + pmaddwd m5, [stk+0x20] + pmaddwd m6, [stk+0x30] + pmaddwd m7, [stk+0x40] + phaddd m8, m9 + phaddd m12, m13 + mova m9, [base+unpckw] + mova m13, hround + phaddd m4, m5 + phaddd m6, m7 + phaddd m8, m12 + phaddd m4, m6 + pshufd m5, m9, q1032 + pshufb m0, m9 ; 0a 1a + pshufb m1, m9 ; 0b 1b + pshufb m2, m5 ; 3a 2a + pshufb m3, m5 ; 3b 2b + mova m12, shift + paddd m4, m13 + paddd m8, m13 + psrad m4, m12 + psrad m8, m12 + packssdw m4, m8 + pshufb m6, [stk+0x90], m9 ; 4a 5a + pshufb m7, [stk+0xa0], m9 ; 4b 5b + pshufb m8, [stk+0xb0], m5 ; 7a 6a + pshufb m13, [stk+0xc0], m5 ; 7b 6b + punpckhwd m0, m2 ; 12a + punpckhwd m1, m3 ; 12b + punpcklwd m2, m6 ; 34a + punpcklwd m3, m7 ; 34b + punpckhwd m6, m8 ; 56a + punpckhwd m7, m13 ; 56b + punpcklwd m8, m4 ; 78a + punpckhqdq m4, m4 + punpcklwd m13, m4 ; 78b + mova [stk+0x90], m6 + mova [stk+0xa0], m7 + mova [stk+0xb0], m8 + mova [stk+0xc0], m13 + mova m13, vround +%else + mov r0m, r0 + mov r3, r3m + mov r0, [stk+ 0] + mov rX, [stk+ 4] + mov r4, [stk+ 8] + mov r5, [stk+12] + MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 + mova m7, [base+unpckw] + pshufd m4, m7, q1032 + pshufb m0, [stk+0x20], m7 ; 0a 1a + pshufb m1, [stk+0x30], m7 ; 0b 1b + pshufb m2, [stk+0x40], m4 ; 3a 2a + pshufb m3, [stk+0x50], m4 ; 3b 2b + pshufb m5, [stk+0x60], m7 ; 4a 5a + pshufb m6, [stk+0x70], m7 ; 4b 5b + pshufb m7, [stk+0x80], m4 ; 7a 6a + punpckhwd m0, m2 ; 12a + punpckhwd m1, m3 ; 12b + punpcklwd m2, m5 ; 34a + punpcklwd m3, m6 ; 34b + mova [stk+0x20], m0 + mova [stk+0x30], m1 + mova [stk+0x40], m2 + mova [stk+0x50], m3 + punpckhwd m5, m7 ; 56a + mova [stk+0x60], m5 + pshufb m5, [stk+0x90], m4 ; 7b 6b + punpcklwd m7, [stk+0xe0] ; 78a + mova m4, [stk+0x180] + punpckhwd m6, m5 ; 56b + mova [stk+0x70], m6 + movq m6, [stk+0xe8] + mova [stk+0x80], m7 + mova m7, [stk+0x1b0] + punpcklwd m5, m6 + mova m6, [stk+0x1a0] + mova [stk+0x90], m5 + mova m5, [stk+0x190] + mov r0, r0m +%endif + jmp .dy1_vloop +INIT_XMM ssse3 +%if ARCH_X86_64 + %define stk rsp+0x20 +%endif +.dy2: + movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2] + add wq, base_reg + jmp wq +%if isput +.dy2_w2: + %if ARCH_X86_64 + mov myd, mym + mova [rsp+0x10], m13 + %define vrnd_mem [rsp+0x10] + movzx t0d, t0b + sub srcq, 2 + movd m15, t0d + %else + %define m8 m0 + %define m9 m1 + %define m14 m4 + %define m15 m3 + %define m11 [esp+0x00] + %define m12 [esp+0x10] + %define vrnd_mem [esp+0x20] + mov r1, r1m + movzx r5, byte [esp+0x1f0] + sub srcq, 2 + movd m15, r5 + %endif + pxor m9, m9 + punpckldq m9, m8 + paddd m14, m9 ; mx+dx*[0-1] + %if ARCH_X86_64 + mova m9, [base+pd_0x4000] + %endif + pshufd m15, m15, q0000 + pand m8, m14, m10 + psrld m8, 6 + paddd m15, m8 + movd r4d, m15 + pshufd m15, m15, q0321 + %if ARCH_X86_64 + movd r6d, m15 + %else + movd r3d, m15 + %endif + mova m5, [base+bdct_lb_q] + mova m6, [base+spel_s_shuf2] + movd m15, [base+subpel_filters+r4*8+2] + %if ARCH_X86_64 + movd m7, [base+subpel_filters+r6*8+2] + %else + movd m7, [base+subpel_filters+r3*8+2] + %endif + pxor m2, m2 + pcmpeqd m8, m2 + psrld m14, 10 + paddd m14, m14 + %if ARCH_X86_32 + mov r3, r3m + pshufb m14, m5 + paddb m14, m6 + mova [stk], m14 + SWAP m5, m0 + SWAP m6, m3 + %define m15 m6 + %endif + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*2] + movu m2, [srcq+ssq*4] + punpckldq m15, m7 + %if ARCH_X86_64 + pshufb m14, m5 + paddb m14, m6 + pand m9, m8 + pandn m8, m15 + SWAP m15, m8 + por m15, m9 + movu m4, [srcq+ssq*1] + movu m5, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + movu m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + %else + pand m7, m5, [base+pd_0x4000] + pandn m5, m15 + por m5, m7 + %define m15 m5 + mov myd, mym + mov r5, [esp+0x1f4] + xor r3, r3 + shr myd, 6 + lea r5, [r5+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r5*8+0] + cmovnz r3, [base+subpel_filters+r5*8+4] + mov [stk+0x20], r3 + mov r3, r3m + %endif + punpcklbw m15, m15 + psraw m15, 8 + REPX {pshufb x, m14}, m0, m1, m2 + REPX {pmaddwd x, m15}, m0, m1, m2 + %if ARCH_X86_64 + REPX {pshufb x, m14}, m4, m5, m6 + REPX {pmaddwd x, m15}, m4, m5, m6 + phaddd m0, m1 + phaddd m1, m2 + phaddd m4, m5 + phaddd m5, m6 + REPX {paddd x, m11}, m0, m1, m4, m5 + REPX {psrad x, m12}, m0, m1, m4, m5 + packssdw m0, m1 ; 0 2 2 4 + packssdw m4, m5 ; 1 3 3 5 + SWAP m2, m4 + movq m10, r4 + %else + mova [stk+0x10], m15 + phaddd m0, m1 + phaddd m1, m2 + movu m2, [srcq+ssq*1] + movu m7, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + movu m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + REPX {pshufb x, m14}, m2, m7, m6 + REPX {pmaddwd x, m15}, m2, m7, m6 + %define m14 [stk+0x00] + %define m15 [stk+0x10] + phaddd m2, m7 + phaddd m7, m6 + REPX {paddd x, m11}, m0, m1, m2, m7 + REPX {psrad x, m12}, m0, m1, m2, m7 + packssdw m0, m1 + packssdw m2, m7 + %define m8 m6 + %define m9 m4 + %define m10 m5 + movd m10, r4 + movd m9, [stk+0x20] + punpckldq m10, m9 + %endif + punpcklbw m10, m10 + psraw m10, 8 + pshufd m7, m10, q0000 + pshufd m8, m10, q1111 + pshufd m9, m10, q2222 + pshufd m10, m10, q3333 + %if ARCH_X86_32 + mova [stk+0x50], m7 + mova [stk+0x60], m8 + mova [stk+0x70], m9 + mova [stk+0x80], m10 + %xdefine m13 m7 + %define m7 [stk+0x50] + %define m8 [stk+0x60] + %define m9 [stk+0x70] + %define m10 [stk+0x80] + %endif + punpcklwd m1, m0, m2 ; 01 23 + punpckhwd m3, m0, m2 ; 23 45 + %if ARCH_X86_32 + mov r4, r0m + %define dstq r4 + mova [stk+0x20], m3 + mova [stk+0x30], m0 + %endif +.dy2_w2_loop: + movu m4, [srcq+ssq*0] + movu m5, [srcq+ssq*1] + movu m6, [srcq+ssq*2] + movu m13, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pmaddwd m3, m8 + REPX {pshufb x, m14}, m4, m5, m6, m13 + REPX {pmaddwd x, m15}, m4, m5, m6, m13 + phaddd m4, m5 + phaddd m6, m13 + pmaddwd m5, m1, m7 + paddd m4, m11 + paddd m6, m11 + psrad m4, m12 + psrad m6, m12 + packssdw m4, m6 ; 6 7 8 9 + paddd m5, m3 + pshufd m3, m4, q2200 + pshufd m4, m4, q3311 + palignr m3, m0, 12 ; 4 6 6 8 + palignr m4, m2, 12 ; 5 7 7 9 + mova m0, m3 + mova m2, m4 + punpcklwd m1, m3, m4 + punpckhwd m3, m4 + pmaddwd m6, m1, m9 + pmaddwd m4, m3, m10 + paddd m5, vrnd_mem + paddd m6, m4 + paddd m5, m6 + pshufd m4, m12, q1032 + pxor m6, m6 + psrad m5, m4 + packssdw m5, m5 + pmaxsw m5, m6 + pminsw m5, pxmaxm + movd [dstq+dsq*0], m5 + pshuflw m5, m5, q1032 + movd [dstq+dsq*1], m5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy2_w2_loop + RET +%endif +INIT_XMM ssse3 +.dy2_w4: +%if ARCH_X86_64 + mov myd, mym + mova [rsp+0x10], m11 + mova [rsp+0x20], m12 + %if isput + mova [rsp+0x30], m13 + %define vrnd_mem [rsp+0x30] + %define stk rsp+0x40 + %else + %define vrnd_mem [base+pd_m524256] + %define stk rsp+0x30 + %endif + movzx t0d, t0b + sub srcq, 2 + movd m15, t0d +%else + %define m10 [base+pd_0x3ff] + %define m9 [base+pd_0x4000] + %define m8 m0 + %xdefine m14 m4 + %define m15 m3 + %if isprep + %define ssq r3 + %endif + movzx r5, byte [esp+0x1f0] + sub srcq, 2 + movd m15, r5 +%endif + pmaddwd m8, [base+rescale_mul] +%if ARCH_X86_64 + mova m9, [base+pd_0x4000] +%endif + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + pand m0, m14, m10 + psrld m0, 6 + paddd m15, m0 + pshufd m7, m15, q1032 +%if ARCH_X86_64 + movd r4d, m15 + movd r11d, m7 + pshufd m15, m15, q0321 + pshufd m7, m7, q0321 + movd r6d, m15 + movd r13d, m7 + mova m10, [base+bdct_lb_q+ 0] + mova m11, [base+bdct_lb_q+16] + movd m13, [base+subpel_filters+ r4*8+2] + movd m2, [base+subpel_filters+ r6*8+2] + movd m15, [base+subpel_filters+r11*8+2] + movd m4, [base+subpel_filters+r13*8+2] +%else + movd r1, m15 + movd r4, m7 + pshufd m15, m15, q0321 + pshufd m7, m7, q0321 + movd r3, m15 + movd r5, m7 + mova m5, [base+bdct_lb_q+ 0] + mova m6, [base+bdct_lb_q+16] + movd m1, [base+subpel_filters+r1*8+2] + movd m2, [base+subpel_filters+r3*8+2] + movd m3, [base+subpel_filters+r4*8+2] + movd m7, [base+subpel_filters+r5*8+2] + SWAP m4, m7 + mov r3, r3m + %if isprep + lea ss3q, [ssq*3] + %endif + %define m10 m5 + %define m11 m6 + %define m12 m1 + %define m13 m1 +%endif + psrld m14, 10 + paddd m14, m14 + punpckldq m13, m2 + punpckldq m15, m4 + punpcklqdq m13, m15 + pxor m2, m2 + pcmpeqd m0, m2 +%if ARCH_X86_64 + pand m9, m0 +%else + pand m2, m9, m0 + %define m9 m2 + SWAP m7, m4 +%endif + pandn m0, m13 +%if ARCH_X86_64 + SWAP m13, m0 +%else + %define m13 m0 +%endif + por m13, m9 + punpckhbw m15, m13, m13 + punpcklbw m13, m13 + psraw m15, 8 + psraw m13, 8 + pshufb m12, m14, m10 + pshufb m14, m11 + mova m10, [base+spel_s_shuf2] + movd r4d, m14 + shr r4d, 24 +%if ARCH_X86_32 + mova [stk+0x40], m13 + mova [stk+0x50], m15 + pxor m2, m2 +%endif + pshufb m7, m14, m2 + psubb m14, m7 + paddb m12, m10 + paddb m14, m10 +%if ARCH_X86_64 + lea r6, [r4+ssq*1] + lea r11, [r4+ssq*2] + lea r13, [r4+ss3q ] + movu m1, [srcq+ssq*0] + movu m8, [srcq+ssq*2] + movu m9, [srcq+ssq*1] + movu m10, [srcq+ss3q ] + movu m7, [srcq+r4 ] + movu m2, [srcq+r11 ] + movu m3, [srcq+r6 ] + movu m4, [srcq+r13 ] + lea srcq, [srcq+ssq*4] + REPX {pshufb x, m12}, m1, m9, m8, m10 + REPX {pmaddwd x, m13}, m1, m9, m8, m10 + REPX {pshufb x, m14}, m7, m3, m2, m4 + REPX {pmaddwd x, m15}, m7, m3, m2, m4 + mova m5, [rsp+0x10] + movd xm6, [rsp+0x20] + phaddd m1, m7 + phaddd m8, m2 + phaddd m9, m3 + phaddd m10, m4 + movu m2, [srcq+ssq*0] + movu m3, [srcq+ssq*1] + REPX {paddd x, m5}, m1, m9, m8, m10 + REPX {psrad x, xm6}, m1, m9, m8, m10 + packssdw m1, m8 ; 0 2 + packssdw m9, m10 ; 1 3 + movu m0, [srcq+r4 ] + movu m8, [srcq+r6 ] + lea srcq, [srcq+ssq*2] + REPX {pshufb x, m12}, m2, m3 + REPX {pmaddwd x, m13}, m2, m3 + REPX {pshufb x, m14}, m0, m8 + REPX {pmaddwd x, m15}, m0, m8 + phaddd m2, m0 + phaddd m3, m8 + shr myd, 6 + mov r9d, 64 << 24 + lea myd, [t1+myq] + cmovnz r9q, [base+subpel_filters+myq*8] + REPX {paddd x, m5}, m2, m3 + REPX {psrad x, xm6}, m2, m3 + packssdw m2, m3 ; 4 5 + pshufd m3, m2, q1032 ; 5 _ + punpcklwd m0, m1, m9 ; 01 + punpckhwd m1, m9 ; 23 + punpcklwd m2, m3 ; 45 + movq m10, r9 + %define hrnd_mem [rsp+0x10] + %define hsh_mem [rsp+0x20] + %define vsh_mem [rsp+0x28] + %if isput + %define vrnd_mem [rsp+0x30] + %else + %define vrnd_mem [base+pd_m524256] + %endif +%else + mova [stk+0x20], m12 + mova [stk+0x30], m14 + add r4, srcq + MC_4TAP_SCALED_H 0x60 ; 0 1 + MC_4TAP_SCALED_H 0x70 ; 2 3 + MC_4TAP_SCALED_H 0x80 ; 4 5 + mov [stk+0xe0], r4 + mova m3, [base+spel_s_shuf8] + mova m0, [stk+0x60] + mova m1, [stk+0x70] + mova m2, [stk+0x80] + mov myd, mym + mov rX, [esp+0x1f4] + xor r5, r5 + shr myd, 6 + lea rX, [rX+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+rX*8+0] + cmovnz r5, [base+subpel_filters+rX*8+4] + mov r3, r3m + pshufb m0, m3 ; 01 + pshufb m1, m3 ; 23 + pshufb m2, m3 ; 45 + movd m7, r4 + movd m4, r5 + mov r5, r0m + %if isput + mov r1, r1m + %endif + mov r4, [stk+0xe0] + %define dstq r5 + %define tmpq r5 + %define m12 [stk+0x20] + %define m14 [stk+0x30] + %define m13 [stk+0x40] + %define m15 [stk+0x50] + %define hrnd_mem [esp+0x00] + %define hsh_mem [esp+0x10] + %define vsh_mem [esp+0x18] + %if isput + %define vrnd_mem [esp+0x20] + %else + %define vrnd_mem [base+pd_m524256] + %endif + %define m10 m7 + punpckldq m10, m4 +%endif + punpcklbw m10, m10 + psraw m10, 8 + pshufd m3, m10, q0000 + pshufd m4, m10, q1111 + pshufd m5, m10, q2222 + pshufd m10, m10, q3333 +%if ARCH_X86_32 + %xdefine m8 m3 + %xdefine m9 m6 + %xdefine m11 m5 + %xdefine m6 m4 + mova [stk+0x100], m3 + mova [stk+0x110], m4 + mova [stk+0x120], m5 + mova [stk+0x130], m10 + %define m3 [stk+0x100] + %define m4 [stk+0x110] + %define m5 [stk+0x120] + %define m10 [stk+0x130] +%endif +.dy2_w4_loop: + pmaddwd m8, m0, m3 + pmaddwd m9, m1, m3 + mova m0, m2 + pmaddwd m1, m4 + pmaddwd m11, m2, m4 + paddd m8, vrnd_mem + paddd m9, vrnd_mem + pmaddwd m2, m5 + paddd m8, m1 + paddd m9, m11 + paddd m8, m2 + movu m6, [srcq+ssq*0] + movu m1, [srcq+ssq*2] +%if ARCH_X86_64 + movu m11, [srcq+r4 ] + movu m2, [srcq+r11] +%else + movu m11, [r4+ssq*0] + movu m2, [r4+ssq*2] +%endif + pshufb m6, m12 + pshufb m1, m12 + pmaddwd m6, m13 + pmaddwd m1, m13 + pshufb m11, m14 + pshufb m2, m14 + pmaddwd m11, m15 + pmaddwd m2, m15 + phaddd m6, m11 + phaddd m1, m2 + paddd m6, hrnd_mem + paddd m1, hrnd_mem + psrad m6, hsh_mem + psrad m1, hsh_mem + movu m7, [srcq+ssq*1] + movu m11, [srcq+ss3q ] + packssdw m6, m1 ; 6 8 +%if ARCH_X86_64 + movu m2, [srcq+r6 ] + movu m1, [srcq+r13] +%else + movu m2, [r4+ssq*1] + movu m1, [r4+ss3q ] +%endif + pshufb m7, m12 + pshufb m11, m12 + pmaddwd m7, m13 + pmaddwd m11, m13 + pshufb m2, m14 + pshufb m1, m14 + pmaddwd m2, m15 + pmaddwd m1, m15 + phaddd m7, m2 + phaddd m11, m1 + paddd m7, hrnd_mem + paddd m11, hrnd_mem + psrad m7, hsh_mem + psrad m11, hsh_mem + packssdw m7, m11 ; 7 9 +%if ARCH_X86_32 + lea r4, [r4+ssq*4] +%endif + lea srcq, [srcq+ssq*4] + punpcklwd m1, m6, m7 ; 67 + punpckhwd m6, m7 ; 89 + mova m2, m6 + pmaddwd m11, m1, m5 + pmaddwd m7, m1, m10 + pmaddwd m6, m10 + paddd m9, m11 +%if isput + movd m11, vsh_mem +%endif + paddd m8, m7 + paddd m9, m6 +%if isput + psrad m8, m11 + psrad m9, m11 + packssdw m8, m9 + pxor m7, m7 + pmaxsw m8, m7 + pminsw m8, pxmaxm + movq [dstq+dsq*0], m8 + movhps [dstq+dsq*1], m8 + lea dstq, [dstq+dsq*2] +%else + psrad m8, 6 + psrad m9, 6 + packssdw m8, m9 + mova [tmpq], m8 + add tmpq, 16 +%endif + sub hd, 2 + jg .dy2_w4_loop + MC_8TAP_SCALED_RET ; why not jz .ret? +INIT_XMM ssse3 +.dy2_w8: + mov dword [stk+0xf0], 1 + movifprep tmp_stridem, 16 + jmp .dy2_w_start +.dy2_w16: + mov dword [stk+0xf0], 2 + movifprep tmp_stridem, 32 + jmp .dy2_w_start +.dy2_w32: + mov dword [stk+0xf0], 4 + movifprep tmp_stridem, 64 + jmp .dy2_w_start +.dy2_w64: + mov dword [stk+0xf0], 8 + movifprep tmp_stridem, 128 + jmp .dy2_w_start +.dy2_w128: + mov dword [stk+0xf0], 16 + movifprep tmp_stridem, 256 +.dy2_w_start: + mov myd, mym +%if ARCH_X86_64 + %ifidn %1, put + movifnidn dsm, dsq + %endif + mova [rsp+0x10], m11 + mova [rsp+0x20], m12 + %define hround m11 + %if isput + mova [rsp+0x30], m13 + %else + mova m13, [base+pd_m524256] + %endif + shr t0d, 16 + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + movd m15, t0d +%else + %define hround [esp+0x00] + %define m12 [esp+0x10] + %define m10 [base+pd_0x3ff] + %define m8 m0 + %xdefine m14 m4 + %xdefine m15 m3 + %if isput + %define dstq r0 + %else + %define tmpq r0 + %define ssq ssm + %endif + mov r5, [esp+0x1f0] + mov r3, [esp+0x1f4] + shr r5, 16 + movd m15, r5 + xor r5, r5 + shr myd, 6 + lea r3, [r3+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r3*8+0] + cmovnz r5, [base+subpel_filters+r3*8+4] + mov r0, r0m + mov r3, r3m +%endif + sub srcq, 6 + pslld m7, m8, 2 ; dx*4 + pmaddwd m8, [base+rescale_mul] ; dx*[0-3] + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] +%if ARCH_X86_64 + movq m3, r4q +%else + movd m5, r4 + movd m6, r5 + punpckldq m5, m6 + SWAP m3, m5 +%endif + punpcklbw m3, m3 + psraw m3, 8 + mova [stk+0x100], m7 + mova [stk+0x120], m15 + mov [stk+0x0f8], srcq + mov [stk+0x130], r0q ; dstq / tmpq + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 +%if ARCH_X86_64 + mova [stk+0x140], m0 + mova [stk+0x150], m1 + mova [stk+0x160], m2 + mova [stk+0x170], m3 + %if UNIX64 + mov hm, hd + %endif +%else + mova [stk+0x180], m0 + mova [stk+0x190], m1 + mova [stk+0x1a0], m2 + mova [stk+0x1b0], m3 + SWAP m5, m3 + mov r5, hm + mov [stk+0x134], r5 +%endif + jmp .dy2_hloop +.dy2_hloop_prep: + dec dword [stk+0x0f0] + jz .ret +%if ARCH_X86_64 + add qword [stk+0x130], 16 + mov hd, hm +%else + add dword [stk+0x130], 16 + mov r5, [stk+0x134] + mov r0, [stk+0x130] +%endif + mova m7, [stk+0x100] + mova m14, [stk+0x110] +%if ARCH_X86_64 + mova m10, [base+pd_0x3ff] + mova m11, [rsp+0x10] +%endif + mova m15, [stk+0x120] + mov srcq, [stk+0x0f8] +%if ARCH_X86_64 + mov r0q, [stk+0x130] ; dstq / tmpq +%else + mov hm, r5 + mov r0m, r0 + mov r3, r3m +%endif + paddd m14, m7 +.dy2_hloop: +%if ARCH_X86_64 + mova m9, [base+pq_0x40000000] +%else + %define m9 [base+pq_0x40000000] +%endif + pxor m1, m1 + psrld m2, m14, 10 + mova [stk], m2 + pand m6, m14, m10 + psrld m6, 6 + paddd m5, m15, m6 + pcmpeqd m6, m1 + pshufd m2, m5, q1032 +%if ARCH_X86_64 + movd r4d, m5 + movd r6d, m2 + pshufd m5, m5, q0321 + pshufd m2, m2, q0321 + movd r7d, m5 + movd r9d, m2 + movq m0, [base+subpel_filters+r4*8] + movq m1, [base+subpel_filters+r6*8] + movhps m0, [base+subpel_filters+r7*8] + movhps m1, [base+subpel_filters+r9*8] +%else + movd r0, m5 + movd rX, m2 + pshufd m5, m5, q0321 + pshufd m2, m2, q0321 + movd r4, m5 + movd r5, m2 + movq m0, [base+subpel_filters+r0*8] + movq m1, [base+subpel_filters+rX*8] + movhps m0, [base+subpel_filters+r4*8] + movhps m1, [base+subpel_filters+r5*8] +%endif + paddd m14, m7 ; mx+dx*[4-7] + pand m5, m14, m10 + psrld m5, 6 + paddd m15, m5 + pxor m2, m2 + pcmpeqd m5, m2 + mova [stk+0x110], m14 + pshufd m4, m15, q1032 +%if ARCH_X86_64 + movd r10d, m15 + movd r11d, m4 + pshufd m15, m15, q0321 + pshufd m4, m4, q0321 + movd r13d, m15 + movd rXd, m4 + movq m2, [base+subpel_filters+r10*8] + movq m3, [base+subpel_filters+r11*8] + movhps m2, [base+subpel_filters+r13*8] + movhps m3, [base+subpel_filters+ rX*8] + psrld m14, 10 + movq r11, m14 + punpckhqdq m14, m14 + movq rX, m14 + mov r10d, r11d + shr r11, 32 + mov r13d, rXd + shr rX, 32 + mov r4d, [stk+ 0] + mov r6d, [stk+ 4] + mov r7d, [stk+ 8] + mov r9d, [stk+12] + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m14, m5, q1100 + pshufd m5, m5, q3322 + pand m7, m9, m4 + pand m8, m9, m6 + pand m15, m9, m14 + pand m9, m9, m5 + pandn m4, m0 + pandn m6, m1 + pandn m14, m2 + pandn m5, m3 + por m7, m4 + por m8, m6 + por m15, m14 + por m9, m5 + punpcklbw m0, m7, m7 + punpckhbw m7, m7 + punpcklbw m1, m8, m8 + punpckhbw m8, m8 + psraw m0, 8 + psraw m7, 8 + psraw m1, 8 + psraw m8, 8 + punpcklbw m2, m15, m15 + punpckhbw m15, m15 + punpcklbw m3, m9, m9 + punpckhbw m9, m9 + psraw m2, 8 + psraw m15, 8 + psraw m3, 8 + psraw m9, 8 + mova [stk+0x10], m0 + mova [stk+0x20], m7 + mova [stk+0x30], m1 + mova [stk+0x40], m8 + mova [stk+0x50], m2 + mova [stk+0x60], m15 + mova [stk+0x70], m3 + mova [stk+0x80], m9 + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 + mova [stk+0x90], m1 + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 + mova [stk+0xa0], m2 + MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 + mova [stk+0xb0], m3 + MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 + mova [stk+0xc0], m4 + MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 + mova [stk+0xd0], m5 + MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 + MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 + MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 + mova m5, [stk+0xd0] + mova m1, [stk+0x90] + mova m2, [stk+0xa0] + mova m3, [stk+0xb0] + mova m9, [stk+0xc0] + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m8 ; 67a + punpckhwd m7, m8 ; 67b + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m9 ; 23a + punpckhwd m3, m9 ; 23b + mova m10, [stk+0x140] + mova m11, [stk+0x150] + mova m14, [stk+0x160] + mova m15, [stk+0x170] + mova [stk+0x90], m4 + mova [stk+0xa0], m5 + mova [stk+0xb0], m6 + mova [stk+0xc0], m7 + %define hround [rsp+0x10] + %define shift [rsp+0x20] + %if isput + %define vround [rsp+0x30] + %else + %define vround [base+pd_m524256] + %endif +.dy2_vloop: + pmaddwd m4, m0, m10 + pmaddwd m5, m1, m10 + pmaddwd m6, m2, m11 + pmaddwd m7, m3, m11 + paddd m4, m13 + paddd m5, m13 + paddd m4, m6 + paddd m5, m7 + pmaddwd m6, [stk+0x90], m14 + pmaddwd m7, [stk+0xa0], m14 + pmaddwd m8, [stk+0xb0], m15 + pmaddwd m9, [stk+0xc0], m15 + paddd m4, m6 + paddd m5, m7 + %if isput + pshufd m6, m12, q1032 + %endif + paddd m4, m8 + paddd m5, m9 +%else + movd r0, m15 + movd rX, m4 + pshufd m15, m15, q0321 + pshufd m4, m4, q0321 + movd r4, m15 + movd r5, m4 + mova m14, [stk+0x110] + movq m2, [base+subpel_filters+r0*8] + movq m3, [base+subpel_filters+rX*8] + movhps m2, [base+subpel_filters+r4*8] + movhps m3, [base+subpel_filters+r5*8] + psrld m14, 10 + mova [stk+16], m14 + mov r0, [stk+ 0] + mov rX, [stk+ 4] + mov r4, [stk+ 8] + mov r5, [stk+12] + mova [stk+0x20], m0 + mova [stk+0x30], m1 + mova [stk+0x40], m2 + mova [stk+0x50], m3 + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m7, m5, q1100 + pshufd m5, m5, q3322 + pand m0, m9, m4 + pand m1, m9, m6 + pand m2, m9, m7 + pand m3, m9, m5 + pandn m4, [stk+0x20] + pandn m6, [stk+0x30] + pandn m7, [stk+0x40] + pandn m5, [stk+0x50] + por m0, m4 + por m1, m6 + por m2, m7 + por m3, m5 + punpcklbw m4, m0, m0 + punpckhbw m0, m0 + punpcklbw m5, m1, m1 + punpckhbw m1, m1 + psraw m4, 8 + psraw m0, 8 + psraw m5, 8 + psraw m1, 8 + punpcklbw m6, m2, m2 + punpckhbw m2, m2 + punpcklbw m7, m3, m3 + punpckhbw m3, m3 + psraw m6, 8 + psraw m2, 8 + psraw m7, 8 + psraw m3, 8 + mova [stk+0x0a0], m4 + mova [stk+0x0b0], m0 + mova [stk+0x0c0], m5 + mova [stk+0x0d0], m1 + mova [stk+0x140], m6 + mova [stk+0x150], m2 + mova [stk+0x160], m7 + mova [stk+0x170], m3 + MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 + MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 + MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 + MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 + MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 + MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 + MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 + MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 + mova m5, [stk+0x60] + mova m6, [stk+0x70] + mova m7, [stk+0x80] + mova m0, [stk+0x90] + mov r0, r0m + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m0 ; 67a + punpckhwd m7, m0 ; 67b + mova [stk+0x60], m4 + mova [stk+0x70], m5 + mova [stk+0x80], m6 + mova [stk+0x90], m7 + mova m1, [stk+0x20] + mova m2, [stk+0x30] + mova m3, [stk+0x40] + mova m4, [stk+0x50] + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m4 ; 23a + punpckhwd m3, m4 ; 23b + mova m4, [stk+0x180] + mova m5, [stk+0x190] + mova m6, [stk+0x1a0] + mova m7, [stk+0x1b0] + mova [stk+0x40], m2 + mova [stk+0x50], m3 +.dy2_vloop: + pmaddwd m0, m4 + pmaddwd m1, m4 + pmaddwd m2, m5 + pmaddwd m3, m5 + paddd m0, m2 + paddd m1, m3 + pmaddwd m2, [stk+0x60], m6 + pmaddwd m3, [stk+0x70], m6 + pmaddwd m4, [stk+0x80], m7 + pmaddwd m5, [stk+0x90], m7 + %if isput + movd m6, [esp+0x18] + %endif + paddd m0, m2 + paddd m1, m3 + paddd m0, vrnd_mem + paddd m1, vrnd_mem + paddd m4, m0 + paddd m5, m1 +%endif +%ifidn %1, put + psrad m4, m6 + psrad m5, m6 + packssdw m4, m5 + pxor m7, m7 + pmaxsw m4, m7 + pminsw m4, pxmaxm + mova [dstq], m4 + add dstq, dsm +%else + psrad m4, 6 + psrad m5, 6 + packssdw m4, m5 + mova [tmpq], m4 + add tmpq, tmp_stridem +%endif + dec hd + jz .dy2_hloop_prep +%if ARCH_X86_64 + MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 0, 1 + mova [stk+0xd0], m4 + MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 4, 0, 1 + mova m4, [stk+0xd0] + mova m0, m2 ; 01a + mova m1, m3 ; 01b + mova m2, [stk+0x90] ; 23a + mova m3, [stk+0xa0] ; 23b + mova m5, [stk+0xb0] ; 45a + mova m6, [stk+0xc0] ; 45b + punpcklwd m7, m4, m8 ; 67a + punpckhwd m4, m8 ; 67b + mova [stk+0x90], m5 + mova [stk+0xa0], m6 + mova [stk+0xb0], m7 + mova [stk+0xc0], m4 +%else + mov r0m, r0 + mov r3, r3m + MC_8TAP_SCALED_H 0xa0, 0xe0 ; 8 + MC_8TAP_SCALED_H 0xa0, 0 ; 9 + mova m7, [stk+0xe0] + mova m2, [stk+0x60] ; 23a + mova m3, [stk+0x70] ; 23b + mova m4, [stk+0x80] ; 45a + mova m5, [stk+0x90] ; 45b + punpcklwd m6, m7, m0 ; 67a + punpckhwd m7, m0 ; 67b + mova m0, [stk+0x40] ; 01a + mova m1, [stk+0x50] ; 01b + mova [stk+0x40], m2 + mova [stk+0x50], m3 + mova [stk+0x60], m4 + mova [stk+0x70], m5 + mova m4, [stk+0x180] + mova m5, [stk+0x190] + mova [stk+0x80], m6 + mova [stk+0x90], m7 + mova m6, [stk+0x1a0] + mova m7, [stk+0x1b0] + mov r0, r0m +%endif + jmp .dy2_vloop +INIT_XMM ssse3 +.ret: + MC_8TAP_SCALED_RET 0 +%if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT + %define r0m [rstk+stack_offset+ 4] + %define r1m [rstk+stack_offset+ 8] + %define r2m [rstk+stack_offset+12] + %define r3m [rstk+stack_offset+16] +%endif +%undef isput +%undef isprep +%endmacro + +%macro BILIN_SCALED_FN 1 +cglobal %1_bilin_scaled_16bpc + mov t0d, (5*15 << 16) | 5*15 + mov t1d, (5*15 << 16) | 5*15 + jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX) +%endmacro + +%if WIN64 +DECLARE_REG_TMP 6, 5 +%elif ARCH_X86_64 +DECLARE_REG_TMP 6, 8 +%else +DECLARE_REG_TMP 1, 2 +%endif +BILIN_SCALED_FN put +FN put_8tap_scaled, sharp, SHARP, SHARP +FN put_8tap_scaled, sharp_smooth, SHARP, SMOOTH +FN put_8tap_scaled, smooth_sharp, SMOOTH, SHARP +FN put_8tap_scaled, smooth, SMOOTH, SMOOTH +FN put_8tap_scaled, sharp_regular, SHARP, REGULAR +FN put_8tap_scaled, regular_sharp, REGULAR, SHARP +FN put_8tap_scaled, smooth_regular, SMOOTH, REGULAR +FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH +FN put_8tap_scaled, regular, REGULAR, REGULAR +MC_8TAP_SCALED put + +%if WIN64 +DECLARE_REG_TMP 5, 4 +%elif ARCH_X86_64 +DECLARE_REG_TMP 6, 7 +%else +DECLARE_REG_TMP 1, 2 +%endif +BILIN_SCALED_FN prep +FN prep_8tap_scaled, sharp, SHARP, SHARP +FN prep_8tap_scaled, sharp_smooth, SHARP, SMOOTH +FN prep_8tap_scaled, smooth_sharp, SMOOTH, SHARP +FN prep_8tap_scaled, smooth, SMOOTH, SMOOTH +FN prep_8tap_scaled, sharp_regular, SHARP, REGULAR +FN prep_8tap_scaled, regular_sharp, REGULAR, SHARP +FN prep_8tap_scaled, smooth_regular, SMOOTH, REGULAR +FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH +FN prep_8tap_scaled, regular, REGULAR, REGULAR +MC_8TAP_SCALED prep + +%if ARCH_X86_64 +DECLARE_REG_TMP 6 +%else +DECLARE_REG_TMP 2 +%endif + +%if ARCH_X86_64 +; warp8x8t spills one less xmm register than warp8x8 on WIN64, compensate that +; by allocating 16 bytes more stack space so that stack offsets match up. +%if WIN64 && STACK_ALIGNMENT == 16 +%assign stksz 16*14 +%else +%assign stksz 16*13 +%endif +cglobal warp_affine_8x8t_16bpc, 4, 13, 9, stksz, dst, ds, src, ss, delta, \ + mx, tmp, alpha, beta, \ + filter, my, gamma, cnt +%assign stack_size_padded_8x8t stack_size_padded +%else +cglobal warp_affine_8x8t_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \ + filter, mx, my +%define m8 [esp+16*13] +%define m9 [esp+16*14] +%define cntd dword [esp+4*63] +%define dstq tmpq +%define dsq 0 +%if STACK_ALIGNMENT < 16 +%define dstm [esp+4*65] +%define dsm [esp+4*66] +%else +%define dstm r0m +%define dsm r1m +%endif +%endif +%define base filterq-$$ + mov t0d, r7m + LEA filterq, $$ + shr t0d, 11 +%if ARCH_X86_64 + movddup m8, [base+warp8x8t_rnd] +%else + movddup m1, [base+warp8x8t_rnd] + mov r1, r1m + add r1, r1 + mova m8, m1 + mov r1m, r1 ; ds *= 2 +%endif + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main + jmp .start +.loop: +%if ARCH_X86_64 + lea dstq, [dstq+dsq*4] +%else + add dstq, dsm + mov dstm, dstq +%endif + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main2 +.start: +%if ARCH_X86_32 + mov dstq, dstm +%endif + paddd m1, m8 + paddd m2, m8 + psrad m1, 15 + psrad m2, 15 + packssdw m1, m2 + mova [dstq+dsq*0], m1 + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main3 +%if ARCH_X86_32 + mov dstq, dstm + add dstq, dsm +%endif + paddd m1, m8 + paddd m2, m8 + psrad m1, 15 + psrad m2, 15 + packssdw m1, m2 + mova [dstq+dsq*2], m1 + dec cntd + jg .loop + RET + +%if ARCH_X86_64 +cglobal warp_affine_8x8_16bpc, 4, 13, 10, 16*13, dst, ds, src, ss, delta, \ + mx, tmp, alpha, beta, \ + filter, my, gamma, cnt +ASSERT stack_size_padded == stack_size_padded_8x8t +%else +cglobal warp_affine_8x8_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \ + filter, mx, my +%endif + mov t0d, r7m + LEA filterq, $$ + shr t0d, 11 +%if ARCH_X86_64 + movddup m8, [base+warp8x8_rnd2+t0*8] + movd m9, r7m ; pixel_max + pshufb m9, [base+pw_256] +%else + movddup m1, [base+warp8x8_rnd2+t0*8] + movd m2, r7m ; pixel_max + pshufb m2, [base+pw_256] + mova m8, m1 + mova m9, m2 +%endif + call .main + jmp .start +.loop: +%if ARCH_X86_64 + lea dstq, [dstq+dsq*2] +%else + add dstq, dsm + mov dstm, dstq +%endif + call .main2 +.start: +%if ARCH_X86_32 + mov dstq, dstm +%endif + psrad m1, 16 + psrad m2, 16 + packssdw m1, m2 + pmaxsw m1, m6 + pmulhrsw m1, m8 + pminsw m1, m9 + mova [dstq+dsq*0], m1 + call .main3 +%if ARCH_X86_32 + mov dstq, dstm + add dstq, dsm +%endif + psrad m1, 16 + psrad m2, 16 + packssdw m1, m2 + pmaxsw m1, m6 + pmulhrsw m1, m8 + pminsw m1, m9 + mova [dstq+dsq*1], m1 + dec cntd + jg .loop + RET +ALIGN function_align +.main: + ; Stack args offset by one (r4m -> r5m etc.) due to call +%if WIN64 + mov deltaq, r5m + mov mxd, r6m +%endif + movd m0, [base+warp8x8_shift+t0*4] + movddup m7, [base+warp8x8_rnd1+t0*8] + add filterq, mc_warp_filter-$$ +%if ARCH_X86_64 + movsx alphad, word [deltaq+2*0] + movsx betad, word [deltaq+2*1] + movsx gammad, word [deltaq+2*2] + movsx deltad, word [deltaq+2*3] + lea tmpq, [ssq*3] + add mxd, 512+(64<<10) + sub srcq, tmpq ; src -= ss*3 + imul tmpd, alphad, -7 + mov myd, r7m + add betad, tmpd ; beta -= alpha*7 + imul tmpd, gammad, -7 + add myd, 512+(64<<10) + mov cntd, 4 + add deltad, tmpd ; delta -= gamma*7 +%else +%if STACK_ALIGNMENT < 16 + %assign stack_offset stack_offset - gprsize +%endif + mov r3d, r5m ; abcd +%if STACK_ALIGNMENT < 16 + mov r0, r1m ; dst + mov r1, r2m ; ds + mov [esp+gprsize+4*65], r0 + mov [esp+gprsize+4*66], r1 +%endif + movsx alphad, word [r3+2*0] + movsx r2d, word [r3+2*1] + movsx gammad, word [r3+2*2] + movsx r3d, word [r3+2*3] + imul r5d, alphad, -7 + add r2d, r5d ; beta -= alpha*7 + imul r5d, gammad, -7 + mov [esp+gprsize+4*60], r2d + add r3d, r5d ; delta -= gamma*7 + mov [esp+gprsize+4*61], r3d + mov r3d, r4m ; ss + mov srcq, r3m + mov mxd, r6m + mov myd, r7m + mov dword [esp+gprsize+4*63], 4 ; cnt + mov [esp+gprsize+4*62], r3 + lea r3, [r3*3] + add mxd, 512+(64<<10) + add myd, 512+(64<<10) + sub srcq, r3 ; src -= ss*3 +%if STACK_ALIGNMENT < 16 + %assign stack_offset stack_offset + gprsize +%endif +%endif + mova [rsp+gprsize], m0 + pxor m6, m6 + call .h + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 01 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 1], m1 + mova [rsp+gprsize+16* 4], m5 + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 12 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 7], m1 + mova [rsp+gprsize+16*10], m5 + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 23 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 2], m1 + mova [rsp+gprsize+16* 5], m5 + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 34 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 8], m1 + mova [rsp+gprsize+16*11], m5 + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 45 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 3], m1 + mova [rsp+gprsize+16* 6], m5 + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 56 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 9], m1 + mova [rsp+gprsize+16*12], m5 + mova m5, m0 +.main2: + call .h +%macro WARP_V 6 ; 01l, 23l, 45l, 01h, 23h, 45h + lea tmpd, [myq+gammaq] + shr myd, 10 + movq m4, [filterq+myq*8] ; a + lea myd, [tmpq+gammaq] + shr tmpd, 10 + movq m2, [filterq+tmpq*8] ; b + lea tmpd, [myq+gammaq] + shr myd, 10 + movq m3, [filterq+myq*8] ; c + lea myd, [tmpq+gammaq] + shr tmpd, 10 + movq m1, [filterq+tmpq*8] ; d + lea tmpd, [myq+gammaq] + shr myd, 10 + punpcklwd m4, m2 + punpcklwd m3, m1 + punpckldq m2, m4, m3 + punpckhdq m4, m3 + punpcklbw m1, m6, m2 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8 + pmaddwd m1, [rsp+gprsize+16*%1] + punpckhbw m3, m6, m2 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8 + mova m2, [rsp+gprsize+16*%2] + pmaddwd m3, m2 + mova [rsp+gprsize+16*%1], m2 + paddd m1, m3 + punpcklbw m3, m6, m4 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8 + mova m2, [rsp+gprsize+16*%3] + pmaddwd m3, m2 + mova [rsp+gprsize+16*%2], m2 + paddd m1, m3 + punpcklwd m3, m5, m0 ; 67 + punpckhbw m2, m6, m4 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8 + pmaddwd m2, m3 + mova [rsp+gprsize+16*%3], m3 + paddd m1, m2 + movq m4, [filterq+myq*8] ; e + lea myd, [tmpq+gammaq] + shr tmpd, 10 + movq m3, [filterq+tmpq*8] ; f + lea tmpd, [myq+gammaq] + shr myd, 10 + movq m2, [filterq+myq*8] ; g +%if ARCH_X86_64 + lea myd, [tmpq+deltaq] ; my += delta +%else + mov myd, [esp+gprsize+4*61] + add myd, tmpd +%endif + shr tmpd, 10 + punpcklwd m4, m3 + movq m3, [filterq+tmpq*8] ; h + punpcklwd m2, m3 + punpckldq m3, m4, m2 + punpckhdq m4, m2 + punpcklbw m2, m6, m3 ; e0 e1 f0 f1 g0 g1 h0 h1 << 8 + pmaddwd m2, [rsp+gprsize+16*%4] + punpckhbw m6, m3 ; e2 e3 f2 f3 g2 g3 h2 h3 << 8 + mova m3, [rsp+gprsize+16*%5] + pmaddwd m6, m3 + mova [rsp+gprsize+16*%4], m3 + pxor m3, m3 + paddd m2, m6 + punpcklbw m3, m4 ; e4 e5 f4 f5 g4 g5 h4 h5 << 8 + mova m6, [rsp+gprsize+16*%6] + pmaddwd m3, m6 + mova [rsp+gprsize+16*%5], m6 + punpckhwd m5, m0 + pxor m6, m6 + paddd m2, m3 + punpckhbw m3, m6, m4 ; e6 e7 f6 f7 g6 g7 h6 h7 << 8 + pmaddwd m3, m5 + mova [rsp+gprsize+16*%6], m5 + mova m5, m0 + paddd m2, m3 +%endmacro + WARP_V 1, 2, 3, 4, 5, 6 + ret +.main3: + call .h + WARP_V 7, 8, 9, 10, 11, 12 + ret +ALIGN function_align +.h: + lea tmpd, [mxq+alphaq] + shr mxd, 10 + movq m3, [filterq+mxq*8] + punpcklbw m0, m6, m3 + movu m3, [srcq-6] + pmaddwd m0, m3 ; 0 + lea mxd, [tmpq+alphaq] + shr tmpd, 10 + movq m3, [filterq+tmpq*8] + punpcklbw m2, m6, m3 + movu m3, [srcq-4] + pmaddwd m2, m3 ; 1 + lea tmpd, [mxq+alphaq] + shr mxd, 10 + movq m3, [filterq+mxq*8] + phaddd m0, m2 ; 0 1 + punpcklbw m2, m6, m3 + movu m3, [srcq-2] + pmaddwd m2, m3 ; 2 + lea mxd, [tmpq+alphaq] + shr tmpd, 10 + movq m3, [filterq+tmpq*8] + punpcklbw m1, m6, m3 + movu m3, [srcq+0] + pmaddwd m1, m3 ; 3 + lea tmpd, [mxq+alphaq] + shr mxd, 10 + movq m3, [filterq+mxq*8] + phaddd m2, m1 ; 2 3 + punpcklbw m1, m6, m3 + movu m3, [srcq+2] + pmaddwd m1, m3 ; 4 + lea mxd, [tmpq+alphaq] + shr tmpd, 10 + movq m3, [filterq+tmpq*8] + phaddd m0, m2 ; 0 1 2 3 + punpcklbw m2, m6, m3 + movu m3, [srcq+4] + pmaddwd m2, m3 ; 5 + lea tmpd, [mxq+alphaq] + shr mxd, 10 + movq m3, [filterq+mxq*8] + phaddd m1, m2 ; 4 5 + punpcklbw m2, m6, m3 + movu m3, [srcq+6] + pmaddwd m2, m3 ; 6 +%if ARCH_X86_64 + lea mxd, [tmpq+betaq] ; mx += beta +%else + mov mxd, [esp+gprsize*2+4*60] + add mxd, tmpd +%endif + shr tmpd, 10 + movq m3, [filterq+tmpq*8] + punpcklbw m4, m6, m3 + movu m3, [srcq+8] +%if ARCH_X86_64 + add srcq, ssq +%else + add srcq, [esp+gprsize*2+4*62] +%endif + pmaddwd m3, m4 ; 7 + phaddd m2, m3 ; 6 7 + phaddd m1, m2 ; 4 5 6 7 + paddd m0, m7 + paddd m1, m7 + psrad m0, [rsp+gprsize*2] + psrad m1, [rsp+gprsize*2] + packssdw m0, m1 + ret + +%macro BIDIR_FN 0 + call .main + jmp wq +.w4_loop: + call .main + lea dstq, [dstq+strideq*2] +.w4: + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + sub hd, 4 + jg .w4_loop +.ret: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*2] +.w8: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jne .w8_loop + RET +.w16_loop: + call .main + add dstq, strideq +.w16: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + dec hd + jg .w16_loop + RET +.w32_loop: + call .main + add dstq, strideq +.w32: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + dec hd + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + call .main + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + call .main + mova [dstq+16*6], m0 + mova [dstq+16*7], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+16* 0], m0 + mova [dstq+16* 1], m1 + call .main + mova [dstq+16* 2], m0 + mova [dstq+16* 3], m1 + call .main + mova [dstq+16* 4], m0 + mova [dstq+16* 5], m1 + call .main + mova [dstq+16* 6], m0 + mova [dstq+16* 7], m1 + call .main + mova [dstq+16* 8], m0 + mova [dstq+16* 9], m1 + call .main + mova [dstq+16*10], m0 + mova [dstq+16*11], m1 + call .main + mova [dstq+16*12], m0 + mova [dstq+16*13], m1 + call .main + mova [dstq+16*14], m0 + mova [dstq+16*15], m1 + dec hd + jg .w128_loop + RET +%endmacro + +%if UNIX64 +DECLARE_REG_TMP 7 +%else +DECLARE_REG_TMP 5 +%endif + +cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h +%define base r6-avg_ssse3_table + LEA r6, avg_ssse3_table + tzcnt wd, wm + mov t0d, r6m ; pixel_max + movsxd wq, [r6+wq*4] + shr t0d, 11 + movddup m2, [base+bidir_rnd+t0*8] + movddup m3, [base+bidir_mul+t0*8] + movifnidn hd, hm + add wq, r6 + BIDIR_FN +ALIGN function_align +.main: + mova m0, [tmp1q+16*0] + paddsw m0, [tmp2q+16*0] + mova m1, [tmp1q+16*1] + paddsw m1, [tmp2q+16*1] + add tmp1q, 16*2 + add tmp2q, 16*2 + pmaxsw m0, m2 + pmaxsw m1, m2 + psubsw m0, m2 + psubsw m1, m2 + pmulhw m0, m3 + pmulhw m1, m3 + ret + +cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h +%define base r6-w_avg_ssse3_table + LEA r6, w_avg_ssse3_table + tzcnt wd, wm + mov t0d, r6m ; weight + movd m6, r7m ; pixel_max + movddup m5, [base+pd_65538] + movsxd wq, [r6+wq*4] + pshufb m6, [base+pw_256] + add wq, r6 + lea r6d, [t0-16] + shl t0d, 16 + sub t0d, r6d ; 16-weight, weight + paddw m5, m6 + mov r6d, t0d + shl t0d, 2 + test dword r7m, 0x800 + cmovnz r6d, t0d + movifnidn hd, hm + movd m4, r6d + pslld m5, 7 + pxor m7, m7 + pshufd m4, m4, q0000 + BIDIR_FN +ALIGN function_align +.main: + mova m2, [tmp1q+16*0] + mova m0, [tmp2q+16*0] + punpckhwd m3, m0, m2 + punpcklwd m0, m2 + mova m2, [tmp1q+16*1] + mova m1, [tmp2q+16*1] + add tmp1q, 16*2 + add tmp2q, 16*2 + pmaddwd m3, m4 + pmaddwd m0, m4 + paddd m3, m5 + paddd m0, m5 + psrad m3, 8 + psrad m0, 8 + packssdw m0, m3 + punpckhwd m3, m1, m2 + punpcklwd m1, m2 + pmaddwd m3, m4 + pmaddwd m1, m4 + paddd m3, m5 + paddd m1, m5 + psrad m3, 8 + psrad m1, 8 + packssdw m1, m3 + pminsw m0, m6 + pminsw m1, m6 + pmaxsw m0, m7 + pmaxsw m1, m7 + ret + +%if ARCH_X86_64 +cglobal mask_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask +%else +cglobal mask_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask +%define hd dword r5m +%define m8 [base+pw_64] +%endif +%define base r6-mask_ssse3_table + LEA r6, mask_ssse3_table + tzcnt wd, wm + mov t0d, r7m ; pixel_max + shr t0d, 11 + movsxd wq, [r6+wq*4] + movddup m6, [base+bidir_rnd+t0*8] + movddup m7, [base+bidir_mul+t0*8] +%if ARCH_X86_64 + mova m8, [base+pw_64] + movifnidn hd, hm +%endif + add wq, r6 + mov maskq, r6mp + BIDIR_FN +ALIGN function_align +.main: + movq m3, [maskq+8*0] + mova m0, [tmp1q+16*0] + mova m4, [tmp2q+16*0] + pxor m5, m5 + punpcklbw m3, m5 + punpckhwd m2, m0, m4 + punpcklwd m0, m4 + psubw m1, m8, m3 + punpckhwd m4, m3, m1 ; m, 64-m + punpcklwd m3, m1 + pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m) + pmaddwd m0, m3 + movq m3, [maskq+8*1] + mova m1, [tmp1q+16*1] + mova m4, [tmp2q+16*1] + add maskq, 8*2 + add tmp1q, 16*2 + add tmp2q, 16*2 + psrad m2, 5 + psrad m0, 5 + packssdw m0, m2 + punpcklbw m3, m5 + punpckhwd m2, m1, m4 + punpcklwd m1, m4 + psubw m5, m8, m3 + punpckhwd m4, m3, m5 ; m, 64-m + punpcklwd m3, m5 + pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m) + pmaddwd m1, m3 + psrad m2, 5 + psrad m1, 5 + packssdw m1, m2 + pmaxsw m0, m6 + pmaxsw m1, m6 + psubsw m0, m6 + psubsw m1, m6 + pmulhw m0, m7 + pmulhw m1, m7 + ret + +cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask +%define base t0-w_mask_420_ssse3_table + LEA t0, w_mask_420_ssse3_table + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movd m0, r7m ; sign + shr r6d, 11 + movsxd wq, [t0+wq*4] +%if ARCH_X86_64 + mova m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 + mova m9, [base+pw_64] + movddup m10, [base+bidir_rnd+r6*8] + movddup m11, [base+bidir_mul+r6*8] +%else + mova m1, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 + mova m2, [base+pw_64] + movddup m3, [base+bidir_rnd+r6*8] + movddup m4, [base+bidir_mul+r6*8] + ALLOC_STACK -16*4 + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + mova [rsp+16*2], m3 + mova [rsp+16*3], m4 + %define m8 [rsp+gprsize+16*0] + %define m9 [rsp+gprsize+16*1] + %define m10 [rsp+gprsize+16*2] + %define m11 [rsp+gprsize+16*3] +%endif + movd m7, [base+pw_2] + psubw m7, m0 + pshufb m7, [base+pw_256] + add wq, t0 + movifnidn hd, r5m + mov maskq, r6mp + call .main + jmp wq +.w4_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 4 +.w4: + movq [dstq+strideq*0], m0 + phaddw m2, m3 + movhps [dstq+strideq*1], m0 + phaddd m2, m2 + lea dstq, [dstq+strideq*2] + paddw m2, m7 + movq [dstq+strideq*0], m1 + psrlw m2, 2 + movhps [dstq+strideq*1], m1 + packuswb m2, m2 + movd [maskq], m2 + sub hd, 4 + jg .w4_loop + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 4 +.w8: + mova [dstq+strideq*0], m0 + paddw m2, m3 + phaddw m2, m2 + mova [dstq+strideq*1], m1 + paddw m2, m7 + psrlw m2, 2 + packuswb m2, m2 + movd [maskq], m2 + sub hd, 2 + jg .w8_loop + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 8 +.w16: + mova [dstq+strideq*1+16*0], m2 + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*1+16*1], m3 + mova [dstq+strideq*0+16*1], m1 + call .main + paddw m2, [dstq+strideq*1+16*0] + paddw m3, [dstq+strideq*1+16*1] + mova [dstq+strideq*1+16*0], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*1], m1 + paddw m2, m7 + psrlw m2, 2 + packuswb m2, m2 + movq [maskq], m2 + sub hd, 2 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 16 +.w32: + mova [dstq+strideq*1+16*0], m2 + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*1+16*1], m3 + mova [dstq+strideq*0+16*1], m1 + call .main + mova [dstq+strideq*0+16*2], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*3], m2 + mova [dstq+strideq*0+16*3], m1 + call .main + paddw m2, [dstq+strideq*1+16*0] + paddw m3, [dstq+strideq*1+16*1] + mova [dstq+strideq*1+16*0], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*2], m2 + mova [dstq+strideq*1+16*1], m1 + call .main + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16*2] + paddw m2, [dstq+strideq*1+16*3] + mova [dstq+strideq*1+16*2], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16*3], m1 + packuswb m3, m2 + mova [maskq], m3 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 16*2 +.w64: + mova [dstq+strideq*1+16*1], m2 + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*1+16*2], m3 + mova [dstq+strideq*0+16*1], m1 + call .main + mova [dstq+strideq*1+16*3], m2 + mova [dstq+strideq*0+16*2], m0 + mova [dstq+strideq*1+16*4], m3 + mova [dstq+strideq*0+16*3], m1 + call .main + mova [dstq+strideq*1+16*5], m2 + mova [dstq+strideq*0+16*4], m0 + mova [dstq+strideq*1+16*6], m3 + mova [dstq+strideq*0+16*5], m1 + call .main + mova [dstq+strideq*0+16*6], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*7], m2 + mova [dstq+strideq*0+16*7], m1 + call .main + paddw m2, [dstq+strideq*1+16*1] + paddw m3, [dstq+strideq*1+16*2] + mova [dstq+strideq*1+16*0], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*2], m2 + mova [dstq+strideq*1+16*1], m1 + call .main + paddw m2, [dstq+strideq*1+16*3] + paddw m3, [dstq+strideq*1+16*4] + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16*2] + mova [dstq+strideq*1+16*2], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16*3], m1 + packuswb m3, m2 + mova [maskq+16*0], m3 + call .main + paddw m2, [dstq+strideq*1+16*5] + paddw m3, [dstq+strideq*1+16*6] + mova [dstq+strideq*1+16*4], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*6], m2 + mova [dstq+strideq*1+16*5], m1 + call .main + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16*6] + paddw m2, [dstq+strideq*1+16*7] + mova [dstq+strideq*1+16*6], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16*7], m1 + packuswb m3, m2 + mova [maskq+16*1], m3 + sub hd, 2 + jg .w64_loop + RET +.w128_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 16*4 +.w128: + mova [dstq+strideq*1+16* 1], m2 + mova [dstq+strideq*0+16* 0], m0 + mova [dstq+strideq*1+16* 2], m3 + mova [dstq+strideq*0+16* 1], m1 + call .main + mova [dstq+strideq*1+16* 3], m2 + mova [dstq+strideq*0+16* 2], m0 + mova [dstq+strideq*1+16* 4], m3 + mova [dstq+strideq*0+16* 3], m1 + call .main + mova [dstq+strideq*1+16* 5], m2 + mova [dstq+strideq*0+16* 4], m0 + mova [dstq+strideq*1+16* 6], m3 + mova [dstq+strideq*0+16* 5], m1 + call .main + mova [dstq+strideq*1+16* 7], m2 + mova [dstq+strideq*0+16* 6], m0 + mova [dstq+strideq*1+16* 8], m3 + mova [dstq+strideq*0+16* 7], m1 + call .main + mova [dstq+strideq*1+16* 9], m2 + mova [dstq+strideq*0+16* 8], m0 + mova [dstq+strideq*1+16*10], m3 + mova [dstq+strideq*0+16* 9], m1 + call .main + mova [dstq+strideq*1+16*11], m2 + mova [dstq+strideq*0+16*10], m0 + mova [dstq+strideq*1+16*12], m3 + mova [dstq+strideq*0+16*11], m1 + call .main + mova [dstq+strideq*1+16*13], m2 + mova [dstq+strideq*0+16*12], m0 + mova [dstq+strideq*1+16*14], m3 + mova [dstq+strideq*0+16*13], m1 + call .main + mova [dstq+strideq*0+16*14], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*15], m2 + mova [dstq+strideq*0+16*15], m1 + call .main + paddw m2, [dstq+strideq*1+16* 1] + paddw m3, [dstq+strideq*1+16* 2] + mova [dstq+strideq*1+16* 0], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16* 2], m2 + mova [dstq+strideq*1+16* 1], m1 + call .main + paddw m2, [dstq+strideq*1+16* 3] + paddw m3, [dstq+strideq*1+16* 4] + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16* 2] + mova [dstq+strideq*1+16* 2], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16* 3], m1 + packuswb m3, m2 + mova [maskq+16*0], m3 + call .main + paddw m2, [dstq+strideq*1+16* 5] + paddw m3, [dstq+strideq*1+16* 6] + mova [dstq+strideq*1+16* 4], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16* 6], m2 + mova [dstq+strideq*1+16* 5], m1 + call .main + paddw m2, [dstq+strideq*1+16* 7] + paddw m3, [dstq+strideq*1+16* 8] + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16* 6] + mova [dstq+strideq*1+16* 6], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16* 7], m1 + packuswb m3, m2 + mova [maskq+16*1], m3 + call .main + paddw m2, [dstq+strideq*1+16* 9] + paddw m3, [dstq+strideq*1+16*10] + mova [dstq+strideq*1+16* 8], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*10], m2 + mova [dstq+strideq*1+16* 9], m1 + call .main + paddw m2, [dstq+strideq*1+16*11] + paddw m3, [dstq+strideq*1+16*12] + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16*10] + mova [dstq+strideq*1+16*10], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16*11], m1 + packuswb m3, m2 + mova [maskq+16*2], m3 + call .main + paddw m2, [dstq+strideq*1+16*13] + paddw m3, [dstq+strideq*1+16*14] + mova [dstq+strideq*1+16*12], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*14], m2 + mova [dstq+strideq*1+16*13], m1 + call .main + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16*14] + paddw m2, [dstq+strideq*1+16*15] + mova [dstq+strideq*1+16*14], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16*15], m1 + packuswb m3, m2 + mova [maskq+16*3], m3 + sub hd, 2 + jg .w128_loop + RET +ALIGN function_align +.main: +%macro W_MASK 2 ; dst/tmp_offset, mask + mova m%1, [tmp1q+16*%1] + mova m%2, [tmp2q+16*%1] + punpcklwd m4, m%2, m%1 + punpckhwd m5, m%2, m%1 + psubsw m%1, m%2 + pabsw m%1, m%1 + psubusw m6, m8, m%1 + psrlw m6, 10 ; 64-m + psubw m%2, m9, m6 ; m + punpcklwd m%1, m6, m%2 + punpckhwd m6, m%2 + pmaddwd m%1, m4 + pmaddwd m6, m5 + psrad m%1, 5 + psrad m6, 5 + packssdw m%1, m6 + pmaxsw m%1, m10 + psubsw m%1, m10 + pmulhw m%1, m11 +%endmacro + W_MASK 0, 2 + W_MASK 1, 3 + add tmp1q, 16*2 + add tmp2q, 16*2 + ret + +cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask +%define base t0-w_mask_422_ssse3_table + LEA t0, w_mask_422_ssse3_table + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movd m7, r7m ; sign + shr r6d, 11 + movsxd wq, [t0+wq*4] +%if ARCH_X86_64 + mova m8, [base+pw_27615] + mova m9, [base+pw_64] + movddup m10, [base+bidir_rnd+r6*8] + movddup m11, [base+bidir_mul+r6*8] +%else + mova m1, [base+pw_27615] + mova m2, [base+pw_64] + movddup m3, [base+bidir_rnd+r6*8] + movddup m4, [base+bidir_mul+r6*8] + ALLOC_STACK -16*4 + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + mova [rsp+16*2], m3 + mova [rsp+16*3], m4 +%endif + pxor m0, m0 + add wq, t0 + pshufb m7, m0 + movifnidn hd, r5m + mov maskq, r6mp + call .main + jmp wq +.w4_loop: + call .main + lea dstq, [dstq+strideq*2] +.w4: + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + sub hd, 4 + jg .w4_loop +.end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*2] +.w8: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*2] +.w16: + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m1 + call .main + mova [dstq+strideq*1+16*0], m0 + mova [dstq+strideq*1+16*1], m1 + sub hd, 2 + jg .w16_loop + RET +.w32_loop: + call .main + add dstq, strideq +.w32: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + dec hd + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + call .main + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + call .main + mova [dstq+16*6], m0 + mova [dstq+16*7], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+16* 0], m0 + mova [dstq+16* 1], m1 + call .main + mova [dstq+16* 2], m0 + mova [dstq+16* 3], m1 + call .main + mova [dstq+16* 4], m0 + mova [dstq+16* 5], m1 + call .main + mova [dstq+16* 6], m0 + mova [dstq+16* 7], m1 + call .main + mova [dstq+16* 8], m0 + mova [dstq+16* 9], m1 + call .main + mova [dstq+16*10], m0 + mova [dstq+16*11], m1 + call .main + mova [dstq+16*12], m0 + mova [dstq+16*13], m1 + call .main + mova [dstq+16*14], m0 + mova [dstq+16*15], m1 + dec hd + jg .w128_loop + RET +ALIGN function_align +.main: + W_MASK 0, 2 + W_MASK 1, 3 + phaddw m2, m3 + add tmp1q, 16*2 + add tmp2q, 16*2 + packuswb m2, m2 + pxor m3, m3 + psubb m2, m7 + pavgb m2, m3 + movq [maskq], m2 + add maskq, 8 + ret + +cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask +%define base t0-w_mask_444_ssse3_table + LEA t0, w_mask_444_ssse3_table + tzcnt wd, wm + mov r6d, r8m ; pixel_max + shr r6d, 11 + movsxd wq, [t0+wq*4] +%if ARCH_X86_64 + mova m8, [base+pw_27615] + mova m9, [base+pw_64] + movddup m10, [base+bidir_rnd+r6*8] + movddup m11, [base+bidir_mul+r6*8] +%else + mova m1, [base+pw_27615] + mova m2, [base+pw_64] + movddup m3, [base+bidir_rnd+r6*8] + movddup m7, [base+bidir_mul+r6*8] + ALLOC_STACK -16*3 + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + mova [rsp+16*2], m3 + %define m11 m7 +%endif + add wq, t0 + movifnidn hd, r5m + mov maskq, r6mp + call .main + jmp wq +.w4_loop: + call .main + lea dstq, [dstq+strideq*2] +.w4: + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + sub hd, 4 + jg .w4_loop +.end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*2] +.w8: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*2] +.w16: + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m1 + call .main + mova [dstq+strideq*1+16*0], m0 + mova [dstq+strideq*1+16*1], m1 + sub hd, 2 + jg .w16_loop + RET +.w32_loop: + call .main + add dstq, strideq +.w32: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + dec hd + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + call .main + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + call .main + mova [dstq+16*6], m0 + mova [dstq+16*7], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+16* 0], m0 + mova [dstq+16* 1], m1 + call .main + mova [dstq+16* 2], m0 + mova [dstq+16* 3], m1 + call .main + mova [dstq+16* 4], m0 + mova [dstq+16* 5], m1 + call .main + mova [dstq+16* 6], m0 + mova [dstq+16* 7], m1 + call .main + mova [dstq+16* 8], m0 + mova [dstq+16* 9], m1 + call .main + mova [dstq+16*10], m0 + mova [dstq+16*11], m1 + call .main + mova [dstq+16*12], m0 + mova [dstq+16*13], m1 + call .main + mova [dstq+16*14], m0 + mova [dstq+16*15], m1 + dec hd + jg .w128_loop + RET +ALIGN function_align +.main: + W_MASK 0, 2 + W_MASK 1, 3 + packuswb m2, m3 + add tmp1q, 16*2 + add tmp2q, 16*2 + mova [maskq], m2 + add maskq, 16 + ret + +; (a * (64 - m) + b * m + 32) >> 6 +; = (((b - a) * m + 32) >> 6) + a +; = (((b - a) * (m << 9) + 16384) >> 15) + a +; except m << 9 overflows int16_t when m == 64 (which is possible), +; but if we negate m it works out (-64 << 9 == -32768). +; = (((a - b) * (m * -512) + 16384) >> 15) + a +cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h, mask, stride3 +%define base r6-blend_ssse3_table + LEA r6, blend_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r6+wq*4] + movifnidn maskq, maskmp + mova m7, [base+pw_m512] + add wq, r6 + lea stride3q, [strideq*3] + pxor m6, m6 + jmp wq +.w4: + mova m5, [maskq] + movq m0, [dstq+strideq*0] + movhps m0, [dstq+strideq*1] + movq m1, [dstq+strideq*2] + movhps m1, [dstq+stride3q ] + psubw m2, m0, [tmpq+16*0] + psubw m3, m1, [tmpq+16*1] + add maskq, 16 + add tmpq, 32 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + pmullw m4, m7 + pmullw m5, m7 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [dstq+strideq*2], m1 + movhps [dstq+stride3q ], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4 + RET +.w8: + mova m5, [maskq] + mova m0, [dstq+strideq*0] + mova m1, [dstq+strideq*1] + psubw m2, m0, [tmpq+16*0] + psubw m3, m1, [tmpq+16*1] + add maskq, 16 + add tmpq, 32 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + pmullw m4, m7 + pmullw m5, m7 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8 + RET +.w16: + mova m5, [maskq] + mova m0, [dstq+16*0] + mova m1, [dstq+16*1] + psubw m2, m0, [tmpq+16*0] + psubw m3, m1, [tmpq+16*1] + add maskq, 16 + add tmpq, 32 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + pmullw m4, m7 + pmullw m5, m7 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + add dstq, strideq + dec hd + jg .w16 + RET +.w32: + mova m5, [maskq+16*0] + mova m0, [dstq+16*0] + mova m1, [dstq+16*1] + psubw m2, m0, [tmpq+16*0] + psubw m3, m1, [tmpq+16*1] + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + pmullw m4, m7 + pmullw m5, m7 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova m5, [maskq+16*1] + mova m0, [dstq+16*2] + mova m1, [dstq+16*3] + psubw m2, m0, [tmpq+16*2] + psubw m3, m1, [tmpq+16*3] + add maskq, 32 + add tmpq, 64 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + pmullw m4, m7 + pmullw m5, m7 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + add dstq, strideq + dec hd + jg .w32 + RET + +cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h +%define base r5-blend_v_ssse3_table + LEA r5, blend_v_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + jmp wq +.w2: + movd m4, [base+obmc_masks+2*2] +.w2_loop: + movd m0, [dstq+strideq*0] + movd m2, [tmpq+4*0] + movd m1, [dstq+strideq*1] + movd m3, [tmpq+4*1] + add tmpq, 4*2 + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + paddw m0, m2 + paddw m1, m3 + movd [dstq+strideq*0], m0 + movd [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w2_loop + RET +.w4: + movddup m2, [base+obmc_masks+4*2] +.w4_loop: + movq m0, [dstq+strideq*0] + movhps m0, [dstq+strideq*1] + mova m1, [tmpq] + add tmpq, 8*2 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w4_loop + RET +.w8: + mova m4, [base+obmc_masks+8*2] +.w8_loop: + mova m0, [dstq+strideq*0] + mova m2, [tmpq+16*0] + mova m1, [dstq+strideq*1] + mova m3, [tmpq+16*1] + add tmpq, 16*2 + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + paddw m0, m2 + paddw m1, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +.w16: + mova m4, [base+obmc_masks+16*2] + movq m5, [base+obmc_masks+16*3] +.w16_loop: + mova m0, [dstq+16*0] + mova m2, [tmpq+16*0] + mova m1, [dstq+16*1] + mova m3, [tmpq+16*1] + add tmpq, 16*2 + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + add dstq, strideq + dec hd + jg .w16_loop + RET +.w32: +%if WIN64 + movaps [rsp+8], m6 +%endif + mova m4, [base+obmc_masks+16*4] + mova m5, [base+obmc_masks+16*5] + mova m6, [base+obmc_masks+16*6] +.w32_loop: + mova m0, [dstq+16*0] + mova m2, [tmpq+16*0] + mova m1, [dstq+16*1] + mova m3, [tmpq+16*1] + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + mova m2, [dstq+16*2] + paddw m1, m3 + mova m3, [tmpq+16*2] + add tmpq, 16*4 + psubw m3, m2 + pmulhrsw m3, m6 + paddw m2, m3 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + add dstq, strideq + dec hd + jg .w32_loop +%if WIN64 + movaps m6, [rsp+8] +%endif + RET + +%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp + mova m0, [dstq+16*(%1+0)] + mova m2, [tmpq+16*(%2+0)] + mova m1, [dstq+16*(%1+1)] + mova m3, [tmpq+16*(%2+1)] +%if %3 + add tmpq, 16*%3 +%endif + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m5 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+16*(%1+0)], m0 + mova [dstq+16*(%1+1)], m1 +%endmacro + +cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, mask +%define base r6-blend_h_ssse3_table + LEA r6, blend_h_ssse3_table + tzcnt wd, wm + mov hd, hm + movsxd wq, [r6+wq*4] + movddup m4, [base+blend_shuf] + lea maskq, [base+obmc_masks+hq*2] + lea hd, [hq*3] + add wq, r6 + shr hd, 2 ; h * 3/4 + lea maskq, [maskq+hq*2] + neg hq + jmp wq +.w2: + movd m0, [dstq+dsq*0] + movd m2, [dstq+dsq*1] + movd m3, [maskq+hq*2] + movq m1, [tmpq] + add tmpq, 4*2 + punpckldq m0, m2 + punpcklwd m3, m3 + psubw m1, m0 + pmulhrsw m1, m3 + paddw m0, m1 + movd [dstq+dsq*0], m0 + psrlq m0, 32 + movd [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w2 + RET +.w4: + mova m3, [base+blend_shuf] +.w4_loop: + movq m0, [dstq+dsq*0] + movhps m0, [dstq+dsq*1] + movd m2, [maskq+hq*2] + mova m1, [tmpq] + add tmpq, 8*2 + psubw m1, m0 + pshufb m2, m3 + pmulhrsw m1, m2 + paddw m0, m1 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w4_loop + RET +.w8: + movddup m5, [base+blend_shuf+8] +%if WIN64 + movaps [rsp+ 8], m6 + movaps [rsp+24], m7 +%endif +.w8_loop: + movd m7, [maskq+hq*2] + mova m0, [dstq+dsq*0] + mova m2, [tmpq+16*0] + mova m1, [dstq+dsq*1] + mova m3, [tmpq+16*1] + add tmpq, 16*2 + pshufb m6, m7, m4 + psubw m2, m0 + pshufb m7, m5 + psubw m3, m1 + pmulhrsw m2, m6 + pmulhrsw m3, m7 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w8_loop +%if WIN64 + movaps m6, [rsp+ 8] + movaps m7, [rsp+24] +%endif + RET +.w16: + movd m5, [maskq+hq*2] + pshufb m5, m4 + BLEND_H_ROW 0, 0, 2 + add dstq, dsq + inc hq + jl .w16 + RET +.w32: + movd m5, [maskq+hq*2] + pshufb m5, m4 + BLEND_H_ROW 0, 0 + BLEND_H_ROW 2, 2, 4 + add dstq, dsq + inc hq + jl .w32 + RET +.w64: + movd m5, [maskq+hq*2] + pshufb m5, m4 + BLEND_H_ROW 0, 0 + BLEND_H_ROW 2, 2 + BLEND_H_ROW 4, 4 + BLEND_H_ROW 6, 6, 8 + add dstq, dsq + inc hq + jl .w64 + RET +.w128: + movd m5, [maskq+hq*2] + pshufb m5, m4 + BLEND_H_ROW 0, 0 + BLEND_H_ROW 2, 2 + BLEND_H_ROW 4, 4 + BLEND_H_ROW 6, 6, 16 + BLEND_H_ROW 8, -8 + BLEND_H_ROW 10, -6 + BLEND_H_ROW 12, -4 + BLEND_H_ROW 14, -2 + add dstq, dsq + inc hq + jl .w128 + RET + +; emu_edge args: +; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih, +; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride, +; const pixel *ref, const ptrdiff_t ref_stride +; +; bw, bh total filled size +; iw, ih, copied block -> fill bottom, right +; x, y, offset in bw/bh -> fill top, left +cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, \ + y, dst, dstride, src, sstride, \ + bottomext, rightext, blk + ; we assume that the buffer (stride) is larger than width, so we can + ; safely overwrite by a few bytes + +%if ARCH_X86_64 + %define reg_zero r12q + %define reg_tmp r10 + %define reg_src srcq + %define reg_bottomext bottomextq + %define reg_rightext rightextq + %define reg_blkm r9m +%else + %define reg_zero r6 + %define reg_tmp r0 + %define reg_src r1 + %define reg_bottomext r0 + %define reg_rightext r1 + %define reg_blkm r2m +%endif + ; + ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + xor reg_zero, reg_zero + lea reg_tmp, [ihq-1] + cmp yq, ihq + cmovs reg_tmp, yq + test yq, yq + cmovs reg_tmp, reg_zero +%if ARCH_X86_64 + imul reg_tmp, sstrideq + add srcq, reg_tmp +%else + imul reg_tmp, sstridem + mov reg_src, srcm + add reg_src, reg_tmp +%endif + ; + ; ref += iclip(x, 0, iw - 1) + lea reg_tmp, [iwq-1] + cmp xq, iwq + cmovs reg_tmp, xq + test xq, xq + cmovs reg_tmp, reg_zero + lea reg_src, [reg_src+reg_tmp*2] +%if ARCH_X86_32 + mov srcm, reg_src +%endif + ; + ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) +%if ARCH_X86_32 + mov r1, r1m ; restore bh +%endif + lea reg_bottomext, [yq+bhq] + sub reg_bottomext, ihq + lea r3, [bhq-1] + cmovs reg_bottomext, reg_zero + ; + + DEFINE_ARGS bw, bh, iw, ih, x, \ + topext, dst, dstride, src, sstride, \ + bottomext, rightext, blk + + ; top_ext = iclip(-y, 0, bh - 1) + neg topextq + cmovs topextq, reg_zero + cmp reg_bottomext, bhq + cmovns reg_bottomext, r3 + cmp topextq, bhq + cmovg topextq, r3 + %if ARCH_X86_32 + mov r4m, reg_bottomext + ; + ; right_ext = iclip(x + bw - iw, 0, bw - 1) + mov r0, r0m ; restore bw + %endif + lea reg_rightext, [xq+bwq] + sub reg_rightext, iwq + lea r2, [bwq-1] + cmovs reg_rightext, reg_zero + + DEFINE_ARGS bw, bh, iw, ih, leftext, \ + topext, dst, dstride, src, sstride, \ + bottomext, rightext, blk + + ; left_ext = iclip(-x, 0, bw - 1) + neg leftextq + cmovs leftextq, reg_zero + cmp reg_rightext, bwq + cmovns reg_rightext, r2 + %if ARCH_X86_32 + mov r3m, r1 + %endif + cmp leftextq, bwq + cmovns leftextq, r2 + +%undef reg_zero +%undef reg_tmp +%undef reg_src +%undef reg_bottomext +%undef reg_rightext + + DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \ + topext, dst, dstride, src, sstride, \ + bottomext, rightext, blk + + ; center_h = bh - top_ext - bottom_ext +%if ARCH_X86_64 + lea r3, [bottomextq+topextq] + sub centerhq, r3 +%else + mov r1, centerhm ; restore r1 + sub centerhq, topextq + sub centerhq, r4m + mov r1m, centerhq +%endif + ; + ; blk += top_ext * PXSTRIDE(dst_stride) + mov r2, topextq +%if ARCH_X86_64 + imul r2, dstrideq +%else + mov r6, r6m ; restore dstq + imul r2, dstridem +%endif + add dstq, r2 + mov reg_blkm, dstq ; save pointer for ext + ; + ; center_w = bw - left_ext - right_ext + mov centerwq, bwq +%if ARCH_X86_64 + lea r3, [rightextq+leftextq] + sub centerwq, r3 +%else + sub centerwq, r3m + sub centerwq, leftextq +%endif + +; vloop Macro +%macro v_loop 3 ; need_left_ext, need_right_ext, suffix + %if ARCH_X86_64 + %define reg_tmp r12 + %else + %define reg_tmp r0 + %endif +.v_loop_%3: + %if ARCH_X86_32 + mov r0, r0m + mov r1, r1m + %endif +%if %1 + ; left extension + %if ARCH_X86_64 + movd m0, [srcq] + %else + mov r3, srcm + movd m0, [r3] + %endif + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + xor r3, r3 +.left_loop_%3: + mova [dstq+r3*2], m0 + add r3, mmsize/2 + cmp r3, leftextq + jl .left_loop_%3 + ; body + lea reg_tmp, [dstq+leftextq*2] +%endif + xor r3, r3 +.body_loop_%3: + %if ARCH_X86_64 + movu m0, [srcq+r3*2] + %else + mov r1, srcm + movu m0, [r1+r3*2] + %endif +%if %1 + movu [reg_tmp+r3*2], m0 +%else + movu [dstq+r3*2], m0 +%endif + add r3, mmsize/2 + cmp r3, centerwq + jl .body_loop_%3 +%if %2 + ; right extension +%if %1 + lea reg_tmp, [reg_tmp+centerwq*2] +%else + lea reg_tmp, [dstq+centerwq*2] +%endif + %if ARCH_X86_64 + movd m0, [srcq+centerwq*2-2] + %else + mov r3, srcm + movd m0, [r3+centerwq*2-2] + %endif + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + xor r3, r3 +.right_loop_%3: + movu [reg_tmp+r3*2], m0 + add r3, mmsize/2 + %if ARCH_X86_64 + cmp r3, rightextq + %else + cmp r3, r3m + %endif + jl .right_loop_%3 +%endif + %if ARCH_X86_64 + add dstq, dstrideq + add srcq, sstrideq + dec centerhq + jg .v_loop_%3 + %else + add dstq, dstridem + mov r0, sstridem + add srcm, r0 + sub dword centerhm, 1 + jg .v_loop_%3 + mov r0, r0m ; restore r0 + %endif +%endmacro ; vloop MACRO + + test leftextq, leftextq + jnz .need_left_ext + %if ARCH_X86_64 + test rightextq, rightextq + jnz .need_right_ext + %else + cmp leftextq, r3m ; leftextq == 0 + jne .need_right_ext + %endif + v_loop 0, 0, 0 + jmp .body_done + + ;left right extensions +.need_left_ext: + %if ARCH_X86_64 + test rightextq, rightextq + %else + mov r3, r3m + test r3, r3 + %endif + jnz .need_left_right_ext + v_loop 1, 0, 1 + jmp .body_done + +.need_left_right_ext: + v_loop 1, 1, 2 + jmp .body_done + +.need_right_ext: + v_loop 0, 1, 3 + +.body_done: +; r0 ; bw +; r1 ;; x loop +; r4 ;; y loop +; r5 ; topextq +; r6 ;dstq +; r7 ;dstrideq +; r8 ; srcq +%if ARCH_X86_64 + %define reg_dstride dstrideq +%else + %define reg_dstride r2 +%endif + ; + ; bottom edge extension + %if ARCH_X86_64 + test bottomextq, bottomextq + jz .top + %else + xor r1, r1 + cmp r1, r4m + je .top + %endif + ; + %if ARCH_X86_64 + mov srcq, dstq + sub srcq, dstrideq + xor r1, r1 + %else + mov r3, dstq + mov reg_dstride, dstridem + sub r3, reg_dstride + mov srcm, r3 + %endif + ; +.bottom_x_loop: + %if ARCH_X86_64 + mova m0, [srcq+r1*2] + lea r3, [dstq+r1*2] + mov r4, bottomextq + %else + mov r3, srcm + mova m0, [r3+r1*2] + lea r3, [dstq+r1*2] + mov r4, r4m + %endif + ; +.bottom_y_loop: + mova [r3], m0 + add r3, reg_dstride + dec r4 + jg .bottom_y_loop + add r1, mmsize/2 + cmp r1, bwq + jl .bottom_x_loop + +.top: + ; top edge extension + test topextq, topextq + jz .end +%if ARCH_X86_64 + mov srcq, reg_blkm +%else + mov r3, reg_blkm + mov reg_dstride, dstridem +%endif + mov dstq, dstm + xor r1, r1 + ; +.top_x_loop: +%if ARCH_X86_64 + mova m0, [srcq+r1*2] +%else + mov r3, reg_blkm + mova m0, [r3+r1*2] +%endif + lea r3, [dstq+r1*2] + mov r4, topextq + ; +.top_y_loop: + mova [r3], m0 + add r3, reg_dstride + dec r4 + jg .top_y_loop + add r1, mmsize/2 + cmp r1, bwq + jl .top_x_loop + +.end: + RET + +%undef reg_dstride +%undef reg_blkm +%undef reg_tmp + +%macro SCRATCH 3 +%if ARCH_X86_32 + mova [rsp+%3*mmsize], m%1 +%define m%2 [rsp+%3*mmsize] +%else + SWAP %1, %2 +%endif +%endmacro + +%if ARCH_X86_64 +cglobal resize_16bpc, 0, 12, 16, 1*16, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0, pxmax +%elif STACK_ALIGNMENT >= 16 +cglobal resize_16bpc, 0, 7, 8, 6*16, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0, pxmax +%else +cglobal resize_16bpc, 0, 6, 8, 6*16, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0, pxmax +%endif + movifnidn dstq, dstmp + movifnidn srcq, srcmp +%if STACK_ALIGNMENT >= 16 + movifnidn dst_wd, dst_wm +%endif +%if ARCH_X86_64 + movifnidn hd, hm +%endif + sub dword mx0m, 4<<14 + sub dword src_wm, 8 + movd m4, pxmaxm + movd m7, dxm + movd m6, mx0m + movd m5, src_wm + punpcklwd m4, m4 + pshufd m4, m4, q0000 + pshufd m7, m7, q0000 + pshufd m6, m6, q0000 + pshufd m5, m5, q0000 + mova [rsp+16*3*ARCH_X86_32], m4 +%if ARCH_X86_64 + DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x + LEA r7, $$ + %define base r7-$$ +%else + DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x + %define hd dword r5m + %if STACK_ALIGNMENT >= 16 + LEA r6, $$ + %define base r6-$$ + %else + LEA r4, $$ + %define base r4-$$ + %endif +%endif +%if ARCH_X86_64 + mova m12, [base+pd_64] + mova m11, [base+pd_63] +%else + %define m12 [base+pd_64] + %define m11 [base+pd_63] +%endif + pmaddwd m4, m7, [base+rescale_mul] ; dx*[0,1,2,3] + pslld m7, 2 ; dx*4 + pslld m5, 14 + paddd m6, m4 ; mx+[0..3]*dx + SCRATCH 7, 15, 0 + SCRATCH 6, 14, 1 + SCRATCH 5, 13, 2 + pxor m1, m1 +.loop_y: + xor xd, xd + mova m0, m14 ; per-line working version of mx +.loop_x: + pcmpgtd m1, m0 + pandn m1, m0 + psrad m2, m0, 8 ; filter offset (unmasked) + pcmpgtd m3, m13, m1 + pand m1, m3 + pandn m3, m13 + por m1, m3 + psubd m3, m0, m1 ; pshufb offset + psrad m1, 14 ; clipped src_x offset + psrad m3, 14 ; pshufb edge_emu offset + pand m2, m11 ; filter offset (masked) + ; load source pixels +%if ARCH_X86_64 + movd r8d, m1 + pshuflw m1, m1, q3232 + movd r9d, m1 + punpckhqdq m1, m1 + movd r10d, m1 + psrlq m1, 32 + movd r11d, m1 + movu m4, [srcq+r8*2] + movu m5, [srcq+r9*2] + movu m6, [srcq+r10*2] + movu m7, [srcq+r11*2] + ; if no emulation is required, we don't need to shuffle or emulate edges + packssdw m3, m3 + movq r11, m3 + test r11, r11 + jz .filter + movsx r8, r11w + sar r11, 16 + movsx r9, r11w + sar r11, 16 + movsx r10, r11w + sar r11, 16 + movu m1, [base+resize_shuf+8+r8*2] + movu m3, [base+resize_shuf+8+r9*2] + movu m8, [base+resize_shuf+8+r10*2] + movu m9, [base+resize_shuf+8+r11*2] + pshufb m4, m1 + pshufb m5, m3 + pshufb m6, m8 + pshufb m7, m9 +.filter: + movd r8d, m2 + pshuflw m2, m2, q3232 + movd r9d, m2 + punpckhqdq m2, m2 + movd r10d, m2 + psrlq m2, 32 + movd r11d, m2 + movq m8, [base+resize_filter+r8*8] + movq m2, [base+resize_filter+r9*8] + pxor m9, m9 + punpcklbw m1, m9, m8 + punpcklbw m3, m9, m2 + psraw m1, 8 + psraw m3, 8 + movq m10, [base+resize_filter+r10*8] + movq m2, [base+resize_filter+r11*8] + punpcklbw m8, m9, m10 + punpcklbw m9, m2 + psraw m8, 8 + psraw m9, 8 + pmaddwd m4, m1 + pmaddwd m5, m3 + pmaddwd m6, m8 + pmaddwd m7, m9 + phaddd m4, m5 +%else + movd r3, m1 + pshuflw m1, m1, q3232 + movd r1, m1 + punpckhqdq m1, m1 + movu m4, [srcq+r3*2] + movu m5, [srcq+r1*2] + movd r3, m1 + psrlq m1, 32 + movd r1, m1 + movu m6, [srcq+r3*2] + movu m7, [srcq+r1*2] + ; if no emulation is required, we don't need to shuffle or emulate edges + pxor m1, m1 + pcmpeqb m1, m3 + pmovmskb r3d, m1 + cmp r3d, 0xffff + je .filter + movd r3, m3 + movu m1, [base+resize_shuf+8+r3*2] + pshuflw m3, m3, q3232 + movd r1, m3 + pshufb m4, m1 + movu m1, [base+resize_shuf+8+r1*2] + punpckhqdq m3, m3 + movd r3, m3 + pshufb m5, m1 + movu m1, [base+resize_shuf+8+r3*2] + psrlq m3, 32 + movd r1, m3 + pshufb m6, m1 + movu m1, [base+resize_shuf+8+r1*2] + pshufb m7, m1 +.filter: + mova [esp+4*16], m6 + mova [esp+5*16], m7 + movd r3, m2 + pshuflw m2, m2, q3232 + movd r1, m2 + movq m6, [base+resize_filter+r3*8] + movq m7, [base+resize_filter+r1*8] + pxor m3, m3 + punpcklbw m1, m3, m6 + punpcklbw m3, m7 + psraw m1, 8 + psraw m3, 8 + pmaddwd m4, m1 + pmaddwd m5, m3 + punpckhqdq m2, m2 + movd r3, m2 + psrlq m2, 32 + movd r1, m2 + phaddd m4, m5 + movq m2, [base+resize_filter+r3*8] + movq m5, [base+resize_filter+r1*8] + mova m6, [esp+4*16] + mova m7, [esp+5*16] + pxor m3, m3 + punpcklbw m1, m3, m2 + punpcklbw m3, m5 + psraw m1, 8 + psraw m3, 8 + pmaddwd m6, m1 + pmaddwd m7, m3 +%endif + phaddd m6, m7 + phaddd m4, m6 + pxor m1, m1 + psubd m2, m12, m4 + psrad m2, 7 + packssdw m2, m2 + pmaxsw m2, m1 + pminsw m2, [rsp+16*3*ARCH_X86_32] + movq [dstq+xq*2], m2 + paddd m0, m15 + add xd, 4 +%if STACK_ALIGNMENT >= 16 + cmp xd, dst_wd +%else + cmp xd, dst_wm +%endif + jl .loop_x + add dstq, dst_stridemp + add srcq, src_stridemp + dec hd + jg .loop_y + RET |