; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA filter_shuf: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 z_base_inc_z2: dw 7*64, 6*64, 5*64, 4*64, 3*64, 2*64, 1*64, 0*64 z_upsample: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 z2_upsample_l: db -1, -1, -2, -1, -3, -1, -4, -1, 8, 9, 8, 9, 10, 11, 12, 13 db 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 z2_top_shufA: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 z2_top_shufB: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 z2_left_shufA: db 14, 15, 12, 13, 10, 11, 8, 9, 12, 13, 10, 11, 8, 9, 6, 7 z2_left_shufB: db 14, 15, 10, 11, 6, 7, 2, 3, 12, 13, 8, 9, 4, 5, 0, 1 z_filt_wh16: db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1 z_filt_t_w48: db 55,127, 7,127, 15, 31, 39, 31,127, 39,127, 39, 7, 15, 31, 15 db 39, 63, 3, 63, 3, 3, 19, 3, 47, 19, 47, 19, 3, 3, 3, 3 z_filt_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0, 0 z_filt_wh4: db 7, 7, 19, 7, z_filt_wh8: db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39 ALIGN 8 pb_2_3: times 4 db 2, 3 z2_dy_offset: dw 96*64, 96*64, 95*64, 95*64 z_filt_k: times 4 dw 8 times 4 dw 6 times 4 dw 4 times 4 dw 5 pw_m3584: times 4 dw -3584 pw_m3072: times 4 dw -3072 pw_m2560: times 4 dw -2560 pw_m2048: times 4 dw -2048 pw_m1536: times 4 dw -1536 pw_m1024: times 4 dw -1024 pw_m512: times 4 dw -512 pw_1: times 4 dw 1 pw_2: times 4 dw 2 pw_3: times 4 dw 3 pw_62: times 4 dw 62 pw_256: times 4 dw 256 pw_512: times 4 dw 512 pw_2048: times 4 dw 2048 %define pw_4 (z_filt_k+8*2) %define pw_8 (z_filt_k+8*0) %define pw_m1to4 z2_upsample_l %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) %xdefine %%base mangle(private_prefix %+ _%1_%2) %%table: %rep %0 - 2 dd %%base %+ .%3 - (%%table - 2*4) %rotate 1 %endrep %endmacro %define ipred_dc_splat_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 10*4) %define ipred_dc_128_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 15*4) %define ipred_cfl_splat_16bpc_ssse3_table (ipred_cfl_16bpc_ssse3_table + 8*4) JMP_TABLE ipred_dc_left_16bpc, ssse3, h4, h8, h16, h32, h64 JMP_TABLE ipred_dc_16bpc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4, \ s4-15*4, s8-15*4, s16c-15*4, s32c-15*4, s64-15*4 JMP_TABLE ipred_h_16bpc, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_z1_16bpc, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_z2_16bpc, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_z3_16bpc, ssse3, h4, h8, h16, h32, h64 JMP_TABLE ipred_cfl_16bpc, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ s4-8*4, s8-8*4, s16-8*4, s32-8*4 JMP_TABLE ipred_cfl_left_16bpc, ssse3, h4, h8, h16, h32 JMP_TABLE ipred_cfl_ac_444_16bpc, ssse3, w4, w8, w16, w32 JMP_TABLE pal_pred_16bpc, ssse3, w4, w8, w16, w32, w64 cextern smooth_weights_1d_16bpc cextern smooth_weights_2d_16bpc cextern dr_intra_derivative cextern filter_intra_taps SECTION .text INIT_XMM ssse3 cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h LEA r5, ipred_dc_left_16bpc_ssse3_table movd m4, wm tzcnt wd, wm add tlq, 2 movifnidn hd, hm pxor m3, m3 pavgw m4, m3 movd m5, wd movu m0, [tlq] movsxd r6, [r5+wq*4] add r6, r5 add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 LEA r5, ipred_dc_left_16bpc_ssse3_table mov hd, hm movd m4, hm tzcnt r6d, hd sub tlq, hq tzcnt wd, wm pxor m3, m3 sub tlq, hq pavgw m4, m3 movd m5, r6d movu m0, [tlq] movsxd r6, [r5+r6*4] add r6, r5 add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 .h64: movu m2, [tlq+112] movu m1, [tlq+ 96] paddw m0, m2 movu m2, [tlq+ 80] paddw m1, m2 movu m2, [tlq+ 64] paddw m0, m2 paddw m0, m1 .h32: movu m1, [tlq+ 48] movu m2, [tlq+ 32] paddw m1, m2 paddw m0, m1 .h16: movu m1, [tlq+ 16] paddw m0, m1 .h8: movhlps m1, m0 paddw m0, m1 .h4: punpcklwd m0, m3 paddd m4, m0 punpckhqdq m0, m0 paddd m0, m4 pshuflw m4, m0, q1032 paddd m0, m4 psrld m0, m5 lea stride3q, [strideq*3] pshuflw m0, m0, q0000 punpcklqdq m0, m0 jmp wq cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3 movifnidn hd, hm tzcnt r6d, hd lea r5d, [wq+hq] movd m4, r5d tzcnt r5d, r5d movd m5, r5d LEA r5, ipred_dc_16bpc_ssse3_table tzcnt wd, wd movsxd r6, [r5+r6*4] movsxd wq, [r5+wq*4+5*4] pxor m3, m3 psrlw m4, 1 add r6, r5 add wq, r5 lea stride3q, [strideq*3] jmp r6 .h4: movq m0, [tlq-8] jmp wq .w4: movq m1, [tlq+2] paddw m1, m0 punpckhwd m0, m3 punpcklwd m1, m3 paddd m0, m1 paddd m4, m0 punpckhqdq m0, m0 paddd m0, m4 pshuflw m1, m0, q1032 paddd m0, m1 cmp hd, 4 jg .w4_mul psrlw m0, 3 jmp .w4_end .w4_mul: mov r2d, 0xAAAB mov r3d, 0x6667 cmp hd, 16 cmove r2d, r3d psrld m0, 2 movd m1, r2d pmulhuw m0, m1 psrlw m0, 1 .w4_end: pshuflw m0, m0, q0000 .s4: movq [dstq+strideq*0], m0 movq [dstq+strideq*1], m0 movq [dstq+strideq*2], m0 movq [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s4 RET .h8: mova m0, [tlq-16] jmp wq .w8: movu m1, [tlq+2] paddw m0, m1 punpcklwd m1, m0, m3 punpckhwd m0, m3 paddd m0, m1 paddd m4, m0 punpckhqdq m0, m0 paddd m0, m4 pshuflw m1, m0, q1032 paddd m0, m1 psrld m0, m5 cmp hd, 8 je .w8_end mov r2d, 0xAAAB mov r3d, 0x6667 cmp hd, 32 cmove r2d, r3d movd m1, r2d pmulhuw m0, m1 psrlw m0, 1 .w8_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s8: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m0 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s8 RET .h16: mova m0, [tlq-32] paddw m0, [tlq-16] jmp wq .w16: movu m1, [tlq+ 2] movu m2, [tlq+18] paddw m1, m2 paddw m0, m1 punpckhwd m1, m0, m3 punpcklwd m0, m3 paddd m0, m1 paddd m4, m0 punpckhqdq m0, m0 paddd m0, m4 pshuflw m1, m0, q1032 paddd m0, m1 psrld m0, m5 cmp hd, 16 je .w16_end mov r2d, 0xAAAB mov r3d, 0x6667 test hd, 8|32 cmovz r2d, r3d movd m1, r2d pmulhuw m0, m1 psrlw m0, 1 .w16_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s16c: mova m1, m0 .s16: mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*0+16*1], m1 mova [dstq+strideq*1+16*0], m0 mova [dstq+strideq*1+16*1], m1 mova [dstq+strideq*2+16*0], m0 mova [dstq+strideq*2+16*1], m1 mova [dstq+stride3q +16*0], m0 mova [dstq+stride3q +16*1], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s16 RET .h32: mova m0, [tlq-64] paddw m0, [tlq-48] paddw m0, [tlq-32] paddw m0, [tlq-16] jmp wq .w32: movu m1, [tlq+ 2] movu m2, [tlq+18] paddw m1, m2 movu m2, [tlq+34] paddw m0, m2 movu m2, [tlq+50] paddw m1, m2 paddw m0, m1 punpcklwd m1, m0, m3 punpckhwd m0, m3 paddd m0, m1 paddd m4, m0 punpckhqdq m0, m0 paddd m0, m4 pshuflw m1, m0, q1032 paddd m0, m1 psrld m0, m5 cmp hd, 32 je .w32_end mov r2d, 0xAAAB mov r3d, 0x6667 cmp hd, 8 cmove r2d, r3d movd m1, r2d pmulhuw m0, m1 psrlw m0, 1 .w32_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s32c: mova m1, m0 mova m2, m0 mova m3, m0 .s32: mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*0+16*1], m1 mova [dstq+strideq*0+16*2], m2 mova [dstq+strideq*0+16*3], m3 mova [dstq+strideq*1+16*0], m0 mova [dstq+strideq*1+16*1], m1 mova [dstq+strideq*1+16*2], m2 mova [dstq+strideq*1+16*3], m3 lea dstq, [dstq+strideq*2] sub hd, 2 jg .s32 RET .h64: mova m0, [tlq-128] mova m1, [tlq-112] paddw m0, [tlq- 96] paddw m1, [tlq- 80] paddw m0, [tlq- 64] paddw m1, [tlq- 48] paddw m0, [tlq- 32] paddw m1, [tlq- 16] paddw m0, m1 jmp wq .w64: movu m1, [tlq+ 2] movu m2, [tlq+ 18] paddw m1, m2 movu m2, [tlq+ 34] paddw m0, m2 movu m2, [tlq+ 50] paddw m1, m2 movu m2, [tlq+ 66] paddw m0, m2 movu m2, [tlq+ 82] paddw m1, m2 movu m2, [tlq+ 98] paddw m0, m2 movu m2, [tlq+114] paddw m1, m2 paddw m0, m1 punpcklwd m1, m0, m3 punpckhwd m0, m3 paddd m0, m1 paddd m4, m0 punpckhqdq m0, m0 paddd m0, m4 pshuflw m1, m0, q1032 paddd m0, m1 psrld m0, m5 cmp hd, 64 je .w64_end mov r2d, 0xAAAB mov r3d, 0x6667 cmp hd, 16 cmove r2d, r3d movd m1, r2d pmulhuw m0, m1 psrlw m0, 1 .w64_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s64: mova [dstq+16*0], m0 mova [dstq+16*1], m0 mova [dstq+16*2], m0 mova [dstq+16*3], m0 mova [dstq+16*4], m0 mova [dstq+16*5], m0 mova [dstq+16*6], m0 mova [dstq+16*7], m0 add dstq, strideq dec hd jg .s64 RET cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 mov r6d, r8m LEA r5, ipred_dc_128_16bpc_ssse3_table tzcnt wd, wm shr r6d, 11 movifnidn hd, hm movsxd wq, [r5+wq*4] movddup m0, [r5-ipred_dc_128_16bpc_ssse3_table+pw_512+r6*8] add wq, r5 lea stride3q, [strideq*3] jmp wq cglobal ipred_v_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3 LEA r5, ipred_dc_splat_16bpc_ssse3_table movifnidn hd, hm movu m0, [tlq+ 2] movu m1, [tlq+ 18] movu m2, [tlq+ 34] movu m3, [tlq+ 50] cmp wd, 64 je .w64 tzcnt wd, wd movsxd wq, [r5+wq*4] add wq, r5 lea stride3q, [strideq*3] jmp wq .w64: WIN64_SPILL_XMM 8 movu m4, [tlq+ 66] movu m5, [tlq+ 82] movu m6, [tlq+ 98] movu m7, [tlq+114] .w64_loop: mova [dstq+16*0], m0 mova [dstq+16*1], m1 mova [dstq+16*2], m2 mova [dstq+16*3], m3 mova [dstq+16*4], m4 mova [dstq+16*5], m5 mova [dstq+16*6], m6 mova [dstq+16*7], m7 add dstq, strideq dec hd jg .w64_loop RET cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 %define base r5-ipred_h_16bpc_ssse3_table tzcnt wd, wm LEA r5, ipred_h_16bpc_ssse3_table movifnidn hd, hm movsxd wq, [r5+wq*4] movddup m2, [base+pw_256] movddup m3, [base+pb_2_3] add wq, r5 lea stride3q, [strideq*3] jmp wq .w4: sub tlq, 8 movq m3, [tlq] pshuflw m0, m3, q3333 pshuflw m1, m3, q2222 pshuflw m2, m3, q1111 pshuflw m3, m3, q0000 movq [dstq+strideq*0], m0 movq [dstq+strideq*1], m1 movq [dstq+strideq*2], m2 movq [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 RET .w8: sub tlq, 8 movq m3, [tlq] punpcklwd m3, m3 pshufd m0, m3, q3333 pshufd m1, m3, q2222 pshufd m2, m3, q1111 pshufd m3, m3, q0000 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8 RET .w16: sub tlq, 4 movd m1, [tlq] pshufb m0, m1, m3 pshufb m1, m2 mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*0+16*1], m0 mova [dstq+strideq*1+16*0], m1 mova [dstq+strideq*1+16*1], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16 RET .w32: sub tlq, 4 movd m1, [tlq] pshufb m0, m1, m3 pshufb m1, m2 mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*0+16*1], m0 mova [dstq+strideq*0+16*2], m0 mova [dstq+strideq*0+16*3], m0 mova [dstq+strideq*1+16*0], m1 mova [dstq+strideq*1+16*1], m1 mova [dstq+strideq*1+16*2], m1 mova [dstq+strideq*1+16*3], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32 RET .w64: sub tlq, 2 movd m0, [tlq] pshufb m0, m2 mova [dstq+16*0], m0 mova [dstq+16*1], m0 mova [dstq+16*2], m0 mova [dstq+16*3], m0 mova [dstq+16*4], m0 mova [dstq+16*5], m0 mova [dstq+16*6], m0 mova [dstq+16*7], m0 add dstq, strideq dec hd jg .w64 RET cglobal ipred_paeth_16bpc, 4, 6, 8, dst, stride, tl, w, h, left %define base r5-ipred_paeth_16bpc_ssse3_table movifnidn hd, hm pshuflw m4, [tlq], q0000 mov leftq, tlq add hd, hd punpcklqdq m4, m4 ; topleft sub leftq, hq and wd, ~7 jnz .w8 movddup m5, [tlq+2] ; top psubw m6, m5, m4 pabsw m7, m6 .w4_loop: movd m1, [leftq+hq-4] punpcklwd m1, m1 punpckldq m1, m1 ; left %macro PAETH 0 paddw m0, m6, m1 psubw m2, m4, m0 ; tldiff psubw m0, m5 ; tdiff pabsw m2, m2 pabsw m0, m0 pminsw m2, m0 pcmpeqw m0, m2 pand m3, m5, m0 pandn m0, m4 por m0, m3 pcmpgtw m3, m7, m2 pand m0, m3 pandn m3, m1 por m0, m3 %endmacro PAETH movhps [dstq+strideq*0], m0 movq [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 2*2 jg .w4_loop RET .w8: %if ARCH_X86_32 PUSH r6 %define r7d hm %assign regs_used 7 %elif WIN64 movaps r4m, m8 PUSH r7 %assign regs_used 8 %endif %if ARCH_X86_64 movddup m8, [pw_256] %endif lea tlq, [tlq+wq*2+2] neg wq mov r7d, hd .w8_loop0: movu m5, [tlq+wq*2] mov r6, dstq add dstq, 16 psubw m6, m5, m4 pabsw m7, m6 .w8_loop: movd m1, [leftq+hq-2] %if ARCH_X86_64 pshufb m1, m8 %else pshuflw m1, m1, q0000 punpcklqdq m1, m1 %endif PAETH mova [r6], m0 add r6, strideq sub hd, 1*2 jg .w8_loop mov hd, r7d add wq, 8 jl .w8_loop0 %if WIN64 movaps m8, r4m %endif RET %if ARCH_X86_64 DECLARE_REG_TMP 7 %else DECLARE_REG_TMP 4 %endif cglobal ipred_smooth_v_16bpc, 4, 6, 6, dst, stride, tl, w, h, weights LEA weightsq, smooth_weights_1d_16bpc mov hd, hm lea weightsq, [weightsq+hq*4] neg hq movd m5, [tlq+hq*2] ; bottom pshuflw m5, m5, q0000 punpcklqdq m5, m5 cmp wd, 4 jne .w8 movddup m4, [tlq+2] ; top lea r3, [strideq*3] psubw m4, m5 ; top - bottom .w4_loop: movq m1, [weightsq+hq*2] punpcklwd m1, m1 pshufd m0, m1, q1100 punpckhdq m1, m1 pmulhrsw m0, m4 pmulhrsw m1, m4 paddw m0, m5 paddw m1, m5 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 movq [dstq+strideq*2], m1 movhps [dstq+r3 ], m1 lea dstq, [dstq+strideq*4] add hq, 4 jl .w4_loop RET .w8: %if ARCH_X86_32 PUSH r6 %assign regs_used 7 mov hm, hq %define hq hm %elif WIN64 PUSH r7 %assign regs_used 8 %endif .w8_loop0: mov t0, hq movu m4, [tlq+2] add tlq, 16 mov r6, dstq add dstq, 16 psubw m4, m5 .w8_loop: movq m3, [weightsq+t0*2] punpcklwd m3, m3 pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 REPX {paddw x, m5}, m0, m1, m2, m3 mova [r6+strideq*0], m0 mova [r6+strideq*1], m1 lea r6, [r6+strideq*2] mova [r6+strideq*0], m2 mova [r6+strideq*1], m3 lea r6, [r6+strideq*2] add t0, 4 jl .w8_loop sub wd, 8 jg .w8_loop0 RET cglobal ipred_smooth_h_16bpc, 3, 6, 6, dst, stride, tl, w, h, weights LEA weightsq, smooth_weights_1d_16bpc mov wd, wm movifnidn hd, hm movd m5, [tlq+wq*2] ; right sub tlq, 8 add hd, hd pshuflw m5, m5, q0000 sub tlq, hq punpcklqdq m5, m5 cmp wd, 4 jne .w8 movddup m4, [weightsq+4*2] lea r3, [strideq*3] .w4_loop: movq m1, [tlq+hq] ; left punpcklwd m1, m1 psubw m1, m5 ; left - right pshufd m0, m1, q3322 punpckldq m1, m1 pmulhrsw m0, m4 pmulhrsw m1, m4 paddw m0, m5 paddw m1, m5 movhps [dstq+strideq*0], m0 movq [dstq+strideq*1], m0 movhps [dstq+strideq*2], m1 movq [dstq+r3 ], m1 lea dstq, [dstq+strideq*4] sub hd, 4*2 jg .w4_loop RET .w8: lea weightsq, [weightsq+wq*4] neg wq %if ARCH_X86_32 PUSH r6 %assign regs_used 7 %define hd hm %elif WIN64 PUSH r7 %assign regs_used 8 %endif .w8_loop0: mov t0d, hd mova m4, [weightsq+wq*2] mov r6, dstq add dstq, 16 .w8_loop: movq m3, [tlq+t0*(1+ARCH_X86_32)] punpcklwd m3, m3 psubw m3, m5 pshufd m0, m3, q3333 pshufd m1, m3, q2222 pshufd m2, m3, q1111 pshufd m3, m3, q0000 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 REPX {paddw x, m5}, m0, m1, m2, m3 mova [r6+strideq*0], m0 mova [r6+strideq*1], m1 lea r6, [r6+strideq*2] mova [r6+strideq*0], m2 mova [r6+strideq*1], m3 lea r6, [r6+strideq*2] sub t0d, 4*(1+ARCH_X86_64) jg .w8_loop add wq, 8 jl .w8_loop0 RET %if ARCH_X86_64 DECLARE_REG_TMP 10 %else DECLARE_REG_TMP 3 %endif cglobal ipred_smooth_16bpc, 3, 7, 8, dst, stride, tl, w, h, \ h_weights, v_weights, top LEA h_weightsq, smooth_weights_2d_16bpc mov wd, wm mov hd, hm movd m7, [tlq+wq*2] ; right lea v_weightsq, [h_weightsq+hq*8] neg hq movd m6, [tlq+hq*2] ; bottom pshuflw m7, m7, q0000 pshuflw m6, m6, q0000 cmp wd, 4 jne .w8 movq m4, [tlq+2] ; top mova m5, [h_weightsq+4*4] punpcklwd m4, m6 ; top, bottom pxor m6, m6 .w4_loop: movq m1, [v_weightsq+hq*4] sub tlq, 4 movd m3, [tlq] ; left pshufd m0, m1, q0000 pshufd m1, m1, q1111 pmaddwd m0, m4 punpcklwd m3, m7 ; left, right pmaddwd m1, m4 pshufd m2, m3, q1111 pshufd m3, m3, q0000 pmaddwd m2, m5 pmaddwd m3, m5 paddd m0, m2 paddd m1, m3 psrld m0, 8 psrld m1, 8 packssdw m0, m1 pavgw m0, m6 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] add hq, 2 jl .w4_loop RET .w8: %if ARCH_X86_32 lea h_weightsq, [h_weightsq+wq*4] mov t0, tlq mov r1m, tlq mov r2m, hq %define m8 [h_weightsq+16*0] %define m9 [h_weightsq+16*1] %else %if WIN64 movaps r4m, m8 movaps r6m, m9 PUSH r7 PUSH r8 %endif PUSH r9 PUSH r10 %assign regs_used 11 lea h_weightsq, [h_weightsq+wq*8] lea topq, [tlq+wq*2] neg wq mov r8, tlq mov r9, hq %endif punpcklqdq m6, m6 .w8_loop0: %if ARCH_X86_32 movu m5, [t0+2] add t0, 16 mov r0m, t0 %else movu m5, [topq+wq*2+2] mova m8, [h_weightsq+wq*4+16*0] mova m9, [h_weightsq+wq*4+16*1] %endif mov t0, dstq add dstq, 16 punpcklwd m4, m5, m6 punpckhwd m5, m6 .w8_loop: movd m1, [v_weightsq+hq*4] sub tlq, 2 movd m3, [tlq] ; left pshufd m1, m1, q0000 pmaddwd m0, m4, m1 pshuflw m3, m3, q0000 pmaddwd m1, m5 punpcklwd m3, m7 ; left, right pmaddwd m2, m8, m3 pmaddwd m3, m9 paddd m0, m2 paddd m1, m3 psrld m0, 8 psrld m1, 8 packssdw m0, m1 pxor m1, m1 pavgw m0, m1 mova [t0], m0 add t0, strideq inc hq jl .w8_loop %if ARCH_X86_32 mov t0, r0m mov tlq, r1m add h_weightsq, 16*2 mov hq, r2m sub dword wm, 8 jg .w8_loop0 %else mov tlq, r8 mov hq, r9 add wq, 8 jl .w8_loop0 %endif %if WIN64 movaps m8, r4m movaps m9, r6m %endif RET %if ARCH_X86_64 cglobal ipred_z1_16bpc, 3, 8, 8, 16*18, dst, stride, tl, w, h, angle, dx %define base r7-$$ %define bdmaxm r8m lea r7, [$$] %else cglobal ipred_z1_16bpc, 3, 7, 8, -16*18, dst, stride, tl, w, h, angle, dx %define base r1-$$ %define stridemp [rsp+4*0] %define bdmaxm [rsp+4*1] mov r3, r8m mov stridemp, r1 mov bdmaxm, r3 LEA r1, $$ %endif tzcnt wd, wm movifnidn angled, anglem movifnidn hd, hm add tlq, 2 movsxd wq, [base+ipred_z1_16bpc_ssse3_table+wq*4] mov dxd, angled movddup m0, [base+pw_256] and dxd, 0x7e movddup m7, [base+pw_62] add angled, 165 ; ~90 lea wq, [base+wq+ipred_z1_16bpc_ssse3_table] movzx dxd, word [base+dr_intra_derivative+dxq] xor angled, 0x4ff ; d = 90 - angle jmp wq .w4: lea r3d, [angleq+88] test r3d, 0x480 jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40 sar r3d, 9 add r3d, hd cmp r3d, 8 jg .w4_no_upsample ; h > 8 || (w == h && is_sm) movd m3, [tlq+14] movu m2, [tlq+ 0] ; 1 2 3 4 5 6 7 8 movd m1, bdmaxm pshufb m3, m0 palignr m4, m3, m2, 4 ; 3 4 5 6 7 8 8 8 paddw m4, [tlq- 2] ; 0 1 2 3 4 5 6 7 add dxd, dxd mova [rsp+32], m3 palignr m3, m2, 2 ; 2 3 4 5 6 7 8 8 pshufb m1, m0 paddw m3, m2 ; -1 * a + 9 * b + 9 * c + -1 * d psubw m5, m3, m4 ; = (b + c - a - d + (b + c) << 3 + 8) >> 4 movd m4, dxd psraw m5, 3 ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1 paddw m3, m5 pxor m5, m5 pmaxsw m3, m5 mov r3d, dxd pavgw m3, m5 pshufb m4, m0 pminsw m3, m1 punpcklwd m1, m2, m3 punpckhwd m2, m3 mova m3, [base+z_upsample] movifnidn strideq, stridemp mova [rsp+ 0], m1 paddw m5, m4, m4 mova [rsp+16], m2 punpcklqdq m4, m5 ; xpos0 xpos1 .w4_upsample_loop: lea r2d, [r3+dxq] shr r3d, 6 ; base0 movu m1, [rsp+r3*2] lea r3d, [r2+dxq] shr r2d, 6 ; base1 movu m2, [rsp+r2*2] pshufb m1, m3 pshufb m2, m3 punpcklqdq m0, m1, m2 punpckhqdq m1, m2 pand m2, m7, m4 ; frac psllw m2, 9 ; (a * (64 - frac) + b * frac + 32) >> 6 psubw m1, m0 ; = a + (((b - a) * frac + 32) >> 6) pmulhrsw m1, m2 ; = a + (((b - a) * (frac << 9) + 16384) >> 15) paddw m4, m5 ; xpos += dx paddw m0, m1 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w4_upsample_loop RET .w4_no_upsample: mov r3d, 7 ; max_base test angled, 0x400 ; !enable_intra_edge_filter jnz .w4_main lea r3d, [hq+3] movd m1, r3d movd m3, angled shr angled, 8 ; is_sm << 1 pxor m2, m2 pshufb m1, m2 pshufb m3, m2 pcmpeqb m1, [base+z_filt_wh4] pand m1, m3 pcmpgtb m1, [base+z_filt_t_w48+angleq*8] pmovmskb r5d, m1 mov r3d, 7 test r5d, r5d jz .w4_main ; filter_strength == 0 pshuflw m1, [tlq-2], q0000 movu m2, [tlq+16*0] imul r5d, 0x55555555 movd m3, [tlq+r3*2] shr r5d, 30 ; filter_strength movd [rsp+12], m1 pshuflw m3, m3, q0000 mova [rsp+16*1], m2 lea r2d, [r3+2] movq [rsp+r3*2+18], m3 cmp hd, 8 cmovae r3d, r2d lea tlq, [rsp+16*1] call .filter_edge .w4_main: lea tlq, [tlq+r3*2] movd m4, dxd movddup m1, [base+z_base_inc] ; base_inc << 6 movd m6, [tlq] ; top[max_base_x] shl r3d, 6 movd m3, r3d pshufb m4, m0 mov r5d, dxd ; xpos pshufb m6, m0 sub r5, r3 pshufb m3, m0 paddw m5, m4, m4 psubw m3, m1 ; max_base_x punpcklqdq m4, m5 ; xpos0 xpos1 movifnidn strideq, stridemp .w4_loop: lea r3, [r5+dxq] sar r5, 6 ; base0 movq m0, [tlq+r5*2+0] movq m1, [tlq+r5*2+2] lea r5, [r3+dxq] sar r3, 6 ; base1 movhps m0, [tlq+r3*2+0] movhps m1, [tlq+r3*2+2] pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 pcmpgtw m2, m3, m4 ; xpos < max_base_x paddw m4, m5 ; xpos += dx paddw m0, m1 pand m0, m2 pandn m2, m6 por m0, m2 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 sub hd, 2 jz .w4_end lea dstq, [dstq+strideq*2] test r5d, r5d jl .w4_loop .w4_end_loop: movq [dstq+strideq*0], m6 movq [dstq+strideq*1], m6 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w4_end_loop .w4_end: RET .w8: lea r3d, [angleq+88] and r3d, ~0x7f or r3d, hd cmp r3d, 8 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 movu m1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 movu m5, [tlq+ 2] ; 2 3 4 5 6 7 8 9 movu m3, [tlq+ 4] ; 3 4 5 6 7 8 9 a paddw m5, m1 paddw m3, [tlq- 2] ; 0 1 2 3 4 5 6 7 psubw m2, m5, m3 movu m6, [tlq+18] ; a b c d e f g _ psraw m2, 3 movu m3, [tlq+20] ; b c d e f g _ _ paddw m5, m2 movu m2, [tlq+16] ; 9 a b c d e f g paddw m6, m2 add dxd, dxd cmp hd, 4 jne .w8_upsample_h8 ; awkward single-pixel edge case pshuflw m3, m3, q1110 ; b c c _ _ _ _ _ .w8_upsample_h8: paddw m3, [tlq+14] ; 8 9 a b c d e f psubw m4, m6, m3 movd m3, bdmaxm psraw m4, 3 mov r3d, dxd paddw m6, m4 pxor m4, m4 pmaxsw m5, m4 pmaxsw m6, m4 pshufb m3, m0 pavgw m5, m4 pavgw m6, m4 movd m4, dxd pminsw m5, m3 pminsw m6, m3 mova m3, [base+z_upsample] pshufb m4, m0 movifnidn strideq, stridemp punpcklwd m0, m1, m5 mova [rsp+ 0], m0 punpckhwd m1, m5 mova [rsp+16], m1 punpcklwd m0, m2, m6 mova [rsp+32], m0 punpckhwd m2, m6 mova [rsp+48], m2 mova m5, m4 .w8_upsample_loop: mov r2d, r3d shr r2d, 6 movu m1, [rsp+r2*2+ 0] movu m2, [rsp+r2*2+16] add r3d, dxd pshufb m1, m3 pshufb m2, m3 punpcklqdq m0, m1, m2 punpckhqdq m1, m2 pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m4, m5 paddw m0, m1 mova [dstq], m0 add dstq, strideq dec hd jg .w8_upsample_loop RET .w8_no_upsample: lea r3d, [hq+7] movd m1, r3d and r3d, 7 or r3d, 8 ; imin(h+7, 15) test angled, 0x400 jnz .w8_main movd m3, angled shr angled, 8 ; is_sm << 1 pxor m2, m2 pshufb m1, m2 pshufb m3, m2 movu m2, [base+z_filt_wh8] psrldq m4, [base+z_filt_t_w48+angleq*8], 4 pcmpeqb m2, m1 pand m2, m3 pcmpgtb m2, m4 pmovmskb r5d, m2 test r5d, r5d jz .w8_main ; filter_strength == 0 pshuflw m1, [tlq-2], q0000 movu m2, [tlq+16*0] imul r5d, 0x55555555 movu m3, [tlq+16*1] movd m4, [tlq+r3*2] shr r5d, 30 ; filter_strength movd [rsp+12], m1 mova [rsp+16*1], m2 pshuflw m4, m4, q0000 mova [rsp+16*2], m3 lea r2d, [r3+2] movq [rsp+r3*2+18], m4 cmp hd, 16 cmovae r3d, r2d lea tlq, [rsp+16*1] call .filter_edge .w8_main: lea tlq, [tlq+r3*2] movd m5, dxd mova m4, [base+z_base_inc] shl r3d, 6 movd m6, [tlq] ; top[max_base_x] movd m1, r3d pshufb m5, m0 mov r5d, dxd ; xpos pshufb m1, m0 sub r5, r3 psubw m4, m1 ; max_base_x pshufb m6, m0 paddw m4, m5 movifnidn strideq, stridemp .w8_loop: mov r3, r5 sar r3, 6 movu m0, [tlq+r3*2+0] movu m1, [tlq+r3*2+2] pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 psraw m2, m4, 15 ; xpos < max_base_x paddw m4, m5 ; xpos += dx paddw m0, m1 pand m0, m2 pandn m2, m6 por m0, m2 mova [dstq], m0 dec hd jz .w8_end add dstq, strideq add r5, dxq jl .w8_loop .w8_end_loop: mova [dstq], m6 add dstq, strideq dec hd jg .w8_end_loop .w8_end: RET .w16: %if ARCH_X86_32 %define strideq r3 %endif lea r3d, [hq+15] movd m1, r3d and r3d, 15 or r3d, 16 ; imin(h+15, 31) test angled, 0x400 jnz .w16_main movd m3, angled shr angled, 8 ; is_sm << 1 pxor m2, m2 pshufb m1, m2 pshufb m3, m2 movq m4, [base+z_filt_t_w16+angleq*4] pcmpeqb m1, [base+z_filt_wh16] pand m1, m3 pcmpgtb m1, m4 pmovmskb r5d, m1 test r5d, r5d jz .w16_main ; filter_strength == 0 pshuflw m1, [tlq-2], q0000 movu m2, [tlq+16*0] imul r5d, 0x24924924 movu m3, [tlq+16*1] movu m4, [tlq+16*2] shr r5d, 30 movu m5, [tlq+16*3] movd m6, [tlq+r3*2] adc r5d, -1 ; filter_strength movd [rsp+12], m1 mova [rsp+16*1], m2 mova [rsp+16*2], m3 pshuflw m6, m6, q0000 mova [rsp+16*3], m4 mova [rsp+16*4], m5 lea r2d, [r3+2] movq [rsp+r3*2+18], m6 cmp hd, 32 cmovae r3d, r2d lea tlq, [rsp+16*1] call .filter_edge .w16_main: lea tlq, [tlq+r3*2] movd m5, dxd mova m4, [base+z_base_inc] shl r3d, 6 movd m6, [tlq] ; top[max_base_x] movd m1, r3d pshufb m5, m0 mov r5d, dxd ; xpos pshufb m1, m0 sub r5, r3 psubw m4, m1 ; max_base_x pshufb m6, m0 paddw m4, m5 .w16_loop: mov r3, r5 sar r3, 6 movu m0, [tlq+r3*2+ 0] movu m2, [tlq+r3*2+ 2] pand m3, m7, m4 psllw m3, 9 psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+16] paddw m0, m2 movu m2, [tlq+r3*2+18] psubw m2, m1 pmulhrsw m2, m3 movddup m3, [base+pw_m512] paddw m1, m2 psraw m2, m4, 15 pcmpgtw m3, m4 paddw m4, m5 pand m0, m2 pandn m2, m6 pand m1, m3 pandn m3, m6 por m0, m2 mova [dstq+16*0], m0 por m1, m3 mova [dstq+16*1], m1 dec hd jz .w16_end movifnidn strideq, stridemp add dstq, strideq add r5, dxq jl .w16_loop .w16_end_loop: mova [dstq+16*0], m6 mova [dstq+16*1], m6 add dstq, strideq dec hd jg .w16_end_loop .w16_end: RET .w32: lea r3d, [hq+31] and r3d, 31 or r3d, 32 ; imin(h+31, 63) test angled, 0x400 ; !enable_intra_edge_filter jnz .w32_main call .filter_copy lea r5d, [r3+2] cmp hd, 64 cmove r3d, r5d call .filter_edge_s3 .w32_main: lea tlq, [tlq+r3*2] movd m5, dxd mova m4, [base+z_base_inc] shl r3d, 6 movd m6, [tlq] ; top[max_base_x] movd m1, r3d pshufb m5, m0 mov r5d, dxd ; xpos pshufb m1, m0 sub r5, r3 psubw m4, m1 ; max_base_x pshufb m6, m0 paddw m4, m5 .w32_loop: mov r3, r5 sar r3, 6 movu m0, [tlq+r3*2+ 0] movu m2, [tlq+r3*2+ 2] pand m3, m7, m4 psllw m3, 9 psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+16] paddw m0, m2 movu m2, [tlq+r3*2+18] psubw m2, m1 pmulhrsw m2, m3 paddw m1, m2 psraw m2, m4, 15 pand m0, m2 pandn m2, m6 por m0, m2 movddup m2, [base+pw_m512] pcmpgtw m2, m4 pand m1, m2 pandn m2, m6 mova [dstq+16*0], m0 por m1, m2 mova [dstq+16*1], m1 movu m0, [tlq+r3*2+32] movu m2, [tlq+r3*2+34] psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+48] paddw m0, m2 movu m2, [tlq+r3*2+50] psubw m2, m1 pmulhrsw m2, m3 paddw m1, m2 movddup m2, [base+pw_m1024] movddup m3, [base+pw_m1536] pcmpgtw m2, m4 pcmpgtw m3, m4 paddw m4, m5 pand m0, m2 pandn m2, m6 pand m1, m3 pandn m3, m6 por m0, m2 mova [dstq+16*2], m0 por m1, m3 mova [dstq+16*3], m1 dec hd jz .w32_end movifnidn strideq, stridemp add dstq, strideq add r5, dxq jl .w32_loop .w32_end_loop: REPX {mova [dstq+16*x], m6}, 0, 1, 2, 3 add dstq, strideq dec hd jg .w32_end_loop .w32_end: RET .w64: lea r3d, [hq+63] test angled, 0x400 ; !enable_intra_edge_filter jnz .w64_main call .filter_copy call .filter_edge_s3 .w64_main: lea tlq, [tlq+r3*2] movd m5, dxd mova m4, [base+z_base_inc] shl r3d, 6 movd m6, [tlq] ; top[max_base_x] movd m1, r3d pshufb m5, m0 mov r5d, dxd ; xpos pshufb m1, m0 sub r5, r3 psubw m4, m1 ; max_base_x pshufb m6, m0 paddw m4, m5 .w64_loop: mov r3, r5 sar r3, 6 movu m0, [tlq+r3*2+ 0] movu m2, [tlq+r3*2+ 2] pand m3, m7, m4 psllw m3, 9 psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+16] paddw m0, m2 movu m2, [tlq+r3*2+18] psubw m2, m1 pmulhrsw m2, m3 paddw m1, m2 psraw m2, m4, 15 pand m0, m2 pandn m2, m6 por m0, m2 movddup m2, [base+pw_m512] pcmpgtw m2, m4 pand m1, m2 pandn m2, m6 mova [dstq+16*0], m0 por m1, m2 mova [dstq+16*1], m1 movu m0, [tlq+r3*2+32] movu m2, [tlq+r3*2+34] psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+48] paddw m0, m2 movu m2, [tlq+r3*2+50] psubw m2, m1 pmulhrsw m2, m3 paddw m1, m2 movddup m2, [base+pw_m1024] pcmpgtw m2, m4 pand m0, m2 pandn m2, m6 por m0, m2 movddup m2, [base+pw_m1536] pcmpgtw m2, m4 pand m1, m2 pandn m2, m6 mova [dstq+16*2], m0 por m1, m2 mova [dstq+16*3], m1 movu m0, [tlq+r3*2+64] movu m2, [tlq+r3*2+66] psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+80] paddw m0, m2 movu m2, [tlq+r3*2+82] psubw m2, m1 pmulhrsw m2, m3 paddw m1, m2 movddup m2, [base+pw_m2048] pcmpgtw m2, m4 pand m0, m2 pandn m2, m6 por m0, m2 movddup m2, [base+pw_m2560] pcmpgtw m2, m4 pand m1, m2 pandn m2, m6 mova [dstq+16*4], m0 por m1, m2 mova [dstq+16*5], m1 movu m0, [tlq+r3*2+96] movu m2, [tlq+r3*2+98] psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+112] paddw m0, m2 movu m2, [tlq+r3*2+114] psubw m2, m1 pmulhrsw m2, m3 paddw m1, m2 movddup m2, [base+pw_m3072] movddup m3, [base+pw_m3584] pcmpgtw m2, m4 pcmpgtw m3, m4 paddw m4, m5 pand m0, m2 pandn m2, m6 pand m1, m3 pandn m3, m6 por m0, m2 mova [dstq+16*6], m0 por m1, m3 mova [dstq+16*7], m1 dec hd jz .w64_end movifnidn strideq, stridemp add dstq, strideq add r5, dxq jl .w64_loop .w64_end_loop: REPX {mova [dstq+16*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 add dstq, strideq dec hd jg .w64_end_loop .w64_end: RET ALIGN function_align .filter_copy: pshuflw m2, [tlq-2], q0000 pshuflw m3, [tlq+r3*2], q0000 xor r5d, r5d movd [rsp+gprsize+12], m2 .filter_copy_loop: movu m1, [tlq+r5*2+16*0] movu m2, [tlq+r5*2+16*1] add r5d, 16 mova [rsp+r5*2+gprsize-16*1], m1 mova [rsp+r5*2+gprsize-16*0], m2 cmp r5d, r3d jle .filter_copy_loop lea tlq, [rsp+gprsize+16*1] movq [tlq+r3*2+2], m3 ret .filter_edge: cmp r5d, 3 je .filter_edge_s3 movddup m4, [base+z_filt_k+r5*8-8] movddup m5, [base+z_filt_k+r5*8+8] xor r5d, r5d movddup m6, [base+pw_8] movu m2, [tlq-2] jmp .filter_edge_start .filter_edge_loop: movu m2, [tlq+r5*2-2] mova [tlq+r5*2-16], m1 .filter_edge_start: pmullw m1, m4, [tlq+r5*2] movu m3, [tlq+r5*2+2] paddw m2, m3 pmullw m2, m5 add r5d, 8 paddw m1, m6 paddw m1, m2 psrlw m1, 4 cmp r5d, r3d jl .filter_edge_loop mova [tlq+r5*2-16], m1 ret .filter_edge_s3: movddup m5, [base+pw_3] xor r5d, r5d movu m2, [tlq-2] movu m3, [tlq-4] jmp .filter_edge_s3_start .filter_edge_s3_loop: movu m2, [tlq+r5*2-2] movu m3, [tlq+r5*2-4] mova [tlq+r5*2-16], m1 .filter_edge_s3_start: paddw m2, [tlq+r5*2+0] paddw m3, m5 movu m1, [tlq+r5*2+2] movu m4, [tlq+r5*2+4] add r5d, 8 paddw m1, m2 pavgw m3, m4 paddw m1, m3 psrlw m1, 2 cmp r5d, r3d jl .filter_edge_s3_loop mova [tlq+r5*2-16], m1 ret %if ARCH_X86_64 cglobal ipred_z2_16bpc, 4, 12, 11, 16*24, dst, stride, tl, w, h, angle, dx, _, dy %define base r7-$$ %define maxwm r6m %define maxhm r7m %define bdmaxm r8m lea r7, [$$] mov hd, hm movddup m8, [base+pw_62] lea r9d, [wq-4] shl r9d, 6 mova m9, [base+z2_top_shufA] or r9d, hd mova m10, [base+z2_left_shufA] %else cglobal ipred_z2_16bpc, 4, 7, 8, -16*27, dst, _, tl, w, h, angle, dx %define base r1-$$ %define r9b byte [rsp+16*26+4*0] %define r9d dword [rsp+16*26+4*0] %define r10d dword [rsp+16*26+4*1] %define r11d dword [rsp+16*26+4*2] %define maxwm [rsp+16*2+4*0] %define maxhm [rsp+16*2+4*1] %define bdmaxm [rsp+16*2+4*2] %define stridemp [rsp+16*26+4*3] %define strideq r3 %define dyd r4 %define dyq r4 mov stridemp, r1 mov r1d, r6m mov r4d, r7m mov r5d, r8m mov maxwm, r1d mov maxhm, r4d mov bdmaxm, r5d LEA r1, $$ lea hd, [wq-4] mova m0, [base+z2_top_shufA] shl hd, 6 mova m1, [base+z2_left_shufA] or hd, hm mova [rsp+16*24], m0 mov r9d, hd mova [rsp+16*25], m1 %endif tzcnt wd, wd movifnidn angled, anglem mova m0, [tlq-16*8] mova m1, [tlq-16*7] mova m2, [tlq-16*6] mova m3, [tlq-16*5] movsxd wq, [base+ipred_z2_16bpc_ssse3_table+wq*4] %if ARCH_X86_64 movzx dxd, angleb %else movzx dxd, byte anglem %endif mova m4, [tlq-16*4] mova m5, [tlq-16*3] mova m6, [tlq-16*2] mova m7, [tlq-16*1] mova [rsp+16* 5], m0 xor angled, 0x400 mova [rsp+16* 6], m1 mov dyd, dxd mova [rsp+16* 7], m2 neg dxq mova [rsp+16* 8], m3 and dyd, ~1 mova [rsp+16* 9], m4 and dxq, ~1 mova [rsp+16*10], m5 lea wq, [base+ipred_z2_16bpc_ssse3_table+wq] mova [rsp+16*11], m6 pxor m3, m3 mova [rsp+16*12], m7 movzx dyd, word [base+dr_intra_derivative+dyq-90] ; angle - 90 movzx dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle movddup m0, [base+pw_256] ; 4<<6 movd m4, [tlq] movu m5, [tlq+16*0+2] movu m6, [tlq+16*1+2] movsldup m1, [base+z2_dy_offset] pshufb m4, m0 movq m7, [base+z_base_inc+2] mov r11d, (112-4)<<6 mova [rsp+16*13], m4 neg dxd mova [rsp+16*14], m5 or dyd, 4<<16 mova [rsp+16*15], m6 %if ARCH_X86_64 lea r10d, [dxq+(112<<6)] ; xpos %else mov [rsp+8*3], dyd lea r4d, [dxq+(112<<6)] mov r10d, r4d movzx hd, r9b %endif movq [rsp+8*0], m1 movq [rsp+8*1], m0 movq [rsp+8*2], m7 jmp wq .w4: test angled, 0x400 jnz .w4_main lea r3d, [hq+2] add angled, 1022 pshuflw m1, m5, q3333 shl r3d, 6 movq [rsp+16*14+8], m1 test r3d, angled jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) call .upsample_above sub angled, 1075 ; angle - 53 lea r3d, [hq+3] xor angled, 0x7f ; 180 - angle movd m2, r3d movd m7, angled shr angled, 8 ; is_sm << 1 pshufb m2, m3 pshufb m7, m3 pcmpeqb m2, [base+z_filt_wh4] pand m7, m2 pcmpgtb m7, [base+z_filt_t_w48+angleq*8] jmp .w8_filter_left .upsample_above: ; w4/w8 paddw m2, m5, [tlq] movu m1, [rsp+gprsize+16*14+2] movu m4, [rsp+gprsize+16*14-4] %if ARCH_X86_64 movd m6, r9m ; bdmax, offset due to call %else movd m6, [rsp+gprsize+16*2+4*2] %endif paddw m4, m1 psubw m1, m2, m4 pshufb m6, m0 psraw m1, 3 paddw m2, m1 add dxd, dxd pmaxsw m2, m3 paddw m7, m7 pavgw m2, m3 pminsw m2, m6 %if ARCH_X86_64 mova m9, [base+z2_top_shufB] lea r10d, [dxq+(113<<6)] mov r11d, (112-7)<<6 %else mova m1, [base+z2_top_shufB] lea r3d, [dxq+(113<<6)] mov dword [rsp+gprsize+16*26+4*2], (112-7)<<6 mov [rsp+gprsize+16*26+4*1], r3d mova [rsp+gprsize+16*24], m1 %endif punpcklwd m1, m2, m5 punpckhwd m2, m5 movq [rsp+gprsize+8*2], m7 mova [rsp+gprsize+16*14], m1 mova [rsp+gprsize+16*15], m2 ret .w4_no_upsample_above: lea r3d, [hq+3] mov [rsp+16*4], angled sub angled, 1112 ; angle - 90 movd m2, r3d mov r3d, 90 movd m1, angled sub r3d, angled ; 180 - angle shr angled, 8 ; is_sm << 1 mova m4, [base+z_filt_wh4] movd m7, r3d mova m5, [base+z_filt_t_w48+angleq*8] mov r3d, 4 call .w8_filter_top mov angled, [rsp+16*4] lea r3d, [hq+2] sub angled, 139 shl r3d, 6 test r3d, angled jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8) .upsample_left: ; w4/w8 mova m2, [tlq-16] lea r3d, [hq-4] movu m3, [tlq-14] movu m4, [rsp+16*12+4] pshufb m1, m2, [base+z2_upsample_l+r3*4] movd m6, bdmaxm pxor m5, m5 paddw m3, m2 paddw m4, m1 psubw m1, m3, m4 movshdup m4, [base+z2_dy_offset] psraw m1, 3 pshufb m6, m0 paddw m3, m1 pmaxsw m3, m5 pavgw m3, m5 pminsw m3, m6 %if ARCH_X86_64 mova m10, [base+z2_left_shufB] add dyd, dyd %else mova m1, [base+z2_left_shufB] shl dword [rsp+8*3], 1 mova [rsp+16*25], m1 %endif punpckhwd m1, m2, m3 punpcklwd m2, m3 movq [rsp+8*0], m4 mova [rsp+16*12], m1 mova [rsp+16*11], m2 .w4_main: movd m6, dxd %if ARCH_X86_64 movd m3, dyd %else movd m3, [rsp+8*3] %endif pshufb m6, m0 movddup m0, [rsp+8*2] paddw m7, m6, m6 movq m5, [base+pw_m1to4] pshuflw m4, m3, q0000 punpcklqdq m6, m7 pmullw m4, m5 pshuflw m3, m3, q1111 paddw m6, m0 mov r2d, r10d pshuflw m0, m4, q3333 psubw m4, [rsp+8*0] movq [rsp+8*3], m3 movq [rsp+8*5], m0 ; dy*4 mov r5, dstq .w4_loop0: mova [rsp+16*4], m6 movq [rsp+8*4], m4 %if ARCH_X86_64 pand m0, m8, m4 %else movq m0, [base+pw_62] pand m0, m4 %endif psraw m4, 6 psllw m0, 9 ; frac_y << 9 movq [rsp+8*7], m0 pabsw m4, m4 movq [rsp+8*6], m4 movzx hd, r9b .w4_loop: lea r3d, [r2+dxq] shr r2d, 6 ; base_x0 movu m2, [rsp+r2*2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x1 movu m1, [rsp+r3*2] lea r3d, [r2+dxq] shr r2d, 6 ; base_x2 movu m3, [rsp+r2*2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x3 movu m4, [rsp+r3*2] %if ARCH_X86_64 REPX {pshufb x, m9}, m2, m1, m3, m4 %else mova m0, [rsp+16*24] REPX {pshufb x, m0}, m2, m1, m3, m4 %endif punpcklqdq m0, m2, m1 punpckhqdq m2, m1 punpcklqdq m1, m3, m4 punpckhqdq m3, m4 %if ARCH_X86_64 pand m5, m8, m6 %else movddup m5, [base+pw_62] pand m5, m6 %endif psllw m5, 9 psubw m2, m0 pmulhrsw m2, m5 paddw m5, m6, m7 psubw m3, m1 paddw m0, m2 %if ARCH_X86_64 pand m2, m8, m5 %else movddup m2, [base+pw_62] pand m2, m5 %endif psllw m2, 9 pmulhrsw m3, m2 paddw m1, m3 cmp r3d, 111 ; topleft jge .w4_toponly mova [rsp+16*22], m0 mova [rsp+16*23], m1 movzx r3d, byte [rsp+8*6+0] ; base_y0 movu m3, [rsp+r3*2] movzx r3d, byte [rsp+8*6+2] ; base_y1 movu m2, [rsp+r3*2] movzx r3d, byte [rsp+8*6+4] ; base_y2 movu m4, [rsp+r3*2] movzx r3d, byte [rsp+8*6+6] ; base_y3 movu m0, [rsp+r3*2] %if ARCH_X86_64 REPX {pshufb x, m10}, m3, m2, m4, m0 %else mova m1, [rsp+16*25] REPX {pshufb x, m1}, m3, m2, m4, m0 %endif punpcklwd m1, m3, m2 punpckhwd m3, m2 ; 01 punpcklwd m2, m4, m0 punpckhwd m4, m0 ; 23 punpckldq m0, m1, m2 ; y0 d1 punpckhdq m1, m2 ; y2 y3 punpckldq m2, m3, m4 punpckhdq m3, m4 movddup m4, [rsp+8*7] psubw m2, m0 psubw m3, m1 pmulhrsw m2, m4 pmulhrsw m3, m4 psraw m6, 15 ; base_x < topleft psraw m4, m5, 15 paddw m0, m2 paddw m1, m3 pand m0, m6 pandn m6, [rsp+16*22] pand m1, m4 pandn m4, [rsp+16*23] por m0, m6 por m1, m4 .w4_toponly: movifnidn strideq, stridemp movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] movq [dstq+strideq*0], m1 movhps [dstq+strideq*1], m1 sub hd, 4 jz .w4_end movq m4, [rsp+8*6] paddsw m6, m5, m7 ; xpos += dx movq m5, [rsp+8*3] psubw m4, m5 lea dstq, [dstq+strideq*2] movq [rsp+8*6], m4 cmp r2d, r11d jge .w4_loop .w4_leftonly_loop: movzx r2d, byte [rsp+8*6+0] ; base_y0 movu m3, [rsp+r2*2] movzx r2d, byte [rsp+8*6+2] ; base_y1 movu m2, [rsp+r2*2] movzx r2d, byte [rsp+8*6+4] ; base_y2 movu m6, [rsp+r2*2] movzx r2d, byte [rsp+8*6+6] ; base_y3 movu m0, [rsp+r2*2] psubw m4, m5 %if ARCH_X86_64 REPX {pshufb x, m10}, m3, m2, m6, m0 %else mova m1, [rsp+16*25] REPX {pshufb x, m1}, m3, m2, m6, m0 %endif movq [rsp+8*6], m4 punpcklwd m1, m3, m2 punpckhwd m3, m2 punpcklwd m2, m6, m0 punpckhwd m6, m0 punpckldq m0, m1, m2 punpckhdq m1, m2 punpckldq m2, m3, m6 punpckhdq m3, m6 movddup m6, [rsp+8*7] psubw m2, m0 psubw m3, m1 pmulhrsw m2, m6 pmulhrsw m3, m6 paddw m0, m2 paddw m1, m3 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] movq [dstq+strideq*0], m1 movhps [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 4 jg .w4_leftonly_loop .w4_end: sub r9d, 1<<8 jl .w4_ret movq m4, [rsp+8*5] add r5, 8 mov dstq, r5 paddw m4, [rsp+8*4] ; base_y += 4*dy movzx r2d, word [rsp+8*1] movddup m6, [rsp+8*1] paddw m6, [rsp+16*4] ; base_x += (4 << upsample_above) add r2d, r10d mov r10d, r2d jmp .w4_loop0 .w4_ret: RET .w8: test angled, 0x400 jnz .w4_main lea r3d, [angleq+126] pshufhw m1, m5, q3333 %if ARCH_X86_64 mov r3b, hb %else xor r3b, r3b or r3d, hd %endif movhps [rsp+16*15], m1 cmp r3d, 8 ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm call .upsample_above sub angled, 53 lea r3d, [hq+7] xor angled, 0x7f ; 180 - angle movu m1, [base+z_filt_wh8] movd m2, r3d movd m7, angled shr angled, 8 ; is_sm << 1 psrldq m4, [base+z_filt_t_w48+angleq*8], 4 pshufb m2, m3 pshufb m7, m3 pcmpeqb m2, m1 movq m1, [base+pw_512] pand m7, m2 pcmpgtb m7, m4 movq [rsp+8*1], m1 ; 8<<6 jmp .w8_filter_left .w8_no_upsample_above: lea r3d, [hq+7] mov [rsp+16*4], angled sub angled, 90 movd m2, r3d mov r3d, 90 movd m1, angled sub r3d, angled ; 180 - angle shr angled, 8 ; is_sm << 1 movu m4, [base+z_filt_wh8] movd m7, r3d psrldq m5, [base+z_filt_t_w48+angleq*8], 4 mov r3d, 8 call .w8_filter_top mov r3d, [rsp+16*4] sub r3d, 141 %if ARCH_X86_64 mov r3b, hb %else xor r3b, r3b or r3d, hd %endif cmp r3d, 8 jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm .w8_filter_left: pmovmskb r5d, m7 test r5d, r5d jz .w4_main imul r5d, 0x55555555 neg hq mov r3, tlq movd m1, [tlq+hq*2] shr r5d, 30 ; filter_strength lea tlq, [rsp+16*13-2] pshuflw m1, m1, q0000 movq [tlq+hq*2-6], m1 call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge jmp .filter_left_end .w8_filter_top: REPX {pshufb x, m3}, m2, m1, m7 pcmpeqb m2, m4 pand m1, m2 pand m7, m2 pcmpgtb m1, m5 pcmpgtb m7, m5 pmovmskb r5d, m1 test r5d, r5d jz .w8_filter_top_end ; filter_strength == 0 imul r5d, 0x55555555 mov [dstq], tlq lea tlq, [rsp+16*14+gprsize] shr r5d, 30 ; filter_strength call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge %if ARCH_X86_64 mov r3d, r7m ; maxw, offset due to call %else mov r3d, [rsp+16*2+4*1] %endif mov tlq, [dstq] cmp r3d, 8 jge .w8_filter_top_end movu m1, [tlq+r3*2+16*0+2] movu m2, [tlq+r3*2+16*1+2] movu [rsp+r3*2+16*14+gprsize], m1 movu [rsp+r3*2+16*15+gprsize], m2 .w8_filter_top_end: ret .w16: test angled, 0x400 jnz .w4_main lea r3d, [hq+15] sub angled, 90 movd m2, r3d mov r3d, 90 movd m1, angled sub r3d, angled ; 180 - angle shr angled, 8 ; is_sm << 1 movd m7, r3d REPX {pshufb x, m3}, m2, m1, m7 movq m4, [base+z_filt_t_w16+angleq*4] pcmpeqb m2, [base+z_filt_wh16] pand m1, m2 pand m7, m2 pcmpgtb m1, m4 pcmpgtb m7, m4 pmovmskb r5d, m1 test r5d, r5d jz .w16_filter_left ; filter_strength == 0 imul r5d, 0x24924924 pshufhw m6, m6, q3333 mov [dstq], tlq lea tlq, [rsp+16*14] shr r5d, 30 movhps [tlq+16*2], m6 adc r5d, -1 ; filter_strength mov r3d, 16 call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge mov r3d, maxwm mov tlq, [dstq] cmp r3d, 16 jge .w16_filter_left movu m1, [tlq+r3*2+16*0+2] movu m2, [tlq+r3*2+16*1+2] movu [rsp+r3*2+16*14], m1 movu [rsp+r3*2+16*15], m2 .w16_filter_left: pmovmskb r5d, m7 test r5d, r5d jz .w4_main imul r5d, 0x24924924 neg hq mov r3, tlq movd m1, [tlq+hq*2] shr r5d, 30 lea tlq, [rsp+16*13-2] pshuflw m1, m1, q0000 adc r5d, -1 ; filter_strength movq [tlq+hq*2-6], m1 call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge jmp .filter_left_end .w32: movu m1, [tlq+16*2+2] movu m2, [tlq+16*3+2] mova [rsp+16*16], m1 mova [rsp+16*17], m2 test angled, 0x400 jnz .w4_main mov [dstq], tlq lea tlq, [rsp+16*14] pshufhw m2, m2, q3333 mov r3d, 32 movhps [tlq+16*4], m2 call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge_s3 mov r3d, maxwm mov tlq, [dstq] cmp r3d, 32 jge .filter_left movu m1, [tlq+r3*2+16*0+2] movu m2, [tlq+r3*2+16*1+2] movu [rsp+r3*2+16*14], m1 movu [rsp+r3*2+16*15], m2 cmp r3d, 16 jge .filter_left movu m1, [tlq+r3*2+16*2+2] movu m2, [tlq+r3*2+16*3+2] movu [rsp+r3*2+16*16], m1 movu [rsp+r3*2+16*17], m2 .filter_left: neg hq mov r3, tlq pshuflw m1, [tlq+hq*2], q0000 lea tlq, [rsp+16*13-2] movq [tlq+hq*2-6], m1 call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge_s3 .filter_left_end: mov r2d, maxhm cmp r2d, hd jge .w4_main neg r2 movu m1, [r3+r2*2-16*1] movu m2, [r3+r2*2-16*2] movu [rsp+r2*2+16*12], m1 movu [rsp+r2*2+16*11], m2 cmp r2d, -48 jle .w4_main movu m1, [r3+r2*2-16*3] movu m2, [r3+r2*2-16*4] movu [rsp+r2*2+16*10], m1 movu [rsp+r2*2+16* 9], m2 cmp r2d, -32 jle .w4_main movu m1, [r3+r2*2-16*5] movu m2, [r3+r2*2-16*6] movu [rsp+r2*2+16* 8], m1 movu [rsp+r2*2+16* 7], m2 cmp r2d, -16 jle .w4_main movu m1, [r3+r2*2-16*7] movu m2, [r3+r2*2-16*8] movu [rsp+r2*2+16* 6], m1 movu [rsp+r2*2+16* 5], m2 jmp .w4_main .w64: movu m1, [tlq+16*2+2] movu m2, [tlq+16*3+2] movu m3, [tlq+16*4+2] movu m4, [tlq+16*5+2] movu m5, [tlq+16*6+2] movu m6, [tlq+16*7+2] mov [dstq], tlq lea tlq, [rsp+16*14] mova [tlq+16*2], m1 mova [tlq+16*3], m2 mova [tlq+16*4], m3 mova [tlq+16*5], m4 mova [tlq+16*6], m5 mova [tlq+16*7], m6 test angled, 0x400 jnz .w4_main pshufhw m6, m6, q3333 mov r3d, 64 movhps [tlq+16*8], m6 call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge_s3 mov r3d, maxwm mov tlq, [dstq] cmp r3d, 64 jge .filter_left movu m1, [tlq+r3*2+16*0+2] movu m2, [tlq+r3*2+16*1+2] movu [rsp+r3*2+16*14], m1 movu [rsp+r3*2+16*15], m2 cmp r3d, 48 jge .filter_left movu m1, [tlq+r3*2+16*2+2] movu m2, [tlq+r3*2+16*3+2] movu [rsp+r3*2+16*16], m1 movu [rsp+r3*2+16*17], m2 cmp r3d, 32 jge .filter_left movu m1, [tlq+r3*2+16*4+2] movu m2, [tlq+r3*2+16*5+2] movu [rsp+r3*2+16*18], m1 movu [rsp+r3*2+16*19], m2 cmp r3d, 16 jge .filter_left movu m1, [tlq+r3*2+16*6+2] movu m2, [tlq+r3*2+16*7+2] movu [rsp+r3*2+16*20], m1 movu [rsp+r3*2+16*21], m2 jmp .filter_left %if ARCH_X86_64 cglobal ipred_z3_16bpc, 4, 9, 8, 16*18, dst, stride, tl, w, h, angle, dy, _, org_w %define base r7-$$ lea r7, [$$] mov org_wd, wd %else cglobal ipred_z3_16bpc, 4, 7, 8, -16*18, dst, stride, tl, w, h, angle, dy %define base r1-$$ %define org_wd r5 %define org_wq r5 movd m6, r8m ; pixel_max mov [dstq+4*0], strideq LEA r1, $$ mov [dstq+4*1], wd %endif tzcnt hd, hm movifnidn angled, anglem sub tlq, 2 movsxd hq, [base+ipred_z3_16bpc_ssse3_table+hq*4] sub angled, 180 movddup m0, [base+pw_256] mov dyd, angled neg dyd xor angled, 0x400 movddup m7, [base+pw_62] or dyq, ~0x7e lea hq, [base+ipred_z3_16bpc_ssse3_table+hq] movzx dyd, word [base+dr_intra_derivative+45*2-1+dyq] jmp hq .h4: lea r4d, [angleq+88] test r4d, 0x480 jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40 sar r4d, 9 add r4d, wd cmp r4d, 8 jg .h4_no_upsample ; w > 8 || (w == 8 && is_sm) mova m2, [tlq-14] ; 7 6 5 4 3 2 1 0 movu m3, [tlq-12] ; 8 7 6 5 4 3 2 1 %if ARCH_X86_64 movd m6, r8m %endif pshufb m4, m2, m0 mov tlq, rsp palignr m1, m2, m4, 14 ; 8 8 7 6 5 4 3 2 add dyd, dyd palignr m5, m2, m4, 12 ; 8 8 8 7 6 5 4 3 paddw m1, m2 paddw m3, m5 psubw m5, m1, m3 mova m3, [base+z_upsample] mova [tlq+ 0], m4 movd m4, dyd psraw m5, 3 neg dyd paddw m1, m5 pxor m5, m5 lea r5d, [dyq+(16<<6)+63] ; ypos pmaxsw m1, m5 pshufb m6, m0 shl wd, 3 pavgw m1, m5 pshufb m4, m0 pminsw m1, m6 sub rsp, wq punpckhwd m0, m1, m2 paddw m5, m4, m4 punpcklwd m1, m2 mova [tlq+32], m0 movsd m4, m5 mova [tlq+16], m1 .h4_upsample_loop: lea r4d, [r5+dyq] sar r5d, 6 movu m2, [tlq+r5*2] lea r5d, [r4+dyq] sar r4d, 6 movu m1, [tlq+r4*2] pshufb m2, m3 pshufb m1, m3 punpckhqdq m0, m1, m2 punpcklqdq m1, m2 pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m4, m5 paddw m0, m1 mova [rsp+wq-16], m0 sub wd, 16 jg .h4_upsample_loop or r3d, 4*2 jmp .end_transpose .h4_no_upsample: mov r4d, 7 test angled, 0x400 ; !enable_intra_edge_filter jnz .h4_main lea r4d, [wq+3] movd m1, r4d movd m3, angled shr angled, 8 ; is_sm << 1 pxor m2, m2 pshufb m1, m2 pshufb m3, m2 pcmpeqb m1, [base+z_filt_wh4] pand m1, m3 pcmpgtb m1, [base+z_filt_t_w48+angleq*8] pmovmskb r5d, m1 mov r4d, 7 test r5d, r5d jz .h4_main ; filter_strength == 0 pshuflw m1, [tlq+2], q0000 imul r5d, 0x55555555 mova m2, [tlq-14] neg r4 movd m3, [tlq+r4*2] shr r5d, 30 movd [rsp+16*17], m1 pshuflw m3, m3, q0000 mova [rsp+16*16], m2 lea r2, [r4-2] movq [rsp+16*17+r4*2-10], m3 cmp wd, 8 cmovae r4, r2 lea tlq, [rsp+16*17-2] call .filter_edge .h4_main: movd m4, dyd sub tlq, r4 movddup m1, [base+z_base_inc_z2+8] ; base_inc << 6 sub tlq, r4 shl r4d, 6 movd m6, [tlq] movd m3, r4d pshufb m4, m0 neg dyq pshufb m6, m0 lea r5, [dyq+r4+63] ; ypos pshufb m3, m0 shl wd, 3 paddw m5, m4, m4 sub rsp, wq psubw m3, m1 ; max_base_y movsd m4, m5 ; ypos1 ypos0 .h4_loop: lea r4, [r5+dyq] sar r5, 6 movddup m0, [tlq+r5*2-6] movddup m1, [tlq+r5*2-8] lea r5, [r4+dyq] sar r4, 6 movlps m0, [tlq+r4*2-6] movlps m1, [tlq+r4*2-8] pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 pcmpgtw m2, m3, m4 paddw m4, m5 paddw m0, m1 pand m0, m2 pandn m2, m6 por m0, m2 mova [rsp+wq-16], m0 sub wd, 16 jz .h4_transpose test r5d, r5d jg .h4_loop .h4_end_loop: mova [rsp+wq-16], m6 sub wd, 16 jg .h4_end_loop .h4_transpose: or r3d, 4*2 jmp .end_transpose .h8: lea r4d, [angleq+88] and r4d, ~0x7f or r4d, wd cmp r4d, 8 ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 mova m2, [tlq-30] ; g f e d c b a 9 movu m1, [tlq-32] ; _ g f e d c b a movu m3, [tlq-16] ; 9 8 7 6 5 4 3 2 paddw m3, [tlq-14] ; 8 7 6 5 4 3 2 1 pshufd m4, m2, q2100 ; _ _ g f e d c b paddw m1, m2 movu m5, [tlq-28] ; f e d c b a 9 8 add dyd, dyd cmp wd, 8 je .h8_upsample_w8 pshufhw m4, m2, q1000 ; _ _ _ _ c c c b .h8_upsample_w8: paddw m4, m5 psubw m5, m1, m4 movu m4, [tlq-18] ; a 9 8 7 6 5 4 3 psraw m5, 3 paddw m1, m5 movu m5, [tlq-12] ; 7 6 5 4 3 2 1 0 %if ARCH_X86_64 movd m6, r8m ; pixel_max %endif paddw m4, m5 shl wd, 4 psubw m5, m3, m4 movd m4, dyd psraw m5, 3 neg dyd paddw m3, m5 pshufb m6, m0 mova m5, [tlq-14] pshufb m4, m0 pxor m0, m0 pmaxsw m1, m0 pmaxsw m3, m0 mov tlq, rsp pavgw m1, m0 pavgw m3, m0 sub rsp, wq pminsw m1, m6 pminsw m6, m3 mova m3, [base+z_upsample] lea r5d, [dyq+(16<<6)+63] ; ypos punpcklwd m0, m1, m2 mova [tlq+16*0], m0 punpckhwd m1, m2 mova [tlq+16*1], m1 punpcklwd m0, m6, m5 mova [tlq+16*2], m0 punpckhwd m6, m5 mova [tlq+16*3], m6 mova m5, m4 .h8_upsample_loop: mov r4d, r5d sar r4d, 6 movu m1, [tlq+r4*2+16*0] movu m2, [tlq+r4*2+16*1] add r5d, dyd pshufb m2, m3 pshufb m1, m3 punpckhqdq m0, m1, m2 punpcklqdq m1, m2 pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m4, m5 paddw m0, m1 mova [rsp+wq-16], m0 sub wd, 16 jg .h8_upsample_loop or r3d, 8*2 jmp .end_transpose .h8_no_upsample: lea r4d, [wq+7] movd m1, r4d and r4d, 7 or r4d, 8 ; imin(w+7, 15) test angled, 0x400 jnz .h8_main movd m3, angled shr angled, 8 ; is_sm << 1 pxor m2, m2 pshufb m1, m2 pshufb m3, m2 movu m2, [base+z_filt_wh8] psrldq m4, [base+z_filt_t_w48+angleq*8], 4 pcmpeqb m2, m1 pand m2, m3 pcmpgtb m2, m4 pmovmskb r5d, m2 test r5d, r5d jz .h8_main ; filter_strength == 0 pshuflw m1, [tlq+2], q0000 imul r5d, 0x55555555 mova m2, [tlq-16*1+2] neg r4 mova m3, [tlq-16*2+2] shr r5d, 30 movd m4, [tlq+r4*2] movd [rsp+16*17], m1 mova [rsp+16*16], m2 pshuflw m4, m4, q0000 mova [rsp+16*15], m3 lea r2, [r4-2] movq [rsp+16*17+r4*2-10], m4 cmp wd, 16 cmovae r4, r2 lea tlq, [rsp+16*17-2] call .filter_edge .h8_main: sub tlq, r4 movd m4, dyd sub tlq, r4 shl r4d, 6 movd m6, [tlq] movd m3, r4d pshufb m4, m0 neg dyq pshufb m6, m0 lea r5, [dyq+r4+63] pshufb m3, m0 shl wd, 4 mova m5, m4 sub rsp, wq psubw m3, [base+z_base_inc_z2] .h8_loop: mov r4, r5 sar r4, 6 movu m0, [tlq+r4*2-14] movu m1, [tlq+r4*2-16] pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 pcmpgtw m2, m3, m4 paddw m4, m5 paddw m0, m1 pand m0, m2 pandn m2, m6 por m0, m2 mova [rsp+wq-16], m0 sub wd, 8*2 jz .h8_transpose add r5, dyq jg .h8_loop .h8_end_loop: mova [rsp+wq-16], m6 sub wd, 8*2 jg .h8_end_loop .h8_transpose: or r3d, 8*2 jmp .end_transpose .h16: lea r4d, [wq+15] movd m1, r4d and r4d, 15 or r4d, 16 ; imin(w+15, 31) test angled, 0x400 jnz .h16_main movd m3, angled shr angled, 8 ; is_sm << 1 pxor m2, m2 pshufb m1, m2 pshufb m3, m2 movq m4, [base+z_filt_t_w16+angleq*4] pcmpeqb m1, [base+z_filt_wh16] pand m1, m3 pcmpgtb m1, m4 pmovmskb r5d, m1 test r5d, r5d jz .h16_main ; filter_strength == 0 pshuflw m1, [tlq+2], q0000 mova m2, [tlq-16*1+2] imul r5d, 0x24924924 mova m3, [tlq-16*2+2] neg r4 mova m4, [tlq-16*3+2] shr r5d, 30 mova m5, [tlq-16*4+2] movd m6, [tlq+r4*2] adc r5d, -1 ; filter_strength movd [rsp+16*17], m1 mova [rsp+16*16], m2 mova [rsp+16*15], m3 pshuflw m6, m6, q0000 mova [rsp+16*14], m4 mova [rsp+16*13], m5 lea r2, [r4-2] movq [rsp+16*17+r4*2-10], m6 cmp wd, 32 cmovae r4, r2 lea tlq, [rsp+16*17-2] call .filter_edge .h16_main: sub tlq, r4 movd m5, dyd sub tlq, r4 shl r4d, 6 movd m6, [tlq] movd m3, r4d pshufb m5, m0 neg dyq pshufb m6, m0 lea r5, [dyq+r4+63] pshufb m3, m0 shl wd, 5 paddw m4, m5, [base+z_base_inc_z2] sub rsp, wq psubw m4, m3 .h16_loop: mov r4, r5 sar r4, 6 movu m0, [tlq+r4*2-14] movu m2, [tlq+r4*2-16] pand m3, m7, m4 psllw m3, 9 psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r4*2-30] paddw m0, m2 movu m2, [tlq+r4*2-32] psubw m2, m1 pmulhrsw m2, m3 movddup m3, [base+pw_m512] paddw m1, m2 psraw m2, m4, 15 pcmpgtw m3, m4 paddw m4, m5 pand m0, m2 pandn m2, m6 pand m1, m3 pandn m3, m6 por m0, m2 mova [rsp+wq-16*1], m0 por m1, m3 mova [rsp+wq-16*2], m1 sub wd, 16*2 jz .h16_transpose add r5, dyq jg .h16_loop .h16_end_loop: mova [rsp+wq-16*1], m6 mova [rsp+wq-16*2], m6 sub wd, 16*2 jg .h16_end_loop .h16_transpose: or r3d, 16*2 jmp .end_transpose .h32: lea r4d, [wq+31] and r4d, 31 or r4d, 32 ; imin(w+31, 63) test angled, 0x400 ; !enable_intra_edge_filter jnz .h32_main call .filter_copy lea r5, [r4-2] cmp wd, 64 cmove r4, r5 call .filter_edge_s3 .h32_main: sub tlq, r4 movd m5, dyd sub tlq, r4 shl r4d, 6 movd m6, [tlq] movd m3, r4d pshufb m5, m0 neg dyq pshufb m6, m0 lea r5, [dyq+r4+63] pshufb m3, m0 paddw m4, m5, [base+z_base_inc_z2] psubw m4, m3 .h32_loop: mov r4, r5 sar r4, 6 movu m0, [tlq+r4*2-14] movu m3, [tlq+r4*2-16] pand m2, m7, m4 psllw m2, 9 psubw m3, m0 pmulhrsw m3, m2 movu m1, [tlq+r4*2-30] paddw m0, m3 movu m3, [tlq+r4*2-32] psubw m3, m1 pmulhrsw m3, m2 sub rsp, 16*4 paddw m1, m3 psraw m3, m4, 15 pand m0, m3 pandn m3, m6 por m0, m3 movddup m3, [base+pw_m512] pcmpgtw m3, m4 pand m1, m3 pandn m3, m6 mova [rsp+16*3], m0 por m1, m3 mova [rsp+16*2], m1 movu m0, [tlq+r4*2-46] movu m3, [tlq+r4*2-48] psubw m3, m0 pmulhrsw m3, m2 movu m1, [tlq+r4*2-62] paddw m0, m3 movu m3, [tlq+r4*2-64] psubw m3, m1 pmulhrsw m3, m2 movddup m2, [base+pw_m1024] paddw m1, m3 movddup m3, [base+pw_m1536] pcmpgtw m2, m4 pcmpgtw m3, m4 paddw m4, m5 pand m0, m2 pandn m2, m6 pand m1, m3 pandn m3, m6 por m0, m2 mova [rsp+16*1], m0 por m1, m3 mova [rsp+16*0], m1 dec wd jz .h32_transpose add r5, dyq jg .h32_loop .h32_end_loop: sub rsp, 16*4 REPX {mova [rsp+16*x], m6}, 3, 2, 1, 0 dec wd jg .h32_end_loop .h32_transpose: or r3d, 32*2 jmp .end_transpose .h64: lea r4d, [wq+63] test angled, 0x400 ; !enable_intra_edge_filter jnz .h64_main call .filter_copy call .filter_edge_s3 .h64_main: sub tlq, r4 movd m5, dyd sub tlq, r4 shl r4d, 6 movd m6, [tlq] movd m3, r4d pshufb m5, m0 neg dyq pshufb m6, m0 lea r5, [dyq+r4+63] pshufb m3, m0 paddw m4, m5, [base+z_base_inc_z2] psubw m4, m3 .h64_loop: mov r4, r5 sar r4, 6 movu m0, [tlq+r4*2- 14] movu m3, [tlq+r4*2- 16] pand m2, m7, m4 psllw m2, 9 psubw m3, m0 pmulhrsw m3, m2 movu m1, [tlq+r4*2- 30] paddw m0, m3 movu m3, [tlq+r4*2- 32] psubw m3, m1 pmulhrsw m3, m2 sub rsp, 16*8 paddw m1, m3 psraw m3, m4, 15 pand m0, m3 pandn m3, m6 por m0, m3 movddup m3, [base+pw_m512] pcmpgtw m3, m4 pand m1, m3 pandn m3, m6 mova [rsp+16*7], m0 por m1, m3 mova [rsp+16*6], m1 movu m0, [tlq+r4*2- 46] movu m3, [tlq+r4*2- 48] psubw m3, m0 pmulhrsw m3, m2 movu m1, [tlq+r4*2- 62] paddw m0, m3 movu m3, [tlq+r4*2- 64] psubw m3, m1 pmulhrsw m3, m2 paddw m1, m3 movddup m3, [base+pw_m1024] pcmpgtw m3, m4 pand m0, m3 pandn m3, m6 por m0, m3 movddup m3, [base+pw_m1536] pcmpgtw m3, m4 pand m1, m3 pandn m3, m6 mova [rsp+16*5], m0 por m1, m3 mova [rsp+16*4], m1 movu m0, [tlq+r4*2- 78] movu m3, [tlq+r4*2- 80] psubw m3, m0 pmulhrsw m3, m2 movu m1, [tlq+r4*2- 94] paddw m0, m3 movu m3, [tlq+r4*2- 96] psubw m3, m1 pmulhrsw m3, m2 paddw m1, m3 movddup m3, [base+pw_m2048] pcmpgtw m3, m4 pand m0, m3 pandn m3, m6 por m0, m3 movddup m3, [base+pw_m2560] pcmpgtw m3, m4 pand m1, m3 pandn m3, m6 mova [rsp+16*3], m0 por m1, m3 mova [rsp+16*2], m1 movu m0, [tlq+r4*2-110] movu m3, [tlq+r4*2-112] psubw m3, m0 pmulhrsw m3, m2 movu m1, [tlq+r4*2-126] paddw m0, m3 movu m3, [tlq+r4*2-128] psubw m3, m1 pmulhrsw m3, m2 movddup m2, [base+pw_m3072] paddw m1, m3 movddup m3, [base+pw_m3584] pcmpgtw m2, m4 pcmpgtw m3, m4 paddw m4, m5 pand m0, m2 pandn m2, m6 pand m1, m3 pandn m3, m6 por m0, m2 mova [rsp+16*1], m0 por m1, m3 mova [rsp+16*0], m1 dec wd jz .h64_transpose add r5, dyq jg .h64_loop .h64_end_loop: sub rsp, 16*8 REPX {mova [rsp+16*x], m6}, 7, 6, 5, 4, 3, 2, 1, 0 dec wd jg .h64_end_loop .h64_transpose: add r3d, 64*2 .end_transpose: %if ARCH_X86_64 lea r7, [strideq*3] %else mov strideq, [dstq+4*0] mov org_wd, [dstq+4*1] %endif lea r4d, [r3*3] .end_transpose_loop: lea r2, [rsp+r3-8] lea r6, [dstq+org_wq*2-8] .end_transpose_loop_y: movq m0, [r2+r4 ] movq m1, [r2+r3*2] movq m2, [r2+r3*1] movq m3, [r2+r3*0] sub r2, 8 punpcklwd m0, m1 punpcklwd m2, m3 punpckhdq m1, m0, m2 punpckldq m0, m2 movhps [r6+strideq*0], m1 movq [r6+strideq*1], m1 %if ARCH_X86_64 movhps [r6+strideq*2], m0 movq [r6+r7 ], m0 lea r6, [r6+strideq*4] %else lea r6, [r6+strideq*2] movhps [r6+strideq*0], m0 movq [r6+strideq*1], m0 lea r6, [r6+strideq*2] %endif cmp r2, rsp jae .end_transpose_loop_y lea rsp, [rsp+r3*4] sub org_wd, 4 jg .end_transpose_loop RET .filter_copy: neg r4 pshuflw m2, [tlq+2], q0000 xor r5d, r5d pshuflw m3, [tlq+r4*2], q0000 movq [rsp+gprsize+16*17], m2 .filter_copy_loop: mova m1, [tlq+r5*2-16*1+2] mova m2, [tlq+r5*2-16*2+2] sub r5, 16 mova [rsp+r5*2+gprsize+16*18], m1 mova [rsp+r5*2+gprsize+16*17], m2 cmp r5d, r4d jg .filter_copy_loop lea tlq, [rsp+gprsize+16*17-2] movq [tlq+r4*2-8], m3 ret .filter_edge: cmp r5d, 3 je .filter_edge_s3 movddup m4, [base+z_filt_k+r5*8-8] movddup m5, [base+z_filt_k+r5*8+8] xor r5d, r5d movddup m6, [base+pw_8] movu m2, [tlq-12] jmp .filter_edge_start .filter_edge_loop: movu m2, [tlq+r5*2-12] mova [tlq+r5*2+2], m1 .filter_edge_start: pmullw m1, m4, [tlq+r5*2-14] movu m3, [tlq+r5*2-16] sub r5, 8 paddw m2, m3 pmullw m2, m5 paddw m1, m6 paddw m1, m2 psrlw m1, 4 cmp r5d, r4d jg .filter_edge_loop mova [tlq+r5*2+2], m1 neg r4d ret .filter_edge_s3: movddup m5, [base+pw_3] xor r5d, r5d movu m2, [tlq-12] movu m3, [tlq-10] jmp .filter_edge_s3_start .filter_edge_s3_loop: movu m2, [tlq+r5*2-12] movu m3, [tlq+r5*2-10] mova [tlq+r5*2+2], m1 .filter_edge_s3_start: paddw m2, [tlq+r5*2-14] paddw m3, m5 movu m1, [tlq+r5*2-16] movu m4, [tlq+r5*2-18] sub r5, 8 paddw m1, m2 pavgw m3, m4 paddw m1, m3 psrlw m1, 2 cmp r5d, r4d jg .filter_edge_s3_loop mova [tlq+r5*2+2], m1 neg r4d ret %if ARCH_X86_64 cglobal ipred_filter_16bpc, 4, 7, 16, dst, stride, tl, w, h, filter %else cglobal ipred_filter_16bpc, 4, 7, 8, -16*8, dst, stride, tl, w, h, filter %define m8 [esp+16*0] %define m9 [esp+16*1] %define m10 [esp+16*2] %define m11 [esp+16*3] %define m12 [esp+16*4] %define m13 [esp+16*5] %define m14 [esp+16*6] %define m15 [esp+16*7] %endif %define base r6-$$ movifnidn hd, hm movd m6, r8m ; bitdepth_max %ifidn filterd, filterm movzx filterd, filterb %else movzx filterd, byte filterm %endif LEA r6, $$ shl filterd, 6 movu m0, [tlq-6] ; __ l1 l0 tl t0 t1 t2 t3 mova m1, [base+filter_intra_taps+filterq+16*0] mova m2, [base+filter_intra_taps+filterq+16*1] mova m3, [base+filter_intra_taps+filterq+16*2] mova m4, [base+filter_intra_taps+filterq+16*3] pxor m5, m5 %if ARCH_X86_64 punpcklbw m8, m5, m1 ; place 8-bit coefficients in the upper punpckhbw m9, m5, m1 ; half of each 16-bit word to avoid punpcklbw m10, m5, m2 ; having to perform sign-extension. punpckhbw m11, m5, m2 punpcklbw m12, m5, m3 punpckhbw m13, m5, m3 punpcklbw m14, m5, m4 punpckhbw m15, m5, m4 %else punpcklbw m7, m5, m1 mova m8, m7 punpckhbw m7, m5, m1 mova m9, m7 punpcklbw m7, m5, m2 mova m10, m7 punpckhbw m7, m5, m2 mova m11, m7 punpcklbw m7, m5, m3 mova m12, m7 punpckhbw m7, m5, m3 mova m13, m7 punpcklbw m7, m5, m4 mova m14, m7 punpckhbw m7, m5, m4 mova m15, m7 %endif mova m7, [base+filter_shuf] add hd, hd mov r5, dstq pshuflw m6, m6, q0000 mov r6, tlq punpcklqdq m6, m6 sub tlq, hq .left_loop: pshufb m0, m7 ; tl t0 t1 t2 t3 l0 l1 __ pshufd m1, m0, q0000 pmaddwd m2, m8, m1 pmaddwd m1, m9 pshufd m4, m0, q1111 pmaddwd m3, m10, m4 pmaddwd m4, m11 paddd m2, m3 paddd m1, m4 pshufd m4, m0, q2222 pmaddwd m3, m12, m4 pmaddwd m4, m13 paddd m2, m3 paddd m1, m4 pshufd m3, m0, q3333 pmaddwd m0, m14, m3 pmaddwd m3, m15 paddd m0, m2 paddd m1, m3 psrad m0, 11 ; x >> 3 psrad m1, 11 packssdw m0, m1 pmaxsw m0, m5 pavgw m0, m5 ; (x + 8) >> 4 pminsw m0, m6 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 movlps m0, [tlq+hq-10] lea dstq, [dstq+strideq*2] sub hd, 2*2 jg .left_loop sub wd, 4 jz .end sub tld, r6d ; -h*2 sub r6, r5 ; tl-dst .right_loop0: add r5, 8 mov hd, tld movu m0, [r5+r6] ; tl t0 t1 t2 t3 __ __ __ mov dstq, r5 .right_loop: pshufd m2, m0, q0000 pmaddwd m1, m8, m2 pmaddwd m2, m9 pshufd m4, m0, q1111 pmaddwd m3, m10, m4 pmaddwd m4, m11 pinsrw m0, [dstq+strideq*0-2], 5 paddd m1, m3 paddd m2, m4 pshufd m0, m0, q2222 movddup m4, [dstq+strideq*1-8] pmaddwd m3, m12, m0 pmaddwd m0, m13 paddd m1, m3 paddd m0, m2 pshuflw m2, m4, q3333 punpcklwd m2, m5 pmaddwd m3, m14, m2 pmaddwd m2, m15 paddd m1, m3 paddd m0, m2 psrad m1, 11 psrad m0, 11 packssdw m0, m1 pmaxsw m0, m5 pavgw m0, m5 pminsw m0, m6 movhps [dstq+strideq*0], m0 movq [dstq+strideq*1], m0 palignr m0, m4, 14 lea dstq, [dstq+strideq*2] add hd, 2*2 jl .right_loop sub wd, 4 jg .right_loop0 .end: RET %if UNIX64 DECLARE_REG_TMP 7 %else DECLARE_REG_TMP 5 %endif cglobal ipred_cfl_top_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac LEA t0, ipred_cfl_left_16bpc_ssse3_table movd m4, wd tzcnt wd, wd movifnidn hd, hm add tlq, 2 movsxd r6, [t0+wq*4] movd m5, wd jmp mangle(private_prefix %+ _ipred_cfl_left_16bpc_ssse3.start) cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha movifnidn hd, hm LEA t0, ipred_cfl_left_16bpc_ssse3_table tzcnt wd, wm lea r6d, [hq*2] movd m4, hd sub tlq, r6 tzcnt r6d, hd movd m5, r6d movsxd r6, [t0+r6*4] .start: movd m7, r7m movu m0, [tlq] add r6, t0 add t0, ipred_cfl_splat_16bpc_ssse3_table-ipred_cfl_left_16bpc_ssse3_table movsxd wq, [t0+wq*4] pxor m6, m6 pshuflw m7, m7, q0000 pcmpeqw m3, m3 add wq, t0 movifnidn acq, acmp pavgw m4, m6 punpcklqdq m7, m7 jmp r6 .h32: movu m1, [tlq+48] movu m2, [tlq+32] paddw m0, m1 paddw m0, m2 .h16: movu m1, [tlq+16] paddw m0, m1 .h8: pshufd m1, m0, q1032 paddw m0, m1 .h4: pmaddwd m0, m3 psubd m4, m0 pshuflw m0, m4, q1032 paddd m0, m4 psrld m0, m5 pshuflw m0, m0, q0000 punpcklqdq m0, m0 jmp wq %macro IPRED_CFL 2 ; dst, src pabsw m%1, m%2 pmulhrsw m%1, m2 psignw m%2, m1 psignw m%1, m%2 paddw m%1, m0 pmaxsw m%1, m6 pminsw m%1, m7 %endmacro cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac, alpha movifnidn hd, hm tzcnt r6d, hd lea t0d, [wq+hq] movd m4, t0d tzcnt t0d, t0d movd m5, t0d LEA t0, ipred_cfl_16bpc_ssse3_table tzcnt wd, wd movd m7, r7m movsxd r6, [t0+r6*4] movsxd wq, [t0+wq*4+4*4] psrlw m4, 1 pxor m6, m6 pshuflw m7, m7, q0000 add r6, t0 add wq, t0 movifnidn acq, acmp pcmpeqw m3, m3 punpcklqdq m7, m7 jmp r6 .h4: movq m0, [tlq-8] jmp wq .w4: movq m1, [tlq+2] paddw m0, m1 pmaddwd m0, m3 psubd m4, m0 pshufd m0, m4, q1032 paddd m0, m4 pshuflw m4, m0, q1032 paddd m0, m4 cmp hd, 4 jg .w4_mul psrld m0, 3 jmp .w4_end .w4_mul: mov r6d, 0xAAAB mov r2d, 0x6667 cmp hd, 16 cmove r6d, r2d movd m1, r6d psrld m0, 2 pmulhuw m0, m1 psrlw m0, 1 .w4_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s4: movd m1, alpham lea r6, [strideq*3] pshuflw m1, m1, q0000 punpcklqdq m1, m1 pabsw m2, m1 psllw m2, 9 .s4_loop: mova m4, [acq+16*0] mova m5, [acq+16*1] add acq, 16*2 IPRED_CFL 3, 4 IPRED_CFL 4, 5 movq [dstq+strideq*0], m3 movhps [dstq+strideq*1], m3 movq [dstq+strideq*2], m4 movhps [dstq+r6 ], m4 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s4_loop RET .h8: mova m0, [tlq-16] jmp wq .w8: movu m1, [tlq+2] paddw m0, m1 pmaddwd m0, m3 psubd m4, m0 pshufd m0, m4, q1032 paddd m0, m4 pshuflw m4, m0, q1032 paddd m0, m4 psrld m0, m5 cmp hd, 8 je .w8_end mov r6d, 0xAAAB mov r2d, 0x6667 cmp hd, 32 cmove r6d, r2d movd m1, r6d pmulhuw m0, m1 psrlw m0, 1 .w8_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s8: movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 pabsw m2, m1 psllw m2, 9 .s8_loop: mova m4, [acq+16*0] mova m5, [acq+16*1] add acq, 16*2 IPRED_CFL 3, 4 IPRED_CFL 4, 5 mova [dstq+strideq*0], m3 mova [dstq+strideq*1], m4 lea dstq, [dstq+strideq*2] sub hd, 2 jg .s8_loop RET .h16: mova m0, [tlq-32] paddw m0, [tlq-16] jmp wq .w16: movu m1, [tlq+ 2] movu m2, [tlq+18] paddw m1, m2 paddw m0, m1 pmaddwd m0, m3 psubd m4, m0 pshufd m0, m4, q1032 paddd m0, m4 pshuflw m4, m0, q1032 paddd m0, m4 psrld m0, m5 cmp hd, 16 je .w16_end mov r6d, 0xAAAB mov r2d, 0x6667 test hd, 8|32 cmovz r6d, r2d movd m1, r6d pmulhuw m0, m1 psrlw m0, 1 .w16_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s16: movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 pabsw m2, m1 psllw m2, 9 .s16_loop: mova m4, [acq+16*0] mova m5, [acq+16*1] add acq, 16*2 IPRED_CFL 3, 4 IPRED_CFL 4, 5 mova [dstq+16*0], m3 mova [dstq+16*1], m4 add dstq, strideq dec hd jg .s16_loop RET .h32: mova m0, [tlq-64] paddw m0, [tlq-48] paddw m0, [tlq-32] paddw m0, [tlq-16] jmp wq .w32: movu m1, [tlq+ 2] movu m2, [tlq+18] paddw m1, m2 movu m2, [tlq+34] paddw m1, m2 movu m2, [tlq+50] paddw m1, m2 paddw m0, m1 pmaddwd m0, m3 psubd m4, m0 pshufd m0, m4, q1032 paddd m0, m4 pshuflw m4, m0, q1032 paddd m0, m4 psrld m0, m5 cmp hd, 32 je .w32_end mov r6d, 0xAAAB mov r2d, 0x6667 cmp hd, 8 cmove r6d, r2d movd m1, r6d pmulhuw m0, m1 psrlw m0, 1 .w32_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s32: movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 pabsw m2, m1 psllw m2, 9 .s32_loop: mova m4, [acq+16*0] mova m5, [acq+16*1] IPRED_CFL 3, 4 IPRED_CFL 4, 5 mova [dstq+16*0], m3 mova [dstq+16*1], m4 mova m4, [acq+16*2] mova m5, [acq+16*3] add acq, 16*4 IPRED_CFL 3, 4 IPRED_CFL 4, 5 mova [dstq+16*2], m3 mova [dstq+16*3], m4 add dstq, strideq dec hd jg .s32_loop RET cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac tzcnt wd, wm LEA t0, ipred_cfl_splat_16bpc_ssse3_table mov r6d, r7m movifnidn hd, hm shr r6d, 11 movd m7, r7m movsxd wq, [t0+wq*4] movddup m0, [t0-ipred_cfl_splat_16bpc_ssse3_table+pw_512+r6*8] pshuflw m7, m7, q0000 pxor m6, m6 add wq, t0 movifnidn acq, acmp punpcklqdq m7, m7 jmp wq cglobal ipred_cfl_ac_420_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h movifnidn hpadd, hpadm %if ARCH_X86_32 && PIC pcmpeqw m5, m5 pabsw m5, m5 paddw m5, m5 %else movddup m5, [pw_2] %endif mov hd, hm shl hpadd, 2 pxor m4, m4 sub hd, hpadd cmp dword wm, 8 mov r5, acq jg .w16 je .w8 lea r3, [strideq*3] .w4_loop: pmaddwd m0, m5, [ypxq+strideq*0] pmaddwd m1, m5, [ypxq+strideq*1] pmaddwd m2, m5, [ypxq+strideq*2] pmaddwd m3, m5, [ypxq+r3 ] lea ypxq, [ypxq+strideq*4] paddd m0, m1 paddd m2, m3 paddd m4, m0 packssdw m0, m2 paddd m4, m2 mova [acq], m0 add acq, 16 sub hd, 2 jg .w4_loop test hpadd, hpadd jz .dc punpckhqdq m0, m0 pslld m2, 2 .w4_hpad: mova [acq+16*0], m0 paddd m4, m2 mova [acq+16*1], m0 add acq, 16*2 sub hpadd, 4 jg .w4_hpad jmp .dc .w8: %if ARCH_X86_32 cmp dword wpadm, 0 %else test wpadd, wpadd %endif jnz .w8_wpad1 .w8_loop: pmaddwd m0, m5, [ypxq+strideq*0+16*0] pmaddwd m2, m5, [ypxq+strideq*1+16*0] pmaddwd m1, m5, [ypxq+strideq*0+16*1] pmaddwd m3, m5, [ypxq+strideq*1+16*1] lea ypxq, [ypxq+strideq*2] paddd m0, m2 paddd m1, m3 paddd m2, m0, m1 packssdw m0, m1 paddd m4, m2 mova [acq], m0 add acq, 16 dec hd jg .w8_loop .w8_hpad: test hpadd, hpadd jz .dc pslld m2, 2 mova m1, m0 jmp .hpad .w8_wpad1: pmaddwd m0, m5, [ypxq+strideq*0] pmaddwd m1, m5, [ypxq+strideq*1] lea ypxq, [ypxq+strideq*2] paddd m0, m1 pshufd m1, m0, q3333 paddd m2, m0, m1 packssdw m0, m1 paddd m4, m2 mova [acq], m0 add acq, 16 dec hd jg .w8_wpad1 jmp .w8_hpad .w16_wpad3: pshufd m3, m0, q3333 mova m1, m3 mova m2, m3 jmp .w16_wpad_end .w16_wpad2: pshufd m1, m3, q3333 mova m2, m1 jmp .w16_wpad_end .w16_wpad1: pshufd m2, m1, q3333 jmp .w16_wpad_end .w16: movifnidn wpadd, wpadm WIN64_SPILL_XMM 7 .w16_loop: pmaddwd m0, m5, [ypxq+strideq*0+16*0] pmaddwd m6, m5, [ypxq+strideq*1+16*0] paddd m0, m6 cmp wpadd, 2 jg .w16_wpad3 pmaddwd m3, m5, [ypxq+strideq*0+16*1] pmaddwd m6, m5, [ypxq+strideq*1+16*1] paddd m3, m6 je .w16_wpad2 pmaddwd m1, m5, [ypxq+strideq*0+16*2] pmaddwd m6, m5, [ypxq+strideq*1+16*2] paddd m1, m6 jp .w16_wpad1 pmaddwd m2, m5, [ypxq+strideq*0+16*3] pmaddwd m6, m5, [ypxq+strideq*1+16*3] paddd m2, m6 .w16_wpad_end: lea ypxq, [ypxq+strideq*2] paddd m6, m0, m3 packssdw m0, m3 paddd m6, m1 mova [acq+16*0], m0 packssdw m1, m2 paddd m2, m6 mova [acq+16*1], m1 add acq, 16*2 paddd m4, m2 dec hd jg .w16_loop WIN64_RESTORE_XMM add hpadd, hpadd jz .dc paddd m2, m2 .hpad: mova [acq+16*0], m0 mova [acq+16*1], m1 paddd m4, m2 mova [acq+16*2], m0 mova [acq+16*3], m1 add acq, 16*4 sub hpadd, 4 jg .hpad .dc: sub r5, acq ; -w*h*2 pshufd m2, m4, q1032 tzcnt r1d, r5d paddd m2, m4 sub r1d, 2 pshufd m4, m2, q2301 movd m0, r1d paddd m2, m4 psrld m2, m0 pxor m0, m0 pavgw m2, m0 packssdw m2, m2 .dc_loop: mova m0, [acq+r5+16*0] mova m1, [acq+r5+16*1] psubw m0, m2 psubw m1, m2 mova [acq+r5+16*0], m0 mova [acq+r5+16*1], m1 add r5, 16*2 jl .dc_loop RET cglobal ipred_cfl_ac_422_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h movifnidn hpadd, hpadm %if ARCH_X86_32 && PIC pcmpeqw m5, m5 pabsw m5, m5 psllw m5, 2 %else movddup m5, [pw_4] %endif mov hd, hm shl hpadd, 2 pxor m4, m4 sub hd, hpadd cmp dword wm, 8 mov r5, acq jg .w16 je .w8 lea r3, [strideq*3] .w4_loop: pmaddwd m0, m5, [ypxq+strideq*0] pmaddwd m3, m5, [ypxq+strideq*1] pmaddwd m1, m5, [ypxq+strideq*2] pmaddwd m2, m5, [ypxq+r3 ] lea ypxq, [ypxq+strideq*4] paddd m4, m0 packssdw m0, m3 paddd m3, m1 packssdw m1, m2 paddd m4, m2 paddd m4, m3 mova [acq+16*0], m0 mova [acq+16*1], m1 add acq, 16*2 sub hd, 4 jg .w4_loop test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc punpckhqdq m1, m1 pslld m2, 3 mova [acq+16*0], m1 mova [acq+16*1], m1 paddd m4, m2 mova [acq+16*2], m1 mova [acq+16*3], m1 add acq, 16*4 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc .w8: %if ARCH_X86_32 cmp dword wpadm, 0 %else test wpadd, wpadd %endif jnz .w8_wpad1 .w8_loop: pmaddwd m0, m5, [ypxq+strideq*0+16*0] pmaddwd m2, m5, [ypxq+strideq*0+16*1] pmaddwd m1, m5, [ypxq+strideq*1+16*0] pmaddwd m3, m5, [ypxq+strideq*1+16*1] lea ypxq, [ypxq+strideq*2] paddd m4, m0 packssdw m0, m2 paddd m4, m2 mova [acq+16*0], m0 paddd m2, m1, m3 packssdw m1, m3 paddd m4, m2 mova [acq+16*1], m1 add acq, 16*2 sub hd, 2 jg .w8_loop .w8_hpad: test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc pslld m2, 2 mova m0, m1 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad .w8_wpad1: pmaddwd m0, m5, [ypxq+strideq*0] pmaddwd m1, m5, [ypxq+strideq*1] lea ypxq, [ypxq+strideq*2] pshufd m2, m0, q3333 pshufd m3, m1, q3333 paddd m4, m0 packssdw m0, m2 paddd m4, m2 paddd m2, m1, m3 packssdw m1, m3 paddd m4, m2 mova [acq+16*0], m0 mova [acq+16*1], m1 add acq, 16*2 sub hd, 2 jg .w8_wpad1 jmp .w8_hpad .w16_wpad3: pshufd m3, m0, q3333 mova m1, m3 mova m2, m3 jmp .w16_wpad_end .w16_wpad2: pshufd m1, m3, q3333 mova m2, m1 jmp .w16_wpad_end .w16_wpad1: pshufd m2, m1, q3333 jmp .w16_wpad_end .w16: movifnidn wpadd, wpadm WIN64_SPILL_XMM 7 .w16_loop: pmaddwd m0, m5, [ypxq+16*0] cmp wpadd, 2 jg .w16_wpad3 pmaddwd m3, m5, [ypxq+16*1] je .w16_wpad2 pmaddwd m1, m5, [ypxq+16*2] jp .w16_wpad1 pmaddwd m2, m5, [ypxq+16*3] .w16_wpad_end: add ypxq, strideq paddd m6, m0, m3 packssdw m0, m3 mova [acq+16*0], m0 paddd m6, m1 packssdw m1, m2 paddd m2, m6 mova [acq+16*1], m1 add acq, 16*2 paddd m4, m2 dec hd jg .w16_loop WIN64_RESTORE_XMM add hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc paddd m2, m2 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h %define base r6-ipred_cfl_ac_444_16bpc_ssse3_table LEA r6, ipred_cfl_ac_444_16bpc_ssse3_table tzcnt wd, wm movifnidn hpadd, hpadm pxor m4, m4 movsxd wq, [r6+wq*4] movddup m5, [base+pw_1] add wq, r6 mov hd, hm shl hpadd, 2 sub hd, hpadd jmp wq .w4: lea r3, [strideq*3] mov r5, acq .w4_loop: movq m0, [ypxq+strideq*0] movhps m0, [ypxq+strideq*1] movq m1, [ypxq+strideq*2] movhps m1, [ypxq+r3 ] lea ypxq, [ypxq+strideq*4] psllw m0, 3 psllw m1, 3 mova [acq+16*0], m0 pmaddwd m0, m5 mova [acq+16*1], m1 pmaddwd m2, m5, m1 add acq, 16*2 paddd m4, m0 paddd m4, m2 sub hd, 4 jg .w4_loop test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc punpckhqdq m1, m1 mova [acq+16*0], m1 pslld m2, 2 mova [acq+16*1], m1 punpckhqdq m2, m2 mova [acq+16*2], m1 paddd m4, m2 mova [acq+16*3], m1 add acq, 16*4 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc .w8: mov r5, acq .w8_loop: mova m0, [ypxq+strideq*0] mova m1, [ypxq+strideq*1] lea ypxq, [ypxq+strideq*2] psllw m0, 3 psllw m1, 3 mova [acq+16*0], m0 pmaddwd m0, m5 mova [acq+16*1], m1 pmaddwd m2, m5, m1 add acq, 16*2 paddd m4, m0 paddd m4, m2 sub hd, 2 jg .w8_loop .w8_hpad: test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc pslld m2, 2 mova m0, m1 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad .w16_wpad2: pshufhw m3, m2, q3333 pshufhw m1, m0, q3333 punpckhqdq m3, m3 punpckhqdq m1, m1 jmp .w16_wpad_end .w16: movifnidn wpadd, wpadm mov r5, acq .w16_loop: mova m2, [ypxq+strideq*0+16*0] mova m0, [ypxq+strideq*1+16*0] psllw m2, 3 psllw m0, 3 test wpadd, wpadd jnz .w16_wpad2 mova m3, [ypxq+strideq*0+16*1] mova m1, [ypxq+strideq*1+16*1] psllw m3, 3 psllw m1, 3 .w16_wpad_end: lea ypxq, [ypxq+strideq*2] mova [acq+16*0], m2 pmaddwd m2, m5 mova [acq+16*1], m3 pmaddwd m3, m5 paddd m4, m2 pmaddwd m2, m5, m0 mova [acq+16*2], m0 paddd m4, m3 pmaddwd m3, m5, m1 mova [acq+16*3], m1 add acq, 16*4 paddd m2, m3 paddd m4, m2 sub hd, 2 jg .w16_loop add hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc paddd m2, m2 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad .w32_wpad6: pshufhw m1, m0, q3333 punpckhqdq m1, m1 mova m2, m1 mova m3, m1 jmp .w32_wpad_end .w32_wpad4: pshufhw m2, m1, q3333 punpckhqdq m2, m2 mova m3, m2 jmp .w32_wpad_end .w32_wpad2: pshufhw m3, m2, q3333 punpckhqdq m3, m3 jmp .w32_wpad_end .w32: movifnidn wpadd, wpadm mov r5, acq WIN64_SPILL_XMM 8 .w32_loop: mova m0, [ypxq+16*0] psllw m0, 3 cmp wpadd, 4 jg .w32_wpad6 mova m1, [ypxq+16*1] psllw m1, 3 je .w32_wpad4 mova m2, [ypxq+16*2] psllw m2, 3 jnp .w32_wpad2 mova m3, [ypxq+16*3] psllw m3, 3 .w32_wpad_end: add ypxq, strideq pmaddwd m6, m5, m0 mova [acq+16*0], m0 pmaddwd m7, m5, m1 mova [acq+16*1], m1 paddd m6, m7 pmaddwd m7, m5, m2 mova [acq+16*2], m2 paddd m6, m7 pmaddwd m7, m5, m3 mova [acq+16*3], m3 add acq, 16*4 paddd m6, m7 paddd m4, m6 dec hd jg .w32_loop %if WIN64 mova m5, m6 WIN64_RESTORE_XMM SWAP 5, 6 %endif test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc .w32_hpad_loop: mova [acq+16*0], m0 mova [acq+16*1], m1 paddd m4, m6 mova [acq+16*2], m2 mova [acq+16*3], m3 add acq, 16*4 dec hpadd jg .w32_hpad_loop jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc cglobal pal_pred_16bpc, 4, 5, 6, dst, stride, pal, idx, w, h %define base r2-pal_pred_16bpc_ssse3_table %if ARCH_X86_32 %define hd r2d %endif mova m4, [palq] LEA r2, pal_pred_16bpc_ssse3_table tzcnt wd, wm pshufb m4, [base+pal_pred_shuf] movsxd wq, [r2+wq*4] pshufd m5, m4, q1032 add wq, r2 movifnidn hd, hm jmp wq .w4: movq m0, [idxq] add idxq, 8 psrlw m1, m0, 4 punpcklbw m0, m1 pshufb m1, m4, m0 pshufb m2, m5, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] movq [dstq+strideq*0], m1 movhps [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 4 jg .w4 RET .w8: movu m3, [idxq] add idxq, 16 psrlw m1, m3, 4 punpcklbw m0, m3, m1 punpckhbw m3, m1 pshufb m1, m4, m0 pshufb m2, m5, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] pshufb m1, m4, m3 pshufb m2, m5, m3 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 4 jg .w8 RET .w16: movu m3, [idxq] add idxq, 16 psrlw m1, m3, 4 punpcklbw m0, m3, m1 punpckhbw m3, m1 pshufb m1, m4, m0 pshufb m2, m5, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+ 0], m0 mova [dstq+16], m1 pshufb m1, m4, m3 pshufb m2, m5, m3 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+strideq+ 0], m0 mova [dstq+strideq+16], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16 RET .w32: movu m3, [idxq] add idxq, 16 psrlw m1, m3, 4 punpcklbw m0, m3, m1 punpckhbw m3, m1 pshufb m1, m4, m0 pshufb m2, m5, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+16*0], m0 mova [dstq+16*1], m1 pshufb m1, m4, m3 pshufb m2, m5, m3 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+16*2], m0 mova [dstq+16*3], m1 add dstq, strideq dec hd jg .w32 RET .w64: movu m3, [idxq+16*0] psrlw m1, m3, 4 punpcklbw m0, m3, m1 punpckhbw m3, m1 pshufb m1, m4, m0 pshufb m2, m5, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+16*0], m0 mova [dstq+16*1], m1 pshufb m1, m4, m3 pshufb m2, m5, m3 movu m3, [idxq+16*1] add idxq, 32 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+16*2], m0 mova [dstq+16*3], m1 psrlw m1, m3, 4 punpcklbw m0, m3, m1 punpckhbw m3, m1 pshufb m1, m4, m0 pshufb m2, m5, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+16*4], m0 mova [dstq+16*5], m1 pshufb m1, m4, m3 pshufb m2, m5, m3 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+16*6], m0 mova [dstq+16*7], m1 add dstq, strideq dec hd jg .w64 RET