; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC ; Copyright (c) 2017-2021, The rav1e contributors ; Copyright (c) 2021, Nathan Egge ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA %macro DUP8 1-* %rep %0 times 8 dw %1 %rotate 1 %endrep %endmacro pri_taps: DUP8 4, 2, 3, 3 dir_table: db 1 * 32 + 0, 2 * 32 + 0 db 1 * 32 + 0, 2 * 32 - 2 db -1 * 32 + 2, -2 * 32 + 4 db 0 * 32 + 2, -1 * 32 + 4 db 0 * 32 + 2, 0 * 32 + 4 db 0 * 32 + 2, 1 * 32 + 4 db 1 * 32 + 2, 2 * 32 + 4 db 1 * 32 + 0, 2 * 32 + 2 db 1 * 32 + 0, 2 * 32 + 0 db 1 * 32 + 0, 2 * 32 - 2 db -1 * 32 + 2, -2 * 32 + 4 db 0 * 32 + 2, -1 * 32 + 4 dir_shift: times 4 dw 0x4000 times 4 dw 0x1000 pw_128: times 4 dw 128 pw_2048: times 8 dw 2048 pw_m16384: times 8 dw -16384 cextern cdef_dir_8bpc_ssse3.main cextern cdef_dir_8bpc_sse4.main cextern shufw_6543210x SECTION .text %if ARCH_X86_32 DECLARE_REG_TMP 5, 3 %elif WIN64 DECLARE_REG_TMP 8, 4 %else DECLARE_REG_TMP 8, 6 %endif %macro CDEF_FILTER 2 ; w, h %if ARCH_X86_64 DEFINE_ARGS dst, stride, _, tmp, pridmp, pri, sec, dir mova m8, [base+pw_2048] %else DEFINE_ARGS dst, pridmp, tmp, sec, pri, _, dir %define m8 [base+pw_2048] %define m9 [rsp+16*1+gprsize] %define m10 [rsp+16*2+gprsize] %endif movifnidn prid, r5m movifnidn secd, r6m test prid, prid jz .sec_only movd m6, r5m %if ARCH_X86_32 mov [rsp+24], pridmpd %endif bsr pridmpd, prid lea tmpd, [priq*4] cmp dword r10m, 0x3ff ; if (bpc == 10) cmove prid, tmpd ; pri <<= 2 mov tmpd, r8m ; damping mov dird, r7m and prid, 16 pshufb m6, m7 ; splat lea dirq, [base+dir_table+dirq*2] lea priq, [base+pri_taps+priq*2] test secd, secd jz .pri_only mova [rsp], m6 movd m6, secd tzcnt secd, secd sub pridmpd, tmpd sub tmpd, secd pshufb m6, m7 xor secd, secd neg pridmpd cmovs pridmpd, secd %if ARCH_X86_32 mov [pri_shift+4], secd mov [sec_shift+4], secd %endif mov [pri_shift+0], pridmpq mov [sec_shift+0], tmpq lea tmpq, [px] %if WIN64 movaps r4m, m9 movaps r6m, m10 %elif ARCH_X86_32 mov pridmpd, [rsp+24] %endif %rep %1*%2/8 call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec %endrep %if WIN64 movaps m9, r4m movaps m10, r6m %endif jmp .end .pri_only: sub tmpd, pridmpd cmovs tmpd, secd %if ARCH_X86_32 mov pridmpd, [rsp+24] mov [pri_shift+4], secd %endif mov [pri_shift+0], tmpq lea tmpq, [px] %rep %1*%2/8 call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri %endrep .end: RET .sec_only: mov tmpd, r8m ; damping movd m6, r6m tzcnt secd, secd mov dird, r7m pshufb m6, m7 sub tmpd, secd lea dirq, [base+dir_table+dirq*2] %if ARCH_X86_32 mov [sec_shift+4], prid %endif mov [sec_shift+0], tmpq lea tmpq, [px] %rep %1*%2/8 call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec %endrep jmp .end %if %1 == %2 %if ARCH_X86_64 DEFINE_ARGS dst, stride, _, tmp, off, pri, _, dir %else DEFINE_ARGS dst, stride, tmp, off, pri, _, dir %endif ALIGN function_align .pri: movsx offq, byte [dirq+4] ; off_k0 %if %1 == 4 movq m1, [dstq+strideq*0] movhps m1, [dstq+strideq*1] movq m2, [tmpq+offq+32*0] ; k0p0 movhps m2, [tmpq+offq+32*1] neg offq movq m3, [tmpq+offq+32*0] ; k0p1 movhps m3, [tmpq+offq+32*1] %else mova m1, [dstq] movu m2, [tmpq+offq] neg offq movu m3, [tmpq+offq] %endif movsx offq, byte [dirq+5] ; off_k1 psubw m2, m1 ; diff_k0p0 psubw m3, m1 ; diff_k0p1 pabsw m4, m2 ; adiff_k0p0 psrlw m5, m4, [pri_shift+gprsize] psubusw m0, m6, m5 pabsw m5, m3 ; adiff_k0p1 pminsw m0, m4 psrlw m4, m5, [pri_shift+gprsize] psignw m0, m2 ; constrain(diff_k0p0) psubusw m2, m6, m4 pminsw m2, m5 %if %1 == 4 movq m4, [tmpq+offq+32*0] ; k1p0 movhps m4, [tmpq+offq+32*1] neg offq movq m5, [tmpq+offq+32*0] ; k1p1 movhps m5, [tmpq+offq+32*1] %else movu m4, [tmpq+offq] neg offq movu m5, [tmpq+offq] %endif psubw m4, m1 ; diff_k1p0 psubw m5, m1 ; diff_k1p1 psignw m2, m3 ; constrain(diff_k0p1) pabsw m3, m4 ; adiff_k1p0 paddw m0, m2 ; constrain(diff_k0) psrlw m2, m3, [pri_shift+gprsize] psubusw m7, m6, m2 pabsw m2, m5 ; adiff_k1p1 pminsw m7, m3 psrlw m3, m2, [pri_shift+gprsize] psignw m7, m4 ; constrain(diff_k1p0) psubusw m4, m6, m3 pminsw m4, m2 psignw m4, m5 ; constrain(diff_k1p1) paddw m7, m4 ; constrain(diff_k1) pmullw m0, [priq+16*0] ; pri_tap_k0 pmullw m7, [priq+16*1] ; pri_tap_k1 paddw m0, m7 ; sum psraw m2, m0, 15 paddw m0, m2 pmulhrsw m0, m8 paddw m0, m1 %if %1 == 4 add tmpq, 32*2 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] %else add tmpq, 32 mova [dstq], m0 add dstq, strideq %endif ret ALIGN function_align .sec: movsx offq, byte [dirq+8] ; off1_k0 %if %1 == 4 movq m1, [dstq+strideq*0] movhps m1, [dstq+strideq*1] movq m2, [tmpq+offq+32*0] ; k0s0 movhps m2, [tmpq+offq+32*1] neg offq movq m3, [tmpq+offq+32*0] ; k0s1 movhps m3, [tmpq+offq+32*1] %else mova m1, [dstq] movu m2, [tmpq+offq] neg offq movu m3, [tmpq+offq] %endif movsx offq, byte [dirq+0] ; off2_k0 psubw m2, m1 ; diff_k0s0 psubw m3, m1 ; diff_k0s1 pabsw m4, m2 ; adiff_k0s0 psrlw m5, m4, [sec_shift+gprsize] psubusw m0, m6, m5 pabsw m5, m3 ; adiff_k0s1 pminsw m0, m4 psrlw m4, m5, [sec_shift+gprsize] psignw m0, m2 ; constrain(diff_k0s0) psubusw m2, m6, m4 pminsw m2, m5 %if %1 == 4 movq m4, [tmpq+offq+32*0] ; k0s2 movhps m4, [tmpq+offq+32*1] neg offq movq m5, [tmpq+offq+32*0] ; k0s3 movhps m5, [tmpq+offq+32*1] %else movu m4, [tmpq+offq] neg offq movu m5, [tmpq+offq] %endif movsx offq, byte [dirq+9] ; off1_k1 psubw m4, m1 ; diff_k0s2 psubw m5, m1 ; diff_k0s3 psignw m2, m3 ; constrain(diff_k0s1) pabsw m3, m4 ; adiff_k0s2 paddw m0, m2 psrlw m2, m3, [sec_shift+gprsize] psubusw m7, m6, m2 pabsw m2, m5 ; adiff_k0s3 pminsw m7, m3 psrlw m3, m2, [sec_shift+gprsize] psignw m7, m4 ; constrain(diff_k0s2) psubusw m4, m6, m3 pminsw m4, m2 %if %1 == 4 movq m2, [tmpq+offq+32*0] ; k1s0 movhps m2, [tmpq+offq+32*1] neg offq movq m3, [tmpq+offq+32*0] ; k1s1 movhps m3, [tmpq+offq+32*1] %else movu m2, [tmpq+offq] neg offq movu m3, [tmpq+offq] %endif movsx offq, byte [dirq+1] ; off2_k1 paddw m0, m7 psignw m4, m5 ; constrain(diff_k0s3) paddw m0, m4 ; constrain(diff_k0) psubw m2, m1 ; diff_k1s0 psubw m3, m1 ; diff_k1s1 paddw m0, m0 ; sec_tap_k0 pabsw m4, m2 ; adiff_k1s0 psrlw m5, m4, [sec_shift+gprsize] psubusw m7, m6, m5 pabsw m5, m3 ; adiff_k1s1 pminsw m7, m4 psrlw m4, m5, [sec_shift+gprsize] psignw m7, m2 ; constrain(diff_k1s0) psubusw m2, m6, m4 pminsw m2, m5 %if %1 == 4 movq m4, [tmpq+offq+32*0] ; k1s2 movhps m4, [tmpq+offq+32*1] neg offq movq m5, [tmpq+offq+32*0] ; k1s3 movhps m5, [tmpq+offq+32*1] %else movu m4, [tmpq+offq] neg offq movu m5, [tmpq+offq] %endif paddw m0, m7 psubw m4, m1 ; diff_k1s2 psubw m5, m1 ; diff_k1s3 psignw m2, m3 ; constrain(diff_k1s1) pabsw m3, m4 ; adiff_k1s2 paddw m0, m2 psrlw m2, m3, [sec_shift+gprsize] psubusw m7, m6, m2 pabsw m2, m5 ; adiff_k1s3 pminsw m7, m3 psrlw m3, m2, [sec_shift+gprsize] psignw m7, m4 ; constrain(diff_k1s2) psubusw m4, m6, m3 pminsw m4, m2 paddw m0, m7 psignw m4, m5 ; constrain(diff_k1s3) paddw m0, m4 ; sum psraw m2, m0, 15 paddw m0, m2 pmulhrsw m0, m8 paddw m0, m1 %if %1 == 4 add tmpq, 32*2 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] %else add tmpq, 32 mova [dstq], m0 add dstq, strideq %endif ret ALIGN function_align .pri_sec: movsx offq, byte [dirq+8] ; off2_k0 %if %1 == 4 movq m1, [dstq+strideq*0] movhps m1, [dstq+strideq*1] movq m2, [tmpq+offq+32*0] ; k0s0 movhps m2, [tmpq+offq+32*1] neg offq movq m3, [tmpq+offq+32*0] ; k0s1 movhps m3, [tmpq+offq+32*1] %else mova m1, [dstq] movu m2, [tmpq+offq] neg offq movu m3, [tmpq+offq] %endif movsx offq, byte [dirq+0] ; off3_k0 pabsw m4, m2 %if ARCH_X86_64 pabsw m10, m3 pmaxsw m9, m2, m3 pminsw m10, m4 %else pabsw m7, m3 pmaxsw m5, m2, m3 pminsw m4, m7 mova m9, m5 mova m10, m4 %endif psubw m2, m1 ; diff_k0s0 psubw m3, m1 ; diff_k0s1 pabsw m4, m2 ; adiff_k0s0 psrlw m5, m4, [sec_shift+gprsize] psubusw m0, m6, m5 pabsw m5, m3 ; adiff_k0s1 pminsw m0, m4 psrlw m4, m5, [sec_shift+gprsize] psignw m0, m2 ; constrain(diff_k0s0) psubusw m2, m6, m4 pminsw m2, m5 %if %1 == 4 movq m4, [tmpq+offq+32*0] ; k0s2 movhps m4, [tmpq+offq+32*1] neg offq movq m5, [tmpq+offq+32*0] ; k0s3 movhps m5, [tmpq+offq+32*1] %else movu m4, [tmpq+offq] neg offq movu m5, [tmpq+offq] %endif movsx offq, byte [dirq+9] ; off2_k1 pabsw m7, m4 psignw m2, m3 pabsw m3, m5 ; constrain(diff_k0s1) %if ARCH_X86_64 pmaxsw m9, m4 pminsw m10, m7 pmaxsw m9, m5 pminsw m10, m3 %else pminsw m7, m10 pminsw m7, m3 pmaxsw m3, m9, m4 pmaxsw m3, m5 mova m10, m7 mova m9, m3 %endif psubw m4, m1 ; diff_k0s2 psubw m5, m1 ; diff_k0s3 paddw m0, m2 pabsw m3, m4 ; adiff_k0s2 psrlw m2, m3, [sec_shift+gprsize] psubusw m7, m6, m2 pabsw m2, m5 ; adiff_k0s3 pminsw m7, m3 psrlw m3, m2, [sec_shift+gprsize] psignw m7, m4 ; constrain(diff_k0s2) psubusw m4, m6, m3 pminsw m4, m2 %if %1 == 4 movq m2, [tmpq+offq+32*0] ; k1s0 movhps m2, [tmpq+offq+32*1] neg offq movq m3, [tmpq+offq+32*0] ; k1s1 movhps m3, [tmpq+offq+32*1] %else movu m2, [tmpq+offq] neg offq movu m3, [tmpq+offq] %endif movsx offq, byte [dirq+1] ; off3_k1 paddw m0, m7 pabsw m7, m2 psignw m4, m5 ; constrain(diff_k0s3) pabsw m5, m3 %if ARCH_X86_64 pmaxsw m9, m2 pminsw m10, m7 pmaxsw m9, m3 pminsw m10, m5 %else pminsw m7, m10 pminsw m7, m5 pmaxsw m5, m9, m2 pmaxsw m5, m3 mova m10, m7 mova m9, m5 %endif paddw m0, m4 ; constrain(diff_k0) psubw m2, m1 ; diff_k1s0 psubw m3, m1 ; diff_k1s1 paddw m0, m0 ; sec_tap_k0 pabsw m4, m2 ; adiff_k1s0 psrlw m5, m4, [sec_shift+gprsize] psubusw m7, m6, m5 pabsw m5, m3 ; adiff_k1s1 pminsw m7, m4 psrlw m4, m5, [sec_shift+gprsize] psignw m7, m2 ; constrain(diff_k1s0) psubusw m2, m6, m4 pminsw m2, m5 %if %1 == 4 movq m4, [tmpq+offq+32*0] ; k1s2 movhps m4, [tmpq+offq+32*1] neg offq movq m5, [tmpq+offq+32*0] ; k1s3 movhps m5, [tmpq+offq+32*1] %else movu m4, [tmpq+offq] neg offq movu m5, [tmpq+offq] %endif movsx offq, byte [dirq+4] ; off1_k0 paddw m0, m7 pabsw m7, m4 psignw m2, m3 ; constrain(diff_k1s1) pabsw m3, m5 %if ARCH_X86_64 pmaxsw m9, m4 pminsw m10, m7 pmaxsw m9, m5 pminsw m10, m3 %else pminsw m7, m10 pminsw m7, m3 pmaxsw m3, m9, m4 pmaxsw m3, m5 mova m10, m7 mova m9, m3 %endif psubw m4, m1 ; diff_k1s2 psubw m5, m1 ; diff_k1s3 pabsw m3, m4 ; adiff_k1s2 paddw m0, m2 psrlw m2, m3, [sec_shift+gprsize] psubusw m7, m6, m2 pabsw m2, m5 ; adiff_k1s3 pminsw m7, m3 psrlw m3, m2, [sec_shift+gprsize] psignw m7, m4 ; constrain(diff_k1s2) psubusw m4, m6, m3 pminsw m4, m2 paddw m0, m7 %if %1 == 4 movq m2, [tmpq+offq+32*0] ; k0p0 movhps m2, [tmpq+offq+32*1] neg offq movq m3, [tmpq+offq+32*0] ; k0p1 movhps m3, [tmpq+offq+32*1] %else movu m2, [tmpq+offq] neg offq movu m3, [tmpq+offq] %endif movsx offq, byte [dirq+5] ; off1_k1 pabsw m7, m2 psignw m4, m5 ; constrain(diff_k1s3) pabsw m5, m3 %if ARCH_X86_64 pmaxsw m9, m2 pminsw m10, m7 pmaxsw m9, m3 pminsw m10, m5 %else pminsw m7, m10 pminsw m7, m5 pmaxsw m5, m9, m2 pmaxsw m5, m3 mova m10, m7 mova m9, m5 %endif psubw m2, m1 ; diff_k0p0 psubw m3, m1 ; diff_k0p1 paddw m0, m4 pabsw m4, m2 ; adiff_k0p0 psrlw m5, m4, [pri_shift+gprsize] psubusw m7, [rsp+gprsize], m5 pabsw m5, m3 ; adiff_k0p1 pminsw m7, m4 psrlw m4, m5, [pri_shift+gprsize] psignw m7, m2 ; constrain(diff_k0p0) psubusw m2, [rsp+gprsize], m4 pminsw m2, m5 %if %1 == 4 movq m4, [tmpq+offq+32*0] ; k1p0 movhps m4, [tmpq+offq+32*1] neg offq movq m5, [tmpq+offq+32*0] ; k1p1 movhps m5, [tmpq+offq+32*1] %else movu m4, [tmpq+offq] neg offq movu m5, [tmpq+offq] %endif psignw m2, m3 ; constrain(diff_k0p1) pabsw m3, m4 paddw m7, m2 ; constrain(diff_k0) pabsw m2, m5 %if ARCH_X86_64 pmaxsw m9, m4 pminsw m10, m3 pmaxsw m9, m5 pminsw m10, m2 %else pminsw m3, m10 pminsw m3, m2 pmaxsw m2, m9, m4 pmaxsw m2, m5 mova m10, m3 mova m9, m2 %endif psubw m4, m1 ; diff_k1p0 psubw m5, m1 ; diff_k1p1 pabsw m3, m4 ; adiff_k1p0 pmullw m7, [priq+16*0] ; pri_tap_k0 paddw m0, m7 psrlw m2, m3, [pri_shift+gprsize] psubusw m7, [rsp+16*0+gprsize], m2 pabsw m2, m5 ; adiff_k1p1 pminsw m7, m3 psrlw m3, m2, [pri_shift+gprsize] psignw m7, m4 ; constrain(diff_k1p0) psubusw m4, [rsp+16*0+gprsize], m3 pminsw m4, m2 psignw m4, m5 ; constrain(diff_k1p1) paddw m7, m4 ; constrain(diff_k1) pmullw m7, [priq+16*1] ; pri_tap_k1 paddw m0, m7 ; sum psraw m2, m0, 15 paddw m0, m2 pmulhrsw m0, m8 paddw m0, m1 %if ARCH_X86_64 pmaxsw m9, m1 pminsw m0, m9 %else pmaxsw m2, m9, m1 pminsw m0, m2 %endif pminsw m1, m10 pmaxsw m0, m1 %if %1 == 4 add tmpq, 32*2 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] %else add tmpq, 32 mova [dstq], m0 add dstq, strideq %endif ret %endif %endmacro INIT_XMM ssse3 %if ARCH_X86_64 cglobal cdef_filter_4x4_16bpc, 5, 9, 9, 32*10, dst, stride, left, top, bot, \ pri, sec, edge %define px rsp+32*4 %else cglobal cdef_filter_4x4_16bpc, 2, 7, 8, -32*11, dst, stride, edge, top, left %define botq topq %define px rsp+32*5 %endif %define base t0-dir_table %define pri_shift px-16*6 %define sec_shift px-16*5 mov edged, r9m LEA t0, dir_table movu m0, [dstq+strideq*0] movu m1, [dstq+strideq*1] lea t1, [dstq+strideq*2] movu m2, [t1 +strideq*0] movu m3, [t1 +strideq*1] movddup m7, [base+pw_m16384] mova [px+32*0+0], m0 mova [px+32*1+0], m1 mova [px+32*2+0], m2 mova [px+32*3+0], m3 test edgeb, 4 ; HAVE_TOP jz .no_top movifnidn topq, topmp movu m0, [topq+strideq*0] movu m1, [topq+strideq*1] mova [px-32*2+0], m0 mova [px-32*1+0], m1 test edgeb, 1 ; HAVE_LEFT jz .top_no_left movd m0, [topq+strideq*0-4] movd m1, [topq+strideq*1-4] movd [px-32*2-4], m0 movd [px-32*1-4], m1 jmp .top_done .no_top: mova [px-32*2+0], m7 mova [px-32*1+0], m7 .top_no_left: movd [px-32*2-4], m7 movd [px-32*1-4], m7 .top_done: test edgeb, 8 ; HAVE_BOTTOM jz .no_bottom movifnidn botq, r4mp movu m0, [botq+strideq*0] movu m1, [botq+strideq*1] mova [px+32*4+0], m0 mova [px+32*5+0], m1 test edgeb, 1 ; HAVE_LEFT jz .bottom_no_left movd m0, [botq+strideq*0-4] movd m1, [botq+strideq*1-4] movd [px+32*4-4], m0 movd [px+32*5-4], m1 jmp .bottom_done .no_bottom: mova [px+32*4+0], m7 mova [px+32*5+0], m7 .bottom_no_left: movd [px+32*4-4], m7 movd [px+32*5-4], m7 .bottom_done: test edgeb, 1 ; HAVE_LEFT jz .no_left movifnidn leftq, r2mp movd m0, [leftq+4*0] movd m1, [leftq+4*1] movd m2, [leftq+4*2] movd m3, [leftq+4*3] movd [px+32*0-4], m0 movd [px+32*1-4], m1 movd [px+32*2-4], m2 movd [px+32*3-4], m3 jmp .left_done .no_left: REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3 .left_done: test edgeb, 2 ; HAVE_RIGHT jnz .padding_done REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5 .padding_done: CDEF_FILTER 4, 4 %if ARCH_X86_64 cglobal cdef_filter_4x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \ pri, sec, edge %else cglobal cdef_filter_4x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left %endif mov edged, r9m LEA t0, dir_table movu m0, [dstq+strideq*0] movu m1, [dstq+strideq*1] lea t1, [dstq+strideq*2] movu m2, [t1 +strideq*0] movu m3, [t1 +strideq*1] lea t1, [t1 +strideq*2] movu m4, [t1 +strideq*0] movu m5, [t1 +strideq*1] lea t1, [t1 +strideq*2] movu m6, [t1 +strideq*0] movu m7, [t1 +strideq*1] mova [px+32*0+0], m0 mova [px+32*1+0], m1 mova [px+32*2+0], m2 mova [px+32*3+0], m3 mova [px+32*4+0], m4 mova [px+32*5+0], m5 mova [px+32*6+0], m6 mova [px+32*7+0], m7 movddup m7, [base+pw_m16384] test edgeb, 4 ; HAVE_TOP jz .no_top movifnidn topq, topmp movu m0, [topq+strideq*0] movu m1, [topq+strideq*1] mova [px-32*2+0], m0 mova [px-32*1+0], m1 test edgeb, 1 ; HAVE_LEFT jz .top_no_left movd m0, [topq+strideq*0-4] movd m1, [topq+strideq*1-4] movd [px-32*2-4], m0 movd [px-32*1-4], m1 jmp .top_done .no_top: mova [px-32*2+0], m7 mova [px-32*1+0], m7 .top_no_left: movd [px-32*2-4], m7 movd [px-32*1-4], m7 .top_done: test edgeb, 8 ; HAVE_BOTTOM jz .no_bottom movifnidn botq, r4mp movu m0, [botq+strideq*0] movu m1, [botq+strideq*1] mova [px+32*8+0], m0 mova [px+32*9+0], m1 test edgeb, 1 ; HAVE_LEFT jz .bottom_no_left movd m0, [botq+strideq*0-4] movd m1, [botq+strideq*1-4] movd [px+32*8-4], m0 movd [px+32*9-4], m1 jmp .bottom_done .no_bottom: mova [px+32*8+0], m7 mova [px+32*9+0], m7 .bottom_no_left: movd [px+32*8-4], m7 movd [px+32*9-4], m7 .bottom_done: test edgeb, 1 ; HAVE_LEFT jz .no_left movifnidn leftq, r2mp movd m0, [leftq+4*0] movd m1, [leftq+4*1] movd m2, [leftq+4*2] movd m3, [leftq+4*3] movd [px+32*0-4], m0 movd [px+32*1-4], m1 movd [px+32*2-4], m2 movd [px+32*3-4], m3 movd m0, [leftq+4*4] movd m1, [leftq+4*5] movd m2, [leftq+4*6] movd m3, [leftq+4*7] movd [px+32*4-4], m0 movd [px+32*5-4], m1 movd [px+32*6-4], m2 movd [px+32*7-4], m3 jmp .left_done .no_left: REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3, 4, 5, 6, 7 .left_done: test edgeb, 2 ; HAVE_RIGHT jnz .padding_done REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 .padding_done: CDEF_FILTER 4, 8 %if ARCH_X86_64 cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \ pri, sec, edge %else cglobal cdef_filter_8x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left %endif mov edged, r9m LEA t0, dir_table mova m0, [dstq+strideq*0+ 0] movd m1, [dstq+strideq*0+16] mova m2, [dstq+strideq*1+ 0] movd m3, [dstq+strideq*1+16] lea t1, [dstq+strideq*2] mova m4, [t1 +strideq*0+ 0] movd m5, [t1 +strideq*0+16] mova m6, [t1 +strideq*1+ 0] movd m7, [t1 +strideq*1+16] lea t1, [t1 +strideq*2] mova [px+32*0+ 0], m0 movd [px+32*0+16], m1 mova [px+32*1+ 0], m2 movd [px+32*1+16], m3 mova [px+32*2+ 0], m4 movd [px+32*2+16], m5 mova [px+32*3+ 0], m6 movd [px+32*3+16], m7 mova m0, [t1 +strideq*0+ 0] movd m1, [t1 +strideq*0+16] mova m2, [t1 +strideq*1+ 0] movd m3, [t1 +strideq*1+16] lea t1, [t1 +strideq*2] mova m4, [t1 +strideq*0+ 0] movd m5, [t1 +strideq*0+16] mova m6, [t1 +strideq*1+ 0] movd m7, [t1 +strideq*1+16] mova [px+32*4+ 0], m0 movd [px+32*4+16], m1 mova [px+32*5+ 0], m2 movd [px+32*5+16], m3 mova [px+32*6+ 0], m4 movd [px+32*6+16], m5 mova [px+32*7+ 0], m6 movd [px+32*7+16], m7 movddup m7, [base+pw_m16384] test edgeb, 4 ; HAVE_TOP jz .no_top movifnidn topq, topmp mova m0, [topq+strideq*0+ 0] mova m1, [topq+strideq*0+16] mova m2, [topq+strideq*1+ 0] mova m3, [topq+strideq*1+16] mova [px-32*2+ 0], m0 movd [px-32*2+16], m1 mova [px-32*1+ 0], m2 movd [px-32*1+16], m3 test edgeb, 1 ; HAVE_LEFT jz .top_no_left movd m0, [topq+strideq*0-4] movd m1, [topq+strideq*1-4] movd [px-32*2-4], m0 movd [px-32*1-4], m1 jmp .top_done .no_top: mova [px-32*2+ 0], m7 movd [px-32*2+16], m7 mova [px-32*1+ 0], m7 movd [px-32*1+16], m7 .top_no_left: movd [px-32*2- 4], m7 movd [px-32*1- 4], m7 .top_done: test edgeb, 8 ; HAVE_BOTTOM jz .no_bottom movifnidn botq, r4mp mova m0, [botq+strideq*0+ 0] movd m1, [botq+strideq*0+16] mova m2, [botq+strideq*1+ 0] movd m3, [botq+strideq*1+16] mova [px+32*8+ 0], m0 movd [px+32*8+16], m1 mova [px+32*9+ 0], m2 movd [px+32*9+16], m3 test edgeb, 1 ; HAVE_LEFT jz .bottom_no_left movd m0, [botq+strideq*0-4] movd m1, [botq+strideq*1-4] movd [px+32*8- 4], m0 movd [px+32*9- 4], m1 jmp .bottom_done .no_bottom: mova [px+32*8+ 0], m7 movd [px+32*8+16], m7 mova [px+32*9+ 0], m7 movd [px+32*9+16], m7 .bottom_no_left: movd [px+32*8- 4], m7 movd [px+32*9- 4], m7 .bottom_done: test edgeb, 1 ; HAVE_LEFT jz .no_left movifnidn leftq, r2mp movd m0, [leftq+4*0] movd m1, [leftq+4*1] movd m2, [leftq+4*2] movd m3, [leftq+4*3] movd [px+32*0- 4], m0 movd [px+32*1- 4], m1 movd [px+32*2- 4], m2 movd [px+32*3- 4], m3 movd m0, [leftq+4*4] movd m1, [leftq+4*5] movd m2, [leftq+4*6] movd m3, [leftq+4*7] movd [px+32*4- 4], m0 movd [px+32*5- 4], m1 movd [px+32*6- 4], m2 movd [px+32*7- 4], m3 jmp .left_done .no_left: REPX {movd [px+32*x- 4], m7}, 0, 1, 2, 3, 4, 5, 6, 7 .left_done: test edgeb, 2 ; HAVE_RIGHT jnz .padding_done REPX {movd [px+32*x+16], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 .padding_done: CDEF_FILTER 8, 8 %macro CDEF_DIR 0 %if ARCH_X86_64 cglobal cdef_dir_16bpc, 4, 7, 16, src, stride, var, bdmax lea r6, [dir_shift] shr bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc movddup m7, [r6+bdmaxq*8] lea r6, [strideq*3] mova m0, [srcq+strideq*0] mova m1, [srcq+strideq*1] mova m2, [srcq+strideq*2] mova m3, [srcq+r6 ] lea srcq, [srcq+strideq*4] mova m4, [srcq+strideq*0] mova m5, [srcq+strideq*1] mova m6, [srcq+strideq*2] REPX {pmulhuw x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulhuw m7, [srcq+r6 ] pxor m8, m8 packuswb m9, m0, m1 packuswb m10, m2, m3 packuswb m11, m4, m5 packuswb m12, m6, m7 REPX {psadbw x, m8}, m9, m10, m11, m12 packssdw m9, m10 packssdw m11, m12 packssdw m9, m11 jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main %else cglobal cdef_dir_16bpc, 2, 4, 8, 96, src, stride, var, bdmax mov bdmaxd, bdmaxm LEA r2, dir_shift shr bdmaxd, 11 movddup m7, [r2+bdmaxq*8] lea r3, [strideq*3] pmulhuw m3, m7, [srcq+strideq*0] pmulhuw m4, m7, [srcq+strideq*1] pmulhuw m5, m7, [srcq+strideq*2] pmulhuw m6, m7, [srcq+r3 ] movddup m1, [r2-dir_shift+pw_128] lea srcq, [srcq+strideq*4] pxor m0, m0 packuswb m2, m3, m4 psubw m3, m1 psubw m4, m1 mova [esp+0x00], m3 mova [esp+0x10], m4 packuswb m3, m5, m6 psadbw m2, m0 psadbw m3, m0 psubw m5, m1 psubw m6, m1 packssdw m2, m3 mova [esp+0x20], m5 mova [esp+0x50], m6 pmulhuw m4, m7, [srcq+strideq*0] pmulhuw m5, m7, [srcq+strideq*1] pmulhuw m6, m7, [srcq+strideq*2] pmulhuw m7, [srcq+r3 ] packuswb m3, m4, m5 packuswb m1, m6, m7 psadbw m3, m0 psadbw m1, m0 packssdw m3, m1 movddup m1, [r2-dir_shift+pw_128] LEA r2, shufw_6543210x jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main %endif %endmacro INIT_XMM ssse3 CDEF_DIR INIT_XMM sse4 CDEF_DIR