diff options
Diffstat (limited to 'third_party/dav1d/src/x86/cdef16_sse.asm')
-rw-r--r-- | third_party/dav1d/src/x86/cdef16_sse.asm | 1033 |
1 files changed, 1033 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/cdef16_sse.asm b/third_party/dav1d/src/x86/cdef16_sse.asm new file mode 100644 index 0000000000..1bd67ace64 --- /dev/null +++ b/third_party/dav1d/src/x86/cdef16_sse.asm @@ -0,0 +1,1033 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; Copyright (c) 2017-2021, The rav1e contributors +; Copyright (c) 2021, Nathan Egge +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA + +%macro DUP8 1-* + %rep %0 + times 8 dw %1 + %rotate 1 + %endrep +%endmacro + +pri_taps: DUP8 4, 2, 3, 3 +dir_table: db 1 * 32 + 0, 2 * 32 + 0 + db 1 * 32 + 0, 2 * 32 - 2 + db -1 * 32 + 2, -2 * 32 + 4 + db 0 * 32 + 2, -1 * 32 + 4 + db 0 * 32 + 2, 0 * 32 + 4 + db 0 * 32 + 2, 1 * 32 + 4 + db 1 * 32 + 2, 2 * 32 + 4 + db 1 * 32 + 0, 2 * 32 + 2 + db 1 * 32 + 0, 2 * 32 + 0 + db 1 * 32 + 0, 2 * 32 - 2 + db -1 * 32 + 2, -2 * 32 + 4 + db 0 * 32 + 2, -1 * 32 + 4 + +dir_shift: times 4 dw 0x4000 + times 4 dw 0x1000 + +pw_128: times 4 dw 128 +pw_2048: times 8 dw 2048 +pw_m16384: times 8 dw -16384 + +cextern cdef_dir_8bpc_ssse3.main +cextern cdef_dir_8bpc_sse4.main +cextern shufw_6543210x + +SECTION .text + +%if ARCH_X86_32 +DECLARE_REG_TMP 5, 3 +%elif WIN64 +DECLARE_REG_TMP 8, 4 +%else +DECLARE_REG_TMP 8, 6 +%endif + +%macro CDEF_FILTER 2 ; w, h +%if ARCH_X86_64 + DEFINE_ARGS dst, stride, _, tmp, pridmp, pri, sec, dir + mova m8, [base+pw_2048] +%else + DEFINE_ARGS dst, pridmp, tmp, sec, pri, _, dir + %define m8 [base+pw_2048] + %define m9 [rsp+16*1+gprsize] + %define m10 [rsp+16*2+gprsize] +%endif + movifnidn prid, r5m + movifnidn secd, r6m + test prid, prid + jz .sec_only + movd m6, r5m +%if ARCH_X86_32 + mov [rsp+24], pridmpd +%endif + bsr pridmpd, prid + lea tmpd, [priq*4] + cmp dword r10m, 0x3ff ; if (bpc == 10) + cmove prid, tmpd ; pri <<= 2 + mov tmpd, r8m ; damping + mov dird, r7m + and prid, 16 + pshufb m6, m7 ; splat + lea dirq, [base+dir_table+dirq*2] + lea priq, [base+pri_taps+priq*2] + test secd, secd + jz .pri_only + mova [rsp], m6 + movd m6, secd + tzcnt secd, secd + sub pridmpd, tmpd + sub tmpd, secd + pshufb m6, m7 + xor secd, secd + neg pridmpd + cmovs pridmpd, secd +%if ARCH_X86_32 + mov [pri_shift+4], secd + mov [sec_shift+4], secd +%endif + mov [pri_shift+0], pridmpq + mov [sec_shift+0], tmpq + lea tmpq, [px] +%if WIN64 + movaps r4m, m9 + movaps r6m, m10 +%elif ARCH_X86_32 + mov pridmpd, [rsp+24] +%endif +%rep %1*%2/8 + call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec +%endrep +%if WIN64 + movaps m9, r4m + movaps m10, r6m +%endif + jmp .end +.pri_only: + sub tmpd, pridmpd + cmovs tmpd, secd +%if ARCH_X86_32 + mov pridmpd, [rsp+24] + mov [pri_shift+4], secd +%endif + mov [pri_shift+0], tmpq + lea tmpq, [px] +%rep %1*%2/8 + call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri +%endrep +.end: + RET +.sec_only: + mov tmpd, r8m ; damping + movd m6, r6m + tzcnt secd, secd + mov dird, r7m + pshufb m6, m7 + sub tmpd, secd + lea dirq, [base+dir_table+dirq*2] +%if ARCH_X86_32 + mov [sec_shift+4], prid +%endif + mov [sec_shift+0], tmpq + lea tmpq, [px] +%rep %1*%2/8 + call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec +%endrep + jmp .end +%if %1 == %2 + %if ARCH_X86_64 + DEFINE_ARGS dst, stride, _, tmp, off, pri, _, dir + %else + DEFINE_ARGS dst, stride, tmp, off, pri, _, dir + %endif +ALIGN function_align +.pri: + movsx offq, byte [dirq+4] ; off_k0 +%if %1 == 4 + movq m1, [dstq+strideq*0] + movhps m1, [dstq+strideq*1] + movq m2, [tmpq+offq+32*0] ; k0p0 + movhps m2, [tmpq+offq+32*1] + neg offq + movq m3, [tmpq+offq+32*0] ; k0p1 + movhps m3, [tmpq+offq+32*1] +%else + mova m1, [dstq] + movu m2, [tmpq+offq] + neg offq + movu m3, [tmpq+offq] +%endif + movsx offq, byte [dirq+5] ; off_k1 + psubw m2, m1 ; diff_k0p0 + psubw m3, m1 ; diff_k0p1 + pabsw m4, m2 ; adiff_k0p0 + psrlw m5, m4, [pri_shift+gprsize] + psubusw m0, m6, m5 + pabsw m5, m3 ; adiff_k0p1 + pminsw m0, m4 + psrlw m4, m5, [pri_shift+gprsize] + psignw m0, m2 ; constrain(diff_k0p0) + psubusw m2, m6, m4 + pminsw m2, m5 +%if %1 == 4 + movq m4, [tmpq+offq+32*0] ; k1p0 + movhps m4, [tmpq+offq+32*1] + neg offq + movq m5, [tmpq+offq+32*0] ; k1p1 + movhps m5, [tmpq+offq+32*1] +%else + movu m4, [tmpq+offq] + neg offq + movu m5, [tmpq+offq] +%endif + psubw m4, m1 ; diff_k1p0 + psubw m5, m1 ; diff_k1p1 + psignw m2, m3 ; constrain(diff_k0p1) + pabsw m3, m4 ; adiff_k1p0 + paddw m0, m2 ; constrain(diff_k0) + psrlw m2, m3, [pri_shift+gprsize] + psubusw m7, m6, m2 + pabsw m2, m5 ; adiff_k1p1 + pminsw m7, m3 + psrlw m3, m2, [pri_shift+gprsize] + psignw m7, m4 ; constrain(diff_k1p0) + psubusw m4, m6, m3 + pminsw m4, m2 + psignw m4, m5 ; constrain(diff_k1p1) + paddw m7, m4 ; constrain(diff_k1) + pmullw m0, [priq+16*0] ; pri_tap_k0 + pmullw m7, [priq+16*1] ; pri_tap_k1 + paddw m0, m7 ; sum + psraw m2, m0, 15 + paddw m0, m2 + pmulhrsw m0, m8 + paddw m0, m1 +%if %1 == 4 + add tmpq, 32*2 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] +%else + add tmpq, 32 + mova [dstq], m0 + add dstq, strideq +%endif + ret +ALIGN function_align +.sec: + movsx offq, byte [dirq+8] ; off1_k0 +%if %1 == 4 + movq m1, [dstq+strideq*0] + movhps m1, [dstq+strideq*1] + movq m2, [tmpq+offq+32*0] ; k0s0 + movhps m2, [tmpq+offq+32*1] + neg offq + movq m3, [tmpq+offq+32*0] ; k0s1 + movhps m3, [tmpq+offq+32*1] +%else + mova m1, [dstq] + movu m2, [tmpq+offq] + neg offq + movu m3, [tmpq+offq] +%endif + movsx offq, byte [dirq+0] ; off2_k0 + psubw m2, m1 ; diff_k0s0 + psubw m3, m1 ; diff_k0s1 + pabsw m4, m2 ; adiff_k0s0 + psrlw m5, m4, [sec_shift+gprsize] + psubusw m0, m6, m5 + pabsw m5, m3 ; adiff_k0s1 + pminsw m0, m4 + psrlw m4, m5, [sec_shift+gprsize] + psignw m0, m2 ; constrain(diff_k0s0) + psubusw m2, m6, m4 + pminsw m2, m5 +%if %1 == 4 + movq m4, [tmpq+offq+32*0] ; k0s2 + movhps m4, [tmpq+offq+32*1] + neg offq + movq m5, [tmpq+offq+32*0] ; k0s3 + movhps m5, [tmpq+offq+32*1] +%else + movu m4, [tmpq+offq] + neg offq + movu m5, [tmpq+offq] +%endif + movsx offq, byte [dirq+9] ; off1_k1 + psubw m4, m1 ; diff_k0s2 + psubw m5, m1 ; diff_k0s3 + psignw m2, m3 ; constrain(diff_k0s1) + pabsw m3, m4 ; adiff_k0s2 + paddw m0, m2 + psrlw m2, m3, [sec_shift+gprsize] + psubusw m7, m6, m2 + pabsw m2, m5 ; adiff_k0s3 + pminsw m7, m3 + psrlw m3, m2, [sec_shift+gprsize] + psignw m7, m4 ; constrain(diff_k0s2) + psubusw m4, m6, m3 + pminsw m4, m2 +%if %1 == 4 + movq m2, [tmpq+offq+32*0] ; k1s0 + movhps m2, [tmpq+offq+32*1] + neg offq + movq m3, [tmpq+offq+32*0] ; k1s1 + movhps m3, [tmpq+offq+32*1] +%else + movu m2, [tmpq+offq] + neg offq + movu m3, [tmpq+offq] +%endif + movsx offq, byte [dirq+1] ; off2_k1 + paddw m0, m7 + psignw m4, m5 ; constrain(diff_k0s3) + paddw m0, m4 ; constrain(diff_k0) + psubw m2, m1 ; diff_k1s0 + psubw m3, m1 ; diff_k1s1 + paddw m0, m0 ; sec_tap_k0 + pabsw m4, m2 ; adiff_k1s0 + psrlw m5, m4, [sec_shift+gprsize] + psubusw m7, m6, m5 + pabsw m5, m3 ; adiff_k1s1 + pminsw m7, m4 + psrlw m4, m5, [sec_shift+gprsize] + psignw m7, m2 ; constrain(diff_k1s0) + psubusw m2, m6, m4 + pminsw m2, m5 +%if %1 == 4 + movq m4, [tmpq+offq+32*0] ; k1s2 + movhps m4, [tmpq+offq+32*1] + neg offq + movq m5, [tmpq+offq+32*0] ; k1s3 + movhps m5, [tmpq+offq+32*1] +%else + movu m4, [tmpq+offq] + neg offq + movu m5, [tmpq+offq] +%endif + paddw m0, m7 + psubw m4, m1 ; diff_k1s2 + psubw m5, m1 ; diff_k1s3 + psignw m2, m3 ; constrain(diff_k1s1) + pabsw m3, m4 ; adiff_k1s2 + paddw m0, m2 + psrlw m2, m3, [sec_shift+gprsize] + psubusw m7, m6, m2 + pabsw m2, m5 ; adiff_k1s3 + pminsw m7, m3 + psrlw m3, m2, [sec_shift+gprsize] + psignw m7, m4 ; constrain(diff_k1s2) + psubusw m4, m6, m3 + pminsw m4, m2 + paddw m0, m7 + psignw m4, m5 ; constrain(diff_k1s3) + paddw m0, m4 ; sum + psraw m2, m0, 15 + paddw m0, m2 + pmulhrsw m0, m8 + paddw m0, m1 +%if %1 == 4 + add tmpq, 32*2 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] +%else + add tmpq, 32 + mova [dstq], m0 + add dstq, strideq +%endif + ret +ALIGN function_align +.pri_sec: + movsx offq, byte [dirq+8] ; off2_k0 +%if %1 == 4 + movq m1, [dstq+strideq*0] + movhps m1, [dstq+strideq*1] + movq m2, [tmpq+offq+32*0] ; k0s0 + movhps m2, [tmpq+offq+32*1] + neg offq + movq m3, [tmpq+offq+32*0] ; k0s1 + movhps m3, [tmpq+offq+32*1] +%else + mova m1, [dstq] + movu m2, [tmpq+offq] + neg offq + movu m3, [tmpq+offq] +%endif + movsx offq, byte [dirq+0] ; off3_k0 + pabsw m4, m2 +%if ARCH_X86_64 + pabsw m10, m3 + pmaxsw m9, m2, m3 + pminsw m10, m4 +%else + pabsw m7, m3 + pmaxsw m5, m2, m3 + pminsw m4, m7 + mova m9, m5 + mova m10, m4 +%endif + psubw m2, m1 ; diff_k0s0 + psubw m3, m1 ; diff_k0s1 + pabsw m4, m2 ; adiff_k0s0 + psrlw m5, m4, [sec_shift+gprsize] + psubusw m0, m6, m5 + pabsw m5, m3 ; adiff_k0s1 + pminsw m0, m4 + psrlw m4, m5, [sec_shift+gprsize] + psignw m0, m2 ; constrain(diff_k0s0) + psubusw m2, m6, m4 + pminsw m2, m5 +%if %1 == 4 + movq m4, [tmpq+offq+32*0] ; k0s2 + movhps m4, [tmpq+offq+32*1] + neg offq + movq m5, [tmpq+offq+32*0] ; k0s3 + movhps m5, [tmpq+offq+32*1] +%else + movu m4, [tmpq+offq] + neg offq + movu m5, [tmpq+offq] +%endif + movsx offq, byte [dirq+9] ; off2_k1 + pabsw m7, m4 + psignw m2, m3 + pabsw m3, m5 ; constrain(diff_k0s1) +%if ARCH_X86_64 + pmaxsw m9, m4 + pminsw m10, m7 + pmaxsw m9, m5 + pminsw m10, m3 +%else + pminsw m7, m10 + pminsw m7, m3 + pmaxsw m3, m9, m4 + pmaxsw m3, m5 + mova m10, m7 + mova m9, m3 +%endif + psubw m4, m1 ; diff_k0s2 + psubw m5, m1 ; diff_k0s3 + paddw m0, m2 + pabsw m3, m4 ; adiff_k0s2 + psrlw m2, m3, [sec_shift+gprsize] + psubusw m7, m6, m2 + pabsw m2, m5 ; adiff_k0s3 + pminsw m7, m3 + psrlw m3, m2, [sec_shift+gprsize] + psignw m7, m4 ; constrain(diff_k0s2) + psubusw m4, m6, m3 + pminsw m4, m2 +%if %1 == 4 + movq m2, [tmpq+offq+32*0] ; k1s0 + movhps m2, [tmpq+offq+32*1] + neg offq + movq m3, [tmpq+offq+32*0] ; k1s1 + movhps m3, [tmpq+offq+32*1] +%else + movu m2, [tmpq+offq] + neg offq + movu m3, [tmpq+offq] +%endif + movsx offq, byte [dirq+1] ; off3_k1 + paddw m0, m7 + pabsw m7, m2 + psignw m4, m5 ; constrain(diff_k0s3) + pabsw m5, m3 +%if ARCH_X86_64 + pmaxsw m9, m2 + pminsw m10, m7 + pmaxsw m9, m3 + pminsw m10, m5 +%else + pminsw m7, m10 + pminsw m7, m5 + pmaxsw m5, m9, m2 + pmaxsw m5, m3 + mova m10, m7 + mova m9, m5 +%endif + paddw m0, m4 ; constrain(diff_k0) + psubw m2, m1 ; diff_k1s0 + psubw m3, m1 ; diff_k1s1 + paddw m0, m0 ; sec_tap_k0 + pabsw m4, m2 ; adiff_k1s0 + psrlw m5, m4, [sec_shift+gprsize] + psubusw m7, m6, m5 + pabsw m5, m3 ; adiff_k1s1 + pminsw m7, m4 + psrlw m4, m5, [sec_shift+gprsize] + psignw m7, m2 ; constrain(diff_k1s0) + psubusw m2, m6, m4 + pminsw m2, m5 +%if %1 == 4 + movq m4, [tmpq+offq+32*0] ; k1s2 + movhps m4, [tmpq+offq+32*1] + neg offq + movq m5, [tmpq+offq+32*0] ; k1s3 + movhps m5, [tmpq+offq+32*1] +%else + movu m4, [tmpq+offq] + neg offq + movu m5, [tmpq+offq] +%endif + movsx offq, byte [dirq+4] ; off1_k0 + paddw m0, m7 + pabsw m7, m4 + psignw m2, m3 ; constrain(diff_k1s1) + pabsw m3, m5 +%if ARCH_X86_64 + pmaxsw m9, m4 + pminsw m10, m7 + pmaxsw m9, m5 + pminsw m10, m3 +%else + pminsw m7, m10 + pminsw m7, m3 + pmaxsw m3, m9, m4 + pmaxsw m3, m5 + mova m10, m7 + mova m9, m3 +%endif + psubw m4, m1 ; diff_k1s2 + psubw m5, m1 ; diff_k1s3 + pabsw m3, m4 ; adiff_k1s2 + paddw m0, m2 + psrlw m2, m3, [sec_shift+gprsize] + psubusw m7, m6, m2 + pabsw m2, m5 ; adiff_k1s3 + pminsw m7, m3 + psrlw m3, m2, [sec_shift+gprsize] + psignw m7, m4 ; constrain(diff_k1s2) + psubusw m4, m6, m3 + pminsw m4, m2 + paddw m0, m7 +%if %1 == 4 + movq m2, [tmpq+offq+32*0] ; k0p0 + movhps m2, [tmpq+offq+32*1] + neg offq + movq m3, [tmpq+offq+32*0] ; k0p1 + movhps m3, [tmpq+offq+32*1] +%else + movu m2, [tmpq+offq] + neg offq + movu m3, [tmpq+offq] +%endif + movsx offq, byte [dirq+5] ; off1_k1 + pabsw m7, m2 + psignw m4, m5 ; constrain(diff_k1s3) + pabsw m5, m3 +%if ARCH_X86_64 + pmaxsw m9, m2 + pminsw m10, m7 + pmaxsw m9, m3 + pminsw m10, m5 +%else + pminsw m7, m10 + pminsw m7, m5 + pmaxsw m5, m9, m2 + pmaxsw m5, m3 + mova m10, m7 + mova m9, m5 +%endif + psubw m2, m1 ; diff_k0p0 + psubw m3, m1 ; diff_k0p1 + paddw m0, m4 + pabsw m4, m2 ; adiff_k0p0 + psrlw m5, m4, [pri_shift+gprsize] + psubusw m7, [rsp+gprsize], m5 + pabsw m5, m3 ; adiff_k0p1 + pminsw m7, m4 + psrlw m4, m5, [pri_shift+gprsize] + psignw m7, m2 ; constrain(diff_k0p0) + psubusw m2, [rsp+gprsize], m4 + pminsw m2, m5 +%if %1 == 4 + movq m4, [tmpq+offq+32*0] ; k1p0 + movhps m4, [tmpq+offq+32*1] + neg offq + movq m5, [tmpq+offq+32*0] ; k1p1 + movhps m5, [tmpq+offq+32*1] +%else + movu m4, [tmpq+offq] + neg offq + movu m5, [tmpq+offq] +%endif + psignw m2, m3 ; constrain(diff_k0p1) + pabsw m3, m4 + paddw m7, m2 ; constrain(diff_k0) + pabsw m2, m5 +%if ARCH_X86_64 + pmaxsw m9, m4 + pminsw m10, m3 + pmaxsw m9, m5 + pminsw m10, m2 +%else + pminsw m3, m10 + pminsw m3, m2 + pmaxsw m2, m9, m4 + pmaxsw m2, m5 + mova m10, m3 + mova m9, m2 +%endif + psubw m4, m1 ; diff_k1p0 + psubw m5, m1 ; diff_k1p1 + pabsw m3, m4 ; adiff_k1p0 + pmullw m7, [priq+16*0] ; pri_tap_k0 + paddw m0, m7 + psrlw m2, m3, [pri_shift+gprsize] + psubusw m7, [rsp+16*0+gprsize], m2 + pabsw m2, m5 ; adiff_k1p1 + pminsw m7, m3 + psrlw m3, m2, [pri_shift+gprsize] + psignw m7, m4 ; constrain(diff_k1p0) + psubusw m4, [rsp+16*0+gprsize], m3 + pminsw m4, m2 + psignw m4, m5 ; constrain(diff_k1p1) + paddw m7, m4 ; constrain(diff_k1) + pmullw m7, [priq+16*1] ; pri_tap_k1 + paddw m0, m7 ; sum + psraw m2, m0, 15 + paddw m0, m2 + pmulhrsw m0, m8 + paddw m0, m1 +%if ARCH_X86_64 + pmaxsw m9, m1 + pminsw m0, m9 +%else + pmaxsw m2, m9, m1 + pminsw m0, m2 +%endif + pminsw m1, m10 + pmaxsw m0, m1 +%if %1 == 4 + add tmpq, 32*2 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] +%else + add tmpq, 32 + mova [dstq], m0 + add dstq, strideq +%endif + ret +%endif +%endmacro + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal cdef_filter_4x4_16bpc, 5, 9, 9, 32*10, dst, stride, left, top, bot, \ + pri, sec, edge + %define px rsp+32*4 +%else +cglobal cdef_filter_4x4_16bpc, 2, 7, 8, -32*11, dst, stride, edge, top, left + %define botq topq + %define px rsp+32*5 +%endif + %define base t0-dir_table + %define pri_shift px-16*6 + %define sec_shift px-16*5 + mov edged, r9m + LEA t0, dir_table + movu m0, [dstq+strideq*0] + movu m1, [dstq+strideq*1] + lea t1, [dstq+strideq*2] + movu m2, [t1 +strideq*0] + movu m3, [t1 +strideq*1] + movddup m7, [base+pw_m16384] + mova [px+32*0+0], m0 + mova [px+32*1+0], m1 + mova [px+32*2+0], m2 + mova [px+32*3+0], m3 + test edgeb, 4 ; HAVE_TOP + jz .no_top + movifnidn topq, topmp + movu m0, [topq+strideq*0] + movu m1, [topq+strideq*1] + mova [px-32*2+0], m0 + mova [px-32*1+0], m1 + test edgeb, 1 ; HAVE_LEFT + jz .top_no_left + movd m0, [topq+strideq*0-4] + movd m1, [topq+strideq*1-4] + movd [px-32*2-4], m0 + movd [px-32*1-4], m1 + jmp .top_done +.no_top: + mova [px-32*2+0], m7 + mova [px-32*1+0], m7 +.top_no_left: + movd [px-32*2-4], m7 + movd [px-32*1-4], m7 +.top_done: + test edgeb, 8 ; HAVE_BOTTOM + jz .no_bottom + movifnidn botq, r4mp + movu m0, [botq+strideq*0] + movu m1, [botq+strideq*1] + mova [px+32*4+0], m0 + mova [px+32*5+0], m1 + test edgeb, 1 ; HAVE_LEFT + jz .bottom_no_left + movd m0, [botq+strideq*0-4] + movd m1, [botq+strideq*1-4] + movd [px+32*4-4], m0 + movd [px+32*5-4], m1 + jmp .bottom_done +.no_bottom: + mova [px+32*4+0], m7 + mova [px+32*5+0], m7 +.bottom_no_left: + movd [px+32*4-4], m7 + movd [px+32*5-4], m7 +.bottom_done: + test edgeb, 1 ; HAVE_LEFT + jz .no_left + movifnidn leftq, r2mp + movd m0, [leftq+4*0] + movd m1, [leftq+4*1] + movd m2, [leftq+4*2] + movd m3, [leftq+4*3] + movd [px+32*0-4], m0 + movd [px+32*1-4], m1 + movd [px+32*2-4], m2 + movd [px+32*3-4], m3 + jmp .left_done +.no_left: + REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3 +.left_done: + test edgeb, 2 ; HAVE_RIGHT + jnz .padding_done + REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5 +.padding_done: + CDEF_FILTER 4, 4 + +%if ARCH_X86_64 +cglobal cdef_filter_4x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \ + pri, sec, edge +%else +cglobal cdef_filter_4x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left +%endif + mov edged, r9m + LEA t0, dir_table + movu m0, [dstq+strideq*0] + movu m1, [dstq+strideq*1] + lea t1, [dstq+strideq*2] + movu m2, [t1 +strideq*0] + movu m3, [t1 +strideq*1] + lea t1, [t1 +strideq*2] + movu m4, [t1 +strideq*0] + movu m5, [t1 +strideq*1] + lea t1, [t1 +strideq*2] + movu m6, [t1 +strideq*0] + movu m7, [t1 +strideq*1] + mova [px+32*0+0], m0 + mova [px+32*1+0], m1 + mova [px+32*2+0], m2 + mova [px+32*3+0], m3 + mova [px+32*4+0], m4 + mova [px+32*5+0], m5 + mova [px+32*6+0], m6 + mova [px+32*7+0], m7 + movddup m7, [base+pw_m16384] + test edgeb, 4 ; HAVE_TOP + jz .no_top + movifnidn topq, topmp + movu m0, [topq+strideq*0] + movu m1, [topq+strideq*1] + mova [px-32*2+0], m0 + mova [px-32*1+0], m1 + test edgeb, 1 ; HAVE_LEFT + jz .top_no_left + movd m0, [topq+strideq*0-4] + movd m1, [topq+strideq*1-4] + movd [px-32*2-4], m0 + movd [px-32*1-4], m1 + jmp .top_done +.no_top: + mova [px-32*2+0], m7 + mova [px-32*1+0], m7 +.top_no_left: + movd [px-32*2-4], m7 + movd [px-32*1-4], m7 +.top_done: + test edgeb, 8 ; HAVE_BOTTOM + jz .no_bottom + movifnidn botq, r4mp + movu m0, [botq+strideq*0] + movu m1, [botq+strideq*1] + mova [px+32*8+0], m0 + mova [px+32*9+0], m1 + test edgeb, 1 ; HAVE_LEFT + jz .bottom_no_left + movd m0, [botq+strideq*0-4] + movd m1, [botq+strideq*1-4] + movd [px+32*8-4], m0 + movd [px+32*9-4], m1 + jmp .bottom_done +.no_bottom: + mova [px+32*8+0], m7 + mova [px+32*9+0], m7 +.bottom_no_left: + movd [px+32*8-4], m7 + movd [px+32*9-4], m7 +.bottom_done: + test edgeb, 1 ; HAVE_LEFT + jz .no_left + movifnidn leftq, r2mp + movd m0, [leftq+4*0] + movd m1, [leftq+4*1] + movd m2, [leftq+4*2] + movd m3, [leftq+4*3] + movd [px+32*0-4], m0 + movd [px+32*1-4], m1 + movd [px+32*2-4], m2 + movd [px+32*3-4], m3 + movd m0, [leftq+4*4] + movd m1, [leftq+4*5] + movd m2, [leftq+4*6] + movd m3, [leftq+4*7] + movd [px+32*4-4], m0 + movd [px+32*5-4], m1 + movd [px+32*6-4], m2 + movd [px+32*7-4], m3 + jmp .left_done +.no_left: + REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3, 4, 5, 6, 7 +.left_done: + test edgeb, 2 ; HAVE_RIGHT + jnz .padding_done + REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 +.padding_done: + CDEF_FILTER 4, 8 + +%if ARCH_X86_64 +cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \ + pri, sec, edge +%else +cglobal cdef_filter_8x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left +%endif + mov edged, r9m + LEA t0, dir_table + mova m0, [dstq+strideq*0+ 0] + movd m1, [dstq+strideq*0+16] + mova m2, [dstq+strideq*1+ 0] + movd m3, [dstq+strideq*1+16] + lea t1, [dstq+strideq*2] + mova m4, [t1 +strideq*0+ 0] + movd m5, [t1 +strideq*0+16] + mova m6, [t1 +strideq*1+ 0] + movd m7, [t1 +strideq*1+16] + lea t1, [t1 +strideq*2] + mova [px+32*0+ 0], m0 + movd [px+32*0+16], m1 + mova [px+32*1+ 0], m2 + movd [px+32*1+16], m3 + mova [px+32*2+ 0], m4 + movd [px+32*2+16], m5 + mova [px+32*3+ 0], m6 + movd [px+32*3+16], m7 + mova m0, [t1 +strideq*0+ 0] + movd m1, [t1 +strideq*0+16] + mova m2, [t1 +strideq*1+ 0] + movd m3, [t1 +strideq*1+16] + lea t1, [t1 +strideq*2] + mova m4, [t1 +strideq*0+ 0] + movd m5, [t1 +strideq*0+16] + mova m6, [t1 +strideq*1+ 0] + movd m7, [t1 +strideq*1+16] + mova [px+32*4+ 0], m0 + movd [px+32*4+16], m1 + mova [px+32*5+ 0], m2 + movd [px+32*5+16], m3 + mova [px+32*6+ 0], m4 + movd [px+32*6+16], m5 + mova [px+32*7+ 0], m6 + movd [px+32*7+16], m7 + movddup m7, [base+pw_m16384] + test edgeb, 4 ; HAVE_TOP + jz .no_top + movifnidn topq, topmp + mova m0, [topq+strideq*0+ 0] + mova m1, [topq+strideq*0+16] + mova m2, [topq+strideq*1+ 0] + mova m3, [topq+strideq*1+16] + mova [px-32*2+ 0], m0 + movd [px-32*2+16], m1 + mova [px-32*1+ 0], m2 + movd [px-32*1+16], m3 + test edgeb, 1 ; HAVE_LEFT + jz .top_no_left + movd m0, [topq+strideq*0-4] + movd m1, [topq+strideq*1-4] + movd [px-32*2-4], m0 + movd [px-32*1-4], m1 + jmp .top_done +.no_top: + mova [px-32*2+ 0], m7 + movd [px-32*2+16], m7 + mova [px-32*1+ 0], m7 + movd [px-32*1+16], m7 +.top_no_left: + movd [px-32*2- 4], m7 + movd [px-32*1- 4], m7 +.top_done: + test edgeb, 8 ; HAVE_BOTTOM + jz .no_bottom + movifnidn botq, r4mp + mova m0, [botq+strideq*0+ 0] + movd m1, [botq+strideq*0+16] + mova m2, [botq+strideq*1+ 0] + movd m3, [botq+strideq*1+16] + mova [px+32*8+ 0], m0 + movd [px+32*8+16], m1 + mova [px+32*9+ 0], m2 + movd [px+32*9+16], m3 + test edgeb, 1 ; HAVE_LEFT + jz .bottom_no_left + movd m0, [botq+strideq*0-4] + movd m1, [botq+strideq*1-4] + movd [px+32*8- 4], m0 + movd [px+32*9- 4], m1 + jmp .bottom_done +.no_bottom: + mova [px+32*8+ 0], m7 + movd [px+32*8+16], m7 + mova [px+32*9+ 0], m7 + movd [px+32*9+16], m7 +.bottom_no_left: + movd [px+32*8- 4], m7 + movd [px+32*9- 4], m7 +.bottom_done: + test edgeb, 1 ; HAVE_LEFT + jz .no_left + movifnidn leftq, r2mp + movd m0, [leftq+4*0] + movd m1, [leftq+4*1] + movd m2, [leftq+4*2] + movd m3, [leftq+4*3] + movd [px+32*0- 4], m0 + movd [px+32*1- 4], m1 + movd [px+32*2- 4], m2 + movd [px+32*3- 4], m3 + movd m0, [leftq+4*4] + movd m1, [leftq+4*5] + movd m2, [leftq+4*6] + movd m3, [leftq+4*7] + movd [px+32*4- 4], m0 + movd [px+32*5- 4], m1 + movd [px+32*6- 4], m2 + movd [px+32*7- 4], m3 + jmp .left_done +.no_left: + REPX {movd [px+32*x- 4], m7}, 0, 1, 2, 3, 4, 5, 6, 7 +.left_done: + test edgeb, 2 ; HAVE_RIGHT + jnz .padding_done + REPX {movd [px+32*x+16], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 +.padding_done: + CDEF_FILTER 8, 8 + +%macro CDEF_DIR 0 +%if ARCH_X86_64 +cglobal cdef_dir_16bpc, 4, 7, 16, src, stride, var, bdmax + lea r6, [dir_shift] + shr bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc + movddup m7, [r6+bdmaxq*8] + lea r6, [strideq*3] + mova m0, [srcq+strideq*0] + mova m1, [srcq+strideq*1] + mova m2, [srcq+strideq*2] + mova m3, [srcq+r6 ] + lea srcq, [srcq+strideq*4] + mova m4, [srcq+strideq*0] + mova m5, [srcq+strideq*1] + mova m6, [srcq+strideq*2] + REPX {pmulhuw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhuw m7, [srcq+r6 ] + pxor m8, m8 + packuswb m9, m0, m1 + packuswb m10, m2, m3 + packuswb m11, m4, m5 + packuswb m12, m6, m7 + REPX {psadbw x, m8}, m9, m10, m11, m12 + packssdw m9, m10 + packssdw m11, m12 + packssdw m9, m11 + jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main +%else +cglobal cdef_dir_16bpc, 2, 4, 8, 96, src, stride, var, bdmax + mov bdmaxd, bdmaxm + LEA r2, dir_shift + shr bdmaxd, 11 + movddup m7, [r2+bdmaxq*8] + lea r3, [strideq*3] + pmulhuw m3, m7, [srcq+strideq*0] + pmulhuw m4, m7, [srcq+strideq*1] + pmulhuw m5, m7, [srcq+strideq*2] + pmulhuw m6, m7, [srcq+r3 ] + movddup m1, [r2-dir_shift+pw_128] + lea srcq, [srcq+strideq*4] + pxor m0, m0 + packuswb m2, m3, m4 + psubw m3, m1 + psubw m4, m1 + mova [esp+0x00], m3 + mova [esp+0x10], m4 + packuswb m3, m5, m6 + psadbw m2, m0 + psadbw m3, m0 + psubw m5, m1 + psubw m6, m1 + packssdw m2, m3 + mova [esp+0x20], m5 + mova [esp+0x50], m6 + pmulhuw m4, m7, [srcq+strideq*0] + pmulhuw m5, m7, [srcq+strideq*1] + pmulhuw m6, m7, [srcq+strideq*2] + pmulhuw m7, [srcq+r3 ] + packuswb m3, m4, m5 + packuswb m1, m6, m7 + psadbw m3, m0 + psadbw m1, m0 + packssdw m3, m1 + movddup m1, [r2-dir_shift+pw_128] + LEA r2, shufw_6543210x + jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main +%endif +%endmacro + +INIT_XMM ssse3 +CDEF_DIR + +INIT_XMM sse4 +CDEF_DIR |