From 36d22d82aa202bb199967e9512281e9a53db42c9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 21:33:14 +0200 Subject: Adding upstream version 115.7.0esr. Signed-off-by: Daniel Baumann --- third_party/dav1d/src/x86/cdef16_avx2.asm | 877 ++++++++++++++++++++++++++++++ 1 file changed, 877 insertions(+) create mode 100644 third_party/dav1d/src/x86/cdef16_avx2.asm (limited to 'third_party/dav1d/src/x86/cdef16_avx2.asm') diff --git a/third_party/dav1d/src/x86/cdef16_avx2.asm b/third_party/dav1d/src/x86/cdef16_avx2.asm new file mode 100644 index 0000000000..4c8d3bca43 --- /dev/null +++ b/third_party/dav1d/src/x86/cdef16_avx2.asm @@ -0,0 +1,877 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA + +%macro DIR_TABLE 1 ; stride + db 1 * %1 + 0, 2 * %1 + 0 + db 1 * %1 + 0, 2 * %1 - 2 + db -1 * %1 + 2, -2 * %1 + 4 + db 0 * %1 + 2, -1 * %1 + 4 + db 0 * %1 + 2, 0 * %1 + 4 + db 0 * %1 + 2, 1 * %1 + 4 + db 1 * %1 + 2, 2 * %1 + 4 + db 1 * %1 + 0, 2 * %1 + 2 + db 1 * %1 + 0, 2 * %1 + 0 + db 1 * %1 + 0, 2 * %1 - 2 + db -1 * %1 + 2, -2 * %1 + 4 + db 0 * %1 + 2, -1 * %1 + 4 +%endmacro + +dir_table4: DIR_TABLE 16 +dir_table8: DIR_TABLE 32 +pri_taps: dw 4, 4, 3, 3, 2, 2, 3, 3 + +dir_shift: times 2 dw 0x4000 + times 2 dw 0x1000 + +pw_2048: times 2 dw 2048 +pw_m16384: times 2 dw -16384 + +cextern cdef_dir_8bpc_avx2.main + +SECTION .text + +%macro CDEF_FILTER 2 ; w, h + DEFINE_ARGS dst, stride, _, dir, pridmp, pri, sec, tmp + movifnidn prid, r5m + movifnidn secd, r6m + mov dird, r7m + vpbroadcastd m8, [base+pw_2048] + lea dirq, [base+dir_table%1+dirq*2] + test prid, prid + jz .sec_only +%if WIN64 + vpbroadcastw m6, prim + movaps [rsp+16*0], xmm9 + movaps [rsp+16*1], xmm10 +%else + movd xm6, prid + vpbroadcastw m6, xm6 +%endif + lzcnt pridmpd, prid + rorx tmpd, prid, 2 + cmp dword r10m, 0xfff ; if (bpc == 12) + cmove prid, tmpd ; pri >>= 2 + mov tmpd, r8m ; damping + and prid, 4 + sub tmpd, 31 + vpbroadcastd m9, [base+pri_taps+priq+8*0] + vpbroadcastd m10, [base+pri_taps+priq+8*1] + test secd, secd + jz .pri_only +%if WIN64 + movaps r8m, xmm13 + vpbroadcastw m13, secm + movaps r4m, xmm11 + movaps r6m, xmm12 +%else + movd xm0, secd + vpbroadcastw m13, xm0 +%endif + lzcnt secd, secd + xor prid, prid + add pridmpd, tmpd + cmovs pridmpd, prid + add secd, tmpd + lea tmpq, [px] + mov [pri_shift], pridmpq + mov [sec_shift], secq +%rep %1*%2/16 + call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec +%endrep +%if WIN64 + movaps xmm11, r4m + movaps xmm12, r6m + movaps xmm13, r8m +%endif + jmp .pri_end +.pri_only: + add pridmpd, tmpd + cmovs pridmpd, secd + lea tmpq, [px] + mov [pri_shift], pridmpq +%rep %1*%2/16 + call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri +%endrep +.pri_end: +%if WIN64 + movaps xmm9, [rsp+16*0] + movaps xmm10, [rsp+16*1] +%endif +.end: + RET +.sec_only: + mov tmpd, r8m ; damping +%if WIN64 + vpbroadcastw m6, secm +%else + movd xm6, secd + vpbroadcastw m6, xm6 +%endif + tzcnt secd, secd + sub tmpd, secd + mov [sec_shift], tmpq + lea tmpq, [px] +%rep %1*%2/16 + call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec +%endrep + jmp .end +%if %1 == %2 +ALIGN function_align +.pri: + movsx offq, byte [dirq+4] ; off_k0 +%if %1 == 4 + mova m1, [tmpq+32*0] + punpcklqdq m1, [tmpq+32*1] ; 0 2 1 3 + movu m2, [tmpq+offq+32*0] + punpcklqdq m2, [tmpq+offq+32*1] ; k0p0 + neg offq + movu m3, [tmpq+offq+32*0] + punpcklqdq m3, [tmpq+offq+32*1] ; k0p1 +%else + mova xm1, [tmpq+32*0] + vinserti128 m1, [tmpq+32*1], 1 + movu xm2, [tmpq+offq+32*0] + vinserti128 m2, [tmpq+offq+32*1], 1 + neg offq + movu xm3, [tmpq+offq+32*0] + vinserti128 m3, [tmpq+offq+32*1], 1 +%endif + movsx offq, byte [dirq+5] ; off_k1 + psubw m2, m1 ; diff_k0p0 + psubw m3, m1 ; diff_k0p1 + pabsw m4, m2 ; adiff_k0p0 + psrlw m5, m4, [pri_shift+gprsize] + psubusw m0, m6, m5 + pabsw m5, m3 ; adiff_k0p1 + pminsw m0, m4 + psrlw m4, m5, [pri_shift+gprsize] + psignw m0, m2 ; constrain(diff_k0p0) + psubusw m2, m6, m4 + pminsw m2, m5 +%if %1 == 4 + movu m4, [tmpq+offq+32*0] + punpcklqdq m4, [tmpq+offq+32*1] ; k1p0 + neg offq + movu m5, [tmpq+offq+32*0] + punpcklqdq m5, [tmpq+offq+32*1] ; k1p1 +%else + movu xm4, [tmpq+offq+32*0] + vinserti128 m4, [tmpq+offq+32*1], 1 + neg offq + movu xm5, [tmpq+offq+32*0] + vinserti128 m5, [tmpq+offq+32*1], 1 +%endif + psubw m4, m1 ; diff_k1p0 + psubw m5, m1 ; diff_k1p1 + psignw m2, m3 ; constrain(diff_k0p1) + pabsw m3, m4 ; adiff_k1p0 + paddw m0, m2 ; constrain(diff_k0) + psrlw m2, m3, [pri_shift+gprsize] + psubusw m7, m6, m2 + pabsw m2, m5 ; adiff_k1p1 + pminsw m7, m3 + psrlw m3, m2, [pri_shift+gprsize] + psignw m7, m4 ; constrain(diff_k1p0) + psubusw m4, m6, m3 + pminsw m4, m2 + psignw m4, m5 ; constrain(diff_k1p1) + paddw m7, m4 ; constrain(diff_k1) + pmullw m0, m9 ; pri_tap_k0 + pmullw m7, m10 ; pri_tap_k1 + paddw m0, m7 ; sum + psraw m2, m0, 15 + paddw m0, m2 + pmulhrsw m0, m8 + add tmpq, 32*2 + paddw m0, m1 +%if %1 == 4 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r9 ], xm1 + lea dstq, [dstq+strideq*4] +%else + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] +%endif + ret +ALIGN function_align +.sec: + movsx offq, byte [dirq+8] ; off1_k0 +%if %1 == 4 + mova m1, [tmpq+32*0] + punpcklqdq m1, [tmpq+32*1] + movu m2, [tmpq+offq+32*0] + punpcklqdq m2, [tmpq+offq+32*1] ; k0s0 + neg offq + movu m3, [tmpq+offq+32*0] + punpcklqdq m3, [tmpq+offq+32*1] ; k0s1 +%else + mova xm1, [tmpq+32*0] + vinserti128 m1, [tmpq+32*1], 1 + movu xm2, [tmpq+offq+32*0] + vinserti128 m2, [tmpq+offq+32*1], 1 + neg offq + movu xm3, [tmpq+offq+32*0] + vinserti128 m3, [tmpq+offq+32*1], 1 +%endif + movsx offq, byte [dirq+0] ; off2_k0 + psubw m2, m1 ; diff_k0s0 + psubw m3, m1 ; diff_k0s1 + pabsw m4, m2 ; adiff_k0s0 + psrlw m5, m4, [sec_shift+gprsize] + psubusw m0, m6, m5 + pabsw m5, m3 ; adiff_k0s1 + pminsw m0, m4 + psrlw m4, m5, [sec_shift+gprsize] + psignw m0, m2 ; constrain(diff_k0s0) + psubusw m2, m6, m4 + pminsw m2, m5 +%if %1 == 4 + movu m4, [tmpq+offq+32*0] + punpcklqdq m4, [tmpq+offq+32*1] ; k0s2 + neg offq + movu m5, [tmpq+offq+32*0] + punpcklqdq m5, [tmpq+offq+32*1] ; k0s3 +%else + movu xm4, [tmpq+offq+32*0] + vinserti128 m4, [tmpq+offq+32*1], 1 + neg offq + movu xm5, [tmpq+offq+32*0] + vinserti128 m5, [tmpq+offq+32*1], 1 +%endif + movsx offq, byte [dirq+9] ; off1_k1 + psubw m4, m1 ; diff_k0s2 + psubw m5, m1 ; diff_k0s3 + psignw m2, m3 ; constrain(diff_k0s1) + pabsw m3, m4 ; adiff_k0s2 + paddw m0, m2 + psrlw m2, m3, [sec_shift+gprsize] + psubusw m7, m6, m2 + pabsw m2, m5 ; adiff_k0s3 + pminsw m7, m3 + psrlw m3, m2, [sec_shift+gprsize] + psignw m7, m4 ; constrain(diff_k0s2) + psubusw m4, m6, m3 + pminsw m4, m2 +%if %1 == 4 + movu m2, [tmpq+offq+32*0] + punpcklqdq m2, [tmpq+offq+32*1] ; k1s0 + neg offq + movu m3, [tmpq+offq+32*0] + punpcklqdq m3, [tmpq+offq+32*1] ; k1s1 +%else + movu xm2, [tmpq+offq+32*0] + vinserti128 m2, [tmpq+offq+32*1], 1 + neg offq + movu xm3, [tmpq+offq+32*0] + vinserti128 m3, [tmpq+offq+32*1], 1 +%endif + movsx offq, byte [dirq+1] ; off2_k1 + paddw m0, m7 + psignw m4, m5 ; constrain(diff_k0s3) + paddw m0, m4 ; constrain(diff_k0) + psubw m2, m1 ; diff_k1s0 + psubw m3, m1 ; diff_k1s1 + paddw m0, m0 ; sec_tap_k0 + pabsw m4, m2 ; adiff_k1s0 + psrlw m5, m4, [sec_shift+gprsize] + psubusw m7, m6, m5 + pabsw m5, m3 ; adiff_k1s1 + pminsw m7, m4 + psrlw m4, m5, [sec_shift+gprsize] + psignw m7, m2 ; constrain(diff_k1s0) + psubusw m2, m6, m4 + pminsw m2, m5 +%if %1 == 4 + movu m4, [tmpq+offq+32*0] + punpcklqdq m4, [tmpq+offq+32*1] ; k1s2 + neg offq + movu m5, [tmpq+offq+32*0] + punpcklqdq m5, [tmpq+offq+32*1] ; k1s3 +%else + movu xm4, [tmpq+offq+32*0] + vinserti128 m4, [tmpq+offq+32*1], 1 + neg offq + movu xm5, [tmpq+offq+32*0] + vinserti128 m5, [tmpq+offq+32*1], 1 +%endif + paddw m0, m7 + psubw m4, m1 ; diff_k1s2 + psubw m5, m1 ; diff_k1s3 + psignw m2, m3 ; constrain(diff_k1s1) + pabsw m3, m4 ; adiff_k1s2 + paddw m0, m2 + psrlw m2, m3, [sec_shift+gprsize] + psubusw m7, m6, m2 + pabsw m2, m5 ; adiff_k1s3 + pminsw m7, m3 + psrlw m3, m2, [sec_shift+gprsize] + psignw m7, m4 ; constrain(diff_k1s2) + psubusw m4, m6, m3 + pminsw m4, m2 + paddw m0, m7 + psignw m4, m5 ; constrain(diff_k1s3) + paddw m0, m4 ; sum + psraw m2, m0, 15 + paddw m0, m2 + pmulhrsw m0, m8 + add tmpq, 32*2 + paddw m0, m1 +%if %1 == 4 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r9 ], xm1 + lea dstq, [dstq+strideq*4] +%else + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] +%endif + ret +ALIGN function_align +.pri_sec: + movsx offq, byte [dirq+8] ; off2_k0 +%if %1 == 4 + mova m1, [tmpq+32*0] + punpcklqdq m1, [tmpq+32*1] + movu m2, [tmpq+offq+32*0] + punpcklqdq m2, [tmpq+offq+32*1] ; k0s0 + neg offq + movu m3, [tmpq+offq+32*0] + punpcklqdq m3, [tmpq+offq+32*1] ; k0s1 +%else + mova xm1, [dstq+strideq*0] + vinserti128 m1, [dstq+strideq*1], 1 + movu xm2, [tmpq+offq+32*0] + vinserti128 m2, [tmpq+offq+32*1], 1 + neg offq + movu xm3, [tmpq+offq+32*0] + vinserti128 m3, [tmpq+offq+32*1], 1 +%endif + movsx offq, byte [dirq+0] ; off3_k0 + pmaxsw m11, m2, m3 + pminuw m12, m2, m3 + psubw m2, m1 ; diff_k0s0 + psubw m3, m1 ; diff_k0s1 + pabsw m4, m2 ; adiff_k0s0 + psrlw m5, m4, [sec_shift+gprsize] + psubusw m0, m13, m5 + pabsw m5, m3 ; adiff_k0s1 + pminsw m0, m4 + psrlw m4, m5, [sec_shift+gprsize] + psignw m0, m2 ; constrain(diff_k0s0) + psubusw m2, m13, m4 + pminsw m2, m5 +%if %1 == 4 + movu m4, [tmpq+offq+32*0] + punpcklqdq m4, [tmpq+offq+32*1] ; k0s2 + neg offq + movu m5, [tmpq+offq+32*0] + punpcklqdq m5, [tmpq+offq+32*1] ; k0s3 +%else + movu xm4, [tmpq+offq+32*0] + vinserti128 m4, [tmpq+offq+32*1], 1 + neg offq + movu xm5, [tmpq+offq+32*0] + vinserti128 m5, [tmpq+offq+32*1], 1 +%endif + movsx offq, byte [dirq+9] ; off2_k1 + psignw m2, m3 ; constrain(diff_k0s1) + pmaxsw m11, m4 + pminuw m12, m4 + pmaxsw m11, m5 + pminuw m12, m5 + psubw m4, m1 ; diff_k0s2 + psubw m5, m1 ; diff_k0s3 + paddw m0, m2 + pabsw m3, m4 ; adiff_k0s2 + psrlw m2, m3, [sec_shift+gprsize] + psubusw m7, m13, m2 + pabsw m2, m5 ; adiff_k0s3 + pminsw m7, m3 + psrlw m3, m2, [sec_shift+gprsize] + psignw m7, m4 ; constrain(diff_k0s2) + psubusw m4, m13, m3 + pminsw m4, m2 +%if %1 == 4 + movu m2, [tmpq+offq+32*0] + punpcklqdq m2, [tmpq+offq+32*1] ; k1s0 + neg offq + movu m3, [tmpq+offq+32*0] + punpcklqdq m3, [tmpq+offq+32*1] ; k1s1 +%else + movu xm2, [tmpq+offq+32*0] + vinserti128 m2, [tmpq+offq+32*1], 1 + neg offq + movu xm3, [tmpq+offq+32*0] + vinserti128 m3, [tmpq+offq+32*1], 1 +%endif + movsx offq, byte [dirq+1] ; off3_k1 + paddw m0, m7 + psignw m4, m5 ; constrain(diff_k0s3) + pmaxsw m11, m2 + pminuw m12, m2 + pmaxsw m11, m3 + pminuw m12, m3 + paddw m0, m4 ; constrain(diff_k0) + psubw m2, m1 ; diff_k1s0 + psubw m3, m1 ; diff_k1s1 + paddw m0, m0 ; sec_tap_k0 + pabsw m4, m2 ; adiff_k1s0 + psrlw m5, m4, [sec_shift+gprsize] + psubusw m7, m13, m5 + pabsw m5, m3 ; adiff_k1s1 + pminsw m7, m4 + psrlw m4, m5, [sec_shift+gprsize] + psignw m7, m2 ; constrain(diff_k1s0) + psubusw m2, m13, m4 + pminsw m2, m5 +%if %1 == 4 + movu m4, [tmpq+offq+32*0] + punpcklqdq m4, [tmpq+offq+32*1] ; k1s2 + neg offq + movu m5, [tmpq+offq+32*0] + punpcklqdq m5, [tmpq+offq+32*1] ; k1s3 +%else + movu xm4, [tmpq+offq+32*0] + vinserti128 m4, [tmpq+offq+32*1], 1 + neg offq + movu xm5, [tmpq+offq+32*0] + vinserti128 m5, [tmpq+offq+32*1], 1 +%endif + movsx offq, byte [dirq+4] ; off1_k0 + paddw m0, m7 + psignw m2, m3 ; constrain(diff_k1s1) + pmaxsw m11, m4 + pminuw m12, m4 + pmaxsw m11, m5 + pminuw m12, m5 + psubw m4, m1 ; diff_k1s2 + psubw m5, m1 ; diff_k1s3 + pabsw m3, m4 ; adiff_k1s2 + paddw m0, m2 + psrlw m2, m3, [sec_shift+gprsize] + psubusw m7, m13, m2 + pabsw m2, m5 ; adiff_k1s3 + pminsw m7, m3 + psrlw m3, m2, [sec_shift+gprsize] + psignw m7, m4 ; constrain(diff_k1s2) + psubusw m4, m13, m3 + pminsw m4, m2 + paddw m0, m7 +%if %1 == 4 + movu m2, [tmpq+offq+32*0] + punpcklqdq m2, [tmpq+offq+32*1] ; k0p0 + neg offq + movu m3, [tmpq+offq+32*0] + punpcklqdq m3, [tmpq+offq+32*1] ; k0p1 +%else + movu xm2, [tmpq+offq+32*0] + vinserti128 m2, [tmpq+offq+32*1], 1 + neg offq + movu xm3, [tmpq+offq+32*0] + vinserti128 m3, [tmpq+offq+32*1], 1 +%endif + movsx offq, byte [dirq+5] ; off1_k1 + psignw m4, m5 ; constrain(diff_k1s3) + pmaxsw m11, m2 + pminuw m12, m2 + pmaxsw m11, m3 + pminuw m12, m3 + psubw m2, m1 ; diff_k0p0 + psubw m3, m1 ; diff_k0p1 + paddw m0, m4 + pabsw m4, m2 ; adiff_k0p0 + psrlw m5, m4, [pri_shift+gprsize] + psubusw m7, m6, m5 + pabsw m5, m3 ; adiff_k0p1 + pminsw m7, m4 + psrlw m4, m5, [pri_shift+gprsize] + psignw m7, m2 ; constrain(diff_k0p0) + psubusw m2, m6, m4 + pminsw m2, m5 +%if %1 == 4 + movu m4, [tmpq+offq+32*0] + punpcklqdq m4, [tmpq+offq+32*1] ; k1p0 + neg offq + movu m5, [tmpq+offq+32*0] + punpcklqdq m5, [tmpq+offq+32*1] ; k1p1 +%else + movu xm4, [tmpq+offq+32*0] + vinserti128 m4, [tmpq+offq+32*1], 1 + neg offq + movu xm5, [tmpq+offq+32*0] + vinserti128 m5, [tmpq+offq+32*1], 1 +%endif + psignw m2, m3 ; constrain(diff_k0p1) + paddw m7, m2 ; constrain(diff_k0) + pmaxsw m11, m4 + pminuw m12, m4 + pmaxsw m11, m5 + pminuw m12, m5 + psubw m4, m1 ; diff_k1p0 + psubw m5, m1 ; diff_k1p1 + pabsw m3, m4 ; adiff_k1p0 + pmullw m7, m9 ; pri_tap_k0 + paddw m0, m7 + psrlw m2, m3, [pri_shift+gprsize] + psubusw m7, m6, m2 + pabsw m2, m5 ; adiff_k1p1 + pminsw m7, m3 + psrlw m3, m2, [pri_shift+gprsize] + psignw m7, m4 ; constrain(diff_k1p0) + psubusw m4, m6, m3 + pminsw m4, m2 + psignw m4, m5 ; constrain(diff_k1p1) + paddw m7, m4 ; constrain(diff_k1) + pmullw m7, m10 ; pri_tap_k1 + paddw m0, m7 ; sum + psraw m2, m0, 15 + paddw m0, m2 + pmulhrsw m0, m8 + add tmpq, 32*2 + pmaxsw m11, m1 + pminuw m12, m1 + paddw m0, m1 + pminsw m0, m11 + pmaxsw m0, m12 +%if %1 == 4 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r9 ], xm1 + lea dstq, [dstq+strideq*4] +%else + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] +%endif + ret +%endif +%endmacro + +INIT_YMM avx2 +cglobal cdef_filter_4x4_16bpc, 5, 10, 9, 16*10, dst, stride, left, top, bot, \ + pri, sec, edge +%if WIN64 + %define px rsp+16*6 + %define offq r8 + %define pri_shift rsp+16*2 + %define sec_shift rsp+16*3 +%else + %define px rsp+16*4 + %define offq r4 + %define pri_shift rsp+16*0 + %define sec_shift rsp+16*1 +%endif + %define base r8-dir_table4 + mov edged, r9m + lea r8, [dir_table4] + movu xm0, [dstq+strideq*0] + movu xm1, [dstq+strideq*1] + lea r9, [strideq*3] + movu xm2, [dstq+strideq*2] + movu xm3, [dstq+r9 ] + vpbroadcastd m7, [base+pw_m16384] + mova [px+16*0+0], xm0 + mova [px+16*1+0], xm1 + mova [px+16*2+0], xm2 + mova [px+16*3+0], xm3 + test edgeb, 4 ; HAVE_TOP + jz .no_top + movu xm0, [topq+strideq*0] + movu xm1, [topq+strideq*1] + mova [px-16*2+0], xm0 + mova [px-16*1+0], xm1 + test edgeb, 1 ; HAVE_LEFT + jz .top_no_left + movd xm0, [topq+strideq*0-4] + movd xm1, [topq+strideq*1-4] + movd [px-16*2-4], xm0 + movd [px-16*1-4], xm1 + jmp .top_done +.no_top: + mova [px-16*2+0], m7 +.top_no_left: + movd [px-16*2-4], xm7 + movd [px-16*1-4], xm7 +.top_done: + test edgeb, 8 ; HAVE_BOTTOM + jz .no_bottom + movu xm0, [botq+strideq*0] + movu xm1, [botq+strideq*1] + mova [px+16*4+0], xm0 + mova [px+16*5+0], xm1 + test edgeb, 1 ; HAVE_LEFT + jz .bottom_no_left + movd xm0, [botq+strideq*0-4] + movd xm1, [botq+strideq*1-4] + movd [px+16*4-4], xm0 + movd [px+16*5-4], xm1 + jmp .bottom_done +.no_bottom: + mova [px+16*4+0], m7 +.bottom_no_left: + movd [px+16*4-4], xm7 + movd [px+16*5-4], xm7 +.bottom_done: + test edgeb, 1 ; HAVE_LEFT + jz .no_left + movd xm0, [leftq+4*0] + movd xm1, [leftq+4*1] + movd xm2, [leftq+4*2] + movd xm3, [leftq+4*3] + movd [px+16*0-4], xm0 + movd [px+16*1-4], xm1 + movd [px+16*2-4], xm2 + movd [px+16*3-4], xm3 + jmp .left_done +.no_left: + REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3 +.left_done: + test edgeb, 2 ; HAVE_RIGHT + jnz .padding_done + REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5 +.padding_done: + CDEF_FILTER 4, 4 + +cglobal cdef_filter_4x8_16bpc, 5, 10, 9, 16*14, dst, stride, left, top, bot, \ + pri, sec, edge + mov edged, r9m + movu xm0, [dstq+strideq*0] + movu xm1, [dstq+strideq*1] + lea r9, [strideq*3] + movu xm2, [dstq+strideq*2] + movu xm3, [dstq+r9 ] + lea r6, [dstq+strideq*4] + movu xm4, [r6 +strideq*0] + movu xm5, [r6 +strideq*1] + movu xm6, [r6 +strideq*2] + movu xm7, [r6 +r9 ] + lea r8, [dir_table4] + mova [px+16*0+0], xm0 + mova [px+16*1+0], xm1 + mova [px+16*2+0], xm2 + mova [px+16*3+0], xm3 + mova [px+16*4+0], xm4 + mova [px+16*5+0], xm5 + mova [px+16*6+0], xm6 + mova [px+16*7+0], xm7 + vpbroadcastd m7, [base+pw_m16384] + test edgeb, 4 ; HAVE_TOP + jz .no_top + movu xm0, [topq+strideq*0] + movu xm1, [topq+strideq*1] + mova [px-16*2+0], xm0 + mova [px-16*1+0], xm1 + test edgeb, 1 ; HAVE_LEFT + jz .top_no_left + movd xm0, [topq+strideq*0-4] + movd xm1, [topq+strideq*1-4] + movd [px-16*2-4], xm0 + movd [px-16*1-4], xm1 + jmp .top_done +.no_top: + mova [px-16*2+0], m7 +.top_no_left: + movd [px-16*2-4], xm7 + movd [px-16*1-4], xm7 +.top_done: + test edgeb, 8 ; HAVE_BOTTOM + jz .no_bottom + movu xm0, [botq+strideq*0] + movu xm1, [botq+strideq*1] + mova [px+16*8+0], xm0 + mova [px+16*9+0], xm1 + test edgeb, 1 ; HAVE_LEFT + jz .bottom_no_left + movd xm0, [botq+strideq*0-4] + movd xm1, [botq+strideq*1-4] + movd [px+16*8-4], xm0 + movd [px+16*9-4], xm1 + jmp .bottom_done +.no_bottom: + mova [px+16*8+0], m7 +.bottom_no_left: + movd [px+16*8-4], xm7 + movd [px+16*9-4], xm7 +.bottom_done: + test edgeb, 1 ; HAVE_LEFT + jz .no_left + movd xm0, [leftq+4*0] + movd xm1, [leftq+4*1] + movd xm2, [leftq+4*2] + movd xm3, [leftq+4*3] + movd [px+16*0-4], xm0 + movd [px+16*1-4], xm1 + movd [px+16*2-4], xm2 + movd [px+16*3-4], xm3 + movd xm0, [leftq+4*4] + movd xm1, [leftq+4*5] + movd xm2, [leftq+4*6] + movd xm3, [leftq+4*7] + movd [px+16*4-4], xm0 + movd [px+16*5-4], xm1 + movd [px+16*6-4], xm2 + movd [px+16*7-4], xm3 + jmp .left_done +.no_left: + REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7 +.left_done: + test edgeb, 2 ; HAVE_RIGHT + jnz .padding_done + REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 +.padding_done: + CDEF_FILTER 4, 8 + +cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*13, dst, stride, left, top, bot, \ + pri, sec, edge +%if WIN64 + %define px rsp+32*4 +%else + %define px rsp+32*3 +%endif + %define base r8-dir_table8 + mov edged, r9m + movu m0, [dstq+strideq*0] + movu m1, [dstq+strideq*1] + lea r6, [dstq+strideq*2] + movu m2, [r6 +strideq*0] + movu m3, [r6 +strideq*1] + lea r6, [r6 +strideq*2] + movu m4, [r6 +strideq*0] + movu m5, [r6 +strideq*1] + lea r6, [r6 +strideq*2] + movu m6, [r6 +strideq*0] + movu m7, [r6 +strideq*1] + lea r8, [dir_table8] + mova [px+32*0+0], m0 + mova [px+32*1+0], m1 + mova [px+32*2+0], m2 + mova [px+32*3+0], m3 + mova [px+32*4+0], m4 + mova [px+32*5+0], m5 + mova [px+32*6+0], m6 + mova [px+32*7+0], m7 + vpbroadcastd m7, [base+pw_m16384] + test edgeb, 4 ; HAVE_TOP + jz .no_top + movu m0, [topq+strideq*0] + movu m1, [topq+strideq*1] + mova [px-32*2+0], m0 + mova [px-32*1+0], m1 + test edgeb, 1 ; HAVE_LEFT + jz .top_no_left + movd xm0, [topq+strideq*0-4] + movd xm1, [topq+strideq*1-4] + movd [px-32*2-4], xm0 + movd [px-32*1-4], xm1 + jmp .top_done +.no_top: + mova [px-32*2+0], m7 + mova [px-32*1+0], m7 +.top_no_left: + movd [px-32*2-4], xm7 + movd [px-32*1-4], xm7 +.top_done: + test edgeb, 8 ; HAVE_BOTTOM + jz .no_bottom + movu m0, [botq+strideq*0] + movu m1, [botq+strideq*1] + mova [px+32*8+0], m0 + mova [px+32*9+0], m1 + test edgeb, 1 ; HAVE_LEFT + jz .bottom_no_left + movd xm0, [botq+strideq*0-4] + movd xm1, [botq+strideq*1-4] + movd [px+32*8-4], xm0 + movd [px+32*9-4], xm1 + jmp .bottom_done +.no_bottom: + mova [px+32*8+0], m7 + mova [px+32*9+0], m7 +.bottom_no_left: + movd [px+32*8-4], xm7 + movd [px+32*9-4], xm7 +.bottom_done: + test edgeb, 1 ; HAVE_LEFT + jz .no_left + movd xm0, [leftq+4*0] + movd xm1, [leftq+4*1] + movd xm2, [leftq+4*2] + movd xm3, [leftq+4*3] + movd [px+32*0-4], xm0 + movd [px+32*1-4], xm1 + movd [px+32*2-4], xm2 + movd [px+32*3-4], xm3 + movd xm0, [leftq+4*4] + movd xm1, [leftq+4*5] + movd xm2, [leftq+4*6] + movd xm3, [leftq+4*7] + movd [px+32*4-4], xm0 + movd [px+32*5-4], xm1 + movd [px+32*6-4], xm2 + movd [px+32*7-4], xm3 + jmp .left_done +.no_left: + REPX {movd [px+32*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7 +.left_done: + test edgeb, 2 ; HAVE_RIGHT + jnz .padding_done + REPX {movd [px+32*x+16], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 +.padding_done: + CDEF_FILTER 8, 8 + +cglobal cdef_dir_16bpc, 4, 7, 6, src, stride, var, bdmax + lea r6, [dir_shift] + shr bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc + vpbroadcastd m4, [r6+bdmaxq*4] + lea r6, [strideq*3] + mova xm0, [srcq+strideq*0] + mova xm1, [srcq+strideq*1] + mova xm2, [srcq+strideq*2] + mova xm3, [srcq+r6 ] + lea srcq, [srcq+strideq*4] + vinserti128 m0, [srcq+r6 ], 1 + vinserti128 m1, [srcq+strideq*2], 1 + vinserti128 m2, [srcq+strideq*1], 1 + vinserti128 m3, [srcq+strideq*0], 1 + REPX {pmulhuw x, m4}, m0, m1, m2, m3 + jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main + +%endif ; ARCH_X86_64 -- cgit v1.2.3