summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/x86/cdef16_avx2.asm
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/dav1d/src/x86/cdef16_avx2.asm')
-rw-r--r--third_party/dav1d/src/x86/cdef16_avx2.asm877
1 files changed, 877 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/cdef16_avx2.asm b/third_party/dav1d/src/x86/cdef16_avx2.asm
new file mode 100644
index 0000000000..4c8d3bca43
--- /dev/null
+++ b/third_party/dav1d/src/x86/cdef16_avx2.asm
@@ -0,0 +1,877 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA
+
+%macro DIR_TABLE 1 ; stride
+ db 1 * %1 + 0, 2 * %1 + 0
+ db 1 * %1 + 0, 2 * %1 - 2
+ db -1 * %1 + 2, -2 * %1 + 4
+ db 0 * %1 + 2, -1 * %1 + 4
+ db 0 * %1 + 2, 0 * %1 + 4
+ db 0 * %1 + 2, 1 * %1 + 4
+ db 1 * %1 + 2, 2 * %1 + 4
+ db 1 * %1 + 0, 2 * %1 + 2
+ db 1 * %1 + 0, 2 * %1 + 0
+ db 1 * %1 + 0, 2 * %1 - 2
+ db -1 * %1 + 2, -2 * %1 + 4
+ db 0 * %1 + 2, -1 * %1 + 4
+%endmacro
+
+dir_table4: DIR_TABLE 16
+dir_table8: DIR_TABLE 32
+pri_taps: dw 4, 4, 3, 3, 2, 2, 3, 3
+
+dir_shift: times 2 dw 0x4000
+ times 2 dw 0x1000
+
+pw_2048: times 2 dw 2048
+pw_m16384: times 2 dw -16384
+
+cextern cdef_dir_8bpc_avx2.main
+
+SECTION .text
+
+%macro CDEF_FILTER 2 ; w, h
+ DEFINE_ARGS dst, stride, _, dir, pridmp, pri, sec, tmp
+ movifnidn prid, r5m
+ movifnidn secd, r6m
+ mov dird, r7m
+ vpbroadcastd m8, [base+pw_2048]
+ lea dirq, [base+dir_table%1+dirq*2]
+ test prid, prid
+ jz .sec_only
+%if WIN64
+ vpbroadcastw m6, prim
+ movaps [rsp+16*0], xmm9
+ movaps [rsp+16*1], xmm10
+%else
+ movd xm6, prid
+ vpbroadcastw m6, xm6
+%endif
+ lzcnt pridmpd, prid
+ rorx tmpd, prid, 2
+ cmp dword r10m, 0xfff ; if (bpc == 12)
+ cmove prid, tmpd ; pri >>= 2
+ mov tmpd, r8m ; damping
+ and prid, 4
+ sub tmpd, 31
+ vpbroadcastd m9, [base+pri_taps+priq+8*0]
+ vpbroadcastd m10, [base+pri_taps+priq+8*1]
+ test secd, secd
+ jz .pri_only
+%if WIN64
+ movaps r8m, xmm13
+ vpbroadcastw m13, secm
+ movaps r4m, xmm11
+ movaps r6m, xmm12
+%else
+ movd xm0, secd
+ vpbroadcastw m13, xm0
+%endif
+ lzcnt secd, secd
+ xor prid, prid
+ add pridmpd, tmpd
+ cmovs pridmpd, prid
+ add secd, tmpd
+ lea tmpq, [px]
+ mov [pri_shift], pridmpq
+ mov [sec_shift], secq
+%rep %1*%2/16
+ call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec
+%endrep
+%if WIN64
+ movaps xmm11, r4m
+ movaps xmm12, r6m
+ movaps xmm13, r8m
+%endif
+ jmp .pri_end
+.pri_only:
+ add pridmpd, tmpd
+ cmovs pridmpd, secd
+ lea tmpq, [px]
+ mov [pri_shift], pridmpq
+%rep %1*%2/16
+ call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri
+%endrep
+.pri_end:
+%if WIN64
+ movaps xmm9, [rsp+16*0]
+ movaps xmm10, [rsp+16*1]
+%endif
+.end:
+ RET
+.sec_only:
+ mov tmpd, r8m ; damping
+%if WIN64
+ vpbroadcastw m6, secm
+%else
+ movd xm6, secd
+ vpbroadcastw m6, xm6
+%endif
+ tzcnt secd, secd
+ sub tmpd, secd
+ mov [sec_shift], tmpq
+ lea tmpq, [px]
+%rep %1*%2/16
+ call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec
+%endrep
+ jmp .end
+%if %1 == %2
+ALIGN function_align
+.pri:
+ movsx offq, byte [dirq+4] ; off_k0
+%if %1 == 4
+ mova m1, [tmpq+32*0]
+ punpcklqdq m1, [tmpq+32*1] ; 0 2 1 3
+ movu m2, [tmpq+offq+32*0]
+ punpcklqdq m2, [tmpq+offq+32*1] ; k0p0
+ neg offq
+ movu m3, [tmpq+offq+32*0]
+ punpcklqdq m3, [tmpq+offq+32*1] ; k0p1
+%else
+ mova xm1, [tmpq+32*0]
+ vinserti128 m1, [tmpq+32*1], 1
+ movu xm2, [tmpq+offq+32*0]
+ vinserti128 m2, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm3, [tmpq+offq+32*0]
+ vinserti128 m3, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+5] ; off_k1
+ psubw m2, m1 ; diff_k0p0
+ psubw m3, m1 ; diff_k0p1
+ pabsw m4, m2 ; adiff_k0p0
+ psrlw m5, m4, [pri_shift+gprsize]
+ psubusw m0, m6, m5
+ pabsw m5, m3 ; adiff_k0p1
+ pminsw m0, m4
+ psrlw m4, m5, [pri_shift+gprsize]
+ psignw m0, m2 ; constrain(diff_k0p0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movu m4, [tmpq+offq+32*0]
+ punpcklqdq m4, [tmpq+offq+32*1] ; k1p0
+ neg offq
+ movu m5, [tmpq+offq+32*0]
+ punpcklqdq m5, [tmpq+offq+32*1] ; k1p1
+%else
+ movu xm4, [tmpq+offq+32*0]
+ vinserti128 m4, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm5, [tmpq+offq+32*0]
+ vinserti128 m5, [tmpq+offq+32*1], 1
+%endif
+ psubw m4, m1 ; diff_k1p0
+ psubw m5, m1 ; diff_k1p1
+ psignw m2, m3 ; constrain(diff_k0p1)
+ pabsw m3, m4 ; adiff_k1p0
+ paddw m0, m2 ; constrain(diff_k0)
+ psrlw m2, m3, [pri_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k1p1
+ pminsw m7, m3
+ psrlw m3, m2, [pri_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1p0)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+ psignw m4, m5 ; constrain(diff_k1p1)
+ paddw m7, m4 ; constrain(diff_k1)
+ pmullw m0, m9 ; pri_tap_k0
+ pmullw m7, m10 ; pri_tap_k1
+ paddw m0, m7 ; sum
+ psraw m2, m0, 15
+ paddw m0, m2
+ pmulhrsw m0, m8
+ add tmpq, 32*2
+ paddw m0, m1
+%if %1 == 4
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r9 ], xm1
+ lea dstq, [dstq+strideq*4]
+%else
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+%endif
+ ret
+ALIGN function_align
+.sec:
+ movsx offq, byte [dirq+8] ; off1_k0
+%if %1 == 4
+ mova m1, [tmpq+32*0]
+ punpcklqdq m1, [tmpq+32*1]
+ movu m2, [tmpq+offq+32*0]
+ punpcklqdq m2, [tmpq+offq+32*1] ; k0s0
+ neg offq
+ movu m3, [tmpq+offq+32*0]
+ punpcklqdq m3, [tmpq+offq+32*1] ; k0s1
+%else
+ mova xm1, [tmpq+32*0]
+ vinserti128 m1, [tmpq+32*1], 1
+ movu xm2, [tmpq+offq+32*0]
+ vinserti128 m2, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm3, [tmpq+offq+32*0]
+ vinserti128 m3, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+0] ; off2_k0
+ psubw m2, m1 ; diff_k0s0
+ psubw m3, m1 ; diff_k0s1
+ pabsw m4, m2 ; adiff_k0s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m0, m6, m5
+ pabsw m5, m3 ; adiff_k0s1
+ pminsw m0, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m0, m2 ; constrain(diff_k0s0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movu m4, [tmpq+offq+32*0]
+ punpcklqdq m4, [tmpq+offq+32*1] ; k0s2
+ neg offq
+ movu m5, [tmpq+offq+32*0]
+ punpcklqdq m5, [tmpq+offq+32*1] ; k0s3
+%else
+ movu xm4, [tmpq+offq+32*0]
+ vinserti128 m4, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm5, [tmpq+offq+32*0]
+ vinserti128 m5, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+9] ; off1_k1
+ psubw m4, m1 ; diff_k0s2
+ psubw m5, m1 ; diff_k0s3
+ psignw m2, m3 ; constrain(diff_k0s1)
+ pabsw m3, m4 ; adiff_k0s2
+ paddw m0, m2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k0s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k0s2)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+%if %1 == 4
+ movu m2, [tmpq+offq+32*0]
+ punpcklqdq m2, [tmpq+offq+32*1] ; k1s0
+ neg offq
+ movu m3, [tmpq+offq+32*0]
+ punpcklqdq m3, [tmpq+offq+32*1] ; k1s1
+%else
+ movu xm2, [tmpq+offq+32*0]
+ vinserti128 m2, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm3, [tmpq+offq+32*0]
+ vinserti128 m3, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+1] ; off2_k1
+ paddw m0, m7
+ psignw m4, m5 ; constrain(diff_k0s3)
+ paddw m0, m4 ; constrain(diff_k0)
+ psubw m2, m1 ; diff_k1s0
+ psubw m3, m1 ; diff_k1s1
+ paddw m0, m0 ; sec_tap_k0
+ pabsw m4, m2 ; adiff_k1s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m7, m6, m5
+ pabsw m5, m3 ; adiff_k1s1
+ pminsw m7, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m7, m2 ; constrain(diff_k1s0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movu m4, [tmpq+offq+32*0]
+ punpcklqdq m4, [tmpq+offq+32*1] ; k1s2
+ neg offq
+ movu m5, [tmpq+offq+32*0]
+ punpcklqdq m5, [tmpq+offq+32*1] ; k1s3
+%else
+ movu xm4, [tmpq+offq+32*0]
+ vinserti128 m4, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm5, [tmpq+offq+32*0]
+ vinserti128 m5, [tmpq+offq+32*1], 1
+%endif
+ paddw m0, m7
+ psubw m4, m1 ; diff_k1s2
+ psubw m5, m1 ; diff_k1s3
+ psignw m2, m3 ; constrain(diff_k1s1)
+ pabsw m3, m4 ; adiff_k1s2
+ paddw m0, m2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k1s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1s2)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+ paddw m0, m7
+ psignw m4, m5 ; constrain(diff_k1s3)
+ paddw m0, m4 ; sum
+ psraw m2, m0, 15
+ paddw m0, m2
+ pmulhrsw m0, m8
+ add tmpq, 32*2
+ paddw m0, m1
+%if %1 == 4
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r9 ], xm1
+ lea dstq, [dstq+strideq*4]
+%else
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+%endif
+ ret
+ALIGN function_align
+.pri_sec:
+ movsx offq, byte [dirq+8] ; off2_k0
+%if %1 == 4
+ mova m1, [tmpq+32*0]
+ punpcklqdq m1, [tmpq+32*1]
+ movu m2, [tmpq+offq+32*0]
+ punpcklqdq m2, [tmpq+offq+32*1] ; k0s0
+ neg offq
+ movu m3, [tmpq+offq+32*0]
+ punpcklqdq m3, [tmpq+offq+32*1] ; k0s1
+%else
+ mova xm1, [dstq+strideq*0]
+ vinserti128 m1, [dstq+strideq*1], 1
+ movu xm2, [tmpq+offq+32*0]
+ vinserti128 m2, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm3, [tmpq+offq+32*0]
+ vinserti128 m3, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+0] ; off3_k0
+ pmaxsw m11, m2, m3
+ pminuw m12, m2, m3
+ psubw m2, m1 ; diff_k0s0
+ psubw m3, m1 ; diff_k0s1
+ pabsw m4, m2 ; adiff_k0s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m0, m13, m5
+ pabsw m5, m3 ; adiff_k0s1
+ pminsw m0, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m0, m2 ; constrain(diff_k0s0)
+ psubusw m2, m13, m4
+ pminsw m2, m5
+%if %1 == 4
+ movu m4, [tmpq+offq+32*0]
+ punpcklqdq m4, [tmpq+offq+32*1] ; k0s2
+ neg offq
+ movu m5, [tmpq+offq+32*0]
+ punpcklqdq m5, [tmpq+offq+32*1] ; k0s3
+%else
+ movu xm4, [tmpq+offq+32*0]
+ vinserti128 m4, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm5, [tmpq+offq+32*0]
+ vinserti128 m5, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+9] ; off2_k1
+ psignw m2, m3 ; constrain(diff_k0s1)
+ pmaxsw m11, m4
+ pminuw m12, m4
+ pmaxsw m11, m5
+ pminuw m12, m5
+ psubw m4, m1 ; diff_k0s2
+ psubw m5, m1 ; diff_k0s3
+ paddw m0, m2
+ pabsw m3, m4 ; adiff_k0s2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m13, m2
+ pabsw m2, m5 ; adiff_k0s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k0s2)
+ psubusw m4, m13, m3
+ pminsw m4, m2
+%if %1 == 4
+ movu m2, [tmpq+offq+32*0]
+ punpcklqdq m2, [tmpq+offq+32*1] ; k1s0
+ neg offq
+ movu m3, [tmpq+offq+32*0]
+ punpcklqdq m3, [tmpq+offq+32*1] ; k1s1
+%else
+ movu xm2, [tmpq+offq+32*0]
+ vinserti128 m2, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm3, [tmpq+offq+32*0]
+ vinserti128 m3, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+1] ; off3_k1
+ paddw m0, m7
+ psignw m4, m5 ; constrain(diff_k0s3)
+ pmaxsw m11, m2
+ pminuw m12, m2
+ pmaxsw m11, m3
+ pminuw m12, m3
+ paddw m0, m4 ; constrain(diff_k0)
+ psubw m2, m1 ; diff_k1s0
+ psubw m3, m1 ; diff_k1s1
+ paddw m0, m0 ; sec_tap_k0
+ pabsw m4, m2 ; adiff_k1s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m7, m13, m5
+ pabsw m5, m3 ; adiff_k1s1
+ pminsw m7, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m7, m2 ; constrain(diff_k1s0)
+ psubusw m2, m13, m4
+ pminsw m2, m5
+%if %1 == 4
+ movu m4, [tmpq+offq+32*0]
+ punpcklqdq m4, [tmpq+offq+32*1] ; k1s2
+ neg offq
+ movu m5, [tmpq+offq+32*0]
+ punpcklqdq m5, [tmpq+offq+32*1] ; k1s3
+%else
+ movu xm4, [tmpq+offq+32*0]
+ vinserti128 m4, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm5, [tmpq+offq+32*0]
+ vinserti128 m5, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+4] ; off1_k0
+ paddw m0, m7
+ psignw m2, m3 ; constrain(diff_k1s1)
+ pmaxsw m11, m4
+ pminuw m12, m4
+ pmaxsw m11, m5
+ pminuw m12, m5
+ psubw m4, m1 ; diff_k1s2
+ psubw m5, m1 ; diff_k1s3
+ pabsw m3, m4 ; adiff_k1s2
+ paddw m0, m2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m13, m2
+ pabsw m2, m5 ; adiff_k1s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1s2)
+ psubusw m4, m13, m3
+ pminsw m4, m2
+ paddw m0, m7
+%if %1 == 4
+ movu m2, [tmpq+offq+32*0]
+ punpcklqdq m2, [tmpq+offq+32*1] ; k0p0
+ neg offq
+ movu m3, [tmpq+offq+32*0]
+ punpcklqdq m3, [tmpq+offq+32*1] ; k0p1
+%else
+ movu xm2, [tmpq+offq+32*0]
+ vinserti128 m2, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm3, [tmpq+offq+32*0]
+ vinserti128 m3, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+5] ; off1_k1
+ psignw m4, m5 ; constrain(diff_k1s3)
+ pmaxsw m11, m2
+ pminuw m12, m2
+ pmaxsw m11, m3
+ pminuw m12, m3
+ psubw m2, m1 ; diff_k0p0
+ psubw m3, m1 ; diff_k0p1
+ paddw m0, m4
+ pabsw m4, m2 ; adiff_k0p0
+ psrlw m5, m4, [pri_shift+gprsize]
+ psubusw m7, m6, m5
+ pabsw m5, m3 ; adiff_k0p1
+ pminsw m7, m4
+ psrlw m4, m5, [pri_shift+gprsize]
+ psignw m7, m2 ; constrain(diff_k0p0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movu m4, [tmpq+offq+32*0]
+ punpcklqdq m4, [tmpq+offq+32*1] ; k1p0
+ neg offq
+ movu m5, [tmpq+offq+32*0]
+ punpcklqdq m5, [tmpq+offq+32*1] ; k1p1
+%else
+ movu xm4, [tmpq+offq+32*0]
+ vinserti128 m4, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm5, [tmpq+offq+32*0]
+ vinserti128 m5, [tmpq+offq+32*1], 1
+%endif
+ psignw m2, m3 ; constrain(diff_k0p1)
+ paddw m7, m2 ; constrain(diff_k0)
+ pmaxsw m11, m4
+ pminuw m12, m4
+ pmaxsw m11, m5
+ pminuw m12, m5
+ psubw m4, m1 ; diff_k1p0
+ psubw m5, m1 ; diff_k1p1
+ pabsw m3, m4 ; adiff_k1p0
+ pmullw m7, m9 ; pri_tap_k0
+ paddw m0, m7
+ psrlw m2, m3, [pri_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k1p1
+ pminsw m7, m3
+ psrlw m3, m2, [pri_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1p0)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+ psignw m4, m5 ; constrain(diff_k1p1)
+ paddw m7, m4 ; constrain(diff_k1)
+ pmullw m7, m10 ; pri_tap_k1
+ paddw m0, m7 ; sum
+ psraw m2, m0, 15
+ paddw m0, m2
+ pmulhrsw m0, m8
+ add tmpq, 32*2
+ pmaxsw m11, m1
+ pminuw m12, m1
+ paddw m0, m1
+ pminsw m0, m11
+ pmaxsw m0, m12
+%if %1 == 4
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r9 ], xm1
+ lea dstq, [dstq+strideq*4]
+%else
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+%endif
+ ret
+%endif
+%endmacro
+
+INIT_YMM avx2
+cglobal cdef_filter_4x4_16bpc, 5, 10, 9, 16*10, dst, stride, left, top, bot, \
+ pri, sec, edge
+%if WIN64
+ %define px rsp+16*6
+ %define offq r8
+ %define pri_shift rsp+16*2
+ %define sec_shift rsp+16*3
+%else
+ %define px rsp+16*4
+ %define offq r4
+ %define pri_shift rsp+16*0
+ %define sec_shift rsp+16*1
+%endif
+ %define base r8-dir_table4
+ mov edged, r9m
+ lea r8, [dir_table4]
+ movu xm0, [dstq+strideq*0]
+ movu xm1, [dstq+strideq*1]
+ lea r9, [strideq*3]
+ movu xm2, [dstq+strideq*2]
+ movu xm3, [dstq+r9 ]
+ vpbroadcastd m7, [base+pw_m16384]
+ mova [px+16*0+0], xm0
+ mova [px+16*1+0], xm1
+ mova [px+16*2+0], xm2
+ mova [px+16*3+0], xm3
+ test edgeb, 4 ; HAVE_TOP
+ jz .no_top
+ movu xm0, [topq+strideq*0]
+ movu xm1, [topq+strideq*1]
+ mova [px-16*2+0], xm0
+ mova [px-16*1+0], xm1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .top_no_left
+ movd xm0, [topq+strideq*0-4]
+ movd xm1, [topq+strideq*1-4]
+ movd [px-16*2-4], xm0
+ movd [px-16*1-4], xm1
+ jmp .top_done
+.no_top:
+ mova [px-16*2+0], m7
+.top_no_left:
+ movd [px-16*2-4], xm7
+ movd [px-16*1-4], xm7
+.top_done:
+ test edgeb, 8 ; HAVE_BOTTOM
+ jz .no_bottom
+ movu xm0, [botq+strideq*0]
+ movu xm1, [botq+strideq*1]
+ mova [px+16*4+0], xm0
+ mova [px+16*5+0], xm1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .bottom_no_left
+ movd xm0, [botq+strideq*0-4]
+ movd xm1, [botq+strideq*1-4]
+ movd [px+16*4-4], xm0
+ movd [px+16*5-4], xm1
+ jmp .bottom_done
+.no_bottom:
+ mova [px+16*4+0], m7
+.bottom_no_left:
+ movd [px+16*4-4], xm7
+ movd [px+16*5-4], xm7
+.bottom_done:
+ test edgeb, 1 ; HAVE_LEFT
+ jz .no_left
+ movd xm0, [leftq+4*0]
+ movd xm1, [leftq+4*1]
+ movd xm2, [leftq+4*2]
+ movd xm3, [leftq+4*3]
+ movd [px+16*0-4], xm0
+ movd [px+16*1-4], xm1
+ movd [px+16*2-4], xm2
+ movd [px+16*3-4], xm3
+ jmp .left_done
+.no_left:
+ REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3
+.left_done:
+ test edgeb, 2 ; HAVE_RIGHT
+ jnz .padding_done
+ REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5
+.padding_done:
+ CDEF_FILTER 4, 4
+
+cglobal cdef_filter_4x8_16bpc, 5, 10, 9, 16*14, dst, stride, left, top, bot, \
+ pri, sec, edge
+ mov edged, r9m
+ movu xm0, [dstq+strideq*0]
+ movu xm1, [dstq+strideq*1]
+ lea r9, [strideq*3]
+ movu xm2, [dstq+strideq*2]
+ movu xm3, [dstq+r9 ]
+ lea r6, [dstq+strideq*4]
+ movu xm4, [r6 +strideq*0]
+ movu xm5, [r6 +strideq*1]
+ movu xm6, [r6 +strideq*2]
+ movu xm7, [r6 +r9 ]
+ lea r8, [dir_table4]
+ mova [px+16*0+0], xm0
+ mova [px+16*1+0], xm1
+ mova [px+16*2+0], xm2
+ mova [px+16*3+0], xm3
+ mova [px+16*4+0], xm4
+ mova [px+16*5+0], xm5
+ mova [px+16*6+0], xm6
+ mova [px+16*7+0], xm7
+ vpbroadcastd m7, [base+pw_m16384]
+ test edgeb, 4 ; HAVE_TOP
+ jz .no_top
+ movu xm0, [topq+strideq*0]
+ movu xm1, [topq+strideq*1]
+ mova [px-16*2+0], xm0
+ mova [px-16*1+0], xm1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .top_no_left
+ movd xm0, [topq+strideq*0-4]
+ movd xm1, [topq+strideq*1-4]
+ movd [px-16*2-4], xm0
+ movd [px-16*1-4], xm1
+ jmp .top_done
+.no_top:
+ mova [px-16*2+0], m7
+.top_no_left:
+ movd [px-16*2-4], xm7
+ movd [px-16*1-4], xm7
+.top_done:
+ test edgeb, 8 ; HAVE_BOTTOM
+ jz .no_bottom
+ movu xm0, [botq+strideq*0]
+ movu xm1, [botq+strideq*1]
+ mova [px+16*8+0], xm0
+ mova [px+16*9+0], xm1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .bottom_no_left
+ movd xm0, [botq+strideq*0-4]
+ movd xm1, [botq+strideq*1-4]
+ movd [px+16*8-4], xm0
+ movd [px+16*9-4], xm1
+ jmp .bottom_done
+.no_bottom:
+ mova [px+16*8+0], m7
+.bottom_no_left:
+ movd [px+16*8-4], xm7
+ movd [px+16*9-4], xm7
+.bottom_done:
+ test edgeb, 1 ; HAVE_LEFT
+ jz .no_left
+ movd xm0, [leftq+4*0]
+ movd xm1, [leftq+4*1]
+ movd xm2, [leftq+4*2]
+ movd xm3, [leftq+4*3]
+ movd [px+16*0-4], xm0
+ movd [px+16*1-4], xm1
+ movd [px+16*2-4], xm2
+ movd [px+16*3-4], xm3
+ movd xm0, [leftq+4*4]
+ movd xm1, [leftq+4*5]
+ movd xm2, [leftq+4*6]
+ movd xm3, [leftq+4*7]
+ movd [px+16*4-4], xm0
+ movd [px+16*5-4], xm1
+ movd [px+16*6-4], xm2
+ movd [px+16*7-4], xm3
+ jmp .left_done
+.no_left:
+ REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7
+.left_done:
+ test edgeb, 2 ; HAVE_RIGHT
+ jnz .padding_done
+ REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+.padding_done:
+ CDEF_FILTER 4, 8
+
+cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*13, dst, stride, left, top, bot, \
+ pri, sec, edge
+%if WIN64
+ %define px rsp+32*4
+%else
+ %define px rsp+32*3
+%endif
+ %define base r8-dir_table8
+ mov edged, r9m
+ movu m0, [dstq+strideq*0]
+ movu m1, [dstq+strideq*1]
+ lea r6, [dstq+strideq*2]
+ movu m2, [r6 +strideq*0]
+ movu m3, [r6 +strideq*1]
+ lea r6, [r6 +strideq*2]
+ movu m4, [r6 +strideq*0]
+ movu m5, [r6 +strideq*1]
+ lea r6, [r6 +strideq*2]
+ movu m6, [r6 +strideq*0]
+ movu m7, [r6 +strideq*1]
+ lea r8, [dir_table8]
+ mova [px+32*0+0], m0
+ mova [px+32*1+0], m1
+ mova [px+32*2+0], m2
+ mova [px+32*3+0], m3
+ mova [px+32*4+0], m4
+ mova [px+32*5+0], m5
+ mova [px+32*6+0], m6
+ mova [px+32*7+0], m7
+ vpbroadcastd m7, [base+pw_m16384]
+ test edgeb, 4 ; HAVE_TOP
+ jz .no_top
+ movu m0, [topq+strideq*0]
+ movu m1, [topq+strideq*1]
+ mova [px-32*2+0], m0
+ mova [px-32*1+0], m1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .top_no_left
+ movd xm0, [topq+strideq*0-4]
+ movd xm1, [topq+strideq*1-4]
+ movd [px-32*2-4], xm0
+ movd [px-32*1-4], xm1
+ jmp .top_done
+.no_top:
+ mova [px-32*2+0], m7
+ mova [px-32*1+0], m7
+.top_no_left:
+ movd [px-32*2-4], xm7
+ movd [px-32*1-4], xm7
+.top_done:
+ test edgeb, 8 ; HAVE_BOTTOM
+ jz .no_bottom
+ movu m0, [botq+strideq*0]
+ movu m1, [botq+strideq*1]
+ mova [px+32*8+0], m0
+ mova [px+32*9+0], m1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .bottom_no_left
+ movd xm0, [botq+strideq*0-4]
+ movd xm1, [botq+strideq*1-4]
+ movd [px+32*8-4], xm0
+ movd [px+32*9-4], xm1
+ jmp .bottom_done
+.no_bottom:
+ mova [px+32*8+0], m7
+ mova [px+32*9+0], m7
+.bottom_no_left:
+ movd [px+32*8-4], xm7
+ movd [px+32*9-4], xm7
+.bottom_done:
+ test edgeb, 1 ; HAVE_LEFT
+ jz .no_left
+ movd xm0, [leftq+4*0]
+ movd xm1, [leftq+4*1]
+ movd xm2, [leftq+4*2]
+ movd xm3, [leftq+4*3]
+ movd [px+32*0-4], xm0
+ movd [px+32*1-4], xm1
+ movd [px+32*2-4], xm2
+ movd [px+32*3-4], xm3
+ movd xm0, [leftq+4*4]
+ movd xm1, [leftq+4*5]
+ movd xm2, [leftq+4*6]
+ movd xm3, [leftq+4*7]
+ movd [px+32*4-4], xm0
+ movd [px+32*5-4], xm1
+ movd [px+32*6-4], xm2
+ movd [px+32*7-4], xm3
+ jmp .left_done
+.no_left:
+ REPX {movd [px+32*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7
+.left_done:
+ test edgeb, 2 ; HAVE_RIGHT
+ jnz .padding_done
+ REPX {movd [px+32*x+16], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+.padding_done:
+ CDEF_FILTER 8, 8
+
+cglobal cdef_dir_16bpc, 4, 7, 6, src, stride, var, bdmax
+ lea r6, [dir_shift]
+ shr bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc
+ vpbroadcastd m4, [r6+bdmaxq*4]
+ lea r6, [strideq*3]
+ mova xm0, [srcq+strideq*0]
+ mova xm1, [srcq+strideq*1]
+ mova xm2, [srcq+strideq*2]
+ mova xm3, [srcq+r6 ]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m0, [srcq+r6 ], 1
+ vinserti128 m1, [srcq+strideq*2], 1
+ vinserti128 m2, [srcq+strideq*1], 1
+ vinserti128 m3, [srcq+strideq*0], 1
+ REPX {pmulhuw x, m4}, m0, m1, m2, m3
+ jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main
+
+%endif ; ARCH_X86_64