diff options
Diffstat (limited to '')
-rw-r--r-- | third_party/dav1d/src/x86/loopfilter_avx512.asm | 1534 |
1 files changed, 1534 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/loopfilter_avx512.asm b/third_party/dav1d/src/x86/loopfilter_avx512.asm new file mode 100644 index 0000000000..0218b624d3 --- /dev/null +++ b/third_party/dav1d/src/x86/loopfilter_avx512.asm @@ -0,0 +1,1534 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +pb_4x0_4x4_4x8_4x12: times 4 db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 + +pb_mask: dd 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080 + dd 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, 0x8000 + +hmulA: dd 0, 8, 16, 24, 32, 40, 48, 56, 4, 12, 20, 28, 36, 44, 52, 60 +hmulB: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +hmulC: dd 0, 1, 2, 3, 16, 17, 18, 19, 32, 33, 34, 35, 48, 49, 50, 51 +hmulD: dd 0, 1, 16, 17, 32, 33, 48, 49 +hshuf4:db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 + +pb_1: times 4 db 1 +pb_2: times 4 db 2 +pb_3: times 4 db 3 +pb_4: times 4 db 4 +pb_16: times 4 db 16 +pb_63: times 4 db 63 +pb_64: times 4 db 64 +pb_128: times 4 db 0x80 +pb_240: times 4 db 0xf0 +pb_248: times 4 db 0xf8 +pb_254: times 4 db 0xfe +pb_2_1: times 2 db 2, 1 +pb_3_1: times 2 db 3, 1 +pb_7_1: times 2 db 7, 1 +pb_m1_0: times 2 db -1, 0 +pb_m1_1: times 2 db -1, 1 +pb_m1_2: times 2 db -1, 2 +pw_2048: times 2 dw 2048 +pw_4096: times 2 dw 4096 + +SECTION .text + +%macro ABSSUB 4 ; dst, a, b, tmp + psubusb %1, %2, %3 + psubusb %4, %3, %2 + por %1, %4 +%endmacro + +%macro TRANSPOSE_16x4_AND_WRITE_4x32 5 + punpcklbw m%5, m%1, m%2 + punpckhbw m%1, m%2 + punpcklbw m%2, m%3, m%4 + punpckhbw m%3, m%4 + punpcklwd m%4, m%5, m%2 + punpckhwd m%5, m%2 + punpcklwd m%2, m%1, m%3 + punpckhwd m%1, m%3 + kmovw k1, k6 + lea t0, [dstq+strideq*4] + vpscatterdd [dstq+m19-2]{k1}, m%4 + kmovw k1, k6 + lea t1, [dstq+strideq*8] + vpscatterdd [t0 +m19-2]{k1}, m%5 + kmovw k1, k6 + lea t2, [t0 +strideq*8] + vpscatterdd [t1 +m19-2]{k1}, m%2 + kmovw k1, k6 + vpscatterdd [t2 +m19-2]{k1}, m%1 +%endmacro + +%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem +%if %1 == 0 + SWAP m16, m22 +%endif + punpcklbw m22, m24, m26 + punpckhbw m24, m26 + punpcklbw m26, m2, m3 + punpckhbw m2, m3 + punpcklbw m3, m4, m5 + punpckhbw m4, m5 + punpcklbw m5, m6, m7 + punpckhbw m6, m7 + punpcklbw m7, m8, m9 + punpckhbw m8, m9 + punpcklbw m9, m10, m11 + punpckhbw m10, m11 + punpcklbw m11, m25, m13 + punpckhbw m25, m13 +%if %1 == 0 + SWAP m13, m16 +%else + mova m13, %3 +%endif + SWAP m16, m25 + punpcklbw m25, m14, m13 + punpckhbw m13, m14, m13 + ; interleaved in m22,24,26,2,3,4,5,6,7,8,9,10,11,rsp%3,25,13 + punpcklwd m14, m22, m26 + punpckhwd m22, m26 + punpcklwd m26, m24, m2 + punpckhwd m24, m2 + punpcklwd m2, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m4, m6 + punpckhwd m4, m6 + punpcklwd m6, m7, m9 + punpckhwd m7, m9 + punpcklwd m9, m8, m10 + punpckhwd m8, m10 + punpcklwd m10, m11, m25 + punpckhwd m11, m25 + SWAP m25, m16, m11 + punpcklwd m11, m25, m13 + punpckhwd m25, m13 + ; interleaved in m14,15,26,24,2,3,5,4,6,7,9,8,10,rsp%3,11,25 + punpckldq m13, m14, m2 + punpckhdq m14, m2 + punpckldq m2, m22, m3 + punpckhdq m22, m3 + punpckldq m3, m26, m5 + punpckhdq m26, m5 + punpckldq m5, m24, m4 + punpckhdq m24, m4 + punpckldq m4, m6, m10 + punpckhdq m6, m10 + punpckldq m10, m9, m11 + punpckhdq m9, m11 + punpckldq m11, m8, m25 + punpckhdq m8, m25 + SWAP m25, m16, m8 + punpckldq m8, m7, m25 + punpckhdq m7, m25 + ; interleaved in m13,14,2,15,3,26,5,24,4,6,8,7,10,9,11,rsp%3 + punpcklqdq m25, m13, m4 + punpckhqdq m13, m4 + punpcklqdq m4, m14, m6 + punpckhqdq m14, m6 + punpcklqdq m6, m2, m8 + punpckhqdq m2, m8 + punpcklqdq m8, m22, m7 + punpckhqdq m22, m7 + punpcklqdq m7, m3, m10 + punpckhqdq m3, m10 + punpcklqdq m10, m26, m9 + punpckhqdq m26, m9 + punpcklqdq m9, m5, m11 + punpckhqdq m5, m11 + SWAP m11, m16 +%if %2 == 0 + SWAP m16, m25 +%else + mova %3, m25 +%endif + punpcklqdq m25, m24, m11 + punpckhqdq m24, m11 +%if %2 == 0 + SWAP m11, m16 +%endif + ; interleaved m11,13,4,14,6,2,8,15,7,3,10,26,9,5,25,24 + SWAP 24, 11, 26, 13, 5, 2, 4, 6, 8, 7, 22 + SWAP 3, 14, 25, 9 +%endmacro + +%macro FILTER 2 ; width [4/6/8/16], dir [h/v] + ; load data +%ifidn %2, v +%define is_h 0 +%if %1 == 4 + lea t0, [dstq+mstrideq*2] + mova m3, [t0 +strideq*0] ; p1 + mova m4, [t0 +strideq*1] ; p0 + mova m5, [t0 +strideq*2] ; q0 + mova m6, [t0 +stride3q ] ; q1 +%else + ; load 6-8 pixels, remainder (for wd=16) will be read inline +%if %1 == 16 + lea t0, [dstq+mstrideq*8] + mova m16, [t0 +strideq*1] + mova m17, [t0 +strideq*2] + mova m18, [t0 +stride3q ] +%endif + lea t0, [dstq+mstrideq*4] +%if %1 != 6 + mova m25, [t0 +strideq*0] +%endif + mova m13, [t0 +strideq*1] + mova m3, [t0 +strideq*2] + mova m4, [t0 +stride3q ] + mova m5, [dstq+strideq*0] + mova m6, [dstq+strideq*1] + mova m14, [dstq+strideq*2] +%if %1 != 6 + mova m22, [dstq+stride3q ] +%endif +%if %1 == 16 + lea t0, [dstq+strideq*4] + mova m29, [t0 +strideq*0] + mova m30, [t0 +strideq*1] + mova m31, [t0 +strideq*2] +%endif +%endif +%else ; h +%define is_h 1 + ; load lines +%if %1 == 4 + vbroadcasti32x4 m0, [hshuf4] + kmovw k1, k6 + lea t0, [dstq+strideq*4] + vpgatherdd m3{k1}, [dstq+m19-2] + kmovw k1, k6 + lea t1, [dstq+strideq*8] + vpgatherdd m4{k1}, [t0 +m19-2] + kmovw k1, k6 + lea t2, [t0 +strideq*8] + vpgatherdd m5{k1}, [t1 +m19-2] + kmovw k1, k6 + vpgatherdd m6{k1}, [t2 +m19-2] + pshufb m3, m0 + pshufb m4, m0 + pshufb m5, m0 + pshufb m6, m0 + punpckldq m7, m3, m4 + punpckhdq m3, m4 + punpckldq m4, m5, m6 + punpckhdq m5, m6 + punpcklqdq m6, m7, m4 + punpckhqdq m7, m4 + punpcklqdq m4, m3, m5 + punpckhqdq m3, m5 + SWAP 3, 6 + SWAP 5, 4, 7 + ; 6,7,4,3 -> 3,4,5,6 +%elif %1 == 6 || %1 == 8 + kmovb k1, k7 + lea t0, [dstq+strideq*1] + vpgatherdq m3{k1}, [dstq+ym21-%1/2] + kmovb k1, k7 + lea t1, [dstq+strideq*2] + vpgatherdq m4{k1}, [t0 +ym21-%1/2] + kmovb k1, k7 + lea t2, [dstq+stride3q ] + vpgatherdq m5{k1}, [t1 +ym21-%1/2] + kmovb k1, k7 + vextracti32x8 ym0, m21, 1 + vpgatherdq m6{k1}, [t2 +ym21-%1/2] + kmovb k1, k7 + vpgatherdq m12{k1}, [dstq+ym0 -%1/2] + kmovb k1, k7 + vpgatherdq m13{k1}, [t0 +ym0 -%1/2] + kmovb k1, k7 + vpgatherdq m14{k1}, [t1 +ym0 -%1/2] + kmovb k1, k7 + vpgatherdq m15{k1}, [t2 +ym0 -%1/2] + ; transpose 8x16 + ; xm3: A-H0,A-H8 + ; xm4: A-H1,A-H9 + ; xm5: A-H2,A-H10 + ; xm6: A-H3,A-H11 + ; xm12: A-H4,A-H12 + ; xm13: A-H5,A-H13 + ; xm14: A-H6,A-H14 + ; xm15: A-H7,A-H15 + punpcklbw m7, m3, m4 + punpckhbw m3, m4 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + punpcklbw m6, m12, m13 + punpckhbw m12, m13 + punpcklbw m13, m14, m15 + punpckhbw m14, m15 + ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1 + ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9 + ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3 + ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11 + ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5 + ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13 + ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7 + ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15 + punpcklwd m15, m7, m4 + punpckhwd m7, m4 + punpcklwd m4, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m6, m13 + punpckhwd m6, m13 + punpcklwd m13, m12, m14 + punpckhwd m12, m14 + ; xm15: A0-3,B0-3,C0-3,D0-3 + ; xm7: E0-3,F0-3,G0-3,H0-3 + ; xm4: A8-11,B8-11,C8-11,D8-11 + ; xm3: E8-11,F8-11,G8-11,H8-11 + ; xm5: A4-7,B4-7,C4-7,D4-7 + ; xm6: E4-7,F4-7,G4-7,H4-7 + ; xm13: A12-15,B12-15,C12-15,D12-15 + ; xm12: E12-15,F12-15,G12-15,H12-15 + punpckldq m14, m15, m5 + punpckhdq m15, m5 + punpckldq m5, m7, m6 + %if %1 != 6 + punpckhdq m7, m6 + %endif + punpckldq m6, m4, m13 + punpckhdq m4, m13 + punpckldq m13, m3, m12 + %if %1 != 6 + punpckhdq m12, m3, m12 + %endif + ; xm14: A0-7,B0-7 + ; xm15: C0-7,D0-7 + ; xm5: E0-7,F0-7 + ; xm7: G0-7,H0-7 + ; xm6: A8-15,B8-15 + ; xm4: C8-15,D8-15 + ; xm13: E8-15,F8-15 + ; xm12: G8-15,H8-15 + punpcklqdq m3, m14, m6 + punpckhqdq m14, m6 + punpckhqdq m6, m15, m4 + punpcklqdq m15, m4 + punpcklqdq m4, m5, m13 + punpckhqdq m13, m5, m13 + %if %1 == 8 + punpcklqdq m5, m7, m12 + punpckhqdq m25, m7, m12 + ; xm3: A0-15 + ; xm14: B0-15 + ; xm15: C0-15 + ; xm6: D0-15 + ; xm4: E0-15 + ; xm13: F0-15 + ; xm5: G0-15 + ; xm25: H0-15 + SWAP 25, 3, 15 + SWAP 13, 14, 5, 4, 6 + SWAP 15, 22 + ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,22 + %else + SWAP 13, 3, 14 + SWAP 6, 4, 15, 5 + ; 3,14,15,6,4,13 -> 13,3,4,5,6,14 + %endif +%else ; 16, h + ; load and 16x16 transpose. We only use 14 pixels but we'll need the + ; remainder at the end for the second transpose + movu xm24, [dstq+strideq*0-8] + movu xm26, [dstq+strideq*1-8] + movu xm2, [dstq+strideq*2-8] + movu xm3, [dstq+stride3q -8] + lea t0, [dstq+strideq*4] + movu xm4, [t0 +strideq*0-8] + movu xm5, [t0 +strideq*1-8] + movu xm6, [t0 +strideq*2-8] + movu xm7, [t0 +stride3q -8] + lea t0, [t0 +strideq*4] + movu xm8, [t0 +strideq*0-8] + movu xm9, [t0 +strideq*1-8] + movu xm10, [t0 +strideq*2-8] + movu xm11, [t0 +stride3q -8] + lea t0, [t0 +strideq*4] + movu xm25, [t0 +strideq*0-8] + movu xm13, [t0 +strideq*1-8] + movu xm14, [t0 +strideq*2-8] + movu xm22, [t0 +stride3q -8] + lea t0, [t0 +strideq*4] + vinserti32x4 ym24, [t0 +strideq*0-8], 1 + vinserti32x4 ym26, [t0 +strideq*1-8], 1 + vinserti32x4 ym2, [t0 +strideq*2-8], 1 + vinserti32x4 ym3, [t0 +stride3q -8], 1 + lea t0, [t0 +strideq*4] + vinserti32x4 ym4, [t0 +strideq*0-8], 1 + vinserti32x4 ym5, [t0 +strideq*1-8], 1 + vinserti32x4 ym6, [t0 +strideq*2-8], 1 + vinserti32x4 ym7, [t0 +stride3q -8], 1 + lea t0, [t0 +strideq*4] + vinserti32x4 ym8, [t0 +strideq*0-8], 1 + vinserti32x4 ym9, [t0 +strideq*1-8], 1 + vinserti32x4 ym10, [t0 +strideq*2-8], 1 + vinserti32x4 ym11, [t0 +stride3q -8], 1 + lea t0, [t0 +strideq*4] + vinserti32x4 ym25, [t0 +strideq*0-8], 1 + vinserti32x4 ym13, [t0 +strideq*1-8], 1 + vinserti32x4 ym14, [t0 +strideq*2-8], 1 + vinserti32x4 ym22, [t0 +stride3q -8], 1 + lea t0, [t0 +strideq*4] + vinserti32x4 m24, [t0 +strideq*0-8], 2 + vinserti32x4 m26, [t0 +strideq*1-8], 2 + vinserti32x4 m2, [t0 +strideq*2-8], 2 + vinserti32x4 m3, [t0 +stride3q -8], 2 + lea t0, [t0 +strideq*4] + vinserti32x4 m4, [t0 +strideq*0-8], 2 + vinserti32x4 m5, [t0 +strideq*1-8], 2 + vinserti32x4 m6, [t0 +strideq*2-8], 2 + vinserti32x4 m7, [t0 +stride3q -8], 2 + lea t0, [t0 +strideq*4] + vinserti32x4 m8, [t0 +strideq*0-8], 2 + vinserti32x4 m9, [t0 +strideq*1-8], 2 + vinserti32x4 m10, [t0 +strideq*2-8], 2 + vinserti32x4 m11, [t0 +stride3q -8], 2 + lea t0, [t0 +strideq*4] + vinserti32x4 m25, [t0 +strideq*0-8], 2 + vinserti32x4 m13, [t0 +strideq*1-8], 2 + vinserti32x4 m14, [t0 +strideq*2-8], 2 + vinserti32x4 m22, [t0 +stride3q -8], 2 + lea t0, [t0 +strideq*4] + vinserti32x4 m24, [t0 +strideq*0-8], 3 + vinserti32x4 m26, [t0 +strideq*1-8], 3 + vinserti32x4 m2, [t0 +strideq*2-8], 3 + vinserti32x4 m3, [t0 +stride3q -8], 3 + lea t0, [t0 +strideq*4] + vinserti32x4 m4, [t0 +strideq*0-8], 3 + vinserti32x4 m5, [t0 +strideq*1-8], 3 + vinserti32x4 m6, [t0 +strideq*2-8], 3 + vinserti32x4 m7, [t0 +stride3q -8], 3 + lea t0, [t0 +strideq*4] + vinserti32x4 m8, [t0 +strideq*0-8], 3 + vinserti32x4 m9, [t0 +strideq*1-8], 3 + vinserti32x4 m10, [t0 +strideq*2-8], 3 + vinserti32x4 m11, [t0 +stride3q -8], 3 + lea t0, [t0 +strideq*4] + vinserti32x4 m25, [t0 +strideq*0-8], 3 + vinserti32x4 m13, [t0 +strideq*1-8], 3 + vinserti32x4 m14, [t0 +strideq*2-8], 3 + vinserti32x4 m22, [t0 +stride3q -8], 3 + ; + TRANSPOSE_16X16B 0, 1, [rsp+0*64] + SWAP m16, m26 + SWAP m17, m2 + SWAP m18, m3 + SWAP m29, m25 + SWAP m30, m13 + SWAP m31, m14 + mova [rsp+4*64], m22 + ; 4,5,6,7,8,9,10,11 -> 25,13,3,4,5,6,14,22 + SWAP 25, 4, 7 + SWAP 13, 5, 8 + SWAP 3, 6, 9 + SWAP 10, 14 + SWAP 11, 22 +%endif +%endif + + ; load L/E/I/H + vpbroadcastd m15, [pb_1] +%ifidn %2, v + movu m1, [lq] + movu m0, [lq+l_strideq] +%else + kmovw k1, k6 + vpgatherdd m0{k1}, [lq+m20+4] + kmovw k1, k6 + vpgatherdd m1{k1}, [lq+m20+0] +%endif + pxor m2, m2 + pcmpeqb k1, m0, m2 + vmovdqu8 m0{k1}, m1 ; l[x][] ? l[x][] : l[x-stride][] + pshufb m0, pbshuf ; l[x][0] + vpcmpub k3, m0, m2, 4 ; neq ; L + psrlq m2, m0, [lutq+128] + pand m2, [pb_63]{bcstd} + vpbroadcastb m1, [lutq+136] + pminub m2, m1 + pmaxub m2, m15 ; I + pand m1, m0, [pb_240]{bcstd} + psrlq m1, 4 ; H + paddd m0, [pb_2]{bcstd} + paddb m0, m0 + paddb m0, m2 ; E + + ABSSUB m8, m3, m4, m9 ; abs(p1-p0) + ABSSUB m9, m5, m6, m10 ; abs(q1-q0) + pmaxub m8, m9 + vpcmpub k1, m8, m1, 6 ; gt ; hev +%if %1 != 4 + %if %1 == 6 + ABSSUB m9, m13, m4, m10 ; abs(p2-p0) + pmaxub m9, m8 + %else + ABSSUB m9, m25, m4, m10 ; abs(p3-p0) + pmaxub m9, m8 + ABSSUB m10, m13, m4, m11 ; abs(p2-p0) + pmaxub m9, m10 + %endif + ABSSUB m10, m5, m14, m11 ; abs(q2-q0) + pmaxub m9, m10 + %if %1 != 6 + ABSSUB m10, m5, m22, m11 ; abs(q3-q0) + pmaxub m9, m10 + %endif + vpcmpub k2{k3}, m9, m15, 2 ; le ; flat8in + %if %1 == 6 + ABSSUB m10, m13, m3, m1 ; abs(p2-p1) + %else + ABSSUB m10, m25, m13, m11 ; abs(p3-p2) + ABSSUB m11, m13, m3, m1 ; abs(p2-p1) + pmaxub m10, m11 + ABSSUB m11, m14, m22, m1 ; abs(q3-q2) + pmaxub m10, m11 + %endif + ABSSUB m11, m14, m6, m1 ; abs(q2-q1) + pmaxub m10, m11 + %if %1 == 16 + vpbroadcastd m11, [maskq+8] + por m11, [maskq+4]{bcstd} + %else + vpbroadcastd m11, [maskq+4] + %endif + vptestmd k4, m11, pbmask + vmovdqa32 m10{k4}{z}, m10 ; only apply fm-wide to wd>4 blocks + pmaxub m8, m10 +%endif + vpcmpub k3{k3}, m8, m2, 2 ; le + ABSSUB m10, m3, m6, m11 ; abs(p1-q1) + ABSSUB m11, m4, m5, m2 ; abs(p0-q0) + paddusb m11, m11 + pand m10, [pb_254]{bcstd} + psrlq m10, 1 + paddusb m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1) + vpcmpub k3{k3}, m10, m0, 2 ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E + +%if %1 == 16 + ABSSUB m1, m16, m4, m2 + ABSSUB m2, m17, m4, m10 + pmaxub m1, m2 + ABSSUB m2, m18, m4, m10 + pmaxub m1, m2 + ABSSUB m2, m29, m5, m10 + pmaxub m1, m2 + ABSSUB m2, m30, m5, m10 + pmaxub m1, m2 + ABSSUB m2, m31, m5, m10 + pmaxub m1, m2 + kandq k2, k2, k3 + vpcmpub k4{k2}, m1, m15, 2 ; flat8in & flat8out + vpbroadcastd m2, [maskq+8] + vptestmd k5, m2, pbmask + vpmovm2d m7, k5 + vptestmb k4{k4}, m7, m7 ; flat16 & fm + por m10, m2, [maskq+4]{bcstd} + vptestmd k5, m10, pbmask + vpmovm2d m7, k5 + vptestmb k2{k2}, m7, m7 ; flat8in + por m2, m10, [maskq+0]{bcstd} + vptestmd k5, m2, pbmask + vpmovm2d m7, k5 + vptestmb k3{k3}, m7, m7 + kandnq k3, k2, k3 ; fm & !flat8 & !flat16 + kandnq k2, k4, k2 ; flat8 & !flat16 +%elif %1 != 4 + vpbroadcastd m0, [maskq+4] + vptestmd k4, m0, pbmask + vpmovm2d m7, k4 + vptestmb k2{k2}, m7, m7 + kandq k2, k2, k3 ; flat8 & fm + por m0, [maskq+0]{bcstd} + vptestmd k4, m0, pbmask + vpmovm2d m7, k4 + vptestmb k3{k3}, m7, m7 + kandnq k3, k2, k3 ; fm & !flat8 +%else + %ifidn %2, v + vptestmd k4, pbmask, [maskq+0]{bcstd} + %else + vpbroadcastd m0, [maskq+0] + vptestmd k4, m0, pbmask + %endif + vpmovm2d m7, k4 + vptestmb k3{k3}, m7, m7 ; fm +%endif + + ; short filter +%if %1 >= 8 + SWAP m23, m15 +%endif + vpbroadcastd m15, [pb_3] + vpbroadcastd m0, [pb_4] + vpbroadcastd m12, [pb_16] + vpbroadcastd m1, [pb_64] + pxor m3, pb128 + pxor m6, pb128 + psubsb m10{k1}{z}, m3, m6 ; f=iclip_diff(p1-q1)&hev + pxor m4, pb128 + pxor m5, pb128 + psubsb m11, m5, m4 + paddsb m10, m11 + paddsb m10, m11 + paddsb m10{k3}{z}, m10, m11 ; f=iclip_diff(3*(q0-p0)+f)&fm + paddsb m8, m10, m15 + paddsb m10, m0 + pand m8, [pb_248]{bcstd} + pand m10, [pb_248]{bcstd} + psrlq m8, 3 + psrlq m10, 3 + pxor m8, m12 + pxor m10, m12 + psubb m8, m12 ; f2 + psubb m10, m12 ; f1 + paddsb m4, m8 + psubsb m5, m10 + pxor m4, pb128 + pxor m5, pb128 + ; + pxor m10, pb128 + pxor m8, m8 + pavgb m8, m10 ; f=(f1+1)>>1 + psubb m8, m1 + knotq k1, k1 + paddsb m3{k1}, m3, m8 + psubsb m6{k1}, m6, m8 + pxor m3, pb128 + pxor m6, pb128 + +%if %1 == 16 + ; flat16 filter +%ifidn %2, v + lea t0, [dstq+mstrideq*8] +%endif + SWAP m24, m16, m14 + SWAP m2, m17, m22 + SWAP m7, m18 + + ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A + ; write -6 + vpbroadcastd m1, [pb_7_1] + vpbroadcastd m12, [pb_2] + punpcklbw m14, m24, m25 + punpckhbw m22, m24, m25 + pmaddubsw m10, m14, m1 + pmaddubsw m11, m22, m1 ; p6*7+p3 + punpcklbw m8, m2, m7 + punpckhbw m9, m2, m7 + pmaddubsw m8, m12 + pmaddubsw m9, m12 + paddw m10, m8 + paddw m11, m9 ; p6*7+p5*2+p4*2+p3 +%ifidn %2, h + vpbroadcastd m27, [pw_2048] + vpbroadcastd m1, [pb_m1_1] + %define pw2048 m27 + %define pbm1_1 m1 +%endif + punpcklbw m8, m13, m3 + punpckhbw m9, m13, m3 + pmaddubsw m8, m23 + pmaddubsw m9, m23 + paddw m10, m8 + paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1 + punpcklbw m8, m4, m5 + punpckhbw m9, m4, m5 + pmaddubsw m8, m23 + pmaddubsw m9, m23 + paddw m10, m8 + paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + pmulhrsw m8, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m8, m9 +%ifidn %2, v + vmovdqu8 [t0+strideq*2]{k4}, m8 ; p5 +%else + vpblendmb m8{k4}, m2, m8 + mova [rsp+1*64], m8 +%endif + + ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B + ; write -5 + pmaddubsw m14, pbm1_1 + pmaddubsw m22, pbm1_1 + paddw m10, m14 + paddw m11, m22 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 + punpcklbw m8, m24, m6 + punpckhbw m9, m24, m6 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1 + SWAP m18, m8 + SWAP m23, m9 + pmulhrsw m8, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m8, m9 +%ifidn %2, v + vmovdqu8 [t0+stride3q]{k4}, m8 ; p4 +%else + vpblendmb m8{k4}, m7, m8 + mova [rsp+2*64], m8 +%endif + + ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C + ; write -4 + SWAP m14, m16 + punpcklbw m8, m24, m13 + punpckhbw m9, m24, m13 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1 + punpcklbw m8, m2, m14 + punpckhbw m2, m14 + pmaddubsw m8, pbm1_1 + pmaddubsw m2, pbm1_1 + paddw m10, m8 + paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2 + SWAP m16, m8 + pmulhrsw m8, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m8, m9 +%ifidn %2, v + vmovdqu8 [t0+strideq*4]{k4}, m8 ; p3 +%else + vpblendmb m8{k4}, m25, m8 + mova [rsp+3*64], m8 +%endif + + ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D + ; write -3 + SWAP m22, m17 + punpcklbw m8, m24, m3 + punpckhbw m9, m24, m3 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2 + punpcklbw m8, m7, m22 + punpckhbw m7, m22 + pmaddubsw m8, pbm1_1 + pmaddubsw m7, pbm1_1 + paddw m10, m8 + paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3 + SWAP m17, m8 + pmulhrsw m8, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m8, m9 + vpblendmb m15{k4}, m13, m8 ; don't clobber p2/m13 since we need it in F + + ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E + ; write -2 +%ifidn %2, v + lea t0, [dstq+strideq*4] +%endif + punpcklbw m8, m24, m4 + punpckhbw m9, m24, m4 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3 + punpcklbw m8, m25, m29 + punpckhbw m9, m25, m29 + SWAP m26, m29 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4 + SWAP m29, m8 + SWAP m0, m9 + pmulhrsw m8, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m8, m9 + vpblendmb m12{k4}, m3, m8 ; don't clobber p1/m3 since we need it in G + + ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F + ; write -1 +%ifidn %2, h + SWAP m28, m24 + punpcklbw m8, m28, m5 + punpckhbw m24, m28, m5 +%else + punpcklbw m8, m24, m5 + punpckhbw m24, m5 +%endif + pmaddubsw m8, pbm1_1 + pmaddubsw m24, pbm1_1 + paddw m10, m8 + paddw m11, m24 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 + punpcklbw m24, m13, m30 + punpckhbw m9, m13, m30 +%ifidn %2, h + SWAP m27, m30 +%endif + SWAP m13, m15 + pmaddubsw m24, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m24 + paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5 + SWAP m30, m24 + SWAP m15, m9 +%ifidn %2, h + SWAP m9, m24 + %define pw2048 m9 +%endif + pmulhrsw m24, m10, pw2048 + pmulhrsw m8, m11, pw2048 + paddw m10, m18 ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5 + paddw m11, m23 + packuswb m24, m8 + punpcklbw m8, m3, m31 + pmaddubsw m8, pbm1_1 + paddw m10, m8 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 + SWAP m18, m8 + pmulhrsw m8, m10, pw2048 + paddw m10, m16 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 +%ifidn %2, h + SWAP m16, m9 + %define pw2048 m16 +%endif + punpckhbw m9, m3, m31 + SWAP m3, m12 + pmaddubsw m9, pbm1_1 + paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 + SWAP m23, m9 + pmulhrsw m9, m11, pw2048 + paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 +%ifidn %2, h + SWAP m2, m1 + %define pbm1_1 m2 +%endif + vpblendmb m1{k4}, m4, m24 ; don't clobber p0/m4 since we need it in H + + ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G + ; write +0 + SWAP m24, m31 ; q6 + packuswb m8, m9 +%ifidn %2, h + SWAP m31, m2 + %define pbm1_1 m31 +%endif + vpblendmb m12{k4}, m5, m8 ; don't clobber q0/m5 since we need it in I + + ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H + ; write +1 + punpcklbw m8, m4, m24 + punpckhbw m2, m4, m24 + SWAP m4, m1 + pmaddubsw m8, pbm1_1 + pmaddubsw m2, pbm1_1 + paddw m10, m8 + paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2 + pmulhrsw m2, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m2, m9 + vpblendmb m2{k4}, m6, m2 ; don't clobber q1/m6 since we need it in K + + ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I + ; write +2 + paddw m10, m17 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 + paddw m11, m7 + punpcklbw m8, m5, m24 + punpckhbw m9, m5, m24 + SWAP m5, m12 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3 + pmulhrsw m7, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m7, m9 + vpblendmb m7{k4}, m14, m7 ; don't clobber q2/m14 since we need it in K + + ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J + ; write +3 + paddw m10, m29 ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 + paddw m11, m0 + punpcklbw m8, m6, m24 + punpckhbw m9, m6, m24 + SWAP 2, 6 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4 + pmulhrsw m8, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m8, m9 +%ifidn %2, v + vmovdqu8 [t0+mstrideq]{k4}, m8 +%else + SWAP m29, m16 + %define pw2048 m29 + vpblendmb m16{k4}, m22, m8 +%endif + + ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K + ; write +4 + paddw m10, m30 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 + paddw m11, m15 +%ifidn %2, h + SWAP m15, m8 +%endif + punpcklbw m8, m14, m24 + punpckhbw m9, m14, m24 + SWAP 14, 7 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 + pmulhrsw m8, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m8, m9 +%ifidn %2, v + vmovdqu8 [t0+strideq*0]{k4}, m8 ; q4 +%else + vpblendmb m17{k4}, m26, m8 +%endif + + ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L + ; write +5 + paddw m10, m18 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 + paddw m11, m23 + punpcklbw m8, m22, m24 + punpckhbw m9, m22, m24 + SWAP m30, m24 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 + pmulhrsw m10, pw2048 + pmulhrsw m11, pw2048 + packuswb m10, m11 +%ifidn %2, v + vmovdqu8 [t0+strideq*1]{k4}, m10 ; q5 +%else + vmovdqu8 m27{k4}, m10 +%endif + +%ifidn %2, v + lea t0, [dstq+mstrideq*4] +%endif +%endif + +%if %1 >= 8 + ; flat8 filter + vpbroadcastd m9, [pb_3_1] + vpbroadcastd m10, [pb_2_1] +%if %1 == 16 + vpbroadcastd m23, [pb_1] + vpbroadcastd m0, [pb_4] +%elifidn %2, h + vpbroadcastd m31, [pb_m1_1] + %define pbm1_1 m31 +%endif + punpcklbw m24, m25, m3 + punpckhbw m26, m25, m3 + pmaddubsw m2, m24, m9 + pmaddubsw m7, m26, m9 ; 3 * p3 + p1 + punpcklbw m8, m13, m4 + punpckhbw m11, m13, m4 + pmaddubsw m8, m10 + pmaddubsw m11, m10 + paddw m2, m8 + paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + punpcklbw m8, m5, m0 + punpckhbw m11, m5, m0 + pmaddubsw m8, m23 + pmaddubsw m11, m23 + paddw m2, m8 + paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 + psrlw m8, m2, 3 + psrlw m11, m7, 3 + packuswb m8, m11 +%if is_h || %1 == 16 + vpblendmb m10{k2}, m13, m8 ; p2 +%endif +%ifidn %2, v + %if %1 == 8 + vmovdqu8 [t0+strideq*1]{k2}, m8 + %else + mova [t0+strideq*1], m10 + %endif +%endif + + pmaddubsw m8, m24, pbm1_1 + pmaddubsw m11, m26, pbm1_1 + paddw m2, m8 + paddw m7, m11 + punpcklbw m8, m13, m6 + punpckhbw m11, m13, m6 + pmaddubsw m8, pbm1_1 + pmaddubsw m11, pbm1_1 + paddw m2, m8 + paddw m7, m11 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4 + psrlw m8, m2, 3 + psrlw m11, m7, 3 + packuswb m8, m11 + vpblendmb m8{k2}, m3, m8 ; p1 +%ifidn %2, v + mova [t0+strideq*2], m8 +%else + SWAP m18, m8 +%endif + + pmaddubsw m24, m23 + pmaddubsw m26, m23 + psubw m2, m24 + psubw m7, m26 + punpcklbw m8, m4, m14 + punpckhbw m11, m4, m14 + pmaddubsw m8, m23 + pmaddubsw m11, m23 + paddw m2, m8 + paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4 + psrlw m8, m2, 3 + psrlw m11, m7, 3 + packuswb m8, m11 + vpblendmb m8{k2}, m4, m8 ; p0 +%ifidn %2, v + mova [t0+stride3q], m8 +%else + SWAP m29, m8 +%endif + + punpcklbw m24, m5, m22 + punpckhbw m26, m5, m22 + pmaddubsw m8, m24, m23 + pmaddubsw m11, m26, m23 + paddw m2, m8 + paddw m7, m11 + punpcklbw m8, m4, m25 + punpckhbw m11, m4, m25 + pmaddubsw m8, m23 + pmaddubsw m11, m23 + psubw m2, m8 + psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4 + psrlw m8, m2, 3 + psrlw m11, m7, 3 + packuswb m8, m11 + vpblendmb m11{k2}, m5, m8 ; q0 +%ifidn %2, v + mova [dstq+strideq*0], m11 +%endif + + pmaddubsw m24, pbm1_1 + pmaddubsw m26, pbm1_1 + paddw m2, m24 + paddw m7, m26 + punpcklbw m8, m13, m6 + punpckhbw m13, m6 + pmaddubsw m8, pbm1_1 + pmaddubsw m13, pbm1_1 + paddw m2, m8 + paddw m7, m13 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4 + psrlw m8, m2, 3 + psrlw m13, m7, 3 + packuswb m8, m13 + vpblendmb m13{k2}, m6, m8 ; q1 +%ifidn %2, v + mova [dstq+strideq*1], m13 +%endif + + punpcklbw m24, m3, m6 + punpckhbw m26, m3, m6 + pmaddubsw m24, m23 + pmaddubsw m26, m23 + psubw m2, m24 + psubw m7, m26 + punpcklbw m24, m14, m22 + punpckhbw m26, m14, m22 + pmaddubsw m24, m23 + pmaddubsw m26, m23 + paddw m2, m24 + paddw m7, m26 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 + psrlw m2, 3 + psrlw m7, 3 + packuswb m2, m7 +%if is_h || %1 == 16 + vpblendmb m2{k2}, m14, m2 ; q2 +%endif +%ifidn %2, v + %if %1 == 8 + vmovdqu8 [dstq+strideq*2]{k2}, m2 + %else + mova [dstq+strideq*2], m2 + %endif +%endif + +%ifidn %2, h + SWAP m24, m18 + SWAP m26, m29 +%if %1 == 8 + ; 16x8 transpose + punpcklbw m3, m25, m10 + punpckhbw m25, m10 + punpcklbw m10, m24, m26 + punpckhbw m24, m26 + punpcklbw m26, m11, m13 + punpckhbw m11, m13 + punpcklbw m13, m2, m22 + punpckhbw m2, m22 + ; + punpcklwd m22, m3, m10 + punpckhwd m3, m10 + punpcklwd m10, m25, m24 + punpckhwd m25, m24 + punpcklwd m24, m26, m13 + punpckhwd m26, m13 + punpcklwd m13, m11, m2 + punpckhwd m11, m2 + ; + punpckldq m2, m22, m24 + punpckhdq m22, m24 + punpckldq m24, m3, m26 + punpckhdq m3, m26 + punpckldq m26, m10, m13 + punpckhdq m10, m13 + punpckldq m13, m25, m11 + punpckhdq m25, m11 + ; write 8x32 + vpbroadcastd ym16, strided + pmulld ym16, [hmulD] + lea t1, [dstq+strideq*2] + lea t2, [dstq+strideq*4] + lea t3, [t1 +strideq*4] + lea t0, [dstq+strideq*8] + kmovb k1, k6 + kmovb k2, k6 + kmovb k3, k6 + kmovb k4, k6 + vpscatterdq [dstq+ym16-4]{k1}, m2 + vpscatterdq [t1 +ym16-4]{k2}, m22 + vpscatterdq [t2 +ym16-4]{k3}, m24 + vpscatterdq [t3 +ym16-4]{k4}, m3 + lea t1, [t0+strideq*2] + lea t2, [t0+strideq*4] + lea t3, [t1+strideq*4] + kmovb k1, k6 + kmovb k2, k6 + kmovb k3, k6 + kmovb k4, k6 + vpscatterdq [t0+ym16-4]{k1}, m26 + vpscatterdq [t1+ym16-4]{k2}, m10 + vpscatterdq [t2+ym16-4]{k3}, m13 + vpscatterdq [t3+ym16-4]{k4}, m25 +%else + ; 16x16 transpose and store + SWAP 5, 10, 2 + SWAP 6, 24 + SWAP 7, 26 + SWAP 8, 11 + SWAP 9, 13 + mova m24, [rsp+0*64] + SWAP m26, m28 + mova m2, [rsp+1*64] + mova m3, [rsp+2*64] + mova m4, [rsp+3*64] + SWAP m11, m16 + SWAP m25, m17 + SWAP m13, m27 + SWAP m14, m30 + TRANSPOSE_16X16B 1, 0, [rsp+4*64] + movu [dstq+strideq*0-8], xm24 + movu [dstq+strideq*1-8], xm26 + movu [dstq+strideq*2-8], xm2 + movu [dstq+stride3q -8], xm3 + lea t0, [dstq+strideq*4] + movu [t0+strideq*0-8], xm4 + movu [t0+strideq*1-8], xm5 + movu [t0+strideq*2-8], xm6 + movu [t0+stride3q -8], xm7 + lea t0, [t0+strideq*4] + movu [t0+strideq*0-8], xm8 + movu [t0+strideq*1-8], xm9 + movu [t0+strideq*2-8], xm10 + movu [t0+stride3q -8], xm11 + lea t0, [t0+strideq*4] + movu [t0+strideq*0-8], xm25 + movu [t0+strideq*1-8], xm13 + movu [t0+strideq*2-8], xm14 + movu [t0+stride3q -8], xm22 + lea t0, [t0+strideq*4] + vextracti128 [t0+strideq*0-8], ym24, 1 + vextracti128 [t0+strideq*1-8], ym26, 1 + vextracti128 [t0+strideq*2-8], ym2, 1 + vextracti128 [t0+stride3q -8], ym3, 1 + lea t0, [t0+strideq*4] + vextracti128 [t0+strideq*0-8], ym4, 1 + vextracti128 [t0+strideq*1-8], ym5, 1 + vextracti128 [t0+strideq*2-8], ym6, 1 + vextracti128 [t0+stride3q -8], ym7, 1 + lea t0, [t0+strideq*4] + vextracti128 [t0+strideq*0-8], ym8, 1 + vextracti128 [t0+strideq*1-8], ym9, 1 + vextracti128 [t0+strideq*2-8], ym10, 1 + vextracti128 [t0+stride3q -8], ym11, 1 + lea t0, [t0+strideq*4] + vextracti128 [t0+strideq*0-8], ym25, 1 + vextracti128 [t0+strideq*1-8], ym13, 1 + vextracti128 [t0+strideq*2-8], ym14, 1 + vextracti128 [t0+stride3q -8], ym22, 1 + lea t0, [t0+strideq*4] + vextracti32x4 [t0+strideq*0-8], m24, 2 + vextracti32x4 [t0+strideq*1-8], m26, 2 + vextracti32x4 [t0+strideq*2-8], m2, 2 + vextracti32x4 [t0+stride3q -8], m3, 2 + lea t0, [t0+strideq*4] + vextracti32x4 [t0+strideq*0-8], m4, 2 + vextracti32x4 [t0+strideq*1-8], m5, 2 + vextracti32x4 [t0+strideq*2-8], m6, 2 + vextracti32x4 [t0+stride3q -8], m7, 2 + lea t0, [t0+strideq*4] + vextracti32x4 [t0+strideq*0-8], m8, 2 + vextracti32x4 [t0+strideq*1-8], m9, 2 + vextracti32x4 [t0+strideq*2-8], m10, 2 + vextracti32x4 [t0+stride3q -8], m11, 2 + lea t0, [t0+strideq*4] + vextracti32x4 [t0+strideq*0-8], m25, 2 + vextracti32x4 [t0+strideq*1-8], m13, 2 + vextracti32x4 [t0+strideq*2-8], m14, 2 + vextracti32x4 [t0+stride3q -8], m22, 2 + lea t0, [t0+strideq*4] + vextracti32x4 [t0+strideq*0-8], m24, 3 + vextracti32x4 [t0+strideq*1-8], m26, 3 + vextracti32x4 [t0+strideq*2-8], m2, 3 + vextracti32x4 [t0+stride3q -8], m3, 3 + lea t0, [t0+strideq*4] + vextracti32x4 [t0+strideq*0-8], m4, 3 + vextracti32x4 [t0+strideq*1-8], m5, 3 + vextracti32x4 [t0+strideq*2-8], m6, 3 + vextracti32x4 [t0+stride3q -8], m7, 3 + lea t0, [t0+strideq*4] + vextracti32x4 [t0+strideq*0-8], m8, 3 + vextracti32x4 [t0+strideq*1-8], m9, 3 + vextracti32x4 [t0+strideq*2-8], m10, 3 + vextracti32x4 [t0+stride3q -8], m11, 3 + lea t0, [t0+strideq*4] + vextracti32x4 [t0+strideq*0-8], m25, 3 + vextracti32x4 [t0+strideq*1-8], m13, 3 + vextracti32x4 [t0+strideq*2-8], m14, 3 + vextracti32x4 [t0+stride3q -8], m22, 3 +%endif +%endif + +%elif %1 == 6 + ; flat6 filter + vpbroadcastd m15, [pb_3_1] + vpbroadcastd m12, [pb_2] + punpcklbw m8, m13, m5 + punpckhbw m11, m13, m5 + pmaddubsw m0, m8, m15 + pmaddubsw m1, m11, m15 + punpcklbw m7, m4, m3 + punpckhbw m10, m4, m3 + pmaddubsw m2, m7, m12 + pmaddubsw m12, m10, m12 +%ifidn %2, h + vpbroadcastd m15, [pb_m1_1] + %define pbm1_1 m15 +%endif + paddw m0, m2 + paddw m1, m12 + pmulhrsw m2, m0, m16 + pmulhrsw m12, m1, m16 + packuswb m2, m12 + vpblendmb m2{k2}, m3, m2 ; p1 +%ifidn %2, v + mova [t0+strideq*2], m2 +%endif + + pmaddubsw m8, pbm1_1 + pmaddubsw m11, pbm1_1 + paddw m0, m8 + paddw m1, m11 + punpcklbw m8, m13, m6 + punpckhbw m11, m13, m6 + pmaddubsw m8, pbm1_1 + pmaddubsw m11, pbm1_1 + paddw m0, m8 + paddw m1, m11 + pmulhrsw m12, m0, m16 + pmulhrsw m13, m1, m16 + packuswb m12, m13 + vpblendmb m12{k2}, m4, m12 ; p0 +%ifidn %2, v + mova [t0+stride3q], m12 +%endif + + vpbroadcastd m9, [pb_m1_2] + vpbroadcastd m4, [pb_m1_0] + paddw m0, m8 + paddw m1, m11 + punpcklbw m8, m3, m14 + punpckhbw m11, m3, m14 + pmaddubsw m14, m8, pbm1_1 + pmaddubsw m13, m11, pbm1_1 + paddw m0, m14 + paddw m1, m13 + pmulhrsw m14, m0, m16 + pmulhrsw m13, m1, m16 + packuswb m14, m13 + vpblendmb m14{k2}, m5, m14 ; q0 +%ifidn %2, v + mova [dstq+strideq*0], m14 +%endif + + pmaddubsw m8, m9 + pmaddubsw m11, m9 + paddw m0, m8 + paddw m1, m11 + pmaddubsw m7, m4 + pmaddubsw m10, m4 + paddw m0, m7 + paddw m1, m10 + pmulhrsw m0, m16 + pmulhrsw m1, m16 + packuswb m0, m1 + vpblendmb m0{k2}, m6, m0 ; q1 +%ifidn %2, v + mova [dstq+strideq*1], m0 +%else + TRANSPOSE_16x4_AND_WRITE_4x32 2, 12, 14, 0, 1 +%endif +%else ; %1 == 4 +%ifidn %2, v + mova [t0+strideq*0], m3 ; p1 + mova [t0+strideq*1], m4 ; p0 + mova [t0+strideq*2], m5 ; q0 + mova [t0+stride3q ], m6 ; q1 +%else + TRANSPOSE_16x4_AND_WRITE_4x32 3, 4, 5, 6, 7 +%endif +%endif +%endmacro + +%define k7 k6 + +INIT_ZMM avx512icl +cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \ + lut, w, stride3, mstride + DECLARE_REG_TMP 9 + shl l_strideq, 2 + sub lq, l_strideq + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] + mova m21, [pb_4x0_4x4_4x8_4x12] + mova m20, [pb_mask] + vpbroadcastd m19, [pb_128] + vpbroadcastd m28, [pb_m1_1] + vpbroadcastd m27, [pw_2048] + %define pbshuf m21 + %define pbmask m20 + %define pb128 m19 + %define pbm1_1 m28 + %define pw2048 m27 + +.loop: + cmp word [maskq+8], 0 ; vmask[2] + je .no_flat16 + + FILTER 16, v + jmp .end + +.no_flat16: + cmp word [maskq+4], 0 ; vmask[1] + je .no_flat + + FILTER 8, v + jmp .end + +.no_flat: + cmp word [maskq+0], 0 ; vmask[0] + je .end + + call .v4 + +.end: + add lq, 64 + add dstq, 64 + add maskq, 2 + sub wd, 16 + jg .loop + RET +ALIGN function_align +RESET_MM_PERMUTATION +.v4: + FILTER 4, v + ret + +cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \ + lut, h, stride3, stride8 + DECLARE_REG_TMP 9, 10, 11, 12 + shl l_strideq, 2 + sub lq, 4 + lea stride3q, [strideq*3] + lea stride8q, [strideq*8] + kxnorw k6, k6, k6 + vpbroadcastd m19, strided + vpbroadcastd m20, l_strided + pmulld m21, m19, [hmulA] + pmulld m20, [hmulB] + pmulld m19, [hmulC] + %define pbshuf [pb_4x0_4x4_4x8_4x12] + %define pbmask [pb_mask] + %define pb128 [pb_128]{bcstd} + shl l_strideq, 1 + +.loop: + cmp word [maskq+8], 0 ; vmask[2] + je .no_flat16 + + FILTER 16, h + jmp .end + +.no_flat16: + cmp word [maskq+4], 0 ; vmask[1] + je .no_flat + + FILTER 8, h + jmp .end + +.no_flat: + cmp word [maskq+0], 0 ; vmask[0] + je .end + + call .h4 + +.end: + lea lq, [lq+l_strideq*8] + lea dstq, [dstq+stride8q*8] + add maskq, 2 + sub hd, 16 + jg .loop + RET +ALIGN function_align +RESET_MM_PERMUTATION +.h4: + FILTER 4, h + ret + +cglobal lpf_v_sb_uv_8bpc, 7, 10, 22, dst, stride, mask, l, l_stride, \ + lut, w, stride3, mstride + DECLARE_REG_TMP 9 + shl l_strideq, 2 + sub lq, l_strideq + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] + mova m21, [pb_4x0_4x4_4x8_4x12] + mova m20, [pb_mask] + vpbroadcastd m19, [pb_128] + vpbroadcastd m17, [pb_m1_1] + vpbroadcastd m16, [pw_4096] + %define pbshuf m21 + %define pbmask m20 + %define pb128 m19 + %define pbm1_1 m17 + +.loop: + cmp word [maskq+4], 0 ; vmask[1] + je .no_flat + + FILTER 6, v + jmp .end + +.no_flat: + cmp word [maskq+0], 0 ; vmask[0] + je .end + + call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx512icl).v4 + +.end: + add lq, 64 + add dstq, 64 + add maskq, 2 + sub wd, 16 + jg .loop + RET + +%undef k7 +cglobal lpf_h_sb_uv_8bpc, 7, 12, 22, dst, stride, mask, l, l_stride, \ + lut, h, stride3, stride8 + DECLARE_REG_TMP 9, 10, 11 + mov r7d, 0xffff + movzx r8d, r7b + cmp hd, 9 + cmovb r7d, r8d + kmovw k6, r7d ; h > 8 ? 0xffff : 0x00ff + shl l_strideq, 2 + sub lq, 4 + kshiftrw k7, k6, 4 ; h > 8 ? 0xff : 0xf0 + lea stride3q, [strideq*3] + lea stride8q, [strideq*8] + vpbroadcastd m19, strided + vpbroadcastd m20, l_strided + pmulld m21, m19, [hmulA] + pmulld m20, [hmulB] + pmulld m19, [hmulC] + mova m18, [pb_mask] + vpbroadcastd m17, [pb_128] + vpbroadcastd m16, [pw_4096] + %define pbshuf [pb_4x0_4x4_4x8_4x12] + %define pbmask m18 + %define pb128 m17 + add l_strideq, l_strideq + +.loop: + cmp word [maskq+4], 0 ; vmask[1] + je .no_flat + + FILTER 6, h + jmp .end + +.no_flat: + cmp word [maskq+0], 0 ; vmask[0] + je .end + + call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx512icl).h4 + +.end: + lea lq, [lq+l_strideq*8] + lea dstq, [dstq+stride8q*8] + add maskq, 2 + sub hd, 16 + jg .loop + RET + +%endif ; ARCH_X86_64 |