summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/x86/loopfilter_avx512.asm
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
commit26a029d407be480d791972afb5975cf62c9360a6 (patch)
treef435a8308119effd964b339f76abb83a57c29483 /third_party/dav1d/src/x86/loopfilter_avx512.asm
parentInitial commit. (diff)
downloadfirefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz
firefox-26a029d407be480d791972afb5975cf62c9360a6.zip
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/dav1d/src/x86/loopfilter_avx512.asm')
-rw-r--r--third_party/dav1d/src/x86/loopfilter_avx512.asm1529
1 files changed, 1529 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/loopfilter_avx512.asm b/third_party/dav1d/src/x86/loopfilter_avx512.asm
new file mode 100644
index 0000000000..202a612aac
--- /dev/null
+++ b/third_party/dav1d/src/x86/loopfilter_avx512.asm
@@ -0,0 +1,1529 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+pb_4x0_4x4_4x8_4x12: times 4 db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12
+
+pb_mask: dd 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080
+ dd 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, 0x8000
+
+hmulA: dd 0, 8, 16, 24, 32, 40, 48, 56, 4, 12, 20, 28, 36, 44, 52, 60
+hmulB: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+hmulC: dd 0, 1, 2, 3, 16, 17, 18, 19, 32, 33, 34, 35, 48, 49, 50, 51
+hmulD: dd 0, 1, 16, 17, 32, 33, 48, 49
+hshuf4:db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+
+shift1: dq 0x0204081020408000
+shift3: dq 0x0810204080000000
+shift4: dq 0x1020408000000000
+
+pb_1: times 4 db 1
+pb_2: times 4 db 2
+pb_3: times 4 db 3
+pb_4: times 4 db 4
+pb_16: times 4 db 16
+pb_63: times 4 db 63
+pb_64: times 4 db 64
+pb_128: times 4 db 0x80
+pb_2_1: times 2 db 2, 1
+pb_3_1: times 2 db 3, 1
+pb_7_1: times 2 db 7, 1
+pb_m1_0: times 2 db -1, 0
+pb_m1_1: times 2 db -1, 1
+pb_m1_2: times 2 db -1, 2
+pw_2048: times 2 dw 2048
+pw_4096: times 2 dw 4096
+
+SECTION .text
+
+%macro ABSSUB 4 ; dst, a, b, tmp
+ psubusb %1, %2, %3
+ psubusb %4, %3, %2
+ por %1, %4
+%endmacro
+
+%macro TRANSPOSE_16x4_AND_WRITE_4x32 5
+ punpcklbw m%5, m%1, m%2
+ punpckhbw m%1, m%2
+ punpcklbw m%2, m%3, m%4
+ punpckhbw m%3, m%4
+ punpcklwd m%4, m%5, m%2
+ punpckhwd m%5, m%2
+ punpcklwd m%2, m%1, m%3
+ punpckhwd m%1, m%3
+ kmovw k1, k6
+ lea t0, [dstq+strideq*4]
+ vpscatterdd [dstq+m19-2]{k1}, m%4
+ kmovw k1, k6
+ lea t1, [dstq+strideq*8]
+ vpscatterdd [t0 +m19-2]{k1}, m%5
+ kmovw k1, k6
+ lea t2, [t0 +strideq*8]
+ vpscatterdd [t1 +m19-2]{k1}, m%2
+ kmovw k1, k6
+ vpscatterdd [t2 +m19-2]{k1}, m%1
+%endmacro
+
+%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem
+%if %1 == 0
+ SWAP m16, m22
+%endif
+ punpcklbw m22, m24, m26
+ punpckhbw m24, m26
+ punpcklbw m26, m2, m3
+ punpckhbw m2, m3
+ punpcklbw m3, m4, m5
+ punpckhbw m4, m5
+ punpcklbw m5, m6, m7
+ punpckhbw m6, m7
+ punpcklbw m7, m8, m9
+ punpckhbw m8, m9
+ punpcklbw m9, m10, m11
+ punpckhbw m10, m11
+ punpcklbw m11, m25, m13
+ punpckhbw m25, m13
+%if %1 == 0
+ SWAP m13, m16
+%else
+ mova m13, %3
+%endif
+ SWAP m16, m25
+ punpcklbw m25, m14, m13
+ punpckhbw m13, m14, m13
+ ; interleaved in m22,24,26,2,3,4,5,6,7,8,9,10,11,rsp%3,25,13
+ punpcklwd m14, m22, m26
+ punpckhwd m22, m26
+ punpcklwd m26, m24, m2
+ punpckhwd m24, m2
+ punpcklwd m2, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m4, m6
+ punpckhwd m4, m6
+ punpcklwd m6, m7, m9
+ punpckhwd m7, m9
+ punpcklwd m9, m8, m10
+ punpckhwd m8, m10
+ punpcklwd m10, m11, m25
+ punpckhwd m11, m25
+ SWAP m25, m16, m11
+ punpcklwd m11, m25, m13
+ punpckhwd m25, m13
+ ; interleaved in m14,15,26,24,2,3,5,4,6,7,9,8,10,rsp%3,11,25
+ punpckldq m13, m14, m2
+ punpckhdq m14, m2
+ punpckldq m2, m22, m3
+ punpckhdq m22, m3
+ punpckldq m3, m26, m5
+ punpckhdq m26, m5
+ punpckldq m5, m24, m4
+ punpckhdq m24, m4
+ punpckldq m4, m6, m10
+ punpckhdq m6, m10
+ punpckldq m10, m9, m11
+ punpckhdq m9, m11
+ punpckldq m11, m8, m25
+ punpckhdq m8, m25
+ SWAP m25, m16, m8
+ punpckldq m8, m7, m25
+ punpckhdq m7, m25
+ ; interleaved in m13,14,2,15,3,26,5,24,4,6,8,7,10,9,11,rsp%3
+ punpcklqdq m25, m13, m4
+ punpckhqdq m13, m4
+ punpcklqdq m4, m14, m6
+ punpckhqdq m14, m6
+ punpcklqdq m6, m2, m8
+ punpckhqdq m2, m8
+ punpcklqdq m8, m22, m7
+ punpckhqdq m22, m7
+ punpcklqdq m7, m3, m10
+ punpckhqdq m3, m10
+ punpcklqdq m10, m26, m9
+ punpckhqdq m26, m9
+ punpcklqdq m9, m5, m11
+ punpckhqdq m5, m11
+ SWAP m11, m16
+%if %2 == 0
+ SWAP m16, m25
+%else
+ mova %3, m25
+%endif
+ punpcklqdq m25, m24, m11
+ punpckhqdq m24, m11
+%if %2 == 0
+ SWAP m11, m16
+%endif
+ ; interleaved m11,13,4,14,6,2,8,15,7,3,10,26,9,5,25,24
+ SWAP 24, 11, 26, 13, 5, 2, 4, 6, 8, 7, 22
+ SWAP 3, 14, 25, 9
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+ ; load data
+%ifidn %2, v
+%define is_h 0
+%if %1 == 4
+ lea t0, [dstq+mstrideq*2]
+ mova m3, [t0 +strideq*0] ; p1
+ mova m4, [t0 +strideq*1] ; p0
+ mova m5, [t0 +strideq*2] ; q0
+ mova m6, [t0 +stride3q ] ; q1
+%else
+ ; load 6-8 pixels, remainder (for wd=16) will be read inline
+%if %1 == 16
+ lea t0, [dstq+mstrideq*8]
+ mova m16, [t0 +strideq*1]
+ mova m17, [t0 +strideq*2]
+ mova m18, [t0 +stride3q ]
+%endif
+ lea t0, [dstq+mstrideq*4]
+%if %1 != 6
+ mova m25, [t0 +strideq*0]
+%endif
+ mova m13, [t0 +strideq*1]
+ mova m3, [t0 +strideq*2]
+ mova m4, [t0 +stride3q ]
+ mova m5, [dstq+strideq*0]
+ mova m6, [dstq+strideq*1]
+ mova m14, [dstq+strideq*2]
+%if %1 != 6
+ mova m22, [dstq+stride3q ]
+%endif
+%if %1 == 16
+ lea t0, [dstq+strideq*4]
+ mova m29, [t0 +strideq*0]
+ mova m30, [t0 +strideq*1]
+ mova m31, [t0 +strideq*2]
+%endif
+%endif
+%else ; h
+%define is_h 1
+ ; load lines
+%if %1 == 4
+ vbroadcasti32x4 m0, [hshuf4]
+ kmovw k1, k6
+ lea t0, [dstq+strideq*4]
+ vpgatherdd m3{k1}, [dstq+m19-2]
+ kmovw k1, k6
+ lea t1, [dstq+strideq*8]
+ vpgatherdd m4{k1}, [t0 +m19-2]
+ kmovw k1, k6
+ lea t2, [t0 +strideq*8]
+ vpgatherdd m5{k1}, [t1 +m19-2]
+ kmovw k1, k6
+ vpgatherdd m6{k1}, [t2 +m19-2]
+ pshufb m3, m0
+ pshufb m4, m0
+ pshufb m5, m0
+ pshufb m6, m0
+ punpckldq m7, m3, m4
+ punpckhdq m3, m4
+ punpckldq m4, m5, m6
+ punpckhdq m5, m6
+ punpcklqdq m6, m7, m4
+ punpckhqdq m7, m4
+ punpcklqdq m4, m3, m5
+ punpckhqdq m3, m5
+ SWAP 3, 6
+ SWAP 5, 4, 7
+ ; 6,7,4,3 -> 3,4,5,6
+%elif %1 == 6 || %1 == 8
+ kmovb k1, k7
+ lea t0, [dstq+strideq*1]
+ vpgatherdq m3{k1}, [dstq+ym21-%1/2]
+ kmovb k1, k7
+ lea t1, [dstq+strideq*2]
+ vpgatherdq m4{k1}, [t0 +ym21-%1/2]
+ kmovb k1, k7
+ lea t2, [dstq+stride3q ]
+ vpgatherdq m5{k1}, [t1 +ym21-%1/2]
+ kmovb k1, k7
+ vextracti32x8 ym0, m21, 1
+ vpgatherdq m6{k1}, [t2 +ym21-%1/2]
+ kmovb k1, k7
+ vpgatherdq m12{k1}, [dstq+ym0 -%1/2]
+ kmovb k1, k7
+ vpgatherdq m13{k1}, [t0 +ym0 -%1/2]
+ kmovb k1, k7
+ vpgatherdq m14{k1}, [t1 +ym0 -%1/2]
+ kmovb k1, k7
+ vpgatherdq m15{k1}, [t2 +ym0 -%1/2]
+ ; transpose 8x16
+ ; xm3: A-H0,A-H8
+ ; xm4: A-H1,A-H9
+ ; xm5: A-H2,A-H10
+ ; xm6: A-H3,A-H11
+ ; xm12: A-H4,A-H12
+ ; xm13: A-H5,A-H13
+ ; xm14: A-H6,A-H14
+ ; xm15: A-H7,A-H15
+ punpcklbw m7, m3, m4
+ punpckhbw m3, m4
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ punpcklbw m6, m12, m13
+ punpckhbw m12, m13
+ punpcklbw m13, m14, m15
+ punpckhbw m14, m15
+ ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1
+ ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9
+ ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3
+ ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11
+ ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5
+ ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13
+ ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7
+ ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15
+ punpcklwd m15, m7, m4
+ punpckhwd m7, m4
+ punpcklwd m4, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m6, m13
+ punpckhwd m6, m13
+ punpcklwd m13, m12, m14
+ punpckhwd m12, m14
+ ; xm15: A0-3,B0-3,C0-3,D0-3
+ ; xm7: E0-3,F0-3,G0-3,H0-3
+ ; xm4: A8-11,B8-11,C8-11,D8-11
+ ; xm3: E8-11,F8-11,G8-11,H8-11
+ ; xm5: A4-7,B4-7,C4-7,D4-7
+ ; xm6: E4-7,F4-7,G4-7,H4-7
+ ; xm13: A12-15,B12-15,C12-15,D12-15
+ ; xm12: E12-15,F12-15,G12-15,H12-15
+ punpckldq m14, m15, m5
+ punpckhdq m15, m5
+ punpckldq m5, m7, m6
+ %if %1 != 6
+ punpckhdq m7, m6
+ %endif
+ punpckldq m6, m4, m13
+ punpckhdq m4, m13
+ punpckldq m13, m3, m12
+ %if %1 != 6
+ punpckhdq m12, m3, m12
+ %endif
+ ; xm14: A0-7,B0-7
+ ; xm15: C0-7,D0-7
+ ; xm5: E0-7,F0-7
+ ; xm7: G0-7,H0-7
+ ; xm6: A8-15,B8-15
+ ; xm4: C8-15,D8-15
+ ; xm13: E8-15,F8-15
+ ; xm12: G8-15,H8-15
+ punpcklqdq m3, m14, m6
+ punpckhqdq m14, m6
+ punpckhqdq m6, m15, m4
+ punpcklqdq m15, m4
+ punpcklqdq m4, m5, m13
+ punpckhqdq m13, m5, m13
+ %if %1 == 8
+ punpcklqdq m5, m7, m12
+ punpckhqdq m25, m7, m12
+ ; xm3: A0-15
+ ; xm14: B0-15
+ ; xm15: C0-15
+ ; xm6: D0-15
+ ; xm4: E0-15
+ ; xm13: F0-15
+ ; xm5: G0-15
+ ; xm25: H0-15
+ SWAP 25, 3, 15
+ SWAP 13, 14, 5, 4, 6
+ SWAP 15, 22
+ ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,22
+ %else
+ SWAP 13, 3, 14
+ SWAP 6, 4, 15, 5
+ ; 3,14,15,6,4,13 -> 13,3,4,5,6,14
+ %endif
+%else ; 16, h
+ ; load and 16x16 transpose. We only use 14 pixels but we'll need the
+ ; remainder at the end for the second transpose
+ movu xm24, [dstq+strideq*0-8]
+ movu xm26, [dstq+strideq*1-8]
+ movu xm2, [dstq+strideq*2-8]
+ movu xm3, [dstq+stride3q -8]
+ lea t0, [dstq+strideq*4]
+ movu xm4, [t0 +strideq*0-8]
+ movu xm5, [t0 +strideq*1-8]
+ movu xm6, [t0 +strideq*2-8]
+ movu xm7, [t0 +stride3q -8]
+ lea t0, [t0 +strideq*4]
+ movu xm8, [t0 +strideq*0-8]
+ movu xm9, [t0 +strideq*1-8]
+ movu xm10, [t0 +strideq*2-8]
+ movu xm11, [t0 +stride3q -8]
+ lea t0, [t0 +strideq*4]
+ movu xm25, [t0 +strideq*0-8]
+ movu xm13, [t0 +strideq*1-8]
+ movu xm14, [t0 +strideq*2-8]
+ movu xm22, [t0 +stride3q -8]
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 ym24, [t0 +strideq*0-8], 1
+ vinserti32x4 ym26, [t0 +strideq*1-8], 1
+ vinserti32x4 ym2, [t0 +strideq*2-8], 1
+ vinserti32x4 ym3, [t0 +stride3q -8], 1
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 ym4, [t0 +strideq*0-8], 1
+ vinserti32x4 ym5, [t0 +strideq*1-8], 1
+ vinserti32x4 ym6, [t0 +strideq*2-8], 1
+ vinserti32x4 ym7, [t0 +stride3q -8], 1
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 ym8, [t0 +strideq*0-8], 1
+ vinserti32x4 ym9, [t0 +strideq*1-8], 1
+ vinserti32x4 ym10, [t0 +strideq*2-8], 1
+ vinserti32x4 ym11, [t0 +stride3q -8], 1
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 ym25, [t0 +strideq*0-8], 1
+ vinserti32x4 ym13, [t0 +strideq*1-8], 1
+ vinserti32x4 ym14, [t0 +strideq*2-8], 1
+ vinserti32x4 ym22, [t0 +stride3q -8], 1
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m24, [t0 +strideq*0-8], 2
+ vinserti32x4 m26, [t0 +strideq*1-8], 2
+ vinserti32x4 m2, [t0 +strideq*2-8], 2
+ vinserti32x4 m3, [t0 +stride3q -8], 2
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m4, [t0 +strideq*0-8], 2
+ vinserti32x4 m5, [t0 +strideq*1-8], 2
+ vinserti32x4 m6, [t0 +strideq*2-8], 2
+ vinserti32x4 m7, [t0 +stride3q -8], 2
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m8, [t0 +strideq*0-8], 2
+ vinserti32x4 m9, [t0 +strideq*1-8], 2
+ vinserti32x4 m10, [t0 +strideq*2-8], 2
+ vinserti32x4 m11, [t0 +stride3q -8], 2
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m25, [t0 +strideq*0-8], 2
+ vinserti32x4 m13, [t0 +strideq*1-8], 2
+ vinserti32x4 m14, [t0 +strideq*2-8], 2
+ vinserti32x4 m22, [t0 +stride3q -8], 2
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m24, [t0 +strideq*0-8], 3
+ vinserti32x4 m26, [t0 +strideq*1-8], 3
+ vinserti32x4 m2, [t0 +strideq*2-8], 3
+ vinserti32x4 m3, [t0 +stride3q -8], 3
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m4, [t0 +strideq*0-8], 3
+ vinserti32x4 m5, [t0 +strideq*1-8], 3
+ vinserti32x4 m6, [t0 +strideq*2-8], 3
+ vinserti32x4 m7, [t0 +stride3q -8], 3
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m8, [t0 +strideq*0-8], 3
+ vinserti32x4 m9, [t0 +strideq*1-8], 3
+ vinserti32x4 m10, [t0 +strideq*2-8], 3
+ vinserti32x4 m11, [t0 +stride3q -8], 3
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m25, [t0 +strideq*0-8], 3
+ vinserti32x4 m13, [t0 +strideq*1-8], 3
+ vinserti32x4 m14, [t0 +strideq*2-8], 3
+ vinserti32x4 m22, [t0 +stride3q -8], 3
+ ;
+ TRANSPOSE_16X16B 0, 1, [rsp+0*64]
+ SWAP m16, m26
+ SWAP m17, m2
+ SWAP m18, m3
+ SWAP m29, m25
+ SWAP m30, m13
+ SWAP m31, m14
+ mova [rsp+4*64], m22
+ ; 4,5,6,7,8,9,10,11 -> 25,13,3,4,5,6,14,22
+ SWAP 25, 4, 7
+ SWAP 13, 5, 8
+ SWAP 3, 6, 9
+ SWAP 10, 14
+ SWAP 11, 22
+%endif
+%endif
+
+ ; load L/E/I/H
+ vpbroadcastd m15, [pb_1]
+%ifidn %2, v
+ movu m1, [lq]
+ movu m0, [lq+l_strideq]
+%else
+ kmovw k1, k6
+ vpgatherdd m0{k1}, [lq+m20+4]
+ kmovw k1, k6
+ vpgatherdd m1{k1}, [lq+m20+0]
+%endif
+ pxor m2, m2
+ pcmpeqb k1, m0, m2
+ vmovdqu8 m0{k1}, m1 ; l[x][] ? l[x][] : l[x-stride][]
+ pshufb m0, pbshuf ; l[x][0]
+ vpcmpub k3, m0, m2, 4 ; neq ; L
+ psrlq m2, m0, [lutq+128]
+ pand m2, [pb_63]{bcstd}
+ vpbroadcastb m1, [lutq+136]
+ pminub m2, m1
+ pmaxub m2, m15 ; I
+ gf2p8affineqb m1, m0, [shift4]{bcstq}, 0 ; H
+ paddd m0, [pb_2]{bcstd}
+ paddb m0, m0
+ paddb m0, m2 ; E
+
+ ABSSUB m8, m3, m4, m9 ; abs(p1-p0)
+ ABSSUB m9, m5, m6, m10 ; abs(q1-q0)
+ pmaxub m8, m9
+ vpcmpub k1, m8, m1, 6 ; gt ; hev
+%if %1 != 4
+ %if %1 == 6
+ ABSSUB m9, m13, m4, m10 ; abs(p2-p0)
+ pmaxub m9, m8
+ %else
+ ABSSUB m9, m25, m4, m10 ; abs(p3-p0)
+ pmaxub m9, m8
+ ABSSUB m10, m13, m4, m11 ; abs(p2-p0)
+ pmaxub m9, m10
+ %endif
+ ABSSUB m10, m5, m14, m11 ; abs(q2-q0)
+ pmaxub m9, m10
+ %if %1 != 6
+ ABSSUB m10, m5, m22, m11 ; abs(q3-q0)
+ pmaxub m9, m10
+ %endif
+ vpcmpub k2{k3}, m9, m15, 2 ; le ; flat8in
+ %if %1 == 6
+ ABSSUB m10, m13, m3, m1 ; abs(p2-p1)
+ %else
+ ABSSUB m10, m25, m13, m11 ; abs(p3-p2)
+ ABSSUB m11, m13, m3, m1 ; abs(p2-p1)
+ pmaxub m10, m11
+ ABSSUB m11, m14, m22, m1 ; abs(q3-q2)
+ pmaxub m10, m11
+ %endif
+ ABSSUB m11, m14, m6, m1 ; abs(q2-q1)
+ pmaxub m10, m11
+ %if %1 == 16
+ vpbroadcastd m11, [maskq+8]
+ por m11, [maskq+4]{bcstd}
+ %else
+ vpbroadcastd m11, [maskq+4]
+ %endif
+ vptestmd k4, m11, pbmask
+ vmovdqa32 m10{k4}{z}, m10 ; only apply fm-wide to wd>4 blocks
+ pmaxub m8, m10
+%endif
+ vpcmpub k3{k3}, m8, m2, 2 ; le
+ ABSSUB m10, m3, m6, m11 ; abs(p1-q1)
+ ABSSUB m11, m4, m5, m2 ; abs(p0-q0)
+ paddusb m11, m11
+ gf2p8affineqb m10, m10, [shift1]{bcstq}, 0
+ paddusb m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+ vpcmpub k3{k3}, m10, m0, 2 ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E
+
+%if %1 == 16
+ ABSSUB m1, m16, m4, m2
+ ABSSUB m2, m17, m4, m10
+ pmaxub m1, m2
+ ABSSUB m2, m18, m4, m10
+ pmaxub m1, m2
+ ABSSUB m2, m29, m5, m10
+ pmaxub m1, m2
+ ABSSUB m2, m30, m5, m10
+ pmaxub m1, m2
+ ABSSUB m2, m31, m5, m10
+ pmaxub m1, m2
+ kandq k2, k2, k3
+ vpcmpub k4{k2}, m1, m15, 2 ; flat8in & flat8out
+ vpbroadcastd m2, [maskq+8]
+ vptestmd k5, m2, pbmask
+ vpmovm2d m7, k5
+ vptestmb k4{k4}, m7, m7 ; flat16 & fm
+ por m10, m2, [maskq+4]{bcstd}
+ vptestmd k5, m10, pbmask
+ vpmovm2d m7, k5
+ vptestmb k2{k2}, m7, m7 ; flat8in
+ por m2, m10, [maskq+0]{bcstd}
+ vptestmd k5, m2, pbmask
+ vpmovm2d m7, k5
+ vptestmb k3{k3}, m7, m7
+ kandnq k3, k2, k3 ; fm & !flat8 & !flat16
+ kandnq k2, k4, k2 ; flat8 & !flat16
+%elif %1 != 4
+ vpbroadcastd m0, [maskq+4]
+ vptestmd k4, m0, pbmask
+ vpmovm2d m7, k4
+ vptestmb k2{k2}, m7, m7
+ kandq k2, k2, k3 ; flat8 & fm
+ por m0, [maskq+0]{bcstd}
+ vptestmd k4, m0, pbmask
+ vpmovm2d m7, k4
+ vptestmb k3{k3}, m7, m7
+ kandnq k3, k2, k3 ; fm & !flat8
+%else
+ %ifidn %2, v
+ vptestmd k4, pbmask, [maskq+0]{bcstd}
+ %else
+ vpbroadcastd m0, [maskq+0]
+ vptestmd k4, m0, pbmask
+ %endif
+ vpmovm2d m7, k4
+ vptestmb k3{k3}, m7, m7 ; fm
+%endif
+
+ ; short filter
+%if %1 >= 8
+ SWAP m23, m15
+%endif
+ vpbroadcastd m15, [pb_3]
+ vpbroadcastd m0, [pb_4]
+ vpbroadcastd m12, [pb_16]
+ vpbroadcastd m1, [pb_64]
+ pxor m3, pb128
+ pxor m6, pb128
+ psubsb m10{k1}{z}, m3, m6 ; f=iclip_diff(p1-q1)&hev
+ pxor m4, pb128
+ pxor m5, pb128
+ psubsb m11, m5, m4
+ paddsb m10, m11
+ paddsb m10, m11
+ paddsb m10{k3}{z}, m10, m11 ; f=iclip_diff(3*(q0-p0)+f)&fm
+ paddsb m8, m10, m15
+ paddsb m10, m0
+ gf2p8affineqb m8, m8, [shift3]{bcstq}, 16
+ gf2p8affineqb m10, m10, [shift3]{bcstq}, 16
+ psubb m8, m12 ; f2
+ psubb m10, m12 ; f1
+ paddsb m4, m8
+ psubsb m5, m10
+ pxor m4, pb128
+ pxor m5, pb128
+ ;
+ pxor m10, pb128
+ pxor m8, m8
+ pavgb m8, m10 ; f=(f1+1)>>1
+ psubb m8, m1
+ knotq k1, k1
+ paddsb m3{k1}, m3, m8
+ psubsb m6{k1}, m6, m8
+ pxor m3, pb128
+ pxor m6, pb128
+
+%if %1 == 16
+ ; flat16 filter
+%ifidn %2, v
+ lea t0, [dstq+mstrideq*8]
+%endif
+ SWAP m24, m16, m14
+ SWAP m2, m17, m22
+ SWAP m7, m18
+
+ ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A
+ ; write -6
+ vpbroadcastd m1, [pb_7_1]
+ vpbroadcastd m12, [pb_2]
+ punpcklbw m14, m24, m25
+ punpckhbw m22, m24, m25
+ pmaddubsw m10, m14, m1
+ pmaddubsw m11, m22, m1 ; p6*7+p3
+ punpcklbw m8, m2, m7
+ punpckhbw m9, m2, m7
+ pmaddubsw m8, m12
+ pmaddubsw m9, m12
+ paddw m10, m8
+ paddw m11, m9 ; p6*7+p5*2+p4*2+p3
+%ifidn %2, h
+ vpbroadcastd m27, [pw_2048]
+ vpbroadcastd m1, [pb_m1_1]
+ %define pw2048 m27
+ %define pbm1_1 m1
+%endif
+ punpcklbw m8, m13, m3
+ punpckhbw m9, m13, m3
+ pmaddubsw m8, m23
+ pmaddubsw m9, m23
+ paddw m10, m8
+ paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1
+ punpcklbw m8, m4, m5
+ punpckhbw m9, m4, m5
+ pmaddubsw m8, m23
+ pmaddubsw m9, m23
+ paddw m10, m8
+ paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+%ifidn %2, v
+ vmovdqu8 [t0+strideq*2]{k4}, m8 ; p5
+%else
+ vpblendmb m8{k4}, m2, m8
+ mova [rsp+1*64], m8
+%endif
+
+ ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B
+ ; write -5
+ pmaddubsw m14, pbm1_1
+ pmaddubsw m22, pbm1_1
+ paddw m10, m14
+ paddw m11, m22 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0
+ punpcklbw m8, m24, m6
+ punpckhbw m9, m24, m6
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1
+ SWAP m18, m8
+ SWAP m23, m9
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+%ifidn %2, v
+ vmovdqu8 [t0+stride3q]{k4}, m8 ; p4
+%else
+ vpblendmb m8{k4}, m7, m8
+ mova [rsp+2*64], m8
+%endif
+
+ ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C
+ ; write -4
+ SWAP m14, m16
+ punpcklbw m8, m24, m13
+ punpckhbw m9, m24, m13
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1
+ punpcklbw m8, m2, m14
+ punpckhbw m2, m14
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m2, pbm1_1
+ paddw m10, m8
+ paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2
+ SWAP m16, m8
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+%ifidn %2, v
+ vmovdqu8 [t0+strideq*4]{k4}, m8 ; p3
+%else
+ vpblendmb m8{k4}, m25, m8
+ mova [rsp+3*64], m8
+%endif
+
+ ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D
+ ; write -3
+ SWAP m22, m17
+ punpcklbw m8, m24, m3
+ punpckhbw m9, m24, m3
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2
+ punpcklbw m8, m7, m22
+ punpckhbw m7, m22
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m7, pbm1_1
+ paddw m10, m8
+ paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3
+ SWAP m17, m8
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+ vpblendmb m15{k4}, m13, m8 ; don't clobber p2/m13 since we need it in F
+
+ ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E
+ ; write -2
+%ifidn %2, v
+ lea t0, [dstq+strideq*4]
+%endif
+ punpcklbw m8, m24, m4
+ punpckhbw m9, m24, m4
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3
+ punpcklbw m8, m25, m29
+ punpckhbw m9, m25, m29
+ SWAP m26, m29
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4
+ SWAP m29, m8
+ SWAP m0, m9
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+ vpblendmb m12{k4}, m3, m8 ; don't clobber p1/m3 since we need it in G
+
+ ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F
+ ; write -1
+%ifidn %2, h
+ SWAP m28, m24
+ punpcklbw m8, m28, m5
+ punpckhbw m24, m28, m5
+%else
+ punpcklbw m8, m24, m5
+ punpckhbw m24, m5
+%endif
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m24, pbm1_1
+ paddw m10, m8
+ paddw m11, m24 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4
+ punpcklbw m24, m13, m30
+ punpckhbw m9, m13, m30
+%ifidn %2, h
+ SWAP m27, m30
+%endif
+ SWAP m13, m15
+ pmaddubsw m24, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m24
+ paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5
+ SWAP m30, m24
+ SWAP m15, m9
+%ifidn %2, h
+ SWAP m9, m24
+ %define pw2048 m9
+%endif
+ pmulhrsw m24, m10, pw2048
+ pmulhrsw m8, m11, pw2048
+ paddw m10, m18 ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5
+ paddw m11, m23
+ packuswb m24, m8
+ punpcklbw m8, m3, m31
+ pmaddubsw m8, pbm1_1
+ paddw m10, m8 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
+ SWAP m18, m8
+ pmulhrsw m8, m10, pw2048
+ paddw m10, m16 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
+%ifidn %2, h
+ SWAP m16, m9
+ %define pw2048 m16
+%endif
+ punpckhbw m9, m3, m31
+ SWAP m3, m12
+ pmaddubsw m9, pbm1_1
+ paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
+ SWAP m23, m9
+ pmulhrsw m9, m11, pw2048
+ paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
+%ifidn %2, h
+ SWAP m2, m1
+ %define pbm1_1 m2
+%endif
+ vpblendmb m1{k4}, m4, m24 ; don't clobber p0/m4 since we need it in H
+
+ ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G
+ ; write +0
+ SWAP m24, m31 ; q6
+ packuswb m8, m9
+%ifidn %2, h
+ SWAP m31, m2
+ %define pbm1_1 m31
+%endif
+ vpblendmb m12{k4}, m5, m8 ; don't clobber q0/m5 since we need it in I
+
+ ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H
+ ; write +1
+ punpcklbw m8, m4, m24
+ punpckhbw m2, m4, m24
+ SWAP m4, m1
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m2, pbm1_1
+ paddw m10, m8
+ paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2
+ pmulhrsw m2, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m2, m9
+ vpblendmb m2{k4}, m6, m2 ; don't clobber q1/m6 since we need it in K
+
+ ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I
+ ; write +2
+ paddw m10, m17 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
+ paddw m11, m7
+ punpcklbw m8, m5, m24
+ punpckhbw m9, m5, m24
+ SWAP m5, m12
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3
+ pmulhrsw m7, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m7, m9
+ vpblendmb m7{k4}, m14, m7 ; don't clobber q2/m14 since we need it in K
+
+ ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J
+ ; write +3
+ paddw m10, m29 ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3
+ paddw m11, m0
+ punpcklbw m8, m6, m24
+ punpckhbw m9, m6, m24
+ SWAP 2, 6
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+%ifidn %2, v
+ vmovdqu8 [t0+mstrideq]{k4}, m8
+%else
+ SWAP m29, m16
+ %define pw2048 m29
+ vpblendmb m16{k4}, m22, m8
+%endif
+
+ ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K
+ ; write +4
+ paddw m10, m30 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+ paddw m11, m15
+%ifidn %2, h
+ SWAP m15, m8
+%endif
+ punpcklbw m8, m14, m24
+ punpckhbw m9, m14, m24
+ SWAP 14, 7
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+%ifidn %2, v
+ vmovdqu8 [t0+strideq*0]{k4}, m8 ; q4
+%else
+ vpblendmb m17{k4}, m26, m8
+%endif
+
+ ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L
+ ; write +5
+ paddw m10, m18 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+ paddw m11, m23
+ punpcklbw m8, m22, m24
+ punpckhbw m9, m22, m24
+ SWAP m30, m24
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+ pmulhrsw m10, pw2048
+ pmulhrsw m11, pw2048
+ packuswb m10, m11
+%ifidn %2, v
+ vmovdqu8 [t0+strideq*1]{k4}, m10 ; q5
+%else
+ vmovdqu8 m27{k4}, m10
+%endif
+
+%ifidn %2, v
+ lea t0, [dstq+mstrideq*4]
+%endif
+%endif
+
+%if %1 >= 8
+ ; flat8 filter
+ vpbroadcastd m9, [pb_3_1]
+ vpbroadcastd m10, [pb_2_1]
+%if %1 == 16
+ vpbroadcastd m23, [pb_1]
+ vpbroadcastd m0, [pb_4]
+%elifidn %2, h
+ vpbroadcastd m31, [pb_m1_1]
+ %define pbm1_1 m31
+%endif
+ punpcklbw m24, m25, m3
+ punpckhbw m26, m25, m3
+ pmaddubsw m2, m24, m9
+ pmaddubsw m7, m26, m9 ; 3 * p3 + p1
+ punpcklbw m8, m13, m4
+ punpckhbw m11, m13, m4
+ pmaddubsw m8, m10
+ pmaddubsw m11, m10
+ paddw m2, m8
+ paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0
+ punpcklbw m8, m5, m0
+ punpckhbw m11, m5, m0
+ pmaddubsw m8, m23
+ pmaddubsw m11, m23
+ paddw m2, m8
+ paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+%if is_h || %1 == 16
+ vpblendmb m10{k2}, m13, m8 ; p2
+%endif
+%ifidn %2, v
+ %if %1 == 8
+ vmovdqu8 [t0+strideq*1]{k2}, m8
+ %else
+ mova [t0+strideq*1], m10
+ %endif
+%endif
+
+ pmaddubsw m8, m24, pbm1_1
+ pmaddubsw m11, m26, pbm1_1
+ paddw m2, m8
+ paddw m7, m11
+ punpcklbw m8, m13, m6
+ punpckhbw m11, m13, m6
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m11, pbm1_1
+ paddw m2, m8
+ paddw m7, m11 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+ vpblendmb m8{k2}, m3, m8 ; p1
+%ifidn %2, v
+ mova [t0+strideq*2], m8
+%else
+ SWAP m18, m8
+%endif
+
+ pmaddubsw m24, m23
+ pmaddubsw m26, m23
+ psubw m2, m24
+ psubw m7, m26
+ punpcklbw m8, m4, m14
+ punpckhbw m11, m4, m14
+ pmaddubsw m8, m23
+ pmaddubsw m11, m23
+ paddw m2, m8
+ paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+ vpblendmb m8{k2}, m4, m8 ; p0
+%ifidn %2, v
+ mova [t0+stride3q], m8
+%else
+ SWAP m29, m8
+%endif
+
+ punpcklbw m24, m5, m22
+ punpckhbw m26, m5, m22
+ pmaddubsw m8, m24, m23
+ pmaddubsw m11, m26, m23
+ paddw m2, m8
+ paddw m7, m11
+ punpcklbw m8, m4, m25
+ punpckhbw m11, m4, m25
+ pmaddubsw m8, m23
+ pmaddubsw m11, m23
+ psubw m2, m8
+ psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+ vpblendmb m11{k2}, m5, m8 ; q0
+%ifidn %2, v
+ mova [dstq+strideq*0], m11
+%endif
+
+ pmaddubsw m24, pbm1_1
+ pmaddubsw m26, pbm1_1
+ paddw m2, m24
+ paddw m7, m26
+ punpcklbw m8, m13, m6
+ punpckhbw m13, m6
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m13, pbm1_1
+ paddw m2, m8
+ paddw m7, m13 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4
+ psrlw m8, m2, 3
+ psrlw m13, m7, 3
+ packuswb m8, m13
+ vpblendmb m13{k2}, m6, m8 ; q1
+%ifidn %2, v
+ mova [dstq+strideq*1], m13
+%endif
+
+ punpcklbw m24, m3, m6
+ punpckhbw m26, m3, m6
+ pmaddubsw m24, m23
+ pmaddubsw m26, m23
+ psubw m2, m24
+ psubw m7, m26
+ punpcklbw m24, m14, m22
+ punpckhbw m26, m14, m22
+ pmaddubsw m24, m23
+ pmaddubsw m26, m23
+ paddw m2, m24
+ paddw m7, m26 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4
+ psrlw m2, 3
+ psrlw m7, 3
+ packuswb m2, m7
+%if is_h || %1 == 16
+ vpblendmb m2{k2}, m14, m2 ; q2
+%endif
+%ifidn %2, v
+ %if %1 == 8
+ vmovdqu8 [dstq+strideq*2]{k2}, m2
+ %else
+ mova [dstq+strideq*2], m2
+ %endif
+%endif
+
+%ifidn %2, h
+ SWAP m24, m18
+ SWAP m26, m29
+%if %1 == 8
+ ; 16x8 transpose
+ punpcklbw m3, m25, m10
+ punpckhbw m25, m10
+ punpcklbw m10, m24, m26
+ punpckhbw m24, m26
+ punpcklbw m26, m11, m13
+ punpckhbw m11, m13
+ punpcklbw m13, m2, m22
+ punpckhbw m2, m22
+ ;
+ punpcklwd m22, m3, m10
+ punpckhwd m3, m10
+ punpcklwd m10, m25, m24
+ punpckhwd m25, m24
+ punpcklwd m24, m26, m13
+ punpckhwd m26, m13
+ punpcklwd m13, m11, m2
+ punpckhwd m11, m2
+ ;
+ punpckldq m2, m22, m24
+ punpckhdq m22, m24
+ punpckldq m24, m3, m26
+ punpckhdq m3, m26
+ punpckldq m26, m10, m13
+ punpckhdq m10, m13
+ punpckldq m13, m25, m11
+ punpckhdq m25, m11
+ ; write 8x32
+ vpbroadcastd ym16, strided
+ pmulld ym16, [hmulD]
+ lea t1, [dstq+strideq*2]
+ lea t2, [dstq+strideq*4]
+ lea t3, [t1 +strideq*4]
+ lea t0, [dstq+strideq*8]
+ kmovb k1, k6
+ kmovb k2, k6
+ kmovb k3, k6
+ kmovb k4, k6
+ vpscatterdq [dstq+ym16-4]{k1}, m2
+ vpscatterdq [t1 +ym16-4]{k2}, m22
+ vpscatterdq [t2 +ym16-4]{k3}, m24
+ vpscatterdq [t3 +ym16-4]{k4}, m3
+ lea t1, [t0+strideq*2]
+ lea t2, [t0+strideq*4]
+ lea t3, [t1+strideq*4]
+ kmovb k1, k6
+ kmovb k2, k6
+ kmovb k3, k6
+ kmovb k4, k6
+ vpscatterdq [t0+ym16-4]{k1}, m26
+ vpscatterdq [t1+ym16-4]{k2}, m10
+ vpscatterdq [t2+ym16-4]{k3}, m13
+ vpscatterdq [t3+ym16-4]{k4}, m25
+%else
+ ; 16x16 transpose and store
+ SWAP 5, 10, 2
+ SWAP 6, 24
+ SWAP 7, 26
+ SWAP 8, 11
+ SWAP 9, 13
+ mova m24, [rsp+0*64]
+ SWAP m26, m28
+ mova m2, [rsp+1*64]
+ mova m3, [rsp+2*64]
+ mova m4, [rsp+3*64]
+ SWAP m11, m16
+ SWAP m25, m17
+ SWAP m13, m27
+ SWAP m14, m30
+ TRANSPOSE_16X16B 1, 0, [rsp+4*64]
+ movu [dstq+strideq*0-8], xm24
+ movu [dstq+strideq*1-8], xm26
+ movu [dstq+strideq*2-8], xm2
+ movu [dstq+stride3q -8], xm3
+ lea t0, [dstq+strideq*4]
+ movu [t0+strideq*0-8], xm4
+ movu [t0+strideq*1-8], xm5
+ movu [t0+strideq*2-8], xm6
+ movu [t0+stride3q -8], xm7
+ lea t0, [t0+strideq*4]
+ movu [t0+strideq*0-8], xm8
+ movu [t0+strideq*1-8], xm9
+ movu [t0+strideq*2-8], xm10
+ movu [t0+stride3q -8], xm11
+ lea t0, [t0+strideq*4]
+ movu [t0+strideq*0-8], xm25
+ movu [t0+strideq*1-8], xm13
+ movu [t0+strideq*2-8], xm14
+ movu [t0+stride3q -8], xm22
+ lea t0, [t0+strideq*4]
+ vextracti128 [t0+strideq*0-8], ym24, 1
+ vextracti128 [t0+strideq*1-8], ym26, 1
+ vextracti128 [t0+strideq*2-8], ym2, 1
+ vextracti128 [t0+stride3q -8], ym3, 1
+ lea t0, [t0+strideq*4]
+ vextracti128 [t0+strideq*0-8], ym4, 1
+ vextracti128 [t0+strideq*1-8], ym5, 1
+ vextracti128 [t0+strideq*2-8], ym6, 1
+ vextracti128 [t0+stride3q -8], ym7, 1
+ lea t0, [t0+strideq*4]
+ vextracti128 [t0+strideq*0-8], ym8, 1
+ vextracti128 [t0+strideq*1-8], ym9, 1
+ vextracti128 [t0+strideq*2-8], ym10, 1
+ vextracti128 [t0+stride3q -8], ym11, 1
+ lea t0, [t0+strideq*4]
+ vextracti128 [t0+strideq*0-8], ym25, 1
+ vextracti128 [t0+strideq*1-8], ym13, 1
+ vextracti128 [t0+strideq*2-8], ym14, 1
+ vextracti128 [t0+stride3q -8], ym22, 1
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m24, 2
+ vextracti32x4 [t0+strideq*1-8], m26, 2
+ vextracti32x4 [t0+strideq*2-8], m2, 2
+ vextracti32x4 [t0+stride3q -8], m3, 2
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m4, 2
+ vextracti32x4 [t0+strideq*1-8], m5, 2
+ vextracti32x4 [t0+strideq*2-8], m6, 2
+ vextracti32x4 [t0+stride3q -8], m7, 2
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m8, 2
+ vextracti32x4 [t0+strideq*1-8], m9, 2
+ vextracti32x4 [t0+strideq*2-8], m10, 2
+ vextracti32x4 [t0+stride3q -8], m11, 2
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m25, 2
+ vextracti32x4 [t0+strideq*1-8], m13, 2
+ vextracti32x4 [t0+strideq*2-8], m14, 2
+ vextracti32x4 [t0+stride3q -8], m22, 2
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m24, 3
+ vextracti32x4 [t0+strideq*1-8], m26, 3
+ vextracti32x4 [t0+strideq*2-8], m2, 3
+ vextracti32x4 [t0+stride3q -8], m3, 3
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m4, 3
+ vextracti32x4 [t0+strideq*1-8], m5, 3
+ vextracti32x4 [t0+strideq*2-8], m6, 3
+ vextracti32x4 [t0+stride3q -8], m7, 3
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m8, 3
+ vextracti32x4 [t0+strideq*1-8], m9, 3
+ vextracti32x4 [t0+strideq*2-8], m10, 3
+ vextracti32x4 [t0+stride3q -8], m11, 3
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m25, 3
+ vextracti32x4 [t0+strideq*1-8], m13, 3
+ vextracti32x4 [t0+strideq*2-8], m14, 3
+ vextracti32x4 [t0+stride3q -8], m22, 3
+%endif
+%endif
+
+%elif %1 == 6
+ ; flat6 filter
+ vpbroadcastd m15, [pb_3_1]
+ vpbroadcastd m12, [pb_2]
+ punpcklbw m8, m13, m5
+ punpckhbw m11, m13, m5
+ pmaddubsw m0, m8, m15
+ pmaddubsw m1, m11, m15
+ punpcklbw m7, m4, m3
+ punpckhbw m10, m4, m3
+ pmaddubsw m2, m7, m12
+ pmaddubsw m12, m10, m12
+%ifidn %2, h
+ vpbroadcastd m15, [pb_m1_1]
+ %define pbm1_1 m15
+%endif
+ paddw m0, m2
+ paddw m1, m12
+ pmulhrsw m2, m0, m16
+ pmulhrsw m12, m1, m16
+ packuswb m2, m12
+ vpblendmb m2{k2}, m3, m2 ; p1
+%ifidn %2, v
+ mova [t0+strideq*2], m2
+%endif
+
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m11, pbm1_1
+ paddw m0, m8
+ paddw m1, m11
+ punpcklbw m8, m13, m6
+ punpckhbw m11, m13, m6
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m11, pbm1_1
+ paddw m0, m8
+ paddw m1, m11
+ pmulhrsw m12, m0, m16
+ pmulhrsw m13, m1, m16
+ packuswb m12, m13
+ vpblendmb m12{k2}, m4, m12 ; p0
+%ifidn %2, v
+ mova [t0+stride3q], m12
+%endif
+
+ vpbroadcastd m9, [pb_m1_2]
+ vpbroadcastd m4, [pb_m1_0]
+ paddw m0, m8
+ paddw m1, m11
+ punpcklbw m8, m3, m14
+ punpckhbw m11, m3, m14
+ pmaddubsw m14, m8, pbm1_1
+ pmaddubsw m13, m11, pbm1_1
+ paddw m0, m14
+ paddw m1, m13
+ pmulhrsw m14, m0, m16
+ pmulhrsw m13, m1, m16
+ packuswb m14, m13
+ vpblendmb m14{k2}, m5, m14 ; q0
+%ifidn %2, v
+ mova [dstq+strideq*0], m14
+%endif
+
+ pmaddubsw m8, m9
+ pmaddubsw m11, m9
+ paddw m0, m8
+ paddw m1, m11
+ pmaddubsw m7, m4
+ pmaddubsw m10, m4
+ paddw m0, m7
+ paddw m1, m10
+ pmulhrsw m0, m16
+ pmulhrsw m1, m16
+ packuswb m0, m1
+ vpblendmb m0{k2}, m6, m0 ; q1
+%ifidn %2, v
+ mova [dstq+strideq*1], m0
+%else
+ TRANSPOSE_16x4_AND_WRITE_4x32 2, 12, 14, 0, 1
+%endif
+%else ; %1 == 4
+%ifidn %2, v
+ mova [t0+strideq*0], m3 ; p1
+ mova [t0+strideq*1], m4 ; p0
+ mova [t0+strideq*2], m5 ; q0
+ mova [t0+stride3q ], m6 ; q1
+%else
+ TRANSPOSE_16x4_AND_WRITE_4x32 3, 4, 5, 6, 7
+%endif
+%endif
+%endmacro
+
+%define k7 k6
+
+INIT_ZMM avx512icl
+cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \
+ lut, w, stride3, mstride
+ DECLARE_REG_TMP 9
+ shl l_strideq, 2
+ sub lq, l_strideq
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+ mova m21, [pb_4x0_4x4_4x8_4x12]
+ mova m20, [pb_mask]
+ vpbroadcastd m19, [pb_128]
+ vpbroadcastd m28, [pb_m1_1]
+ vpbroadcastd m27, [pw_2048]
+ %define pbshuf m21
+ %define pbmask m20
+ %define pb128 m19
+ %define pbm1_1 m28
+ %define pw2048 m27
+
+.loop:
+ cmp word [maskq+8], 0 ; vmask[2]
+ je .no_flat16
+
+ FILTER 16, v
+ jmp .end
+
+.no_flat16:
+ cmp word [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 8, v
+ jmp .end
+
+.no_flat:
+ cmp word [maskq+0], 0 ; vmask[0]
+ je .end
+
+ call .v4
+
+.end:
+ add lq, 64
+ add dstq, 64
+ add maskq, 2
+ sub wd, 16
+ jg .loop
+ RET
+ALIGN function_align
+RESET_MM_PERMUTATION
+.v4:
+ FILTER 4, v
+ ret
+
+cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \
+ lut, h, stride3, stride8
+ DECLARE_REG_TMP 9, 10, 11, 12
+ shl l_strideq, 2
+ sub lq, 4
+ lea stride3q, [strideq*3]
+ lea stride8q, [strideq*8]
+ kxnorw k6, k6, k6
+ vpbroadcastd m19, strided
+ vpbroadcastd m20, l_strided
+ pmulld m21, m19, [hmulA]
+ pmulld m20, [hmulB]
+ pmulld m19, [hmulC]
+ %define pbshuf [pb_4x0_4x4_4x8_4x12]
+ %define pbmask [pb_mask]
+ %define pb128 [pb_128]{bcstd}
+ shl l_strideq, 1
+
+.loop:
+ cmp word [maskq+8], 0 ; vmask[2]
+ je .no_flat16
+
+ FILTER 16, h
+ jmp .end
+
+.no_flat16:
+ cmp word [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 8, h
+ jmp .end
+
+.no_flat:
+ cmp word [maskq+0], 0 ; vmask[0]
+ je .end
+
+ call .h4
+
+.end:
+ lea lq, [lq+l_strideq*8]
+ lea dstq, [dstq+stride8q*8]
+ add maskq, 2
+ sub hd, 16
+ jg .loop
+ RET
+ALIGN function_align
+RESET_MM_PERMUTATION
+.h4:
+ FILTER 4, h
+ ret
+
+cglobal lpf_v_sb_uv_8bpc, 7, 10, 22, dst, stride, mask, l, l_stride, \
+ lut, w, stride3, mstride
+ DECLARE_REG_TMP 9
+ shl l_strideq, 2
+ sub lq, l_strideq
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+ mova m21, [pb_4x0_4x4_4x8_4x12]
+ mova m20, [pb_mask]
+ vpbroadcastd m19, [pb_128]
+ vpbroadcastd m17, [pb_m1_1]
+ vpbroadcastd m16, [pw_4096]
+ %define pbshuf m21
+ %define pbmask m20
+ %define pb128 m19
+ %define pbm1_1 m17
+
+.loop:
+ cmp word [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 6, v
+ jmp .end
+
+.no_flat:
+ cmp word [maskq+0], 0 ; vmask[0]
+ je .end
+
+ call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx512icl).v4
+
+.end:
+ add lq, 64
+ add dstq, 64
+ add maskq, 2
+ sub wd, 16
+ jg .loop
+ RET
+
+%undef k7
+cglobal lpf_h_sb_uv_8bpc, 7, 12, 22, dst, stride, mask, l, l_stride, \
+ lut, h, stride3, stride8
+ DECLARE_REG_TMP 9, 10, 11
+ mov r7d, 0xffff
+ movzx r8d, r7b
+ cmp hd, 9
+ cmovb r7d, r8d
+ kmovw k6, r7d ; h > 8 ? 0xffff : 0x00ff
+ shl l_strideq, 2
+ sub lq, 4
+ kshiftrw k7, k6, 4 ; h > 8 ? 0xff : 0xf0
+ lea stride3q, [strideq*3]
+ lea stride8q, [strideq*8]
+ vpbroadcastd m19, strided
+ vpbroadcastd m20, l_strided
+ pmulld m21, m19, [hmulA]
+ pmulld m20, [hmulB]
+ pmulld m19, [hmulC]
+ mova m18, [pb_mask]
+ vpbroadcastd m17, [pb_128]
+ vpbroadcastd m16, [pw_4096]
+ %define pbshuf [pb_4x0_4x4_4x8_4x12]
+ %define pbmask m18
+ %define pb128 m17
+ add l_strideq, l_strideq
+
+.loop:
+ cmp word [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 6, h
+ jmp .end
+
+.no_flat:
+ cmp word [maskq+0], 0 ; vmask[0]
+ je .end
+
+ call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx512icl).h4
+
+.end:
+ lea lq, [lq+l_strideq*8]
+ lea dstq, [dstq+stride8q*8]
+ add maskq, 2
+ sub hd, 16
+ jg .loop
+ RET
+
+%endif ; ARCH_X86_64