summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/x86/loopfilter16_avx2.asm
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/dav1d/src/x86/loopfilter16_avx2.asm')
-rw-r--r--third_party/dav1d/src/x86/loopfilter16_avx2.asm1161
1 files changed, 1161 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/loopfilter16_avx2.asm b/third_party/dav1d/src/x86/loopfilter16_avx2.asm
new file mode 100644
index 0000000000..ed83000ac2
--- /dev/null
+++ b/third_party/dav1d/src/x86/loopfilter16_avx2.asm
@@ -0,0 +1,1161 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+
+pb_mask: dd 1, 1, 2, 2, 4, 4, 8, 8
+pb_4x1_4x5_4x9_4x13: times 4 db 0, 1
+ times 4 db 8, 9
+ times 4 db 0, 1
+ times 4 db 8, 9
+
+pw_1: times 16 dw 1
+pw_2: times 16 dw 2
+pw_3: times 16 dw 3
+pw_4096: times 2 dw 4096
+
+; 10bpc/12bpc:
+pw_4: times 2 dw 4
+ times 2 dw 16
+clip_max: times 2 dw 511
+ times 2 dw 2047
+clip_min: times 2 dw -512
+ times 2 dw -2048
+
+SECTION .text
+
+; in: out:
+; mm%1 a b c d a e i m
+; mm%2 e f g h b f j n
+; mm%3 i j k l -> c g k o
+; mm%4 m n o p d h l p
+%macro TRANSPOSE4X4W 5
+ punpcklwd m%5, m%1, m%2
+ punpckhwd m%1, m%2
+ punpcklwd m%2, m%3, m%4
+ punpckhwd m%3, m%4
+ punpckldq m%4, m%5, m%2
+ punpckhdq m%5, m%2
+ punpckldq m%2, m%1, m%3
+ punpckhdq m%1, m%3
+
+ SWAP %1, %4
+ SWAP %2, %5, %3
+%endmacro
+
+; in: out:
+; xmm%1 a b c d e f g h a i q y 6 E M U
+; xmm%2 i j k l m n o p b j r z 7 F N V
+; xmm%3 q r s t u v w x c k s 0 8 G O W
+; xmm%4 y z 0 1 2 3 4 5 d l t 1 9 H P X
+; xmm%5 6 7 8 9 A B C D -> e m u 2 A I Q Y
+; xmm%6 E F G H I J K L f n v 3 B J R Z
+; xmm%7 M N O P Q R S T g o w 4 C K S +
+; xmm%8 U V W X Y Z + = h p x 5 D L T =
+%macro TRANSPOSE8X8W 9
+ ; xmm%1 a b c d e f g h a i q y b j r z
+ ; xmm%2 i j k l m n o p c k s 0 d l t 1
+ ; xmm%3 q r s t u v w x -> e m u 2 f n v 3
+ ; xmm%4 y z 0 1 2 3 4 5 g o w 4 h p x 5
+ TRANSPOSE4X4W %1, %2, %3, %4, %9
+
+ ; xmm%5 6 7 8 9 A B C D 6 E M U 7 F N V
+ ; xmm%6 E F G H I J K L 8 G O W 9 H P X
+ ; xmm%7 M N O P Q R S T -> A I Q Y B J R Z
+ ; xmm%8 U V W X Y Z + = C K S + D L T =
+ TRANSPOSE4X4W %5, %6, %7, %8, %9
+
+ ; xmm%1 a i q y b j r z a i q y 6 E M U
+ ; xmm%2 c k s 0 d l t 1 b j r z 7 F N V
+ ; xmm%3 e m u 2 f n v 3 c k s 0 8 G O W
+ ; xmm%4 g o w 4 h p x 5 d l t 1 9 H P X
+ ; xmm%5 6 E M U 7 F N V -> e m u 2 A I Q Y
+ ; xmm%6 8 G O W 9 H P X f n v 3 B J R Z
+ ; xmm%7 A I Q Y B J R Z g o w 4 C K S +
+ ; xmm%8 C K S + D L T = h p x 5 D L T =
+ punpckhqdq m%9, m%1, m%5
+ punpcklqdq m%1, m%5
+ punpckhqdq m%5, m%2, m%6
+ punpcklqdq m%2, m%6
+ punpckhqdq m%6, m%3, m%7
+ punpcklqdq m%3, m%7
+ punpckhqdq m%7, m%4, m%8
+ punpcklqdq m%4, m%8
+
+ SWAP %8, %7, %4, %5, %3, %2, %9
+%endmacro
+
+; transpose and write m3-6, everything else is scratch
+%macro TRANSPOSE_8x4_AND_WRITE_4x16 0
+ ; transpose 8x4
+ punpcklwd m0, m3, m4
+ punpckhwd m3, m4
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ punpckldq m6, m0, m4
+ punpckhdq m0, m4
+ punpckldq m4, m3, m5
+ punpckhdq m3, m5
+
+ ; write out
+ movq [dstq+strideq*0-4], xm6
+ movhps [dstq+strideq*1-4], xm6
+ movq [dstq+strideq*2-4], xm0
+ movhps [dstq+stride3q -4], xm0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm4
+ movhps [dstq+strideq*1-4], xm4
+ movq [dstq+strideq*2-4], xm3
+ movhps [dstq+stride3q -4], xm3
+ lea dstq, [dstq+strideq*4]
+
+ vextracti128 xm6, m6, 1
+ vextracti128 xm0, m0, 1
+ vextracti128 xm4, m4, 1
+ vextracti128 xm3, m3, 1
+
+ movq [dstq+strideq*0-4], xm6
+ movhps [dstq+strideq*1-4], xm6
+ movq [dstq+strideq*2-4], xm0
+ movhps [dstq+stride3q -4], xm0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm4
+ movhps [dstq+strideq*1-4], xm4
+ movq [dstq+strideq*2-4], xm3
+ movhps [dstq+stride3q -4], xm3
+ lea dstq, [dstq+strideq*4]
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+ ; load data
+%ifidn %2, v
+%if %1 == 4
+ lea tmpq, [dstq+mstrideq*2]
+ mova m3, [tmpq+strideq*0] ; p1
+ mova m4, [tmpq+strideq*1] ; p0
+ mova m5, [tmpq+strideq*2] ; q0
+ mova m6, [tmpq+stride3q] ; q1
+%else
+ ; load 6-8 pixels, remainder (for wd=16) will be read inline
+ lea tmpq, [dstq+mstrideq*4]
+ ; we load p3 later
+ mova m13, [tmpq+strideq*1]
+ mova m3, [tmpq+strideq*2]
+ mova m4, [tmpq+stride3q]
+ mova m5, [dstq+strideq*0]
+ mova m6, [dstq+strideq*1]
+ mova m14, [dstq+strideq*2]
+%if %1 != 6
+ mova m15, [dstq+stride3q]
+%endif
+%endif
+%else
+ ; load lines
+%if %1 == 4
+ movq xm3, [dstq+strideq*0-4]
+ movq xm4, [dstq+strideq*1-4]
+ movq xm5, [dstq+strideq*2-4]
+ movq xm6, [dstq+stride3q -4]
+ lea tmpq, [dstq+strideq*4]
+ movq xm11, [tmpq+strideq*0-4]
+ movq xm13, [tmpq+strideq*1-4]
+ movq xm14, [tmpq+strideq*2-4]
+ movq xm15, [tmpq+stride3q -4]
+ lea tmpq, [tmpq+strideq*4]
+ ; this overreads by 8 bytes but the buffers are padded
+ ; so that should be ok
+ vinserti128 m3, [tmpq+strideq*0-4], 1
+ vinserti128 m4, [tmpq+strideq*1-4], 1
+ vinserti128 m5, [tmpq+strideq*2-4], 1
+ vinserti128 m6, [tmpq+stride3q -4], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m11, [tmpq+strideq*0-4], 1
+ vinserti128 m13, [tmpq+strideq*1-4], 1
+ vinserti128 m14, [tmpq+strideq*2-4], 1
+ vinserti128 m15, [tmpq+stride3q -4], 1
+
+ ; transpose 4x8
+ ; xm3: A-D0,A-D4
+ ; xm4: A-D1,A-D5
+ ; xm5: A-D2,A-D6
+ ; xm6: A-D3,A-D7
+ punpcklwd m7, m3, m4
+ punpcklwd m3, m11, m13
+ punpcklwd m4, m5, m6
+ punpcklwd m5, m14, m15
+ ; xm7: A0-1,B0-1,C0-1,D0-1
+ ; xm3: A4-5,B4-5,C4-5,D4-5
+ ; xm4: A2-3,B2-3,C2-3,D2-3
+ ; xm5: A6-7,B6-7,C6-7,D6-7
+ punpckldq m6, m7, m4
+ punpckhdq m7, m4
+ punpckldq m8, m3, m5
+ punpckhdq m5, m3, m5
+ ; xm6: A0-3,B0-3
+ ; xm7: C0-3,D0-3
+ ; xm8: A4-7,B4-7
+ ; xm5: C4-7,D4-7
+ punpcklqdq m3, m6, m8
+ punpckhqdq m4, m6, m8
+ punpckhqdq m6, m7, m5
+ punpcklqdq m5, m7, m5
+ ; xm3: A0-7
+ ; xm4: B0-7
+ ; xm5: C0-7
+ ; xm6: D0-7
+%elif %1 == 6 || %1 == 8
+ movu xm3, [dstq+strideq*0-8]
+ movu xm4, [dstq+strideq*1-8]
+ movu xm5, [dstq+strideq*2-8]
+ movu xm6, [dstq+stride3q -8]
+ lea tmpq, [dstq+strideq*4]
+ movu xm11, [tmpq+strideq*0-8]
+ movu xm13, [tmpq+strideq*1-8]
+ movu xm14, [tmpq+strideq*2-8]
+ movu xm15, [tmpq+stride3q -8]
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m3, [tmpq+strideq*0-8], 1
+ vinserti128 m4, [tmpq+strideq*1-8], 1
+ vinserti128 m5, [tmpq+strideq*2-8], 1
+ vinserti128 m6, [tmpq+stride3q -8], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m11, [tmpq+strideq*0-8], 1
+ vinserti128 m13, [tmpq+strideq*1-8], 1
+ vinserti128 m14, [tmpq+strideq*2-8], 1
+ vinserti128 m15, [tmpq+stride3q -8], 1
+
+ ; transpose 8x16
+ ; xm3: A-H0,A-H8
+ ; xm4: A-H1,A-H9
+ ; xm5: A-H2,A-H10
+ ; xm6: A-H3,A-H11
+ ; xm11: A-H4,A-H12
+ ; xm13: A-H5,A-H13
+ ; xm14: A-H6,A-H14
+ ; xm15: A-H7,A-H15
+ punpcklwd m7, m3, m4
+ punpckhwd m3, m4
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ punpcklwd m6, m11, m13
+ punpckhwd m11, m13
+ punpcklwd m13, m14, m15
+ punpckhwd m14, m15
+ ; xm7: A0-1,B0-1,C0-1,D0-1
+ ; xm3: E0-1,F0-1,G0-1,H0-1
+ ; xm4: A2-3,B2-3,C2-3,D2-3
+ ; xm5: E2-3,F2-3,G2-3,H2-3
+ ; xm6: A4-5,B4-5,C4-5,D4-5
+ ; xm11: E4-5,F4-5,G4-5,H4-5
+ ; xm13: A6-7,B6-7,C6-7,D6-7
+ ; xm14: E6-7,F6-7,G6-7,H6-7
+ punpckldq m15, m7, m4
+ punpckhdq m7, m4
+ punpckldq m9, m3, m5
+ punpckhdq m8, m3, m5
+ punpckldq m3, m6, m13
+ punpckhdq m6, m13
+ punpckldq m10, m11, m14
+ punpckhdq m11, m14
+ ; xm15: A0-3,B0-3
+ ; xm7: C0-3,D0-3
+ ; xm9: E0-3,F0-3
+ ; xm8: G0-3,H0-3
+ ; xm3: A4-7,B4-7
+ ; xm6: C4-7,D4-7
+ ; xm10: E4-7,F4-7
+ ; xm11: G4-7,H4-7
+%if %1 != 6
+ punpcklqdq m0, m15, m3
+%endif
+ punpckhqdq m13, m15, m3
+ punpcklqdq m3, m7, m6
+ punpckhqdq m4, m7, m6
+ punpcklqdq m5, m9, m10
+ punpckhqdq m6, m9, m10
+ punpcklqdq m14, m8, m11
+%if %1 != 6
+ punpckhqdq m15, m8, m11
+ mova [rsp+5*32], m0
+%endif
+%else
+ ; We only use 14 pixels but we'll need the remainder at the end for
+ ; the second transpose
+ mova xm0, [dstq+strideq*0-16]
+ mova xm1, [dstq+strideq*1-16]
+ mova xm2, [dstq+strideq*2-16]
+ mova xm3, [dstq+stride3q -16]
+ lea tmpq, [dstq+strideq*4]
+ mova xm4, [tmpq+strideq*0-16]
+ mova xm5, [tmpq+strideq*1-16]
+ mova xm6, [tmpq+strideq*2-16]
+ mova xm7, [tmpq+stride3q -16]
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m0, m0, [tmpq+strideq*0-16], 1
+ vinserti128 m1, m1, [tmpq+strideq*1-16], 1
+ vinserti128 m2, m2, [tmpq+strideq*2-16], 1
+ vinserti128 m3, m3, [tmpq+stride3q -16], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m4, m4, [tmpq+strideq*0-16], 1
+ vinserti128 m5, m5, [tmpq+strideq*1-16], 1
+ vinserti128 m6, m6, [tmpq+strideq*2-16], 1
+ vinserti128 m7, m7, [tmpq+stride3q -16], 1
+
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8
+
+ mova [rsp+6*32], m0
+ mova [rsp+7*32], m1
+ mova [rsp+8*32], m2
+ mova [rsp+9*32], m3
+ mova [rsp+5*32], m4
+
+ mova xm0, [dstq+strideq*0]
+ mova xm1, [dstq+strideq*1]
+ mova xm2, [dstq+strideq*2]
+ mova xm3, [dstq+stride3q ]
+ lea tmpq, [dstq+strideq*4]
+ mova xm8, [tmpq+strideq*0]
+ mova xm9, [tmpq+strideq*1]
+ mova xm10, [tmpq+strideq*2]
+ mova xm11, [tmpq+stride3q ]
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m0, m0, [tmpq+strideq*0], 1
+ vinserti128 m1, m1, [tmpq+strideq*1], 1
+ vinserti128 m2, m2, [tmpq+strideq*2], 1
+ vinserti128 m3, m3, [tmpq+stride3q ], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m8, m8, [tmpq+strideq*0], 1
+ vinserti128 m9, m9, [tmpq+strideq*1], 1
+ vinserti128 m10, m10, [tmpq+strideq*2], 1
+ vinserti128 m11, m11, [tmpq+stride3q ], 1
+
+ TRANSPOSE8X8W 0, 1, 2, 3, 8, 9, 10, 11, 4
+
+ mova [rsp+10*32], m8
+ mova [rsp+11*32], m9
+ mova [rsp+12*32], m10
+ mova [rsp+13*32], m11
+
+ ; 5,6,7,0,1,2,3 -> 13,3,4,5,6,14,15
+ SWAP 13, 5, 0
+ SWAP 3, 6, 1, 15
+ SWAP 4, 7
+ SWAP 2, 14
+%endif
+%endif
+
+ ; load L/E/I/H
+%ifidn %2, v
+ pmovzxbw m1, [lq]
+ pmovzxbw m0, [lq+l_strideq]
+ pxor m2, m2
+%else
+ vpbroadcastq m0, [lq] ; l0, l1
+ vpbroadcastq m1, [lq+l_strideq] ; l2, l3
+ vpbroadcastq m2, [lq+l_strideq*2] ; l4, l5
+ vpbroadcastq m10, [lq+l_stride3q] ; l6, l7
+ punpckldq m0, m1 ; l0, l2, l1, l3 [2x]
+ punpckldq m2, m10 ; l4, l6, l5, l7 [2x]
+ vpblendd m0, m0, m2, 11110000b ; l0, l2, l1, l3, l4, l6, l5, l7
+ pxor m2, m2
+ punpcklbw m1, m0, m2 ; l0, l2, l4, l6
+ punpckhbw m0, m2 ; l1, l3, l5, l7
+%endif
+ pcmpeqw m10, m2, m0
+ pand m1, m10
+ por m0, m1 ; l[x][] ? l[x][] : l[x-stride][]
+ pshufb m0, [pb_4x1_4x5_4x9_4x13] ; l[x][1]
+ pcmpeqw m10, m2, m0 ; !L
+ psrlw m10, 1
+ psrlw m2, m0, [lutq+128]
+ vpbroadcastw m1, [lutq+136]
+ pminuw m2, m1
+ pmaxuw m2, [pw_1] ; I
+ psrlw m1, m0, 4 ; H
+ paddw m0, [pw_2]
+ vpbroadcastd m8, [r11]
+ paddw m0, m0
+ paddw m0, m2 ; E
+ REPX {pmullw x, m8}, m0, m1, m2
+
+ psubw m8, m3, m4 ; p1-p0
+ psubw m9, m5, m6 ; q1-q0
+ REPX {pabsw x, x}, m8, m9
+ pmaxuw m8, m10
+ pmaxuw m8, m9
+ pcmpgtw m7, m8, m1 ; hev
+%if %1 != 4
+ psubw m9, m13, m4 ; p2-p0
+ pabsw m9, m9
+ pmaxuw m9, m8
+%if %1 != 6
+%ifidn %2, v
+ mova m11, [tmpq+strideq*0] ; p3
+%else
+ mova m11, [rsp+5*32] ; p3
+%endif
+ psubw m10, m11, m4 ; p3-p0
+ pabsw m10, m10
+ pmaxuw m9, m10
+%endif
+ psubw m10, m5, m14 ; q2-q0
+ pabsw m10, m10
+ pmaxuw m9, m10
+%if %1 != 6
+ psubw m10, m5, m15 ; q3-q0
+ pabsw m10, m10
+ pmaxuw m9, m10
+%endif
+ vpbroadcastd m10, [r11]
+ pcmpgtw m9, m10 ; !flat8in
+
+ psubw m10, m13, m3 ; p2-p1
+ pabsw m10, m10
+%if %1 != 6
+ psubw m11, m13 ; p3-p2
+ pabsw m11, m11
+ pmaxuw m10, m11
+ psubw m11, m14, m15 ; q3-q2
+ pabsw m11, m11
+ pmaxuw m10, m11
+%endif
+ psubw m11, m14, m6 ; q2-q1
+ pabsw m11, m11
+ pmaxuw m10, m11
+
+%if %1 == 16
+ vpbroadcastd m11, [maskq+8]
+ vpbroadcastd m1, [maskq+4]
+ por m11, m1
+ pand m11, m12
+ pcmpeqd m11, m12
+ pand m10, m11
+%else
+ vpbroadcastd m11, [maskq+4]
+ pand m11, m12
+ pcmpeqd m11, m12
+ pand m10, m11 ; only apply fm-wide to wd>4 blocks
+%endif
+ pmaxuw m8, m10
+%endif
+ pcmpgtw m8, m2
+
+ psubw m10, m3, m6 ; p1-q1
+ psubw m11, m4, m5 ; p0-q0
+ REPX {pabsw x, x}, m10, m11
+ paddw m11, m11
+ psrlw m10, 1
+ paddw m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+ pcmpgtw m10, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
+ por m8, m10
+
+%if %1 == 16
+
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1]
+ mova m1, [tmpq+strideq*2]
+ mova m2, [tmpq+stride3q]
+%else
+ mova m0, [rsp+7*32]
+ mova m1, [rsp+8*32]
+ mova m2, [rsp+9*32]
+%endif
+ REPX {psubw x, m4}, m0, m1, m2
+ REPX {pabsw x, x}, m0, m1, m2
+ pmaxuw m1, m0
+ pmaxuw m1, m2
+%ifidn %2, v
+ lea tmpq, [dstq+strideq*4]
+ mova m0, [tmpq+strideq*0]
+ mova m2, [tmpq+strideq*1]
+ mova m10, [tmpq+strideq*2]
+%else
+ mova m0, [rsp+10*32]
+ mova m2, [rsp+11*32]
+ mova m10, [rsp+12*32]
+%endif
+ REPX {psubw x, m5}, m0, m2, m10
+ REPX {pabsw x, x}, m0, m2, m10
+ pmaxuw m0, m2
+ pmaxuw m1, m10
+ pmaxuw m1, m0
+ vpbroadcastd m0, [r11]
+ pcmpgtw m1, m0 ; !flat8out
+ por m1, m9 ; !flat8in | !flat8out
+ vpbroadcastd m2, [maskq+8]
+ pand m10, m2, m12
+ pcmpeqd m10, m12
+ pandn m1, m10 ; flat16
+ pandn m1, m8, m1 ; flat16 & fm
+
+ vpbroadcastd m10, [maskq+4]
+ por m10, m2
+ pand m2, m10, m12
+ pcmpeqd m2, m12
+ pandn m9, m2 ; flat8in
+ pandn m9, m8, m9
+ vpbroadcastd m2, [maskq+0]
+ por m2, m10
+ pand m2, m12
+ pcmpeqd m2, m12
+ pandn m8, m2
+ pandn m8, m9, m8 ; fm & !flat8 & !flat16
+ pandn m9, m1, m9 ; flat8 & !flat16
+%elif %1 != 4
+ vpbroadcastd m0, [maskq+4]
+ pand m2, m0, m12
+ pcmpeqd m2, m12
+ pandn m9, m2
+ pandn m9, m8, m9 ; flat8 & fm
+ vpbroadcastd m2, [maskq+0]
+ por m0, m2
+ pand m0, m12
+ pcmpeqd m0, m12
+ pandn m8, m0
+ pandn m8, m9, m8 ; fm & !flat8
+%else
+ vpbroadcastd m0, [maskq+0]
+ pand m0, m12
+ pcmpeqd m0, m12
+ pandn m8, m0 ; fm
+%endif
+
+ ; short filter
+ vpbroadcastd m0, [r11+8*1] ; 511 or 2047
+ vpbroadcastd m2, [r11+8*2] ; -512 or -2048
+ psubw m10, m5, m4
+ paddw m11, m10, m10
+ paddw m11, m10
+ psubw m10, m3, m6 ; iclip_diff(p1-q1)
+ pminsw m10, m0
+ pmaxsw m10, m2
+ pand m10, m7 ; f=iclip_diff(p1-q1)&hev
+ paddw m10, m11 ; f=iclip_diff(3*(q0-p0)+f)
+ pminsw m10, m0
+ pmaxsw m10, m2
+ pand m8, m10 ; f&=fm
+ vpbroadcastd m10, [pw_4]
+ paddw m10, m8
+ paddw m8, [pw_3]
+ REPX {pminsw x, m0}, m10, m8
+ psraw m10, 3 ; f2
+ psraw m8, 3 ; f1
+ psubw m5, m10
+ paddw m4, m8
+
+ paddw m10, [pw_1]
+ psraw m10, 1 ; f=(f1+1)>>1
+ pandn m8, m7, m10 ; f&=!hev
+ paddw m3, m8
+ psubw m6, m8
+ pxor m8, m8
+ psubw m0, m2 ; 1023 or 4095
+ REPX {pminsw x, m0}, m3, m4, m5, m6
+ REPX {pmaxsw x, m8}, m3, m4, m5, m6
+
+%if %1 == 16
+
+; m3-6 = p1/p0/q0/q1, m9=flat8, m1=flat16
+; m12=filter bits mask
+; m13-15=p2/q2/q3
+; m0,2,7-8,10-11 = free
+
+ ; flat16 filter
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1] ; p6
+ mova m2, [tmpq+strideq*2] ; p5
+ mova m7, [tmpq+stride3q] ; p4
+ mova m11, [tmpq+strideq*4] ; p3
+%else
+ mova m0, [rsp+7*32]
+ mova m2, [rsp+8*32]
+ mova m7, [rsp+9*32]
+ mova m11, [rsp+5*32]
+%endif
+
+ mova [rsp+ 0*32], m9
+
+ ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ paddw m8, m0, [pw_1]
+ psllw m8, 3 ; p6*8+8
+ paddw m10, m2, m7 ; p5+p4
+ psubw m8, m0
+ paddw m10, m10 ; (p5+p4)*2
+ paddw m8, m11 ; p6*7+p3
+ paddw m10, m13 ; (p5+p4)*2+p2
+ paddw m8, m3 ; p6*7+p3+p1
+ paddw m10, m4 ; (p5+p4)*2+p2+p0
+ paddw m8, m5 ; p6*7+p3+p1+q0
+ paddw m8, m10 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ psrlw m10, m8, 4
+ vpblendvb m10, m2, m10, m1
+%ifidn %2, v
+ mova [tmpq+strideq*2], m10 ; p5
+%else
+ mova [rsp+8*32], m10
+%endif
+
+ ; sub p6*2, add p3/q1
+ paddw m8, m11
+ paddw m10, m0, m0
+ paddw m8, m6
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m7, m10, m1
+%ifidn %2, v
+ mova [tmpq+stride3q], m10 ; p4
+%else
+ mova [rsp+9*32], m10
+%endif
+
+ ; sub p6/p5, add p2/q2
+ psubw m8, m0
+ paddw m10, m13, m14
+ psubw m8, m2
+ paddw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m11, m10, m1
+%ifidn %2, v
+ mova [tmpq+strideq*4], m10 ; p3
+ lea tmpq, [dstq+strideq*4]
+%else
+ mova [rsp+5*32], m10
+%endif
+
+ ; sub p6/p4, add p1/q3
+ paddw m8, m3
+ paddw m10, m0, m7
+ paddw m8, m15
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m13, m10, m1
+ mova [rsp+1*32], m10 ; don't clobber p2/m13
+
+ ; sub p6/p3, add p0/q4
+ paddw m8, m4
+ paddw m10, m0, m11
+%ifidn %2, v
+ paddw m8, [tmpq+strideq*0]
+%else
+ paddw m8, [rsp+10*32]
+%endif
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m3, m10, m1
+ mova [rsp+2*32], m10 ; don't clobber p1/m3
+
+ ; sub p6/p2, add q0/q5
+ paddw m8, m5
+ paddw m10, m0, m13
+%ifidn %2, v
+ paddw m8, [tmpq+strideq*1]
+%else
+ paddw m8, [rsp+11*32]
+%endif
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m4, m10, m1
+ mova [rsp+3*32], m10 ; don't clobber p0/m4
+
+ ; sub p6/p1, add q1/q6
+ paddw m8, m6
+ paddw m10, m0, m3
+%ifidn %2, v
+ mova m0, [tmpq+strideq*2] ; q6
+%else
+ mova m0, [rsp+12*32] ; q6
+%endif
+ paddw m8, m0
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m5, m10, m1
+ mova [rsp+4*32], m10 ; don't clobber q0/m5
+
+ ; sub p5/p0, add q2/q6
+ paddw m8, m14
+ paddw m10, m2, m4
+ paddw m8, m0
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m2, m6, m10, m1 ; don't clobber q1/m6
+
+ ; sub p4/q0, add q3/q6
+ paddw m8, m15
+ paddw m10, m7, m5
+ paddw m8, m0
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m7, m14, m10, m1 ; don't clobber q2/m14
+
+ ; sub p3/q1, add q4/q6
+%ifidn %2, v
+ paddw m8, [tmpq+strideq*0]
+%else
+ paddw m8, [rsp+10*32]
+%endif
+ paddw m10, m11, m6
+ paddw m8, m0
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m15, m10, m1
+%ifidn %2, v
+ mova [tmpq+mstrideq], m10 ; q3
+%else
+ mova [rsp+14*32], m10
+%endif
+
+ ; sub p2/q2, add q5/q6
+%ifidn %2, v
+ paddw m8, [tmpq+strideq*1]
+%else
+ paddw m8, [rsp+11*32]
+%endif
+ paddw m10, m13, m14
+ paddw m8, m0
+ psubw m8, m10
+ psrlw m10, m8, 4
+%ifidn %2, v
+ mova m9, [tmpq+strideq*0]
+%else
+ mova m9, [rsp+10*32]
+%endif
+ vpblendvb m10, m9, m10, m1
+%ifidn %2, v
+ mova [tmpq+strideq*0], m10 ; q4
+%else
+ mova [rsp+10*32], m10
+%endif
+
+ ; sub p1/q3, add q6*2
+ psubw m8, m3
+ paddw m0, m0
+ psubw m8, m15
+ paddw m8, m0
+ psrlw m10, m8, 4
+%ifidn %2, v
+ mova m9, [tmpq+strideq*1]
+%else
+ mova m9, [rsp+11*32]
+%endif
+ vpblendvb m10, m9, m10, m1
+%ifidn %2, v
+ mova [tmpq+strideq*1], m10 ; q5
+%else
+ mova [rsp+11*32], m10
+%endif
+
+ mova m9, [rsp+0*32]
+ mova m13, [rsp+1*32]
+ mova m3, [rsp+2*32]
+ mova m4, [rsp+3*32]
+ mova m5, [rsp+4*32]
+ SWAP 2, 6
+ SWAP 7, 14
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*4]
+%else
+ mova m15, [rsp+14*32]
+%endif
+%endif
+
+%if %1 >= 8
+ ; flat8 filter
+ vpbroadcastd m7, [pw_4096]
+%ifidn %2, v
+ mova m0, [tmpq+strideq*0] ; p3
+%else
+ mova m0, [rsp+5*32] ; p3
+%endif
+ paddw m1, m0, m13 ; p3+p2
+ paddw m2, m3, m4 ; p1+p0
+ paddw m8, m1, m1 ; 2*(p3+p2)
+ paddw m2, m0 ; p1+p0+p3
+ paddw m8, m5 ; 2*(p3+p2)+q0
+ paddw m2, m8 ; 3*p3+2*p2+p1+p0+q0
+ pmulhrsw m10, m2, m7
+
+ paddw m8, m3, m6
+ psubw m2, m1
+ paddw m2, m8
+ pmulhrsw m8, m2, m7
+
+ paddw m11, m0, m3
+ paddw m1, m4, m14
+ psubw m2, m11
+ paddw m2, m1
+ pmulhrsw m1, m2, m7
+
+ paddw m11, m0, m4
+ pblendvb m4, m1, m9
+ paddw m1, m5, m15
+ psubw m2, m11
+ paddw m2, m1
+ pmulhrsw m11, m2, m7
+
+ paddw m2, m6
+ paddw m2, m15
+ paddw m1, m13, m5
+ pblendvb m5, m11, m9
+ pblendvb m13, m10, m9
+ psubw m2, m1
+ pmulhrsw m1, m2, m7
+
+ psubw m2, m3
+ pblendvb m3, m8, m9
+ psubw m2, m6
+ pblendvb m6, m1, m9
+ paddw m1, m15, m14
+ paddw m2, m1
+ pmulhrsw m2, m7
+
+ pblendvb m14, m2, m9
+
+%ifidn %2, v
+ mova [tmpq+strideq*1], m13 ; p2
+ mova [tmpq+strideq*2], m3 ; p1
+ mova [tmpq+stride3q ], m4 ; p0
+ mova [dstq+strideq*0], m5 ; q0
+ mova [dstq+strideq*1], m6 ; q1
+ mova [dstq+strideq*2], m14 ; q2
+%elif %1 == 8
+ TRANSPOSE8X8W 0, 13, 3, 4, 5, 6, 14, 15, 1
+
+ ; write 8x16
+ movu [dstq+strideq*0-8], xm0
+ movu [dstq+strideq*1-8], xm13
+ movu [dstq+strideq*2-8], xm3
+ movu [dstq+stride3q -8], xm4
+ lea dstq, [dstq+strideq*4]
+ movu [dstq+strideq*0-8], xm5
+ movu [dstq+strideq*1-8], xm6
+ movu [dstq+strideq*2-8], xm14
+ movu [dstq+stride3q -8], xm15
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0-8], m0, 1
+ vextracti128 [dstq+strideq*1-8], m13, 1
+ vextracti128 [dstq+strideq*2-8], m3, 1
+ vextracti128 [dstq+stride3q -8], m4, 1
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0-8], m5, 1
+ vextracti128 [dstq+strideq*1-8], m6, 1
+ vextracti128 [dstq+strideq*2-8], m14, 1
+ vextracti128 [dstq+stride3q -8], m15, 1
+ lea dstq, [dstq+strideq*4]
+%else
+ mova m8, [rsp+6*32]
+ mova m1, [rsp+7*32]
+ mova m2, [rsp+8*32]
+ mova m7, [rsp+9*32]
+ TRANSPOSE8X8W 8, 1, 2, 7, 0, 13, 3, 4, 9
+
+ mova [dstq+strideq*0-16], xm8
+ mova [dstq+strideq*1-16], xm1
+ mova [dstq+strideq*2-16], xm2
+ mova [dstq+stride3q -16], xm7
+ lea tmpq, [dstq+strideq*4]
+ mova [tmpq+strideq*0-16], xm0
+ mova [tmpq+strideq*1-16], xm13
+ mova [tmpq+strideq*2-16], xm3
+ mova [tmpq+stride3q -16], xm4
+ lea tmpq, [tmpq+strideq*4]
+ vextracti128 [tmpq+strideq*0-16], m8, 1
+ vextracti128 [tmpq+strideq*1-16], m1, 1
+ vextracti128 [tmpq+strideq*2-16], m2, 1
+ vextracti128 [tmpq+stride3q -16], m7, 1
+ lea tmpq, [tmpq+strideq*4]
+ vextracti128 [tmpq+strideq*0-16], m0, 1
+ vextracti128 [tmpq+strideq*1-16], m13, 1
+ vextracti128 [tmpq+strideq*2-16], m3, 1
+ vextracti128 [tmpq+stride3q -16], m4, 1
+
+ mova m0, [rsp+10*32]
+ mova m1, [rsp+11*32]
+ mova m2, [rsp+12*32]
+ mova m3, [rsp+13*32]
+ TRANSPOSE8X8W 5, 6, 14, 15, 0, 1, 2, 3, 4
+ mova [dstq+strideq*0], xm5
+ mova [dstq+strideq*1], xm6
+ mova [dstq+strideq*2], xm14
+ mova [dstq+stride3q ], xm15
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm1
+ mova [dstq+strideq*2], xm2
+ mova [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0], m5, 1
+ vextracti128 [dstq+strideq*1], m6, 1
+ vextracti128 [dstq+strideq*2], m14, 1
+ vextracti128 [dstq+stride3q ], m15, 1
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0], m0, 1
+ vextracti128 [dstq+strideq*1], m1, 1
+ vextracti128 [dstq+strideq*2], m2, 1
+ vextracti128 [dstq+stride3q ], m3, 1
+ lea dstq, [dstq+strideq*4]
+%endif
+%elif %1 == 6
+ ; flat6 filter
+ vpbroadcastd m7, [pw_4096]
+ paddw m8, m3, m4
+ paddw m8, m13 ; p2+p1+p0
+ paddw m11, m13, m5
+ paddw m8, m8
+ paddw m8, m11 ; p2+2*(p2+p1+p0)+q0
+ pmulhrsw m2, m8, m7
+
+ paddw m8, m5
+ paddw m11, m13, m13
+ paddw m8, m6
+ psubw m8, m11
+ pmulhrsw m10, m8, m7
+
+ paddw m8, m6
+ paddw m11, m13, m3
+ paddw m8, m14
+ psubw m8, m11
+ pmulhrsw m11, m8, m7
+
+ psubw m8, m3
+ paddw m14, m14
+ psubw m8, m4
+ paddw m8, m14
+ pmulhrsw m8, m7
+
+ pblendvb m3, m2, m9
+ pblendvb m4, m10, m9
+ pblendvb m5, m11, m9
+ pblendvb m6, m8, m9
+
+%ifidn %2, v
+ mova [tmpq+strideq*2], m3 ; p1
+ mova [tmpq+stride3q ], m4 ; p0
+ mova [dstq+strideq*0], m5 ; q0
+ mova [dstq+strideq*1], m6 ; q1
+%else
+ TRANSPOSE_8x4_AND_WRITE_4x16
+%endif
+%else
+%ifidn %2, v
+ mova [tmpq+strideq*0], m3 ; p1
+ mova [tmpq+strideq*1], m4 ; p0
+ mova [tmpq+strideq*2], m5 ; q0
+ mova [tmpq+stride3q ], m6 ; q1
+%else
+ TRANSPOSE_8x4_AND_WRITE_4x16
+%endif
+%endif
+%endmacro
+
+INIT_YMM avx2
+cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits
+ mov r6d, r7m
+ lea r11, [pw_4]
+ shr r6d, 11 ; is_12bpc
+ lea r11, [r11+r6*4]
+ mov wd, wm
+ shl l_strideq, 2
+ sub lq, l_strideq
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+ mov mask_bitsd, 0xf
+ mova m12, [pb_mask]
+
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ jz .no_flat16
+
+ FILTER 16, v
+ jmp .end
+
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+
+ FILTER 8, v
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+
+ call .v4
+
+.end:
+ pslld m12, 4
+ add lq, 16
+ add dstq, 32
+ shl mask_bitsd, 4
+ sub wd, 4
+ jg .loop
+ RET
+ALIGN function_align
+.v4:
+ FILTER 4, v
+ ret
+
+INIT_YMM avx2
+cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp, mask_bits
+ mov r6d, r7m
+ lea r11, [pw_4]
+ shr r6d, 11 ; is_12bpc
+ lea r11, [r11+r6*4]
+ mov hd, hm
+ shl l_strideq, 2
+ sub lq, 4
+ lea stride3q, [strideq*3]
+ lea l_stride3q, [l_strideq*3]
+ mov mask_bitsd, 0xf
+ mova m12, [pb_mask]
+
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ jz .no_flat16
+
+ FILTER 16, h
+ jmp .end
+
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+
+ FILTER 8, h
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .no_filter
+
+ call .h4
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+strideq*8]
+ lea dstq, [dstq+strideq*8]
+.end:
+ pslld m12, 4
+ lea lq, [lq+l_strideq*4]
+ shl mask_bitsd, 4
+ sub hd, 4
+ jg .loop
+ RET
+ALIGN function_align
+.h4:
+ FILTER 4, h
+ ret
+
+INIT_YMM avx2
+cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits
+ mov r6d, r7m
+ lea r11, [pw_4]
+ shr r6d, 11 ; is_12bpc
+ lea r11, [r11+r6*4]
+ mov wd, wm
+ shl l_strideq, 2
+ sub lq, l_strideq
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+ mov mask_bitsd, 0xf
+ mova m12, [pb_mask]
+
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+
+ FILTER 6, v
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+
+ call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx2).v4
+
+.end:
+ pslld m12, 4
+ add lq, 16
+ add dstq, 32
+ shl mask_bitsd, 4
+ sub wd, 4
+ jg .loop
+ RET
+
+INIT_YMM avx2
+cglobal lpf_h_sb_uv_16bpc, 6, 12, 16, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp, mask_bits
+ mov r6d, r7m
+ lea r11, [pw_4]
+ shr r6d, 11 ; is_12bpc
+ lea r11, [r11+r6*4]
+ mov hd, hm
+ shl l_strideq, 2
+ sub lq, 4
+ lea stride3q, [strideq*3]
+ lea l_stride3q, [l_strideq*3]
+ mov mask_bitsd, 0xf
+ mova m12, [pb_mask]
+
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+
+ FILTER 6, h
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .no_filter
+
+ call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx2).h4
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+strideq*8]
+ lea dstq, [dstq+strideq*8]
+.end:
+ pslld m12, 4
+ lea lq, [lq+l_strideq*4]
+ shl mask_bitsd, 4
+ sub hd, 4
+ jg .loop
+ RET
+
+%endif ; ARCH_X86_64