summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/x86/loopfilter16_sse.asm
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/dav1d/src/x86/loopfilter16_sse.asm')
-rw-r--r--third_party/dav1d/src/x86/loopfilter16_sse.asm1793
1 files changed, 1793 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/loopfilter16_sse.asm b/third_party/dav1d/src/x86/loopfilter16_sse.asm
new file mode 100644
index 0000000000..c486b57a21
--- /dev/null
+++ b/third_party/dav1d/src/x86/loopfilter16_sse.asm
@@ -0,0 +1,1793 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+%if ARCH_X86_64
+%define PIC_sym(a) a
+%else
+%define PIC_base $$
+%define PIC_sym(a) pic_regq+a-PIC_base
+%endif
+
+pb_4x1_4x5_4x9_4x13: times 4 db 0, 1
+ times 4 db 8, 9
+
+pw_1: times 8 dw 1
+pw_2: times 8 dw 2
+pw_3: times 8 dw 3
+; 4 and 16 need to be next to each other since they are used as alternates
+; depending on whether bitdepth is 10 or 12
+pw_4: times 8 dw 4
+pw_16: times 8 dw 16
+pw_8: times 8 dw 8
+pw_4096: times 8 dw 4096
+
+pb_mask: dd 1, 1, 2, 2
+
+SECTION .text
+
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < 16
+%define extra_stack 2
+%else
+%define extra_stack 0
+%endif
+%endif
+
+%macro RELOC_ARGS 2 ; h/v, off
+ASSERT ARCH_X86_32
+%if STACK_ALIGNMENT < 16
+ mov r5d, [rstk + stack_offset + 4*4 + 4]
+%define lstridem [esp+%2+0*gprsize]
+ mov lstridem, r5d
+ mov r5d, [rstk + stack_offset + 4*5 + 4]
+%define lutm [esp+%2+1*gprsize]
+ mov lutm, r5d
+ mov r5d, [rstk + stack_offset + 4*6 + 4]
+%ifidn %1, v
+%define wm [esp+%2+2*gprsize]
+ mov wm, r5d
+ mov r5d, [rstk + stack_offset + 4*3 + 4]
+%define lm [esp+%2+3*gprsize]
+ mov lm, r5d
+%else ; %1 == h
+%define hm [esp+%2+2*gprsize]
+ mov hm, r5d
+%endif ; %1==v
+ mov r5d, r7m
+%define bdmulm [esp+%2+4*gprsize]
+ mov bdmulm, r5d
+%else
+%define lstridem r4m
+%define lutm r5m
+%ifidn %1, v
+%define wm r6m
+%define lm r3m
+%else
+%define hm r6m
+%endif
+%define bdmulm r7m
+%endif ; STACK_ALIGNMENT
+%endmacro
+
+%macro UNRELOC_ARGS 0
+%if ARCH_X86_32
+%undef lm
+%undef lstridem
+%undef wm
+%undef hm
+%undef lutm
+%endif
+%endmacro
+
+%macro SPLATD 2
+ movd %1, %2
+ pshufd %1, %1, q0000
+%endmacro
+
+%macro SPLATW 2
+ movd %1, %2
+ pshuflw %1, %1, q0000
+ punpcklqdq %1, %1
+%endmacro
+
+; in: out:
+; mm%1 a b c d a e i m
+; mm%2 e f g h b f j n
+; mm%3 i j k l -> c g k o
+; mm%4 m n o p d h l p
+%macro TRANSPOSE4X4W 5
+ punpcklwd m%5, m%1, m%2
+ punpckhwd m%1, m%2
+ punpcklwd m%2, m%3, m%4
+ punpckhwd m%3, m%4
+ punpckldq m%4, m%5, m%2
+ punpckhdq m%5, m%2
+ punpckldq m%2, m%1, m%3
+ punpckhdq m%1, m%3
+
+ SWAP %1, %4
+ SWAP %2, %5, %3
+%endmacro
+
+; in: out:
+; m%1 a b c d e f g h a i q y 6 E M U
+; m%2 i j k l m n o p b j r z 7 F N V
+; m%3 q r s t u v w x c k s 0 8 G O W
+; m%4 y z 0 1 2 3 4 5 d l t 1 9 H P X
+; m%5 6 7 8 9 A B C D -> e m u 2 A I Q Y
+; m%6 E F G H I J K L f n v 3 B J R Z
+; m%7 M N O P Q R S T g o w 4 C K S +
+; m%8 U V W X Y Z + = h p x 5 D L T =
+%if ARCH_X86_64
+%macro TRANSPOSE8X8W 9
+ ; m%1 a b c d e f g h a i q y b j r z
+ ; m%2 i j k l m n o p c k s 0 d l t 1
+ ; m%3 q r s t u v w x -> e m u 2 f n v 3
+ ; m%4 y z 0 1 2 3 4 5 g o w 4 h p x 5
+ TRANSPOSE4X4W %1, %2, %3, %4, %9
+
+ ; m%5 6 7 8 9 A B C D 6 E M U 7 F N V
+ ; m%6 E F G H I J K L 8 G O W 9 H P X
+ ; m%7 M N O P Q R S T -> A I Q Y B J R Z
+ ; m%8 U V W X Y Z + = C K S + D L T =
+ TRANSPOSE4X4W %5, %6, %7, %8, %9
+
+ ; m%1 a i q y b j r z a i q y 6 E M U
+ ; m%2 c k s 0 d l t 1 b j r z 7 F N V
+ ; m%3 e m u 2 f n v 3 c k s 0 8 G O W
+ ; m%4 g o w 4 h p x 5 d l t 1 9 H P X
+ ; m%5 6 E M U 7 F N V -> e m u 2 A I Q Y
+ ; m%6 8 G O W 9 H P X f n v 3 B J R Z
+ ; m%7 A I Q Y B J R Z g o w 4 C K S +
+ ; m%8 C K S + D L T = h p x 5 D L T =
+ punpckhqdq m%9, m%1, m%5
+ punpcklqdq m%1, m%5
+ punpckhqdq m%5, m%2, m%6
+ punpcklqdq m%2, m%6
+ punpckhqdq m%6, m%3, m%7
+ punpcklqdq m%3, m%7
+ punpckhqdq m%7, m%4, m%8
+ punpcklqdq m%4, m%8
+
+ SWAP %8, %7, %4, %5, %3, %2, %9
+%endmacro
+%else ; x86-32
+; input: 1-7 in registers, 8 in first memory [read-only]
+; second memory is scratch, and may overlap with first or third memory
+; output: 1-5,7-8 in registers, 6 in third memory [write-only]
+%macro TRANSPOSE8X8W 13 ; regs [8x], mem [3x], a/u [in/out alignment [2x]
+ TRANSPOSE4X4W %1, %2, %3, %4, %8
+%ifnidn %9, ""
+ mov%12 m%8, %9
+%else
+ mova m%8, %10
+%endif
+ mova %10, m%4
+ TRANSPOSE4X4W %5, %6, %7, %8, %4
+ punpckhqdq m%4, m%1, m%5
+ punpcklqdq m%1, m%5
+ punpckhqdq m%5, m%2, m%6
+ punpcklqdq m%2, m%6
+ punpckhqdq m%6, m%3, m%7
+ punpcklqdq m%3, m%7
+ mova m%7, %10
+%ifnidn %11, ""
+ mov%13 %11, m%6
+%else
+ mova %10, m%6
+%endif
+ punpckhqdq m%6, m%7, m%8
+ punpcklqdq m%7, m%8
+
+ ; 1,4,2,5,3,8,7,6 -> 1,2,3,4,5,6,7,8
+ SWAP %2, %4, %5, %3
+ SWAP %6, %8
+%endmacro
+%endif ; x86-32/64
+
+; transpose and write m8-11, everything else is scratch
+%macro TRANSPOSE_8x4_AND_WRITE_4x8 5 ; p1, p0, q0, q1, tmp
+ ; transpose 8x4
+ punpcklwd %5, %1, %2
+ punpckhwd %1, %2
+ punpcklwd %2, %3, %4
+ punpckhwd %3, %4
+ punpckldq %4, %5, %2
+ punpckhdq %5, %2
+ punpckldq %2, %1, %3
+ punpckhdq %1, %3
+
+ ; write out
+ movq [dstq+strideq*0-4], %4
+ movhps [dstq+strideq*1-4], %4
+ movq [dstq+strideq*2-4], %5
+ movhps [dstq+stride3q -4], %5
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], %2
+ movhps [dstq+strideq*1-4], %2
+ movq [dstq+strideq*2-4], %1
+ movhps [dstq+stride3q -4], %1
+ lea dstq, [dstq+strideq*4]
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+ ; load data
+%ifidn %2, v
+%if %1 == 4
+%if ARCH_X86_64
+%define P1 m8
+%define P0 m9
+%define Q0 m10
+%define Q1 m11
+ mova P1, [dstq+mstrideq*2] ; p1
+ mova P0, [dstq+mstrideq*1] ; p0
+ mova Q0, [dstq+strideq*0] ; q0
+ mova Q1, [dstq+strideq*1] ; q1
+%else ; x86-32
+%define P1 [dstq+mstrideq*2]
+%define P0 [dstq+mstrideq*1]
+%define Q0 [dstq+strideq*0]
+%define Q1 [dstq+strideq*1]
+%endif ; x86-32/64
+%else ; %1 != 4
+ ; load 6-8 pixels, remainder (for wd=16) will be read inline
+ lea tmpq, [dstq+mstrideq*4]
+%if ARCH_X86_64
+ ; we load p3 later
+%define P2 m13
+%define P1 m8
+%define P0 m9
+%define Q0 m10
+%define Q1 m11
+%define Q2 m14
+ mova P2, [tmpq+strideq*1]
+ mova P1, [tmpq+strideq*2]
+ mova P0, [tmpq+stride3q]
+ mova Q0, [dstq+strideq*0]
+ mova Q1, [dstq+strideq*1]
+ mova Q2, [dstq+strideq*2]
+%if %1 != 6
+%define P3 [tmpq+strideq*0]
+%define Q3 m15
+ mova Q3, [dstq+stride3q]
+%endif ; %1 != 6
+%else ; x86-32
+%define P2 [tmpq+strideq*1]
+%define P1 [dstq+mstrideq*2]
+%define P0 [dstq+mstrideq*1]
+%define Q0 [dstq+strideq*0]
+%define Q1 [dstq+strideq*1]
+%define Q2 [dstq+strideq*2]
+%if %1 != 6
+%define P3 [dstq+mstrideq*4]
+%define Q3 [dstq+stride3q]
+%endif ; %1 != 6
+%endif ; x86-32/64
+%endif ; %1 ==/!= 4
+%else ; %2 != v
+ ; load lines
+%if %1 == 4
+ movq m0, [dstq+strideq*0-4]
+ movq m2, [dstq+strideq*1-4]
+ movq m4, [dstq+strideq*2-4]
+ movq m5, [dstq+stride3q -4]
+ lea tmpq, [dstq+strideq*4]
+ movq m3, [tmpq+strideq*0-4]
+ movq m6, [tmpq+strideq*1-4]
+ movq m1, [tmpq+strideq*2-4]
+ movq m7, [tmpq+stride3q -4]
+
+ ; transpose 4x8
+ ; m0: A-D0
+ ; m2: A-D1
+ ; m4: A-D2
+ ; m5: A-D3
+ ; m3: A-D4
+ ; m6: A-D5
+ ; m1: A-D6
+ ; m7: A-D7
+ punpcklwd m0, m2
+ punpcklwd m4, m5
+ punpcklwd m3, m6
+ punpcklwd m1, m7
+ ; m0: A0-1,B0-1,C0-1,D0-1
+ ; m4: A2-3,B2-3,C2-3,D2-3
+ ; m3: A4-5,B4-5,C4-5,D4-5
+ ; m1: A6-7,B6-7,C6-7,D6-7
+ punpckhdq m2, m0, m4
+ punpckldq m0, m4
+ punpckhdq m4, m3, m1
+ punpckldq m3, m1
+ ; m0: A0-3,B0-3
+ ; m2: C0-3,D0-3
+ ; m3: A4-7,B4-7
+ ; m4: C4-7,D4-7
+ punpckhqdq m1, m0, m3
+ punpcklqdq m0, m3
+ punpckhqdq m3, m2, m4
+ punpcklqdq m2, m4
+ ; m0: A0-7
+ ; m1: B0-7
+ ; m2: C0-7
+ ; m3: D0-7
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 1, 9
+ SWAP 2, 10
+ SWAP 3, 11
+%define P1 m8
+%define P0 m9
+%define Q0 m10
+%define Q1 m11
+%else
+%define P1 [esp+3*mmsize]
+%define P0 [esp+4*mmsize]
+%define Q0 [esp+5*mmsize]
+%define Q1 [esp+6*mmsize]
+ mova P1, m0
+ mova P0, m1
+ mova Q0, m2
+ mova Q1, m3
+%endif
+%elif %1 == 6 || %1 == 8
+ movu m0, [dstq+strideq*0-8]
+ movu m1, [dstq+strideq*1-8]
+ movu m2, [dstq+strideq*2-8]
+ movu m3, [dstq+stride3q -8]
+ lea tmpq, [dstq+strideq*4]
+ movu m4, [tmpq+strideq*0-8]
+ movu m5, [tmpq+strideq*1-8]
+ movu m6, [tmpq+strideq*2-8]
+%if ARCH_X86_64
+ movu m7, [tmpq+stride3q -8]
+%endif
+
+ ; transpose 8x16
+ ; m0: A-H0,A-H8
+ ; m1: A-H1,A-H9
+ ; m2: A-H2,A-H10
+ ; m3: A-H3,A-H11
+ ; m4: A-H4,A-H12
+ ; m5: A-H5,A-H13
+ ; m6: A-H6,A-H14
+ ; m7: A-H7,A-H15
+%if ARCH_X86_64
+ punpcklwd m8, m0, m1
+%else
+ punpcklwd m7, m0, m1
+%endif
+ punpckhwd m0, m1
+ punpcklwd m1, m2, m3
+ punpckhwd m2, m3
+ punpcklwd m3, m4, m5
+ punpckhwd m4, m5
+%if ARCH_X86_64
+ punpcklwd m5, m6, m7
+ punpckhwd m6, m7
+%else
+ mova [rsp+3*16], m4
+ movu m4, [tmpq+stride3q -8]
+ punpcklwd m5, m6, m4
+ punpckhwd m6, m4
+%endif
+ ; m8: A0-1,B0-1,C0-1,D0-1 [m7 on x86-32]
+ ; m0: E0-1,F0-1,G0-1,H0-1
+ ; m1: A2-3,B2-3,C2-3,D2-3
+ ; m2: E2-3,F2-3,G2-3,H2-3
+ ; m3: A4-5,B4-5,C4-5,D4-5
+ ; m4: E4-5,F4-5,G4-5,H4-5 [r3 on x86-32]
+ ; m5: A6-7,B6-7,C6-7,D6-7
+ ; m6: E6-7,F6-7,G6-7,H6-7
+%if ARCH_X86_64
+ punpckldq m7, m8, m1
+ punpckhdq m8, m1
+%else
+ punpckldq m4, m7, m1
+ punpckhdq m7, m1
+%endif
+ punpckldq m1, m0, m2
+ punpckhdq m0, m2
+ punpckldq m2, m3, m5
+ punpckhdq m3, m5
+%if ARCH_X86_64
+ punpckldq m5, m4, m6
+ punpckhdq m4, m6
+%else
+ mova [rsp+4*16], m3
+ mova m3, [rsp+3*16]
+ punpckldq m5, m3, m6
+ punpckhdq m3, m6
+%endif
+ ; m7: A0-3,B0-3 [m4 on x86-32]
+ ; m8: C0-3,D0-3 [m7 on x86-32]
+ ; m1: E0-3,F0-3
+ ; m0: G0-3,H0-3
+ ; m2: A4-7,B4-7
+ ; m3: C4-7,D4-7 [r4 on x86-32]
+ ; m5: E4-7,F4-7
+ ; m4: G4-7,H4-7 [m3 on x86-32]
+%if ARCH_X86_64
+%if %1 != 6
+ punpcklqdq m6, m7, m2
+%endif
+ punpckhqdq m7, m2
+ punpcklqdq m2, m8, m3
+ punpckhqdq m8, m3
+ punpcklqdq m3, m1, m5
+ punpckhqdq m1, m5
+%if %1 != 6
+ punpckhqdq m5, m0, m4
+%endif
+ punpcklqdq m0, m4
+%if %1 == 8
+ mova [rsp+1*16], m6
+%define P3 [rsp+1*16]
+%endif
+ ; 7,2,8,3,1,0,5 -> 13,8,9,10,11,14,15
+ SWAP 7, 13
+ SWAP 8, 2, 9
+ SWAP 3, 10
+ SWAP 1, 11
+ SWAP 0, 14
+ SWAP 5, 15
+%define P2 m13
+%define P1 m8
+%define P0 m9
+%define Q0 m10
+%define Q1 m11
+%define Q2 m14
+%if %1 == 8
+%define Q3 m15
+%endif
+%else ; x86-32
+%if %1 == 8
+%define P3 [rsp+ 6*16]
+ punpcklqdq m6, m4, m2
+ mova P3, m6
+%endif
+ mova m6, [rsp+4*16]
+ punpckhqdq m4, m2
+ punpcklqdq m2, m7, m6
+ punpckhqdq m7, m6
+ punpcklqdq m6, m1, m5
+ punpckhqdq m1, m5
+%if %1 == 8
+%define Q3 [rsp+24*16]
+ punpckhqdq m5, m0, m3
+ mova Q3, m5
+%endif
+ punpcklqdq m0, m3
+%if %1 == 8
+%define P2 [rsp+18*16]
+%define P1 [rsp+19*16]
+%define P0 [rsp+20*16]
+%define Q0 [rsp+21*16]
+%define Q1 [rsp+22*16]
+%define Q2 [rsp+23*16]
+%else
+%define P2 [rsp+3*16]
+%define P1 [rsp+4*16]
+%define P0 [rsp+5*16]
+%define Q0 [rsp+6*16]
+%define Q1 [rsp+7*16]
+%define Q2 [rsp+8*16]
+%endif
+ mova P2, m4
+ mova P1, m2
+ mova P0, m7
+ mova Q0, m6
+ mova Q1, m1
+ mova Q2, m0
+%endif ; x86-32/64
+%else ; %1 == 16
+ ; We only use 14 pixels but we'll need the remainder at the end for
+ ; the second transpose
+ mova m0, [dstq+strideq*0-16]
+ mova m1, [dstq+strideq*1-16]
+ mova m2, [dstq+strideq*2-16]
+ mova m3, [dstq+stride3q -16]
+ lea tmpq, [dstq+strideq*4]
+ mova m4, [tmpq+strideq*0-16]
+ mova m5, [tmpq+strideq*1-16]
+ mova m6, [tmpq+strideq*2-16]
+%if ARCH_X86_64
+ mova m7, [tmpq+stride3q -16]
+
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8
+ SWAP 5, 13
+ SWAP 6, 8
+ SWAP 7, 9
+%define P2 m13
+%define P1 m8
+%define P0 m9
+%else ; x86-32
+%define P2 [esp+18*16]
+%define P1 [esp+19*16]
+%define P0 [esp+20*16]
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \
+ [tmpq+stride3q -16], P2, "", a, a
+ mova P1, m6
+ mova P0, m7
+%endif ; x86-32/64
+ mova [rsp+ 7*16], m0
+ mova [rsp+ 8*16], m1
+ mova [rsp+ 9*16], m2
+ mova [rsp+10*16], m3
+%define P3 [rsp+6*16]
+ mova P3, m4
+
+ mova m0, [dstq+strideq*0]
+ mova m1, [dstq+strideq*1]
+ mova m2, [dstq+strideq*2]
+ mova m3, [dstq+stride3q ]
+ lea tmpq, [dstq+strideq*4]
+ mova m4, [tmpq+strideq*0]
+ mova m5, [tmpq+strideq*1]
+ mova m6, [tmpq+strideq*2]
+%if ARCH_X86_64
+ mova m7, [tmpq+stride3q ]
+
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 10
+ SWAP 0, 10
+ SWAP 1, 11
+ SWAP 2, 14
+ SWAP 3, 15
+%define Q0 m10
+%define Q1 m11
+%define Q2 m14
+%define Q3 m15
+%else ; x86-32
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \
+ [tmpq+stride3q ], [rsp+12*16], "", a, a
+%define Q0 [esp+21*16]
+%define Q1 [esp+22*16]
+%define Q2 [esp+23*16]
+%define Q3 [esp+24*16]
+ mova Q0, m0
+ mova Q1, m1
+ mova Q2, m2
+ mova Q3, m3
+%endif ; x86-32/64
+
+ mova [rsp+11*16], m4
+%if ARCH_X86_64
+ mova [rsp+12*16], m5
+%endif
+ mova [rsp+13*16], m6
+ mova [rsp+14*16], m7
+%endif ; %1 == 4/6/8/16
+%endif ; %2 ==/!= v
+
+ ; load L/E/I/H
+%if ARCH_X86_32
+%define l_strideq r5
+ mov l_strideq, dword lstridem
+%ifidn %2, v
+%define lq r3
+ mov lq, dword lm
+%endif
+%endif
+%ifidn %2, v
+%if cpuflag(sse4)
+ pmovzxbw m1, [lq]
+ pmovzxbw m0, [lq+l_strideq]
+ pxor m2, m2
+%else ; ssse3
+ movq m1, [lq]
+ movq m0, [lq+l_strideq]
+ pxor m2, m2
+ REPX {punpcklbw x, m2}, m1, m0
+%endif ; ssse3/sse4
+%else ; %2 != v
+ movq m0, [lq] ; l0, l1
+ movq m1, [lq+l_strideq] ; l2, l3
+ punpckldq m0, m1 ; l0, l2, l1, l3
+ pxor m2, m2
+ punpcklbw m1, m0, m2 ; l0, l2
+ punpckhbw m0, m2 ; l1, l3
+%endif ; %2==/!=v
+%if ARCH_X86_32
+%ifidn %2, v
+%undef lq
+ mov mstrideq, mstridem
+%endif
+%endif
+ pcmpeqw m5, m2, m0
+ pand m1, m5
+ por m0, m1 ; l[x][] ? l[x][] : l[x-stride][]
+ pshufb m0, [PIC_sym(pb_4x1_4x5_4x9_4x13)] ; l[x][1]
+ pcmpeqw m5, m2, m0 ; !L
+ psrlw m5, 1
+%if ARCH_X86_64
+ psrlw m2, m0, [lutq+128]
+ SPLATW m1, [lutq+136]
+%else ; x86-32
+ mov r5, lutm
+ psrlw m2, m0, [r5+128]
+ SPLATW m1, [r5+136]
+%endif ; x86-32/64
+ pminsw m2, m1
+ pmaxsw m2, [PIC_sym(pw_1)] ; I
+ psrlw m1, m0, 4 ; H
+ paddw m0, [PIC_sym(pw_2)]
+ paddw m0, m0
+ paddw m0, m2 ; E
+ REPX {pmullw x, [bdmulq]}, m0, m1, m2
+%if ARCH_X86_32
+%undef l_strideq
+ lea stride3q, [strideq*3]
+%endif
+
+ psubw m3, P1, P0 ; p1-p0
+ psubw m4, Q0, Q1 ; q0-q1
+ REPX {pabsw x, x}, m3, m4
+ pmaxsw m3, m5
+ pmaxsw m3, m4
+ pcmpgtw m7, m3, m1 ; hev
+%if %1 != 4
+ psubw m4, P2, P0 ; p2-p0
+ pabsw m4, m4
+ pmaxsw m4, m3
+%if %1 != 6
+ mova m6, P3 ; p3
+ psubw m5, m6, P0 ; p3-p0
+ pabsw m5, m5
+ pmaxsw m4, m5
+%endif ; %1 != 6
+ psubw m5, Q0, Q2 ; q0-q2
+ pabsw m5, m5
+ pmaxsw m4, m5
+%if %1 != 6
+ psubw m5, Q0, Q3 ; q0-q3
+ pabsw m5, m5
+ pmaxsw m4, m5
+%endif ; %1 != 6
+ pcmpgtw m4, [bdmulq] ; !flat8in
+
+ psubw m5, P2, P1 ; p2-p1
+ pabsw m5, m5
+%if %1 != 6
+ psubw m6, P2 ; p3-p2
+ pabsw m6, m6
+ pmaxsw m5, m6
+ psubw m6, Q2, Q3 ; q2-q3
+ pabsw m6, m6
+ pmaxsw m5, m6
+%endif ; %1 != 6
+ psubw m6, Q2, Q1 ; q2-q1
+ pabsw m6, m6
+ pmaxsw m5, m6
+
+%if %1 == 16
+ SPLATD m6, [maskq+8]
+ SPLATD m1, [maskq+4]
+ por m6, m1
+ pand m6, m12
+ pcmpeqd m6, m12
+ pand m5, m6
+%else ; %1 != 16
+ SPLATD m6, [maskq+4]
+ pand m6, m12
+ pcmpeqd m6, m12
+ pand m5, m6 ; only apply fm-wide to wd>4 blocks
+%endif ; %1==/!=16
+ pmaxsw m3, m5
+%endif ; %1 != 4
+ pcmpgtw m3, m2
+
+ psubw m5, P1, Q1 ; p1-q1
+ psubw m6, P0, Q0 ; p0-q0
+ REPX {pabsw x, x}, m5, m6
+ paddw m6, m6
+ psrlw m5, 1
+ paddw m5, m6 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+ pcmpgtw m5, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
+ por m3, m5
+
+%if %1 == 16
+
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1]
+ mova m1, [tmpq+strideq*2]
+ mova m2, [tmpq+stride3q]
+%else ; %2 != v
+ mova m0, [rsp+ 8*16]
+ mova m1, [rsp+ 9*16]
+ mova m2, [rsp+10*16]
+%endif ; %2==/!=v
+ REPX {psubw x, P0}, m0, m1, m2
+ REPX {pabsw x, x}, m0, m1, m2
+ pmaxsw m1, m0
+ pmaxsw m1, m2
+%ifidn %2, v
+ lea tmpq, [dstq+strideq*4]
+ mova m0, [tmpq+strideq*0]
+ mova m2, [tmpq+strideq*1]
+ mova m5, [tmpq+strideq*2]
+%else ; %2 != v
+ mova m0, [rsp+11*16]
+ mova m2, [rsp+12*16]
+ mova m5, [rsp+13*16]
+%endif ; %2==/!=v
+ REPX {psubw x, Q0}, m0, m2, m5
+ REPX {pabsw x, x}, m0, m2, m5
+ pmaxsw m0, m2
+ pmaxsw m1, m5
+ pmaxsw m1, m0
+ pcmpgtw m1, [bdmulq] ; !flat8out
+ por m1, m4 ; !flat8in | !flat8out
+ SPLATD m2, [maskq+8]
+ pand m5, m2, m12
+ pcmpeqd m5, m12
+ pandn m1, m5 ; flat16
+ pandn m5, m3, m1 ; flat16 & fm
+ SWAP 1, 5
+
+ SPLATD m5, [maskq+4]
+ por m5, m2
+ pand m2, m5, m12
+ pcmpeqd m2, m12
+ pandn m4, m2 ; flat8in
+ pandn m2, m3, m4
+ SWAP 2, 4
+ SPLATD m2, [maskq+0]
+ por m2, m5
+ pand m2, m12
+ pcmpeqd m2, m12
+ pandn m3, m2
+ pandn m0, m4, m3 ; fm & !flat8 & !flat16
+ SWAP 0, 3
+ pandn m0, m1, m4 ; flat8 & !flat16
+ SWAP 0, 4
+%elif %1 != 4
+ SPLATD m0, [maskq+4]
+ pand m2, m0, m12
+ pcmpeqd m2, m12
+ pandn m4, m2
+ pandn m2, m3, m4 ; flat8 & fm
+ SWAP 2, 4
+ SPLATD m2, [maskq+0]
+ por m0, m2
+ pand m0, m12
+ pcmpeqd m0, m12
+ pandn m3, m0
+ pandn m0, m4, m3 ; fm & !flat8
+ SWAP 0, 3
+%else ; %1 == 4
+ SPLATD m0, [maskq+0]
+ pand m0, m12
+ pcmpeqd m0, m12
+ pandn m3, m0 ; fm
+%endif ; %1==/!=4
+
+ ; short filter
+%if ARCH_X86_64
+ SPLATW m0, r7m
+%else
+ SPLATW m0, bdmulm
+%endif
+ pcmpeqw m2, m2
+ psrlw m0, 1 ; 511 or 2047
+ pxor m2, m0 ; -512 or -2048
+
+ psubw m5, Q0, P0 ; q0-p0
+ paddw m6, m5, m5
+ paddw m6, m5 ; 3*(q0-p0)
+ psubw m5, P1, Q1 ; iclip_diff(p1-q1)
+ pminsw m5, m0
+ pmaxsw m5, m2
+ pand m5, m7 ; f=iclip_diff(p1-q1)&hev
+ paddw m5, m6 ; f=iclip_diff(3*(q0-p0)+f)
+ pminsw m5, m0
+ pmaxsw m5, m2
+ pand m3, m5 ; f&=fm
+ paddw m5, m3, [PIC_sym(pw_3)]
+ paddw m3, [PIC_sym(pw_4)]
+ REPX {pminsw x, m0}, m5, m3
+ psraw m5, 3 ; f2
+ psraw m3, 3 ; f1
+ psubw m0, m2 ; 1023 or 4095
+ pxor m2, m2
+%if ARCH_X86_64
+ paddw P0, m5
+ psubw Q0, m3
+%else
+ paddw m5, P0
+ psubw m6, Q0, m3
+ REPX {pminsw x, m0}, m5, m6
+ REPX {pmaxsw x, m2}, m5, m6
+%endif
+
+ paddw m3, [PIC_sym(pw_1)]
+ psraw m3, 1 ; f=(f1+1)>>1
+ pandn m7, m3 ; f&=!hev
+ SWAP 7, 3
+%if ARCH_X86_64
+ paddw P1, m3
+ psubw Q1, m3
+ REPX {pminsw x, m0}, P1, P0, Q0, Q1
+ REPX {pmaxsw x, m2}, P1, P0, Q0, Q1
+%else
+ psubw m7, Q1, m3
+ paddw m3, P1
+ REPX {pminsw x, m0}, m7, m3
+ REPX {pmaxsw x, m2}, m7, m3
+%if %1 > 4
+ mova P1, m3
+ mova P0, m5
+ mova Q0, m6
+ mova Q1, m7
+%endif
+%endif
+
+%if %1 == 16
+
+; m8-11 = p1/p0/q0/q1, m4=flat8, m1=flat16
+; m12=filter bits mask
+; m13-15=p2/q2/q3
+; m0,2-3,5-7 = free
+
+ ; flat16 filter
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1] ; p6
+ mova m2, [tmpq+strideq*2] ; p5
+ mova m7, [tmpq+stride3q] ; p4
+ mova m6, [tmpq+strideq*4] ; p3
+ lea tmpq, [dstq+mstrideq*4]
+%else ; %2 != v
+ mova m0, [rsp+ 8*16]
+ mova m2, [rsp+ 9*16]
+ mova m7, [rsp+10*16]
+ mova m6, [rsp+ 6*16]
+%endif ; %2==/!=v
+
+ mova [rsp+ 0*16], m4
+
+ ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ psllw m3, m0, 3 ; p6*8
+ paddw m3, [PIC_sym(pw_8)]
+ paddw m5, m2, m7 ; p5+p4
+ psubw m3, m0
+ paddw m5, m5 ; (p5+p4)*2
+ paddw m3, m6 ; p6*7+p3
+ paddw m5, P2 ; (p5+p4)*2+p2
+ paddw m3, P1 ; p6*7+p3+p1
+ paddw m5, P0 ; (p5+p4)*2+p2+p0
+ paddw m3, Q0 ; p6*7+p3+p1+q0
+ paddw m3, m5 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, m2
+ por m5, m4
+%ifidn %2, v
+ mova [tmpq+mstrideq*2], m5 ; p5
+%else ; %2 != v
+ mova [rsp+9*16], m5
+%endif ; %2==/!=v
+
+ ; sub p6*2, add p3/q1
+ paddw m3, m6
+ paddw m5, m0, m0
+ paddw m3, Q1
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, m7
+ por m5, m4
+%ifidn %2, v
+ mova [tmpq+mstrideq*1], m5 ; p4
+%else ; %2 != v
+ mova [rsp+10*16], m5
+%endif ; %2==/!=v
+
+ ; sub p6/p5, add p2/q2
+ psubw m3, m0
+ paddw m5, P2, Q2
+ psubw m3, m2
+ paddw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, m6
+ por m5, m4
+%ifidn %2, v
+ mova [tmpq+strideq*0], m5 ; p3
+%else ; %2 != v
+ mova [rsp+6*16], m5
+%endif ; %2==/!=v
+
+%define WRITE_IN_PLACE 0
+%ifidn %2, v
+%if ARCH_X86_64
+%define WRITE_IN_PLACE 1
+%endif
+%endif
+
+ ; sub p6/p4, add p1/q3
+ paddw m3, P1
+ paddw m5, m0, m7
+ paddw m3, Q3
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, P2
+ por m5, m4
+%if WRITE_IN_PLACE
+ mova [tmpq+strideq*1], m5
+%else
+ mova [rsp+1*16], m5 ; don't clobber p2/m13
+%endif
+
+ ; sub p6/p3, add p0/q4
+ paddw m3, P0
+ paddw m5, m0, m6
+%ifidn %2, v
+ paddw m3, [dstq+strideq*4]
+%else ; %2 != v
+ paddw m3, [rsp+11*16]
+%endif ; %2==/!=v
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, P1
+ por m5, m4
+%if WRITE_IN_PLACE
+ mova [dstq+mstrideq*2], m5
+%else
+ mova [rsp+2*16], m5 ; don't clobber p1/m3
+%endif
+
+ ; sub p6/p2, add q0/q5
+ paddw m3, Q0
+ paddw m5, m0, P2
+%ifidn %2, v
+%if ARCH_X86_32
+ lea r4, P2
+%endif
+ lea tmpq, [dstq+strideq*4]
+ paddw m3, [tmpq+strideq*1]
+%else ; %2 != v
+ paddw m3, [rsp+12*16]
+%endif ; %2==/!=v
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, P0
+ por m5, m4
+%if WRITE_IN_PLACE
+ mova [dstq+mstrideq*1], m5
+%else
+ mova [rsp+3*16], m5 ; don't clobber p0/m4
+%endif
+
+ ; sub p6/p1, add q1/q6
+ paddw m3, Q1
+ paddw m5, m0, P1
+%ifidn %2, v
+ mova m0, [tmpq+strideq*2] ; q6
+%else ; %2 != v
+ mova m0, [rsp+13*16] ; q6
+%endif ; %2==/!=v
+ paddw m3, m0
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, Q0
+ por m5, m4
+%if WRITE_IN_PLACE
+ mova [dstq], m5
+%else
+ mova [rsp+4*16], m5 ; don't clobber q0/m5
+%endif
+
+ ; sub p5/p0, add q2/q6
+ paddw m3, Q2
+ paddw m5, m2, P0
+ paddw m3, m0
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, Q1
+ por m2, m5, m4 ; don't clobber q1/m6
+
+ ; sub p4/q0, add q3/q6
+ paddw m3, Q3
+ paddw m7, Q0
+ paddw m3, m0
+ psubw m3, m7
+ psrlw m7, m3, 4
+ pand m7, m1
+ pandn m4, m1, Q2
+ por m7, m4 ; don't clobber q2/m14
+
+ ; sub p3/q1, add q4/q6
+%ifidn %2, v
+ paddw m3, [tmpq+strideq*0]
+%else ; %2 != v
+ paddw m3, [rsp+11*16]
+%endif ; %2==/!=v
+ paddw m6, Q1
+ paddw m3, m0
+ psubw m3, m6
+ psrlw m6, m3, 4
+ pand m6, m1
+ pandn m4, m1, Q3
+ por m6, m4
+%if WRITE_IN_PLACE
+ mova [tmpq+mstrideq], m6 ; q3
+%else ; %2 != v
+ mova [rsp+5*16], m6
+%endif ; %2==/!=v
+
+ ; sub p2/q2, add q5/q6
+%ifidn %2, v
+ paddw m3, [tmpq+strideq*1]
+%if ARCH_X86_64
+ paddw m5, P2, Q2
+%else
+ ; because tmpq is clobbered, so we use a backup pointer for P2 instead
+ paddw m5, [r4], Q2
+ mov pic_regq, pic_regm
+%endif
+%else ; %2 != v
+ paddw m3, [rsp+12*16]
+ paddw m5, P2, Q2
+%endif ; %2==/!=v
+ paddw m3, m0
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+%ifidn %2, v
+ pandn m4, m1, [tmpq+strideq*0]
+%else ; %2 != v
+ pandn m4, m1, [rsp+11*16]
+%endif ; %2==/!=v
+ por m5, m4
+%ifidn %2, v
+ mova [tmpq+strideq*0], m5 ; q4
+%else ; %2 != v
+ mova [rsp+11*16], m5
+%endif ; %2==/!=v
+
+ ; sub p1/q3, add q6*2
+ psubw m3, P1
+ paddw m0, m0
+ psubw m3, Q3
+ paddw m3, m0
+ psrlw m5, m3, 4
+ pand m5, m1
+%ifidn %2, v
+ pandn m4, m1, [tmpq+strideq*1]
+%else ; %2 != v
+ pandn m4, m1, [rsp+12*16]
+%endif ; %2==/!=v
+ por m5, m4
+%ifidn %2, v
+ mova [tmpq+strideq*1], m5 ; q5
+%else ; %2 != v
+ mova [rsp+12*16], m5
+%endif ; %2==/!=v
+
+ mova m4, [rsp+0*16]
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*4]
+%endif
+%if ARCH_X86_64
+ SWAP 2, 11
+ SWAP 7, 14
+ SWAP 6, 15
+%else ; x86-32
+ mova Q1, m2
+ mova Q2, m7
+%endif ; x86-32/64
+%if WRITE_IN_PLACE
+ mova P2, [tmpq+strideq*1]
+ mova P1, [tmpq+strideq*2]
+ mova P0, [tmpq+stride3q]
+ mova Q0, [dstq]
+%elif ARCH_X86_64
+ mova P2, [rsp+1*16]
+ mova P1, [rsp+2*16]
+ mova P0, [rsp+3*16]
+ mova Q0, [rsp+4*16]
+%else ; !WRITE_IN_PLACE & x86-32
+ mova m0, [rsp+1*16]
+ mova m1, [rsp+2*16]
+ mova m2, [rsp+3*16]
+ mova m3, [rsp+4*16]
+ mova m7, [rsp+5*16]
+ mova P2, m0
+ mova P1, m1
+ mova P0, m2
+ mova Q0, m3
+ mova Q3, m7
+%endif ; WRITE_IN_PLACE / x86-32/64
+%undef WRITE_IN_PLACE
+%endif ; %1 == 16
+
+%if %1 >= 8
+
+ ; flat8 filter
+ mova m0, P3 ; p3
+ paddw m1, m0, P2 ; p3+p2
+ paddw m2, P1, P0 ; p1+p0
+ paddw m3, m1, m1 ; 2*(p3+p2)
+ paddw m2, m0 ; p1+p0+p3
+ paddw m3, Q0 ; 2*(p3+p2)+q0
+ paddw m2, m3 ; 3*p3+2*p2+p1+p0+q0
+ pmulhrsw m7, m2, [PIC_sym(pw_4096)]
+ psubw m7, P2
+ pand m7, m4
+
+ paddw m3, P1, Q1 ; p1+q1
+ psubw m2, m1 ; 2*p3+p2+p1+p0+q0
+ paddw m2, m3 ; 2*p3+p2+2*p1+p0+q0+q1
+ pmulhrsw m3, m2, [PIC_sym(pw_4096)]
+ psubw m3, P1
+ pand m3, m4
+
+ paddw m5, m0, P1 ; p3+p1
+ paddw m6, P0, Q2 ; p0+q2
+ psubw m2, m5 ; p3+p2+p1+p0+q0+q1
+ paddw m2, m6 ; p3+p2+p1+2*p0+q0+q1+q2
+ pmulhrsw m5, m2, [PIC_sym(pw_4096)]
+ psubw m5, P0
+ pand m5, m4
+
+ paddw m6, m0, P0 ; p3+p0
+ paddw m1, Q0, Q3 ; q0+q3
+ psubw m2, m6 ; p2+p1+p0+q0+q1+q2
+ paddw m2, m1 ; p2+p1+p0+2*q0+q1+q2+q3
+ pmulhrsw m6, m2, [PIC_sym(pw_4096)]
+ psubw m6, Q0
+ pand m6, m4
+
+ paddw m2, Q1 ; p2+p1+p0+2*q0+2*q1+q2+q3
+ paddw m2, Q3 ; p2+p1+p0+2*q0+2*q1+q2+2*q3
+ paddw m1, P2, Q0 ; p2+q0
+ psubw m2, m1 ; p1+p0+q0+2*q1+q2+2*q3
+ pmulhrsw m1, m2, [PIC_sym(pw_4096)]
+ psubw m1, Q1
+ pand m1, m4
+
+ psubw m2, P1 ; p0+q0+2*q1+q2+2*q3
+ psubw m2, Q1 ; p0+q0+q1+q2+2*q3
+ paddw m0, Q3, Q2 ; q3+q2
+ paddw m2, m0 ; p0+q0+q1+2*q2+3*q3
+ pmulhrsw m2, [PIC_sym(pw_4096)]
+ psubw m2, Q2
+ pand m2, m4
+
+ paddw m7, P2
+ paddw m3, P1
+ paddw m5, P0
+ paddw m6, Q0
+ paddw m1, Q1
+ paddw m2, Q2
+
+%ifidn %2, v
+ mova [tmpq+strideq*1], m7 ; p2
+ mova [tmpq+strideq*2], m3 ; p1
+ mova [tmpq+stride3q ], m5 ; p0
+ mova [dstq+strideq*0], m6 ; q0
+ mova [dstq+strideq*1], m1 ; q1
+ mova [dstq+strideq*2], m2 ; q2
+%else ; %2 != v
+ mova m0, P3
+
+%if %1 == 8
+ lea tmpq, [dstq+strideq*4]
+%if ARCH_X86_64
+ SWAP 4, 15
+ TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, 8
+%else
+ TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, "", \
+ Q3, [tmpq+strideq*1-8], a, u
+%endif
+
+ ; write 8x8
+ movu [dstq+strideq*0-8], m0
+ movu [dstq+strideq*1-8], m7
+ movu [dstq+strideq*2-8], m3
+ movu [dstq+stride3q -8], m5
+ movu [tmpq+strideq*0-8], m6
+%if ARCH_X86_64
+ movu [tmpq+strideq*1-8], m1
+%endif
+ movu [tmpq+strideq*2-8], m2
+ movu [tmpq+stride3q -8], m4
+ lea dstq, [dstq+strideq*8]
+%else ; %1 != 8
+%if ARCH_X86_64
+ SWAP 6, 8
+ SWAP 1, 9
+ SWAP 2, 10
+%else
+ mova [rsp+1*16], m6
+ mova [rsp+2*16], m1
+ mova [rsp+3*16], m2
+%endif
+
+ mova m1, [rsp+ 7*16]
+ mova m2, [rsp+ 8*16]
+ mova m4, [rsp+ 9*16]
+ mova m6, [rsp+10*16]
+ lea tmpq, [dstq+strideq*4]
+%if ARCH_X86_64
+ TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, 11
+%else
+ mova [rsp+7*16], m5
+ TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, "", \
+ [rsp+7*16], [tmpq+strideq*1-16], a, a
+%endif
+
+ mova [dstq+strideq*0-16], m1
+ mova [dstq+strideq*1-16], m2
+ mova [dstq+strideq*2-16], m4
+ mova [dstq+stride3q -16], m6
+ mova [tmpq+strideq*0-16], m0
+%if ARCH_X86_64
+ mova [tmpq+strideq*1-16], m7
+%endif
+ mova [tmpq+strideq*2-16], m3
+ mova [tmpq+stride3q -16], m5
+
+%if ARCH_X86_64
+ SWAP 6, 8
+ SWAP 1, 9
+ SWAP 2, 10
+ SWAP 4, 15
+%else
+ mova m6, [rsp+1*16]
+ mova m1, [rsp+2*16]
+ mova m2, [rsp+3*16]
+ mova m4, Q3
+%endif
+ mova m0, [rsp+11*16]
+ mova m3, [rsp+12*16]
+ mova m5, [rsp+13*16]
+%if ARCH_X86_64
+ mova m7, [rsp+14*16]
+ TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, 8
+%else
+ TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, "", \
+ [rsp+14*16], [tmpq+strideq*1], a, a
+%endif
+ mova [dstq+strideq*0], m6
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m4
+ mova [tmpq+strideq*0], m0
+%if ARCH_X86_64
+ mova [tmpq+strideq*1], m3
+%endif
+ mova [tmpq+strideq*2], m5
+ mova [tmpq+stride3q ], m7
+ lea dstq, [dstq+strideq*8]
+%endif ; %1==/!=8
+%endif ; %2==/!=v
+%elif %1 == 6
+ ; flat6 filter
+ paddw m3, P1, P0 ; p1+p0
+ paddw m3, P2 ; p2+p1+p0
+ paddw m6, P2, Q0 ; p2+q0
+ paddw m3, m3 ; 2*(p2+p1+p0)
+ paddw m3, m6 ; p2+2*(p2+p1+p0)+q0
+ pmulhrsw m2, m3, [PIC_sym(pw_4096)]
+ psubw m2, P1
+ pand m2, m4
+
+ paddw m3, Q0 ; p2+2*(p2+p1+p0+q0)
+ paddw m6, P2, P2 ; 2*p2
+ paddw m3, Q1 ; p2+2*(p2+p1+p0+q0)+q1
+ psubw m3, m6 ; p2+2*(p1+p0+q0)+q1
+ pmulhrsw m5, m3, [PIC_sym(pw_4096)]
+ psubw m5, P0
+ pand m5, m4
+
+ paddw m3, Q1 ; p2+2*(p1+p0+q0+q1)
+ paddw m6, P2, P1 ; p2+p1
+ paddw m3, Q2 ; p2+2*(p1+p0+q0+q1)+q2
+ psubw m3, m6 ; p1+2*(p0+q0+q1)+q2
+ pmulhrsw m6, m3, [PIC_sym(pw_4096)]
+ psubw m6, Q0
+ pand m6, m4
+
+ psubw m3, P1 ; 2*(p0+q0+q1)+q2
+%if ARCH_X86_64
+ paddw Q2, Q2 ; q2*2
+%else
+ mova m0, Q2
+ paddw m0, m0
+%endif
+ psubw m3, P0 ; p0+2*(q0+q1)+q2
+%if ARCH_X86_64
+ paddw m3, Q2 ; p0+q*(q0+q1+q2)+q2
+%else
+ paddw m3, m0
+%endif
+ pmulhrsw m3, [PIC_sym(pw_4096)]
+ psubw m3, Q1
+ pand m3, m4
+
+ paddw m2, P1
+ paddw m5, P0
+ paddw m6, Q0
+ paddw m3, Q1
+
+%ifidn %2, v
+ mova [dstq+mstrideq*2], m2 ; p1
+ mova [dstq+mstrideq*1], m5 ; p0
+ mova [dstq+strideq*0], m6 ; q0
+ mova [dstq+strideq*1], m3 ; q1
+%else ; %2 != v
+ TRANSPOSE_8x4_AND_WRITE_4x8 m2, m5, m6, m3, m0
+%endif ; %2==/!=v
+%else ; %1 == 4
+%if ARCH_X86_64
+%ifidn %2, v
+ mova [dstq+mstrideq*2], P1 ; p1
+ mova [dstq+mstrideq*1], P0 ; p0
+ mova [dstq+strideq*0], Q0 ; q0
+ mova [dstq+strideq*1], Q1 ; q1
+%else ; %2 != v
+ TRANSPOSE_8x4_AND_WRITE_4x8 P1, P0, Q0, Q1, m0
+%endif ; %2==/!=v
+%else ; x86-32
+%ifidn %2, v
+ mova [dstq+mstrideq*2], m3
+ mova [dstq+mstrideq*1], m5
+ mova [dstq+strideq*0], m6
+ mova [dstq+strideq*1], m7
+%else ; %2 != v
+ TRANSPOSE_8x4_AND_WRITE_4x8 m3, m5, m6, m7, m0
+%endif ; %2==/!=v
+%endif ; x86-32/64
+%endif ; %1
+%undef P3
+%undef P2
+%undef P1
+%undef P0
+%undef Q0
+%undef Q1
+%undef Q2
+%undef Q3
+%endmacro
+
+INIT_XMM ssse3
+; stack layout:
+; r0 - flat8 backup inside flat16 code
+%if ARCH_X86_64
+cglobal lpf_v_sb_y_16bpc, 6, 12, 16, -16 * 1, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits, bdmul
+ mov r6d, r7m
+ sar r6d, 7
+ and r6d, 16 ; 0 for 10bpc, 16 for 12bpc
+ lea bdmulq, [pw_4]
+ add bdmulq, r6
+ mov wd, wm
+ shl l_strideq, 2
+ sub lq, l_strideq
+%else
+; stack layout [32bit only]:
+; r1-4 - p2-q0 post-filter16
+; r5 - p3
+; r6 - q3 post-filter16
+; r7 - GPRs [mask_bitsm, mstridem]
+; r8 - m12/pb_mask
+; r9 - bdmulq
+cglobal lpf_v_sb_y_16bpc, 4, 7, 8, -16 * (10 + extra_stack), \
+ dst, stride, mask, mstride, pic_reg, stride3, tmp
+ RELOC_ARGS v, 10*16
+%if STACK_ALIGNMENT >= 16
+ mov r5d, r7m
+%endif
+ sar r5d, 7
+ and r5d, 16 ; 0 for 10bpc, 16 for 12bpc
+ LEA pic_regq, PIC_base
+%define pic_regm dword [esp+7*16+2*gprsize]
+ mov pic_regm, pic_regq
+ mova m0, [PIC_sym(pw_4)+r5]
+%define bdmulq esp+9*16
+ mova [bdmulq], m0
+ shl dword lstridem, 2
+ sub r3, dword lstridem
+ mov dword lm, r3
+%endif
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+%if ARCH_X86_64
+ mov mask_bitsd, 0x3
+ mova m12, [pb_mask]
+%else
+%define mstridem dword [esp+7*16+1*gprsize]
+ mov mstridem, mstrideq
+%define mask_bitsm dword [esp+7*16+0*gprsize]
+ mov mask_bitsm, 0x3
+ mova m0, [PIC_sym(pb_mask)]
+%define m12 [esp+8*16]
+ mova m12, m0
+%endif
+
+.loop:
+%if ARCH_X86_64
+ test [maskq+8], mask_bitsd ; vmask[2]
+%else
+ mov r6d, mask_bitsm
+ test [maskq+8], r6d
+%endif
+ jz .no_flat16
+
+ FILTER 16, v
+ jmp .end
+
+.no_flat16:
+%if ARCH_X86_64
+ test [maskq+4], mask_bitsd ; vmask[1]
+%else
+ test [maskq+4], r6d
+%endif
+ jz .no_flat
+
+ FILTER 8, v
+ jmp .end
+
+.no_flat:
+%if ARCH_X86_64
+ test [maskq+0], mask_bitsd ; vmask[0]
+%else
+ test [maskq+0], r6d
+%endif
+ jz .end
+
+ FILTER 4, v
+
+.end:
+%if ARCH_X86_64
+ pslld m12, 2
+ add lq, 8
+%else
+ mova m0, m12
+ pslld m0, 2
+ mova m12, m0
+ add dword lm, 8
+%endif
+ add dstq, 16
+%if ARCH_X86_64
+ shl mask_bitsd, 2
+ sub wd, 2
+%else
+ shl mask_bitsm, 2
+ sub dword wm, 2
+%endif
+ jg .loop
+%undef mask_bitsm
+%undef bdmulq
+ UNRELOC_ARGS
+ RET
+
+INIT_XMM ssse3
+; stack layout:
+; r0 - flat8 backup inside flat16
+; r1-4 - p2-q0 post-filter16 backup
+; r5 - q3 post-filter16 backup
+; r6 - p3
+; r7-10 - p7-4
+; r11-14 - q4-7
+%if ARCH_X86_64
+cglobal lpf_h_sb_y_16bpc, 6, 11, 16, -16 * 15, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, tmp, mask_bits, bdmul
+ mov r6d, r7m
+ sar r6d, 7
+ and r6d, 16 ; 0 for 10bpc, 16 for 12bpc
+ lea bdmulq, [pw_4]
+ add bdmulq, r6
+ mov hd, hm
+ shl l_strideq, 2
+%else
+; stack layout [32bit only]:
+; r15 - GPRs [mask_bitsm]
+; r16 - m12/pb_mask
+; r17 - bdmulq
+; r18-24 - p2-q3
+cglobal lpf_h_sb_y_16bpc, 4, 7, 8, -16 * (25 + extra_stack), \
+ dst, stride, mask, l, pic_reg, stride3, tmp
+ RELOC_ARGS h, 25*16
+%if STACK_ALIGNMENT >= 16
+ mov r5d, r7m
+%endif
+ sar r5d, 7
+ and r5d, 16 ; 0 for 10bpc, 16 for 12bpc
+ LEA pic_regq, PIC_base
+ mova m0, [PIC_sym(pw_4)+r5]
+%define bdmulq esp+17*16
+ mova [bdmulq], m0
+ shl dword lstridem, 2
+%endif
+ sub lq, 4
+ lea stride3q, [strideq*3]
+%if ARCH_X86_64
+ mov mask_bitsd, 0x3
+ mova m12, [pb_mask]
+%else
+%define mask_bitsm dword [esp+15*16+0*gprsize]
+ mov mask_bitsm, 0x3
+ mova m0, [PIC_sym(pb_mask)]
+%define m12 [esp+16*16]
+ mova m12, m0
+%endif
+
+.loop:
+%if ARCH_X86_64
+ test [maskq+8], mask_bitsd ; vmask[2]
+%else
+ mov r6d, mask_bitsm
+ test [maskq+8], r6d
+%endif
+ jz .no_flat16
+
+ FILTER 16, h
+ jmp .end
+
+.no_flat16:
+%if ARCH_X86_64
+ test [maskq+4], mask_bitsd ; vmask[1]
+%else
+ test [maskq+4], r6d
+%endif
+ jz .no_flat
+
+ FILTER 8, h
+ jmp .end
+
+.no_flat:
+%if ARCH_X86_64
+ test [maskq+0], mask_bitsd ; vmask[0]
+%else
+ test [maskq+0], r6d
+%endif
+ jz .no_filter
+
+ FILTER 4, h
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+strideq*8]
+.end:
+%if ARCH_X86_64
+ pslld m12, 2
+ lea lq, [lq+l_strideq*2]
+ shl mask_bitsd, 2
+ sub hd, 2
+%else
+ mova m0, m12
+ pslld m0, 2
+ mova m12, m0
+ add lq, dword lstridem
+ add lq, dword lstridem
+ shl mask_bitsm, 2
+ sub dword hm, 2
+%endif
+ jg .loop
+%undef mask_bitsm
+%undef bdmulq
+ UNRELOC_ARGS
+ RET
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits, bdmul
+ mov r6d, r7m
+ sar r6d, 7
+ and r6d, 16 ; 0 for 10bpc, 16 for 12bpc
+ lea bdmulq, [pw_4]
+ add bdmulq, r6
+ mov wd, wm
+ shl l_strideq, 2
+ sub lq, l_strideq
+%else
+; stack layout [32bit only]:
+; r0 - GPRs [mask_bitsm, mstridem]
+; r1 - m12/pb_mask
+; r2 - bdmulq
+cglobal lpf_v_sb_uv_16bpc, 4, 7, 8, -16 * (3 + extra_stack), \
+ dst, stride, mask, mstride, pic_reg, stride3, tmp
+ RELOC_ARGS v, 3*16
+%if STACK_ALIGNMENT >= 16
+ mov r5d, r7m
+%endif
+ sar r5d, 7
+ and r5d, 16 ; 0 for 10bpc, 16 for 12bpc
+ LEA pic_regq, PIC_base
+ mova m0, [PIC_sym(pw_4)+r5]
+%define bdmulq esp+2*16
+ mova [bdmulq], m0
+ shl dword lstridem, 2
+ sub r3, dword lstridem
+ mov dword lm, r3
+%endif
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+%if ARCH_X86_64
+ mov mask_bitsd, 0x3
+ mova m12, [pb_mask]
+%else
+%define mask_bitsm dword [esp+0*gprsize]
+%define mstridem dword [esp+1*gprsize]
+ mov mask_bitsm, 0x3
+ mov mstridem, mstrideq
+ mova m0, [PIC_sym(pb_mask)]
+%define m12 [esp+1*16]
+ mova m12, m0
+%endif
+
+.loop:
+%if ARCH_X86_64
+ test [maskq+4], mask_bitsd ; vmask[1]
+%else
+ mov r6d, mask_bitsm
+ test [maskq+4], r6d
+%endif
+ jz .no_flat
+
+ FILTER 6, v
+ jmp .end
+
+.no_flat:
+%if ARCH_X86_64
+ test [maskq+0], mask_bitsd ; vmask[0]
+%else
+ test [maskq+0], r6d
+%endif
+ jz .end
+
+ FILTER 4, v
+
+.end:
+%if ARCH_X86_64
+ pslld m12, 2
+ add lq, 8
+%else
+ mova m0, m12
+ pslld m0, 2
+ mova m12, m0
+ add dword lm, 8
+%endif
+ add dstq, 16
+%if ARCH_X86_64
+ shl mask_bitsd, 2
+ sub wd, 2
+%else
+ shl mask_bitsm, 2
+ sub dword wm, 2
+%endif
+ jg .loop
+%undef mask_bitsm
+%undef bdmulq
+ UNRELOC_ARGS
+ RET
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal lpf_h_sb_uv_16bpc, 6, 11, 16, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, tmp, mask_bits, bdmul
+ mov r6d, r7m
+ sar r6d, 7
+ and r6d, 16 ; 0 for 10bpc, 16 for 12bpc
+ lea bdmulq, [pw_4]
+ add bdmulq, r6
+ mov hd, hm
+ shl l_strideq, 2
+%else
+; stack layout [32bit only]:
+; r0 - GPRs [mask_bitsm]
+; r1 - m12/pb_mask
+; r2 - bdmulq
+; r3-8 - p2-q2
+cglobal lpf_h_sb_uv_16bpc, 4, 7, 8, -16 * (9 + extra_stack), \
+ dst, stride, mask, l, pic_reg, stride3, tmp
+ RELOC_ARGS h, 9*16
+%if STACK_ALIGNMENT >= 16
+ mov r5d, r7m
+%endif
+ sar r5d, 7
+ and r5d, 16 ; 0 for 10bpc, 16 for 12bpc
+ LEA pic_regq, PIC_base
+ mova m0, [PIC_sym(pw_4)+r5]
+%define bdmulq esp+2*16
+ mova [bdmulq], m0
+ shl dword lstridem, 2
+%endif
+ sub lq, 4
+ lea stride3q, [strideq*3]
+%if ARCH_X86_64
+ mov mask_bitsd, 0x3
+ mova m12, [pb_mask]
+%else
+%define mask_bitsm dword [esp+0*gprsize]
+ mov mask_bitsm, 0x3
+ mova m0, [PIC_sym(pb_mask)]
+%define m12 [esp+1*16]
+ mova m12, m0
+%endif
+
+.loop:
+%if ARCH_X86_64
+ test [maskq+4], mask_bitsd ; vmask[1]
+%else
+ mov r6d, mask_bitsm
+ test [maskq+4], r6d
+%endif
+ jz .no_flat
+
+ FILTER 6, h
+ jmp .end
+
+.no_flat:
+%if ARCH_X86_64
+ test [maskq+0], mask_bitsd ; vmask[0]
+%else
+ test [maskq+0], r6d
+%endif
+ jz .no_filter
+
+ FILTER 4, h
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+strideq*8]
+.end:
+%if ARCH_X86_64
+ pslld m12, 2
+ lea lq, [lq+l_strideq*2]
+ shl mask_bitsd, 2
+ sub hd, 2
+%else
+ mova m0, m12
+ pslld m0, 2
+ mova m12, m0
+ add lq, dword lstridem
+ add lq, dword lstridem
+ shl mask_bitsm, 2
+ sub dword hm, 2
+%endif
+ jg .loop
+%undef mask_bitsm
+%undef bdmulq
+ UNRELOC_ARGS
+ RET