From 36d22d82aa202bb199967e9512281e9a53db42c9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 21:33:14 +0200 Subject: Adding upstream version 115.7.0esr. Signed-off-by: Daniel Baumann --- third_party/dav1d/src/x86/loopfilter16_sse.asm | 1793 ++++++++++++++++++++++++ 1 file changed, 1793 insertions(+) create mode 100644 third_party/dav1d/src/x86/loopfilter16_sse.asm (limited to 'third_party/dav1d/src/x86/loopfilter16_sse.asm') diff --git a/third_party/dav1d/src/x86/loopfilter16_sse.asm b/third_party/dav1d/src/x86/loopfilter16_sse.asm new file mode 100644 index 0000000000..c486b57a21 --- /dev/null +++ b/third_party/dav1d/src/x86/loopfilter16_sse.asm @@ -0,0 +1,1793 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 16 + +%if ARCH_X86_64 +%define PIC_sym(a) a +%else +%define PIC_base $$ +%define PIC_sym(a) pic_regq+a-PIC_base +%endif + +pb_4x1_4x5_4x9_4x13: times 4 db 0, 1 + times 4 db 8, 9 + +pw_1: times 8 dw 1 +pw_2: times 8 dw 2 +pw_3: times 8 dw 3 +; 4 and 16 need to be next to each other since they are used as alternates +; depending on whether bitdepth is 10 or 12 +pw_4: times 8 dw 4 +pw_16: times 8 dw 16 +pw_8: times 8 dw 8 +pw_4096: times 8 dw 4096 + +pb_mask: dd 1, 1, 2, 2 + +SECTION .text + +%if ARCH_X86_32 +%if STACK_ALIGNMENT < 16 +%define extra_stack 2 +%else +%define extra_stack 0 +%endif +%endif + +%macro RELOC_ARGS 2 ; h/v, off +ASSERT ARCH_X86_32 +%if STACK_ALIGNMENT < 16 + mov r5d, [rstk + stack_offset + 4*4 + 4] +%define lstridem [esp+%2+0*gprsize] + mov lstridem, r5d + mov r5d, [rstk + stack_offset + 4*5 + 4] +%define lutm [esp+%2+1*gprsize] + mov lutm, r5d + mov r5d, [rstk + stack_offset + 4*6 + 4] +%ifidn %1, v +%define wm [esp+%2+2*gprsize] + mov wm, r5d + mov r5d, [rstk + stack_offset + 4*3 + 4] +%define lm [esp+%2+3*gprsize] + mov lm, r5d +%else ; %1 == h +%define hm [esp+%2+2*gprsize] + mov hm, r5d +%endif ; %1==v + mov r5d, r7m +%define bdmulm [esp+%2+4*gprsize] + mov bdmulm, r5d +%else +%define lstridem r4m +%define lutm r5m +%ifidn %1, v +%define wm r6m +%define lm r3m +%else +%define hm r6m +%endif +%define bdmulm r7m +%endif ; STACK_ALIGNMENT +%endmacro + +%macro UNRELOC_ARGS 0 +%if ARCH_X86_32 +%undef lm +%undef lstridem +%undef wm +%undef hm +%undef lutm +%endif +%endmacro + +%macro SPLATD 2 + movd %1, %2 + pshufd %1, %1, q0000 +%endmacro + +%macro SPLATW 2 + movd %1, %2 + pshuflw %1, %1, q0000 + punpcklqdq %1, %1 +%endmacro + +; in: out: +; mm%1 a b c d a e i m +; mm%2 e f g h b f j n +; mm%3 i j k l -> c g k o +; mm%4 m n o p d h l p +%macro TRANSPOSE4X4W 5 + punpcklwd m%5, m%1, m%2 + punpckhwd m%1, m%2 + punpcklwd m%2, m%3, m%4 + punpckhwd m%3, m%4 + punpckldq m%4, m%5, m%2 + punpckhdq m%5, m%2 + punpckldq m%2, m%1, m%3 + punpckhdq m%1, m%3 + + SWAP %1, %4 + SWAP %2, %5, %3 +%endmacro + +; in: out: +; m%1 a b c d e f g h a i q y 6 E M U +; m%2 i j k l m n o p b j r z 7 F N V +; m%3 q r s t u v w x c k s 0 8 G O W +; m%4 y z 0 1 2 3 4 5 d l t 1 9 H P X +; m%5 6 7 8 9 A B C D -> e m u 2 A I Q Y +; m%6 E F G H I J K L f n v 3 B J R Z +; m%7 M N O P Q R S T g o w 4 C K S + +; m%8 U V W X Y Z + = h p x 5 D L T = +%if ARCH_X86_64 +%macro TRANSPOSE8X8W 9 + ; m%1 a b c d e f g h a i q y b j r z + ; m%2 i j k l m n o p c k s 0 d l t 1 + ; m%3 q r s t u v w x -> e m u 2 f n v 3 + ; m%4 y z 0 1 2 3 4 5 g o w 4 h p x 5 + TRANSPOSE4X4W %1, %2, %3, %4, %9 + + ; m%5 6 7 8 9 A B C D 6 E M U 7 F N V + ; m%6 E F G H I J K L 8 G O W 9 H P X + ; m%7 M N O P Q R S T -> A I Q Y B J R Z + ; m%8 U V W X Y Z + = C K S + D L T = + TRANSPOSE4X4W %5, %6, %7, %8, %9 + + ; m%1 a i q y b j r z a i q y 6 E M U + ; m%2 c k s 0 d l t 1 b j r z 7 F N V + ; m%3 e m u 2 f n v 3 c k s 0 8 G O W + ; m%4 g o w 4 h p x 5 d l t 1 9 H P X + ; m%5 6 E M U 7 F N V -> e m u 2 A I Q Y + ; m%6 8 G O W 9 H P X f n v 3 B J R Z + ; m%7 A I Q Y B J R Z g o w 4 C K S + + ; m%8 C K S + D L T = h p x 5 D L T = + punpckhqdq m%9, m%1, m%5 + punpcklqdq m%1, m%5 + punpckhqdq m%5, m%2, m%6 + punpcklqdq m%2, m%6 + punpckhqdq m%6, m%3, m%7 + punpcklqdq m%3, m%7 + punpckhqdq m%7, m%4, m%8 + punpcklqdq m%4, m%8 + + SWAP %8, %7, %4, %5, %3, %2, %9 +%endmacro +%else ; x86-32 +; input: 1-7 in registers, 8 in first memory [read-only] +; second memory is scratch, and may overlap with first or third memory +; output: 1-5,7-8 in registers, 6 in third memory [write-only] +%macro TRANSPOSE8X8W 13 ; regs [8x], mem [3x], a/u [in/out alignment [2x] + TRANSPOSE4X4W %1, %2, %3, %4, %8 +%ifnidn %9, "" + mov%12 m%8, %9 +%else + mova m%8, %10 +%endif + mova %10, m%4 + TRANSPOSE4X4W %5, %6, %7, %8, %4 + punpckhqdq m%4, m%1, m%5 + punpcklqdq m%1, m%5 + punpckhqdq m%5, m%2, m%6 + punpcklqdq m%2, m%6 + punpckhqdq m%6, m%3, m%7 + punpcklqdq m%3, m%7 + mova m%7, %10 +%ifnidn %11, "" + mov%13 %11, m%6 +%else + mova %10, m%6 +%endif + punpckhqdq m%6, m%7, m%8 + punpcklqdq m%7, m%8 + + ; 1,4,2,5,3,8,7,6 -> 1,2,3,4,5,6,7,8 + SWAP %2, %4, %5, %3 + SWAP %6, %8 +%endmacro +%endif ; x86-32/64 + +; transpose and write m8-11, everything else is scratch +%macro TRANSPOSE_8x4_AND_WRITE_4x8 5 ; p1, p0, q0, q1, tmp + ; transpose 8x4 + punpcklwd %5, %1, %2 + punpckhwd %1, %2 + punpcklwd %2, %3, %4 + punpckhwd %3, %4 + punpckldq %4, %5, %2 + punpckhdq %5, %2 + punpckldq %2, %1, %3 + punpckhdq %1, %3 + + ; write out + movq [dstq+strideq*0-4], %4 + movhps [dstq+strideq*1-4], %4 + movq [dstq+strideq*2-4], %5 + movhps [dstq+stride3q -4], %5 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], %2 + movhps [dstq+strideq*1-4], %2 + movq [dstq+strideq*2-4], %1 + movhps [dstq+stride3q -4], %1 + lea dstq, [dstq+strideq*4] +%endmacro + +%macro FILTER 2 ; width [4/6/8/16], dir [h/v] + ; load data +%ifidn %2, v +%if %1 == 4 +%if ARCH_X86_64 +%define P1 m8 +%define P0 m9 +%define Q0 m10 +%define Q1 m11 + mova P1, [dstq+mstrideq*2] ; p1 + mova P0, [dstq+mstrideq*1] ; p0 + mova Q0, [dstq+strideq*0] ; q0 + mova Q1, [dstq+strideq*1] ; q1 +%else ; x86-32 +%define P1 [dstq+mstrideq*2] +%define P0 [dstq+mstrideq*1] +%define Q0 [dstq+strideq*0] +%define Q1 [dstq+strideq*1] +%endif ; x86-32/64 +%else ; %1 != 4 + ; load 6-8 pixels, remainder (for wd=16) will be read inline + lea tmpq, [dstq+mstrideq*4] +%if ARCH_X86_64 + ; we load p3 later +%define P2 m13 +%define P1 m8 +%define P0 m9 +%define Q0 m10 +%define Q1 m11 +%define Q2 m14 + mova P2, [tmpq+strideq*1] + mova P1, [tmpq+strideq*2] + mova P0, [tmpq+stride3q] + mova Q0, [dstq+strideq*0] + mova Q1, [dstq+strideq*1] + mova Q2, [dstq+strideq*2] +%if %1 != 6 +%define P3 [tmpq+strideq*0] +%define Q3 m15 + mova Q3, [dstq+stride3q] +%endif ; %1 != 6 +%else ; x86-32 +%define P2 [tmpq+strideq*1] +%define P1 [dstq+mstrideq*2] +%define P0 [dstq+mstrideq*1] +%define Q0 [dstq+strideq*0] +%define Q1 [dstq+strideq*1] +%define Q2 [dstq+strideq*2] +%if %1 != 6 +%define P3 [dstq+mstrideq*4] +%define Q3 [dstq+stride3q] +%endif ; %1 != 6 +%endif ; x86-32/64 +%endif ; %1 ==/!= 4 +%else ; %2 != v + ; load lines +%if %1 == 4 + movq m0, [dstq+strideq*0-4] + movq m2, [dstq+strideq*1-4] + movq m4, [dstq+strideq*2-4] + movq m5, [dstq+stride3q -4] + lea tmpq, [dstq+strideq*4] + movq m3, [tmpq+strideq*0-4] + movq m6, [tmpq+strideq*1-4] + movq m1, [tmpq+strideq*2-4] + movq m7, [tmpq+stride3q -4] + + ; transpose 4x8 + ; m0: A-D0 + ; m2: A-D1 + ; m4: A-D2 + ; m5: A-D3 + ; m3: A-D4 + ; m6: A-D5 + ; m1: A-D6 + ; m7: A-D7 + punpcklwd m0, m2 + punpcklwd m4, m5 + punpcklwd m3, m6 + punpcklwd m1, m7 + ; m0: A0-1,B0-1,C0-1,D0-1 + ; m4: A2-3,B2-3,C2-3,D2-3 + ; m3: A4-5,B4-5,C4-5,D4-5 + ; m1: A6-7,B6-7,C6-7,D6-7 + punpckhdq m2, m0, m4 + punpckldq m0, m4 + punpckhdq m4, m3, m1 + punpckldq m3, m1 + ; m0: A0-3,B0-3 + ; m2: C0-3,D0-3 + ; m3: A4-7,B4-7 + ; m4: C4-7,D4-7 + punpckhqdq m1, m0, m3 + punpcklqdq m0, m3 + punpckhqdq m3, m2, m4 + punpcklqdq m2, m4 + ; m0: A0-7 + ; m1: B0-7 + ; m2: C0-7 + ; m3: D0-7 +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 3, 11 +%define P1 m8 +%define P0 m9 +%define Q0 m10 +%define Q1 m11 +%else +%define P1 [esp+3*mmsize] +%define P0 [esp+4*mmsize] +%define Q0 [esp+5*mmsize] +%define Q1 [esp+6*mmsize] + mova P1, m0 + mova P0, m1 + mova Q0, m2 + mova Q1, m3 +%endif +%elif %1 == 6 || %1 == 8 + movu m0, [dstq+strideq*0-8] + movu m1, [dstq+strideq*1-8] + movu m2, [dstq+strideq*2-8] + movu m3, [dstq+stride3q -8] + lea tmpq, [dstq+strideq*4] + movu m4, [tmpq+strideq*0-8] + movu m5, [tmpq+strideq*1-8] + movu m6, [tmpq+strideq*2-8] +%if ARCH_X86_64 + movu m7, [tmpq+stride3q -8] +%endif + + ; transpose 8x16 + ; m0: A-H0,A-H8 + ; m1: A-H1,A-H9 + ; m2: A-H2,A-H10 + ; m3: A-H3,A-H11 + ; m4: A-H4,A-H12 + ; m5: A-H5,A-H13 + ; m6: A-H6,A-H14 + ; m7: A-H7,A-H15 +%if ARCH_X86_64 + punpcklwd m8, m0, m1 +%else + punpcklwd m7, m0, m1 +%endif + punpckhwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + punpcklwd m3, m4, m5 + punpckhwd m4, m5 +%if ARCH_X86_64 + punpcklwd m5, m6, m7 + punpckhwd m6, m7 +%else + mova [rsp+3*16], m4 + movu m4, [tmpq+stride3q -8] + punpcklwd m5, m6, m4 + punpckhwd m6, m4 +%endif + ; m8: A0-1,B0-1,C0-1,D0-1 [m7 on x86-32] + ; m0: E0-1,F0-1,G0-1,H0-1 + ; m1: A2-3,B2-3,C2-3,D2-3 + ; m2: E2-3,F2-3,G2-3,H2-3 + ; m3: A4-5,B4-5,C4-5,D4-5 + ; m4: E4-5,F4-5,G4-5,H4-5 [r3 on x86-32] + ; m5: A6-7,B6-7,C6-7,D6-7 + ; m6: E6-7,F6-7,G6-7,H6-7 +%if ARCH_X86_64 + punpckldq m7, m8, m1 + punpckhdq m8, m1 +%else + punpckldq m4, m7, m1 + punpckhdq m7, m1 +%endif + punpckldq m1, m0, m2 + punpckhdq m0, m2 + punpckldq m2, m3, m5 + punpckhdq m3, m5 +%if ARCH_X86_64 + punpckldq m5, m4, m6 + punpckhdq m4, m6 +%else + mova [rsp+4*16], m3 + mova m3, [rsp+3*16] + punpckldq m5, m3, m6 + punpckhdq m3, m6 +%endif + ; m7: A0-3,B0-3 [m4 on x86-32] + ; m8: C0-3,D0-3 [m7 on x86-32] + ; m1: E0-3,F0-3 + ; m0: G0-3,H0-3 + ; m2: A4-7,B4-7 + ; m3: C4-7,D4-7 [r4 on x86-32] + ; m5: E4-7,F4-7 + ; m4: G4-7,H4-7 [m3 on x86-32] +%if ARCH_X86_64 +%if %1 != 6 + punpcklqdq m6, m7, m2 +%endif + punpckhqdq m7, m2 + punpcklqdq m2, m8, m3 + punpckhqdq m8, m3 + punpcklqdq m3, m1, m5 + punpckhqdq m1, m5 +%if %1 != 6 + punpckhqdq m5, m0, m4 +%endif + punpcklqdq m0, m4 +%if %1 == 8 + mova [rsp+1*16], m6 +%define P3 [rsp+1*16] +%endif + ; 7,2,8,3,1,0,5 -> 13,8,9,10,11,14,15 + SWAP 7, 13 + SWAP 8, 2, 9 + SWAP 3, 10 + SWAP 1, 11 + SWAP 0, 14 + SWAP 5, 15 +%define P2 m13 +%define P1 m8 +%define P0 m9 +%define Q0 m10 +%define Q1 m11 +%define Q2 m14 +%if %1 == 8 +%define Q3 m15 +%endif +%else ; x86-32 +%if %1 == 8 +%define P3 [rsp+ 6*16] + punpcklqdq m6, m4, m2 + mova P3, m6 +%endif + mova m6, [rsp+4*16] + punpckhqdq m4, m2 + punpcklqdq m2, m7, m6 + punpckhqdq m7, m6 + punpcklqdq m6, m1, m5 + punpckhqdq m1, m5 +%if %1 == 8 +%define Q3 [rsp+24*16] + punpckhqdq m5, m0, m3 + mova Q3, m5 +%endif + punpcklqdq m0, m3 +%if %1 == 8 +%define P2 [rsp+18*16] +%define P1 [rsp+19*16] +%define P0 [rsp+20*16] +%define Q0 [rsp+21*16] +%define Q1 [rsp+22*16] +%define Q2 [rsp+23*16] +%else +%define P2 [rsp+3*16] +%define P1 [rsp+4*16] +%define P0 [rsp+5*16] +%define Q0 [rsp+6*16] +%define Q1 [rsp+7*16] +%define Q2 [rsp+8*16] +%endif + mova P2, m4 + mova P1, m2 + mova P0, m7 + mova Q0, m6 + mova Q1, m1 + mova Q2, m0 +%endif ; x86-32/64 +%else ; %1 == 16 + ; We only use 14 pixels but we'll need the remainder at the end for + ; the second transpose + mova m0, [dstq+strideq*0-16] + mova m1, [dstq+strideq*1-16] + mova m2, [dstq+strideq*2-16] + mova m3, [dstq+stride3q -16] + lea tmpq, [dstq+strideq*4] + mova m4, [tmpq+strideq*0-16] + mova m5, [tmpq+strideq*1-16] + mova m6, [tmpq+strideq*2-16] +%if ARCH_X86_64 + mova m7, [tmpq+stride3q -16] + + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8 + SWAP 5, 13 + SWAP 6, 8 + SWAP 7, 9 +%define P2 m13 +%define P1 m8 +%define P0 m9 +%else ; x86-32 +%define P2 [esp+18*16] +%define P1 [esp+19*16] +%define P0 [esp+20*16] + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \ + [tmpq+stride3q -16], P2, "", a, a + mova P1, m6 + mova P0, m7 +%endif ; x86-32/64 + mova [rsp+ 7*16], m0 + mova [rsp+ 8*16], m1 + mova [rsp+ 9*16], m2 + mova [rsp+10*16], m3 +%define P3 [rsp+6*16] + mova P3, m4 + + mova m0, [dstq+strideq*0] + mova m1, [dstq+strideq*1] + mova m2, [dstq+strideq*2] + mova m3, [dstq+stride3q ] + lea tmpq, [dstq+strideq*4] + mova m4, [tmpq+strideq*0] + mova m5, [tmpq+strideq*1] + mova m6, [tmpq+strideq*2] +%if ARCH_X86_64 + mova m7, [tmpq+stride3q ] + + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 10 + SWAP 0, 10 + SWAP 1, 11 + SWAP 2, 14 + SWAP 3, 15 +%define Q0 m10 +%define Q1 m11 +%define Q2 m14 +%define Q3 m15 +%else ; x86-32 + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \ + [tmpq+stride3q ], [rsp+12*16], "", a, a +%define Q0 [esp+21*16] +%define Q1 [esp+22*16] +%define Q2 [esp+23*16] +%define Q3 [esp+24*16] + mova Q0, m0 + mova Q1, m1 + mova Q2, m2 + mova Q3, m3 +%endif ; x86-32/64 + + mova [rsp+11*16], m4 +%if ARCH_X86_64 + mova [rsp+12*16], m5 +%endif + mova [rsp+13*16], m6 + mova [rsp+14*16], m7 +%endif ; %1 == 4/6/8/16 +%endif ; %2 ==/!= v + + ; load L/E/I/H +%if ARCH_X86_32 +%define l_strideq r5 + mov l_strideq, dword lstridem +%ifidn %2, v +%define lq r3 + mov lq, dword lm +%endif +%endif +%ifidn %2, v +%if cpuflag(sse4) + pmovzxbw m1, [lq] + pmovzxbw m0, [lq+l_strideq] + pxor m2, m2 +%else ; ssse3 + movq m1, [lq] + movq m0, [lq+l_strideq] + pxor m2, m2 + REPX {punpcklbw x, m2}, m1, m0 +%endif ; ssse3/sse4 +%else ; %2 != v + movq m0, [lq] ; l0, l1 + movq m1, [lq+l_strideq] ; l2, l3 + punpckldq m0, m1 ; l0, l2, l1, l3 + pxor m2, m2 + punpcklbw m1, m0, m2 ; l0, l2 + punpckhbw m0, m2 ; l1, l3 +%endif ; %2==/!=v +%if ARCH_X86_32 +%ifidn %2, v +%undef lq + mov mstrideq, mstridem +%endif +%endif + pcmpeqw m5, m2, m0 + pand m1, m5 + por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] + pshufb m0, [PIC_sym(pb_4x1_4x5_4x9_4x13)] ; l[x][1] + pcmpeqw m5, m2, m0 ; !L + psrlw m5, 1 +%if ARCH_X86_64 + psrlw m2, m0, [lutq+128] + SPLATW m1, [lutq+136] +%else ; x86-32 + mov r5, lutm + psrlw m2, m0, [r5+128] + SPLATW m1, [r5+136] +%endif ; x86-32/64 + pminsw m2, m1 + pmaxsw m2, [PIC_sym(pw_1)] ; I + psrlw m1, m0, 4 ; H + paddw m0, [PIC_sym(pw_2)] + paddw m0, m0 + paddw m0, m2 ; E + REPX {pmullw x, [bdmulq]}, m0, m1, m2 +%if ARCH_X86_32 +%undef l_strideq + lea stride3q, [strideq*3] +%endif + + psubw m3, P1, P0 ; p1-p0 + psubw m4, Q0, Q1 ; q0-q1 + REPX {pabsw x, x}, m3, m4 + pmaxsw m3, m5 + pmaxsw m3, m4 + pcmpgtw m7, m3, m1 ; hev +%if %1 != 4 + psubw m4, P2, P0 ; p2-p0 + pabsw m4, m4 + pmaxsw m4, m3 +%if %1 != 6 + mova m6, P3 ; p3 + psubw m5, m6, P0 ; p3-p0 + pabsw m5, m5 + pmaxsw m4, m5 +%endif ; %1 != 6 + psubw m5, Q0, Q2 ; q0-q2 + pabsw m5, m5 + pmaxsw m4, m5 +%if %1 != 6 + psubw m5, Q0, Q3 ; q0-q3 + pabsw m5, m5 + pmaxsw m4, m5 +%endif ; %1 != 6 + pcmpgtw m4, [bdmulq] ; !flat8in + + psubw m5, P2, P1 ; p2-p1 + pabsw m5, m5 +%if %1 != 6 + psubw m6, P2 ; p3-p2 + pabsw m6, m6 + pmaxsw m5, m6 + psubw m6, Q2, Q3 ; q2-q3 + pabsw m6, m6 + pmaxsw m5, m6 +%endif ; %1 != 6 + psubw m6, Q2, Q1 ; q2-q1 + pabsw m6, m6 + pmaxsw m5, m6 + +%if %1 == 16 + SPLATD m6, [maskq+8] + SPLATD m1, [maskq+4] + por m6, m1 + pand m6, m12 + pcmpeqd m6, m12 + pand m5, m6 +%else ; %1 != 16 + SPLATD m6, [maskq+4] + pand m6, m12 + pcmpeqd m6, m12 + pand m5, m6 ; only apply fm-wide to wd>4 blocks +%endif ; %1==/!=16 + pmaxsw m3, m5 +%endif ; %1 != 4 + pcmpgtw m3, m2 + + psubw m5, P1, Q1 ; p1-q1 + psubw m6, P0, Q0 ; p0-q0 + REPX {pabsw x, x}, m5, m6 + paddw m6, m6 + psrlw m5, 1 + paddw m5, m6 ; abs(p0-q0)*2+(abs(p1-q1)>>1) + pcmpgtw m5, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E + por m3, m5 + +%if %1 == 16 + +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] + mova m1, [tmpq+strideq*2] + mova m2, [tmpq+stride3q] +%else ; %2 != v + mova m0, [rsp+ 8*16] + mova m1, [rsp+ 9*16] + mova m2, [rsp+10*16] +%endif ; %2==/!=v + REPX {psubw x, P0}, m0, m1, m2 + REPX {pabsw x, x}, m0, m1, m2 + pmaxsw m1, m0 + pmaxsw m1, m2 +%ifidn %2, v + lea tmpq, [dstq+strideq*4] + mova m0, [tmpq+strideq*0] + mova m2, [tmpq+strideq*1] + mova m5, [tmpq+strideq*2] +%else ; %2 != v + mova m0, [rsp+11*16] + mova m2, [rsp+12*16] + mova m5, [rsp+13*16] +%endif ; %2==/!=v + REPX {psubw x, Q0}, m0, m2, m5 + REPX {pabsw x, x}, m0, m2, m5 + pmaxsw m0, m2 + pmaxsw m1, m5 + pmaxsw m1, m0 + pcmpgtw m1, [bdmulq] ; !flat8out + por m1, m4 ; !flat8in | !flat8out + SPLATD m2, [maskq+8] + pand m5, m2, m12 + pcmpeqd m5, m12 + pandn m1, m5 ; flat16 + pandn m5, m3, m1 ; flat16 & fm + SWAP 1, 5 + + SPLATD m5, [maskq+4] + por m5, m2 + pand m2, m5, m12 + pcmpeqd m2, m12 + pandn m4, m2 ; flat8in + pandn m2, m3, m4 + SWAP 2, 4 + SPLATD m2, [maskq+0] + por m2, m5 + pand m2, m12 + pcmpeqd m2, m12 + pandn m3, m2 + pandn m0, m4, m3 ; fm & !flat8 & !flat16 + SWAP 0, 3 + pandn m0, m1, m4 ; flat8 & !flat16 + SWAP 0, 4 +%elif %1 != 4 + SPLATD m0, [maskq+4] + pand m2, m0, m12 + pcmpeqd m2, m12 + pandn m4, m2 + pandn m2, m3, m4 ; flat8 & fm + SWAP 2, 4 + SPLATD m2, [maskq+0] + por m0, m2 + pand m0, m12 + pcmpeqd m0, m12 + pandn m3, m0 + pandn m0, m4, m3 ; fm & !flat8 + SWAP 0, 3 +%else ; %1 == 4 + SPLATD m0, [maskq+0] + pand m0, m12 + pcmpeqd m0, m12 + pandn m3, m0 ; fm +%endif ; %1==/!=4 + + ; short filter +%if ARCH_X86_64 + SPLATW m0, r7m +%else + SPLATW m0, bdmulm +%endif + pcmpeqw m2, m2 + psrlw m0, 1 ; 511 or 2047 + pxor m2, m0 ; -512 or -2048 + + psubw m5, Q0, P0 ; q0-p0 + paddw m6, m5, m5 + paddw m6, m5 ; 3*(q0-p0) + psubw m5, P1, Q1 ; iclip_diff(p1-q1) + pminsw m5, m0 + pmaxsw m5, m2 + pand m5, m7 ; f=iclip_diff(p1-q1)&hev + paddw m5, m6 ; f=iclip_diff(3*(q0-p0)+f) + pminsw m5, m0 + pmaxsw m5, m2 + pand m3, m5 ; f&=fm + paddw m5, m3, [PIC_sym(pw_3)] + paddw m3, [PIC_sym(pw_4)] + REPX {pminsw x, m0}, m5, m3 + psraw m5, 3 ; f2 + psraw m3, 3 ; f1 + psubw m0, m2 ; 1023 or 4095 + pxor m2, m2 +%if ARCH_X86_64 + paddw P0, m5 + psubw Q0, m3 +%else + paddw m5, P0 + psubw m6, Q0, m3 + REPX {pminsw x, m0}, m5, m6 + REPX {pmaxsw x, m2}, m5, m6 +%endif + + paddw m3, [PIC_sym(pw_1)] + psraw m3, 1 ; f=(f1+1)>>1 + pandn m7, m3 ; f&=!hev + SWAP 7, 3 +%if ARCH_X86_64 + paddw P1, m3 + psubw Q1, m3 + REPX {pminsw x, m0}, P1, P0, Q0, Q1 + REPX {pmaxsw x, m2}, P1, P0, Q0, Q1 +%else + psubw m7, Q1, m3 + paddw m3, P1 + REPX {pminsw x, m0}, m7, m3 + REPX {pmaxsw x, m2}, m7, m3 +%if %1 > 4 + mova P1, m3 + mova P0, m5 + mova Q0, m6 + mova Q1, m7 +%endif +%endif + +%if %1 == 16 + +; m8-11 = p1/p0/q0/q1, m4=flat8, m1=flat16 +; m12=filter bits mask +; m13-15=p2/q2/q3 +; m0,2-3,5-7 = free + + ; flat16 filter +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] ; p6 + mova m2, [tmpq+strideq*2] ; p5 + mova m7, [tmpq+stride3q] ; p4 + mova m6, [tmpq+strideq*4] ; p3 + lea tmpq, [dstq+mstrideq*4] +%else ; %2 != v + mova m0, [rsp+ 8*16] + mova m2, [rsp+ 9*16] + mova m7, [rsp+10*16] + mova m6, [rsp+ 6*16] +%endif ; %2==/!=v + + mova [rsp+ 0*16], m4 + + ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + psllw m3, m0, 3 ; p6*8 + paddw m3, [PIC_sym(pw_8)] + paddw m5, m2, m7 ; p5+p4 + psubw m3, m0 + paddw m5, m5 ; (p5+p4)*2 + paddw m3, m6 ; p6*7+p3 + paddw m5, P2 ; (p5+p4)*2+p2 + paddw m3, P1 ; p6*7+p3+p1 + paddw m5, P0 ; (p5+p4)*2+p2+p0 + paddw m3, Q0 ; p6*7+p3+p1+q0 + paddw m3, m5 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, m2 + por m5, m4 +%ifidn %2, v + mova [tmpq+mstrideq*2], m5 ; p5 +%else ; %2 != v + mova [rsp+9*16], m5 +%endif ; %2==/!=v + + ; sub p6*2, add p3/q1 + paddw m3, m6 + paddw m5, m0, m0 + paddw m3, Q1 + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, m7 + por m5, m4 +%ifidn %2, v + mova [tmpq+mstrideq*1], m5 ; p4 +%else ; %2 != v + mova [rsp+10*16], m5 +%endif ; %2==/!=v + + ; sub p6/p5, add p2/q2 + psubw m3, m0 + paddw m5, P2, Q2 + psubw m3, m2 + paddw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, m6 + por m5, m4 +%ifidn %2, v + mova [tmpq+strideq*0], m5 ; p3 +%else ; %2 != v + mova [rsp+6*16], m5 +%endif ; %2==/!=v + +%define WRITE_IN_PLACE 0 +%ifidn %2, v +%if ARCH_X86_64 +%define WRITE_IN_PLACE 1 +%endif +%endif + + ; sub p6/p4, add p1/q3 + paddw m3, P1 + paddw m5, m0, m7 + paddw m3, Q3 + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, P2 + por m5, m4 +%if WRITE_IN_PLACE + mova [tmpq+strideq*1], m5 +%else + mova [rsp+1*16], m5 ; don't clobber p2/m13 +%endif + + ; sub p6/p3, add p0/q4 + paddw m3, P0 + paddw m5, m0, m6 +%ifidn %2, v + paddw m3, [dstq+strideq*4] +%else ; %2 != v + paddw m3, [rsp+11*16] +%endif ; %2==/!=v + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, P1 + por m5, m4 +%if WRITE_IN_PLACE + mova [dstq+mstrideq*2], m5 +%else + mova [rsp+2*16], m5 ; don't clobber p1/m3 +%endif + + ; sub p6/p2, add q0/q5 + paddw m3, Q0 + paddw m5, m0, P2 +%ifidn %2, v +%if ARCH_X86_32 + lea r4, P2 +%endif + lea tmpq, [dstq+strideq*4] + paddw m3, [tmpq+strideq*1] +%else ; %2 != v + paddw m3, [rsp+12*16] +%endif ; %2==/!=v + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, P0 + por m5, m4 +%if WRITE_IN_PLACE + mova [dstq+mstrideq*1], m5 +%else + mova [rsp+3*16], m5 ; don't clobber p0/m4 +%endif + + ; sub p6/p1, add q1/q6 + paddw m3, Q1 + paddw m5, m0, P1 +%ifidn %2, v + mova m0, [tmpq+strideq*2] ; q6 +%else ; %2 != v + mova m0, [rsp+13*16] ; q6 +%endif ; %2==/!=v + paddw m3, m0 + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, Q0 + por m5, m4 +%if WRITE_IN_PLACE + mova [dstq], m5 +%else + mova [rsp+4*16], m5 ; don't clobber q0/m5 +%endif + + ; sub p5/p0, add q2/q6 + paddw m3, Q2 + paddw m5, m2, P0 + paddw m3, m0 + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, Q1 + por m2, m5, m4 ; don't clobber q1/m6 + + ; sub p4/q0, add q3/q6 + paddw m3, Q3 + paddw m7, Q0 + paddw m3, m0 + psubw m3, m7 + psrlw m7, m3, 4 + pand m7, m1 + pandn m4, m1, Q2 + por m7, m4 ; don't clobber q2/m14 + + ; sub p3/q1, add q4/q6 +%ifidn %2, v + paddw m3, [tmpq+strideq*0] +%else ; %2 != v + paddw m3, [rsp+11*16] +%endif ; %2==/!=v + paddw m6, Q1 + paddw m3, m0 + psubw m3, m6 + psrlw m6, m3, 4 + pand m6, m1 + pandn m4, m1, Q3 + por m6, m4 +%if WRITE_IN_PLACE + mova [tmpq+mstrideq], m6 ; q3 +%else ; %2 != v + mova [rsp+5*16], m6 +%endif ; %2==/!=v + + ; sub p2/q2, add q5/q6 +%ifidn %2, v + paddw m3, [tmpq+strideq*1] +%if ARCH_X86_64 + paddw m5, P2, Q2 +%else + ; because tmpq is clobbered, so we use a backup pointer for P2 instead + paddw m5, [r4], Q2 + mov pic_regq, pic_regm +%endif +%else ; %2 != v + paddw m3, [rsp+12*16] + paddw m5, P2, Q2 +%endif ; %2==/!=v + paddw m3, m0 + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 +%ifidn %2, v + pandn m4, m1, [tmpq+strideq*0] +%else ; %2 != v + pandn m4, m1, [rsp+11*16] +%endif ; %2==/!=v + por m5, m4 +%ifidn %2, v + mova [tmpq+strideq*0], m5 ; q4 +%else ; %2 != v + mova [rsp+11*16], m5 +%endif ; %2==/!=v + + ; sub p1/q3, add q6*2 + psubw m3, P1 + paddw m0, m0 + psubw m3, Q3 + paddw m3, m0 + psrlw m5, m3, 4 + pand m5, m1 +%ifidn %2, v + pandn m4, m1, [tmpq+strideq*1] +%else ; %2 != v + pandn m4, m1, [rsp+12*16] +%endif ; %2==/!=v + por m5, m4 +%ifidn %2, v + mova [tmpq+strideq*1], m5 ; q5 +%else ; %2 != v + mova [rsp+12*16], m5 +%endif ; %2==/!=v + + mova m4, [rsp+0*16] +%ifidn %2, v + lea tmpq, [dstq+mstrideq*4] +%endif +%if ARCH_X86_64 + SWAP 2, 11 + SWAP 7, 14 + SWAP 6, 15 +%else ; x86-32 + mova Q1, m2 + mova Q2, m7 +%endif ; x86-32/64 +%if WRITE_IN_PLACE + mova P2, [tmpq+strideq*1] + mova P1, [tmpq+strideq*2] + mova P0, [tmpq+stride3q] + mova Q0, [dstq] +%elif ARCH_X86_64 + mova P2, [rsp+1*16] + mova P1, [rsp+2*16] + mova P0, [rsp+3*16] + mova Q0, [rsp+4*16] +%else ; !WRITE_IN_PLACE & x86-32 + mova m0, [rsp+1*16] + mova m1, [rsp+2*16] + mova m2, [rsp+3*16] + mova m3, [rsp+4*16] + mova m7, [rsp+5*16] + mova P2, m0 + mova P1, m1 + mova P0, m2 + mova Q0, m3 + mova Q3, m7 +%endif ; WRITE_IN_PLACE / x86-32/64 +%undef WRITE_IN_PLACE +%endif ; %1 == 16 + +%if %1 >= 8 + + ; flat8 filter + mova m0, P3 ; p3 + paddw m1, m0, P2 ; p3+p2 + paddw m2, P1, P0 ; p1+p0 + paddw m3, m1, m1 ; 2*(p3+p2) + paddw m2, m0 ; p1+p0+p3 + paddw m3, Q0 ; 2*(p3+p2)+q0 + paddw m2, m3 ; 3*p3+2*p2+p1+p0+q0 + pmulhrsw m7, m2, [PIC_sym(pw_4096)] + psubw m7, P2 + pand m7, m4 + + paddw m3, P1, Q1 ; p1+q1 + psubw m2, m1 ; 2*p3+p2+p1+p0+q0 + paddw m2, m3 ; 2*p3+p2+2*p1+p0+q0+q1 + pmulhrsw m3, m2, [PIC_sym(pw_4096)] + psubw m3, P1 + pand m3, m4 + + paddw m5, m0, P1 ; p3+p1 + paddw m6, P0, Q2 ; p0+q2 + psubw m2, m5 ; p3+p2+p1+p0+q0+q1 + paddw m2, m6 ; p3+p2+p1+2*p0+q0+q1+q2 + pmulhrsw m5, m2, [PIC_sym(pw_4096)] + psubw m5, P0 + pand m5, m4 + + paddw m6, m0, P0 ; p3+p0 + paddw m1, Q0, Q3 ; q0+q3 + psubw m2, m6 ; p2+p1+p0+q0+q1+q2 + paddw m2, m1 ; p2+p1+p0+2*q0+q1+q2+q3 + pmulhrsw m6, m2, [PIC_sym(pw_4096)] + psubw m6, Q0 + pand m6, m4 + + paddw m2, Q1 ; p2+p1+p0+2*q0+2*q1+q2+q3 + paddw m2, Q3 ; p2+p1+p0+2*q0+2*q1+q2+2*q3 + paddw m1, P2, Q0 ; p2+q0 + psubw m2, m1 ; p1+p0+q0+2*q1+q2+2*q3 + pmulhrsw m1, m2, [PIC_sym(pw_4096)] + psubw m1, Q1 + pand m1, m4 + + psubw m2, P1 ; p0+q0+2*q1+q2+2*q3 + psubw m2, Q1 ; p0+q0+q1+q2+2*q3 + paddw m0, Q3, Q2 ; q3+q2 + paddw m2, m0 ; p0+q0+q1+2*q2+3*q3 + pmulhrsw m2, [PIC_sym(pw_4096)] + psubw m2, Q2 + pand m2, m4 + + paddw m7, P2 + paddw m3, P1 + paddw m5, P0 + paddw m6, Q0 + paddw m1, Q1 + paddw m2, Q2 + +%ifidn %2, v + mova [tmpq+strideq*1], m7 ; p2 + mova [tmpq+strideq*2], m3 ; p1 + mova [tmpq+stride3q ], m5 ; p0 + mova [dstq+strideq*0], m6 ; q0 + mova [dstq+strideq*1], m1 ; q1 + mova [dstq+strideq*2], m2 ; q2 +%else ; %2 != v + mova m0, P3 + +%if %1 == 8 + lea tmpq, [dstq+strideq*4] +%if ARCH_X86_64 + SWAP 4, 15 + TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, 8 +%else + TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, "", \ + Q3, [tmpq+strideq*1-8], a, u +%endif + + ; write 8x8 + movu [dstq+strideq*0-8], m0 + movu [dstq+strideq*1-8], m7 + movu [dstq+strideq*2-8], m3 + movu [dstq+stride3q -8], m5 + movu [tmpq+strideq*0-8], m6 +%if ARCH_X86_64 + movu [tmpq+strideq*1-8], m1 +%endif + movu [tmpq+strideq*2-8], m2 + movu [tmpq+stride3q -8], m4 + lea dstq, [dstq+strideq*8] +%else ; %1 != 8 +%if ARCH_X86_64 + SWAP 6, 8 + SWAP 1, 9 + SWAP 2, 10 +%else + mova [rsp+1*16], m6 + mova [rsp+2*16], m1 + mova [rsp+3*16], m2 +%endif + + mova m1, [rsp+ 7*16] + mova m2, [rsp+ 8*16] + mova m4, [rsp+ 9*16] + mova m6, [rsp+10*16] + lea tmpq, [dstq+strideq*4] +%if ARCH_X86_64 + TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, 11 +%else + mova [rsp+7*16], m5 + TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, "", \ + [rsp+7*16], [tmpq+strideq*1-16], a, a +%endif + + mova [dstq+strideq*0-16], m1 + mova [dstq+strideq*1-16], m2 + mova [dstq+strideq*2-16], m4 + mova [dstq+stride3q -16], m6 + mova [tmpq+strideq*0-16], m0 +%if ARCH_X86_64 + mova [tmpq+strideq*1-16], m7 +%endif + mova [tmpq+strideq*2-16], m3 + mova [tmpq+stride3q -16], m5 + +%if ARCH_X86_64 + SWAP 6, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 4, 15 +%else + mova m6, [rsp+1*16] + mova m1, [rsp+2*16] + mova m2, [rsp+3*16] + mova m4, Q3 +%endif + mova m0, [rsp+11*16] + mova m3, [rsp+12*16] + mova m5, [rsp+13*16] +%if ARCH_X86_64 + mova m7, [rsp+14*16] + TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, 8 +%else + TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, "", \ + [rsp+14*16], [tmpq+strideq*1], a, a +%endif + mova [dstq+strideq*0], m6 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m4 + mova [tmpq+strideq*0], m0 +%if ARCH_X86_64 + mova [tmpq+strideq*1], m3 +%endif + mova [tmpq+strideq*2], m5 + mova [tmpq+stride3q ], m7 + lea dstq, [dstq+strideq*8] +%endif ; %1==/!=8 +%endif ; %2==/!=v +%elif %1 == 6 + ; flat6 filter + paddw m3, P1, P0 ; p1+p0 + paddw m3, P2 ; p2+p1+p0 + paddw m6, P2, Q0 ; p2+q0 + paddw m3, m3 ; 2*(p2+p1+p0) + paddw m3, m6 ; p2+2*(p2+p1+p0)+q0 + pmulhrsw m2, m3, [PIC_sym(pw_4096)] + psubw m2, P1 + pand m2, m4 + + paddw m3, Q0 ; p2+2*(p2+p1+p0+q0) + paddw m6, P2, P2 ; 2*p2 + paddw m3, Q1 ; p2+2*(p2+p1+p0+q0)+q1 + psubw m3, m6 ; p2+2*(p1+p0+q0)+q1 + pmulhrsw m5, m3, [PIC_sym(pw_4096)] + psubw m5, P0 + pand m5, m4 + + paddw m3, Q1 ; p2+2*(p1+p0+q0+q1) + paddw m6, P2, P1 ; p2+p1 + paddw m3, Q2 ; p2+2*(p1+p0+q0+q1)+q2 + psubw m3, m6 ; p1+2*(p0+q0+q1)+q2 + pmulhrsw m6, m3, [PIC_sym(pw_4096)] + psubw m6, Q0 + pand m6, m4 + + psubw m3, P1 ; 2*(p0+q0+q1)+q2 +%if ARCH_X86_64 + paddw Q2, Q2 ; q2*2 +%else + mova m0, Q2 + paddw m0, m0 +%endif + psubw m3, P0 ; p0+2*(q0+q1)+q2 +%if ARCH_X86_64 + paddw m3, Q2 ; p0+q*(q0+q1+q2)+q2 +%else + paddw m3, m0 +%endif + pmulhrsw m3, [PIC_sym(pw_4096)] + psubw m3, Q1 + pand m3, m4 + + paddw m2, P1 + paddw m5, P0 + paddw m6, Q0 + paddw m3, Q1 + +%ifidn %2, v + mova [dstq+mstrideq*2], m2 ; p1 + mova [dstq+mstrideq*1], m5 ; p0 + mova [dstq+strideq*0], m6 ; q0 + mova [dstq+strideq*1], m3 ; q1 +%else ; %2 != v + TRANSPOSE_8x4_AND_WRITE_4x8 m2, m5, m6, m3, m0 +%endif ; %2==/!=v +%else ; %1 == 4 +%if ARCH_X86_64 +%ifidn %2, v + mova [dstq+mstrideq*2], P1 ; p1 + mova [dstq+mstrideq*1], P0 ; p0 + mova [dstq+strideq*0], Q0 ; q0 + mova [dstq+strideq*1], Q1 ; q1 +%else ; %2 != v + TRANSPOSE_8x4_AND_WRITE_4x8 P1, P0, Q0, Q1, m0 +%endif ; %2==/!=v +%else ; x86-32 +%ifidn %2, v + mova [dstq+mstrideq*2], m3 + mova [dstq+mstrideq*1], m5 + mova [dstq+strideq*0], m6 + mova [dstq+strideq*1], m7 +%else ; %2 != v + TRANSPOSE_8x4_AND_WRITE_4x8 m3, m5, m6, m7, m0 +%endif ; %2==/!=v +%endif ; x86-32/64 +%endif ; %1 +%undef P3 +%undef P2 +%undef P1 +%undef P0 +%undef Q0 +%undef Q1 +%undef Q2 +%undef Q3 +%endmacro + +INIT_XMM ssse3 +; stack layout: +; r0 - flat8 backup inside flat16 code +%if ARCH_X86_64 +cglobal lpf_v_sb_y_16bpc, 6, 12, 16, -16 * 1, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits, bdmul + mov r6d, r7m + sar r6d, 7 + and r6d, 16 ; 0 for 10bpc, 16 for 12bpc + lea bdmulq, [pw_4] + add bdmulq, r6 + mov wd, wm + shl l_strideq, 2 + sub lq, l_strideq +%else +; stack layout [32bit only]: +; r1-4 - p2-q0 post-filter16 +; r5 - p3 +; r6 - q3 post-filter16 +; r7 - GPRs [mask_bitsm, mstridem] +; r8 - m12/pb_mask +; r9 - bdmulq +cglobal lpf_v_sb_y_16bpc, 4, 7, 8, -16 * (10 + extra_stack), \ + dst, stride, mask, mstride, pic_reg, stride3, tmp + RELOC_ARGS v, 10*16 +%if STACK_ALIGNMENT >= 16 + mov r5d, r7m +%endif + sar r5d, 7 + and r5d, 16 ; 0 for 10bpc, 16 for 12bpc + LEA pic_regq, PIC_base +%define pic_regm dword [esp+7*16+2*gprsize] + mov pic_regm, pic_regq + mova m0, [PIC_sym(pw_4)+r5] +%define bdmulq esp+9*16 + mova [bdmulq], m0 + shl dword lstridem, 2 + sub r3, dword lstridem + mov dword lm, r3 +%endif + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] +%if ARCH_X86_64 + mov mask_bitsd, 0x3 + mova m12, [pb_mask] +%else +%define mstridem dword [esp+7*16+1*gprsize] + mov mstridem, mstrideq +%define mask_bitsm dword [esp+7*16+0*gprsize] + mov mask_bitsm, 0x3 + mova m0, [PIC_sym(pb_mask)] +%define m12 [esp+8*16] + mova m12, m0 +%endif + +.loop: +%if ARCH_X86_64 + test [maskq+8], mask_bitsd ; vmask[2] +%else + mov r6d, mask_bitsm + test [maskq+8], r6d +%endif + jz .no_flat16 + + FILTER 16, v + jmp .end + +.no_flat16: +%if ARCH_X86_64 + test [maskq+4], mask_bitsd ; vmask[1] +%else + test [maskq+4], r6d +%endif + jz .no_flat + + FILTER 8, v + jmp .end + +.no_flat: +%if ARCH_X86_64 + test [maskq+0], mask_bitsd ; vmask[0] +%else + test [maskq+0], r6d +%endif + jz .end + + FILTER 4, v + +.end: +%if ARCH_X86_64 + pslld m12, 2 + add lq, 8 +%else + mova m0, m12 + pslld m0, 2 + mova m12, m0 + add dword lm, 8 +%endif + add dstq, 16 +%if ARCH_X86_64 + shl mask_bitsd, 2 + sub wd, 2 +%else + shl mask_bitsm, 2 + sub dword wm, 2 +%endif + jg .loop +%undef mask_bitsm +%undef bdmulq + UNRELOC_ARGS + RET + +INIT_XMM ssse3 +; stack layout: +; r0 - flat8 backup inside flat16 +; r1-4 - p2-q0 post-filter16 backup +; r5 - q3 post-filter16 backup +; r6 - p3 +; r7-10 - p7-4 +; r11-14 - q4-7 +%if ARCH_X86_64 +cglobal lpf_h_sb_y_16bpc, 6, 11, 16, -16 * 15, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, tmp, mask_bits, bdmul + mov r6d, r7m + sar r6d, 7 + and r6d, 16 ; 0 for 10bpc, 16 for 12bpc + lea bdmulq, [pw_4] + add bdmulq, r6 + mov hd, hm + shl l_strideq, 2 +%else +; stack layout [32bit only]: +; r15 - GPRs [mask_bitsm] +; r16 - m12/pb_mask +; r17 - bdmulq +; r18-24 - p2-q3 +cglobal lpf_h_sb_y_16bpc, 4, 7, 8, -16 * (25 + extra_stack), \ + dst, stride, mask, l, pic_reg, stride3, tmp + RELOC_ARGS h, 25*16 +%if STACK_ALIGNMENT >= 16 + mov r5d, r7m +%endif + sar r5d, 7 + and r5d, 16 ; 0 for 10bpc, 16 for 12bpc + LEA pic_regq, PIC_base + mova m0, [PIC_sym(pw_4)+r5] +%define bdmulq esp+17*16 + mova [bdmulq], m0 + shl dword lstridem, 2 +%endif + sub lq, 4 + lea stride3q, [strideq*3] +%if ARCH_X86_64 + mov mask_bitsd, 0x3 + mova m12, [pb_mask] +%else +%define mask_bitsm dword [esp+15*16+0*gprsize] + mov mask_bitsm, 0x3 + mova m0, [PIC_sym(pb_mask)] +%define m12 [esp+16*16] + mova m12, m0 +%endif + +.loop: +%if ARCH_X86_64 + test [maskq+8], mask_bitsd ; vmask[2] +%else + mov r6d, mask_bitsm + test [maskq+8], r6d +%endif + jz .no_flat16 + + FILTER 16, h + jmp .end + +.no_flat16: +%if ARCH_X86_64 + test [maskq+4], mask_bitsd ; vmask[1] +%else + test [maskq+4], r6d +%endif + jz .no_flat + + FILTER 8, h + jmp .end + +.no_flat: +%if ARCH_X86_64 + test [maskq+0], mask_bitsd ; vmask[0] +%else + test [maskq+0], r6d +%endif + jz .no_filter + + FILTER 4, h + jmp .end + +.no_filter: + lea dstq, [dstq+strideq*8] +.end: +%if ARCH_X86_64 + pslld m12, 2 + lea lq, [lq+l_strideq*2] + shl mask_bitsd, 2 + sub hd, 2 +%else + mova m0, m12 + pslld m0, 2 + mova m12, m0 + add lq, dword lstridem + add lq, dword lstridem + shl mask_bitsm, 2 + sub dword hm, 2 +%endif + jg .loop +%undef mask_bitsm +%undef bdmulq + UNRELOC_ARGS + RET + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits, bdmul + mov r6d, r7m + sar r6d, 7 + and r6d, 16 ; 0 for 10bpc, 16 for 12bpc + lea bdmulq, [pw_4] + add bdmulq, r6 + mov wd, wm + shl l_strideq, 2 + sub lq, l_strideq +%else +; stack layout [32bit only]: +; r0 - GPRs [mask_bitsm, mstridem] +; r1 - m12/pb_mask +; r2 - bdmulq +cglobal lpf_v_sb_uv_16bpc, 4, 7, 8, -16 * (3 + extra_stack), \ + dst, stride, mask, mstride, pic_reg, stride3, tmp + RELOC_ARGS v, 3*16 +%if STACK_ALIGNMENT >= 16 + mov r5d, r7m +%endif + sar r5d, 7 + and r5d, 16 ; 0 for 10bpc, 16 for 12bpc + LEA pic_regq, PIC_base + mova m0, [PIC_sym(pw_4)+r5] +%define bdmulq esp+2*16 + mova [bdmulq], m0 + shl dword lstridem, 2 + sub r3, dword lstridem + mov dword lm, r3 +%endif + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] +%if ARCH_X86_64 + mov mask_bitsd, 0x3 + mova m12, [pb_mask] +%else +%define mask_bitsm dword [esp+0*gprsize] +%define mstridem dword [esp+1*gprsize] + mov mask_bitsm, 0x3 + mov mstridem, mstrideq + mova m0, [PIC_sym(pb_mask)] +%define m12 [esp+1*16] + mova m12, m0 +%endif + +.loop: +%if ARCH_X86_64 + test [maskq+4], mask_bitsd ; vmask[1] +%else + mov r6d, mask_bitsm + test [maskq+4], r6d +%endif + jz .no_flat + + FILTER 6, v + jmp .end + +.no_flat: +%if ARCH_X86_64 + test [maskq+0], mask_bitsd ; vmask[0] +%else + test [maskq+0], r6d +%endif + jz .end + + FILTER 4, v + +.end: +%if ARCH_X86_64 + pslld m12, 2 + add lq, 8 +%else + mova m0, m12 + pslld m0, 2 + mova m12, m0 + add dword lm, 8 +%endif + add dstq, 16 +%if ARCH_X86_64 + shl mask_bitsd, 2 + sub wd, 2 +%else + shl mask_bitsm, 2 + sub dword wm, 2 +%endif + jg .loop +%undef mask_bitsm +%undef bdmulq + UNRELOC_ARGS + RET + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal lpf_h_sb_uv_16bpc, 6, 11, 16, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, tmp, mask_bits, bdmul + mov r6d, r7m + sar r6d, 7 + and r6d, 16 ; 0 for 10bpc, 16 for 12bpc + lea bdmulq, [pw_4] + add bdmulq, r6 + mov hd, hm + shl l_strideq, 2 +%else +; stack layout [32bit only]: +; r0 - GPRs [mask_bitsm] +; r1 - m12/pb_mask +; r2 - bdmulq +; r3-8 - p2-q2 +cglobal lpf_h_sb_uv_16bpc, 4, 7, 8, -16 * (9 + extra_stack), \ + dst, stride, mask, l, pic_reg, stride3, tmp + RELOC_ARGS h, 9*16 +%if STACK_ALIGNMENT >= 16 + mov r5d, r7m +%endif + sar r5d, 7 + and r5d, 16 ; 0 for 10bpc, 16 for 12bpc + LEA pic_regq, PIC_base + mova m0, [PIC_sym(pw_4)+r5] +%define bdmulq esp+2*16 + mova [bdmulq], m0 + shl dword lstridem, 2 +%endif + sub lq, 4 + lea stride3q, [strideq*3] +%if ARCH_X86_64 + mov mask_bitsd, 0x3 + mova m12, [pb_mask] +%else +%define mask_bitsm dword [esp+0*gprsize] + mov mask_bitsm, 0x3 + mova m0, [PIC_sym(pb_mask)] +%define m12 [esp+1*16] + mova m12, m0 +%endif + +.loop: +%if ARCH_X86_64 + test [maskq+4], mask_bitsd ; vmask[1] +%else + mov r6d, mask_bitsm + test [maskq+4], r6d +%endif + jz .no_flat + + FILTER 6, h + jmp .end + +.no_flat: +%if ARCH_X86_64 + test [maskq+0], mask_bitsd ; vmask[0] +%else + test [maskq+0], r6d +%endif + jz .no_filter + + FILTER 4, h + jmp .end + +.no_filter: + lea dstq, [dstq+strideq*8] +.end: +%if ARCH_X86_64 + pslld m12, 2 + lea lq, [lq+l_strideq*2] + shl mask_bitsd, 2 + sub hd, 2 +%else + mova m0, m12 + pslld m0, 2 + mova m12, m0 + add lq, dword lstridem + add lq, dword lstridem + shl mask_bitsm, 2 + sub dword hm, 2 +%endif + jg .loop +%undef mask_bitsm +%undef bdmulq + UNRELOC_ARGS + RET -- cgit v1.2.3