summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/x86/looprestoration_sse.asm
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/dav1d/src/x86/looprestoration_sse.asm')
-rw-r--r--third_party/dav1d/src/x86/looprestoration_sse.asm3681
1 files changed, 3681 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/looprestoration_sse.asm b/third_party/dav1d/src/x86/looprestoration_sse.asm
new file mode 100644
index 0000000000..01eb6fa348
--- /dev/null
+++ b/third_party/dav1d/src/x86/looprestoration_sse.asm
@@ -0,0 +1,3681 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; Copyright © 2018, VideoLabs
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+wiener_init: db 6, 7, 6, 7, 6, 7, 6, 7, 0, 0, 0, 0, 2, 4, 2, 4
+wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14
+wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12
+wiener_shufD: db 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1, 10, -1, 11, -1
+wiener_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+sgr_lshuf3: db 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
+sgr_lshuf5: db 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+pb_right_ext_mask: times 24 db 0xff
+ times 8 db 0
+pb_1: times 16 db 1
+pb_3: times 16 db 3
+pw_256: times 8 dw 256
+pw_2056: times 8 dw 2056
+pw_m16380: times 8 dw -16380
+pd_4096: times 4 dd 4096
+pd_34816: times 4 dd 34816
+pd_0xffff: times 4 dd 0xffff
+pd_0xf00800a4: times 4 dd 0xf00800a4
+pd_0xf00801c7: times 4 dd 0xf00801c7
+
+cextern sgr_x_by_x
+
+SECTION .text
+
+%macro movif64 2 ; dst, src
+ %if ARCH_X86_64
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro movif32 2 ; dst, src
+ %if ARCH_X86_32
+ mov %1, %2
+ %endif
+%endmacro
+
+%if ARCH_X86_32
+ %define PIC_base_offset $$
+
+ %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg
+ %assign pic_reg_stk_off 4
+ %xdefine PIC_reg %1
+ %if %2 == 1
+ mov [esp], %1
+ %endif
+ LEA PIC_reg, PIC_base_offset
+ %if %3 == 1
+ XCHG_PIC_REG
+ %endif
+ %endmacro
+
+ %macro XCHG_PIC_REG 0
+ mov [esp+pic_reg_stk_off], PIC_reg
+ %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8
+ mov PIC_reg, [esp+pic_reg_stk_off]
+ %endmacro
+
+ %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
+
+%else
+ %macro XCHG_PIC_REG 0
+ %endmacro
+
+ %define PIC_sym(sym) (sym)
+%endif
+
+%macro WIENER 0
+%if ARCH_X86_64
+DECLARE_REG_TMP 9, 7, 10, 11, 12, 13, 14 ; ring buffer pointers
+cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
+ w, h, edge, flt, x
+ %define tmpstrideq strideq
+ %define base 0
+ mov fltq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ movq m14, [fltq]
+ add lpfq, wq
+ movq m7, [fltq+16]
+ add dstq, wq
+ lea t1, [rsp+wq*2+16]
+ mova m15, [pw_2056]
+ neg wq
+%if cpuflag(ssse3)
+ pshufb m14, [wiener_init]
+ mova m8, [wiener_shufA]
+ pshufd m12, m14, q2222 ; x0 x0
+ mova m9, [wiener_shufB]
+ pshufd m13, m14, q3333 ; x1 x2
+ mova m10, [wiener_shufC]
+ punpcklqdq m14, m14 ; x3
+ mova m11, [wiener_shufD]
+%else
+ mova m10, [pw_m16380]
+ punpcklwd m14, m14
+ pshufd m11, m14, q0000 ; x0
+ pshufd m12, m14, q1111 ; x1
+ pshufd m13, m14, q2222 ; x2
+ pshufd m14, m14, q3333 ; x3
+%endif
+%else
+DECLARE_REG_TMP 4, 0, _, 5
+%if cpuflag(ssse3)
+ %define m10 [base+wiener_shufC]
+ %define m11 [base+wiener_shufD]
+ %define stk_off 96
+%else
+ %define m10 [base+pw_m16380]
+ %define m11 [stk+96]
+ %define stk_off 112
+%endif
+cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, tmpstride
+ %define base r6-pb_right_ext_mask-21
+ %define stk esp
+ %define dstq leftq
+ %define edgeb byte edged
+ %define edged [stk+ 8]
+ %define dstmp [stk+12]
+ %define hd dword [stk+16]
+ %define wq [stk+20]
+ %define strideq [stk+24]
+ %define leftmp [stk+28]
+ %define t2 [stk+32]
+ %define t4 [stk+36]
+ %define t5 [stk+40]
+ %define t6 [stk+44]
+ %define m8 [base+wiener_shufA]
+ %define m9 [base+wiener_shufB]
+ %define m12 [stk+48]
+ %define m13 [stk+64]
+ %define m14 [stk+80]
+ %define m15 [base+pw_2056]
+ mov r1, r6m ; flt
+ mov r0, r0m ; dst
+ mov r4, r4m ; w
+ mov lpfq, lpfm
+ mov r2, r7m ; edge
+ mov r5, r5m ; h
+ movq m3, [r1+ 0]
+ movq m7, [r1+16]
+ add r0, r4
+ mov r1, r1m ; stride
+ add lpfq, r4
+ mov edged, r2
+ mov r2, r2m ; left
+ mov dstmp, r0
+ lea t1, [rsp+r4*2+stk_off]
+ mov hd, r5
+ neg r4
+ LEA r6, pb_right_ext_mask+21
+ mov wq, r4
+ mov strideq, r1
+ mov leftmp, r2
+ mov r4, r1
+%if cpuflag(ssse3)
+ pshufb m3, [base+wiener_init]
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q3333
+ punpcklqdq m3, m3
+%else
+ punpcklwd m3, m3
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova m11, m0
+%endif
+ mova m12, m1
+ mova m13, m2
+ mova m14, m3
+%endif
+ psllw m7, 5
+ pshufd m6, m7, q0000 ; y0 y1
+ pshufd m7, m7, q1111 ; y2 y3
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 384*2
+ call .h_top
+ lea t3, [lpfq+tmpstrideq*4]
+ mov lpfq, dstmp
+ add t3, tmpstrideq
+ mov [rsp], t3 ; below
+ mov t4, t1
+ add t1, 384*2
+ call .h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+.main:
+ lea t0, [t1+384*2]
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v3
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.v1:
+ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
+ RET
+.no_top:
+ lea t3, [lpfq+tmpstrideq*4]
+ mov lpfq, dstmp
+ lea t3, [t3+tmpstrideq*2]
+ mov [rsp], t3
+ call .h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v3
+ add t0, 384*8
+ call .hv
+ dec hd
+ jnz .main
+.v3:
+ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
+.v2:
+ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
+ jmp .v1
+.extend_right:
+ movd m2, [lpfq-4]
+%if ARCH_X86_64
+ push r0
+ lea r0, [pb_right_ext_mask+21]
+ movu m0, [r0+xq+0]
+ movu m1, [r0+xq+8]
+ pop r0
+%else
+ movu m0, [r6+xq+0]
+ movu m1, [r6+xq+8]
+%endif
+%if cpuflag(ssse3)
+ pshufb m2, [base+pb_3]
+%else
+ punpcklbw m2, m2
+ pshuflw m2, m2, q3333
+ punpcklqdq m2, m2
+%endif
+ pand m4, m0
+ pand m5, m1
+ pandn m0, m2
+ pandn m1, m2
+ por m4, m0
+ por m5, m1
+ ret
+.h:
+ %define stk esp+4 ; offset due to call
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movifnidn leftq, leftmp
+ mova m4, [lpfq+xq]
+ movd m5, [leftq]
+ add leftq, 4
+ pslldq m4, 4
+ por m4, m5
+ movifnidn leftmp, leftq
+ jmp .h_main
+.h_extend_left:
+%if cpuflag(ssse3)
+ mova m4, [lpfq+xq]
+ pshufb m4, [base+wiener_l_shuf]
+%else
+ mova m5, [lpfq+xq]
+ pshufd m4, m5, q2103
+ punpcklbw m5, m5
+ punpcklwd m5, m5
+ movss m4, m5
+%endif
+ jmp .h_main
+.h_top:
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+xq-4]
+.h_main:
+ movu m5, [lpfq+xq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp xd, -18
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+%macro %%h7 0
+%if cpuflag(ssse3)
+ pshufb m0, m4, m8
+ pmaddubsw m0, m12
+ pshufb m1, m5, m8
+ pmaddubsw m1, m12
+ pshufb m2, m4, m9
+ pmaddubsw m2, m13
+ pshufb m3, m5, m9
+ pmaddubsw m3, m13
+ paddw m0, m2
+ pshufb m2, m4, m10
+ pmaddubsw m2, m13
+ paddw m1, m3
+ pshufb m3, m5, m10
+ pmaddubsw m3, m13
+ pshufb m4, m11
+ paddw m0, m2
+ pmullw m2, m14, m4
+ pshufb m5, m11
+ paddw m1, m3
+ pmullw m3, m14, m5
+ psllw m4, 7
+ psllw m5, 7
+ paddw m0, m2
+ mova m2, [base+pw_m16380]
+ paddw m1, m3
+ paddw m4, m2
+ paddw m5, m2
+ paddsw m0, m4
+ paddsw m1, m5
+%else
+ psrldq m0, m4, 1
+ pslldq m1, m4, 1
+ pxor m3, m3
+ punpcklbw m0, m3
+ punpckhbw m1, m3
+ paddw m0, m1
+ pmullw m0, m11
+ psrldq m1, m4, 2
+ pslldq m2, m4, 2
+ punpcklbw m1, m3
+ punpckhbw m2, m3
+ paddw m1, m2
+ pmullw m1, m12
+ paddw m0, m1
+ pshufd m2, m4, q0321
+ punpcklbw m2, m3
+ pmullw m1, m14, m2
+ paddw m0, m1
+ psrldq m1, m4, 3
+ pslldq m4, 3
+ punpcklbw m1, m3
+ punpckhbw m4, m3
+ paddw m1, m4
+ pmullw m1, m13
+ paddw m0, m1
+ psllw m2, 7
+ paddw m2, m10
+ paddsw m0, m2
+ psrldq m1, m5, 1
+ pslldq m2, m5, 1
+ punpcklbw m1, m3
+ punpckhbw m2, m3
+ paddw m1, m2
+ pmullw m1, m11
+ psrldq m2, m5, 2
+ pslldq m4, m5, 2
+ punpcklbw m2, m3
+ punpckhbw m4, m3
+ paddw m2, m4
+ pmullw m2, m12
+ paddw m1, m2
+ pshufd m4, m5, q0321
+ punpcklbw m4, m3
+ pmullw m2, m14, m4
+ paddw m1, m2
+ psrldq m2, m5, 3
+ pslldq m5, 3
+ punpcklbw m2, m3
+ punpckhbw m5, m3
+ paddw m2, m5
+ pmullw m2, m13
+ paddw m1, m2
+ psllw m4, 7
+ paddw m4, m10
+ paddsw m1, m4
+%endif
+%endmacro
+ %%h7
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m15
+ paddw m1, m15
+ mova [t1+xq*2+ 0], m0
+ mova [t1+xq*2+16], m1
+ add xq, 16
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movifnidn leftq, leftmp
+ mova m4, [lpfq+xq]
+ movd m5, [leftq]
+ add leftq, 4
+ pslldq m4, 4
+ por m4, m5
+ movifnidn leftmp, leftq
+ jmp .hv_main
+.hv_extend_left:
+%if cpuflag(ssse3)
+ mova m4, [lpfq+xq]
+ pshufb m4, [base+wiener_l_shuf]
+%else
+ mova m5, [lpfq+xq]
+ pshufd m4, m5, q2103
+ punpcklbw m5, m5
+ punpcklwd m5, m5
+ movss m4, m5
+%endif
+ jmp .hv_main
+.hv_bottom:
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m4, [lpfq+xq-4]
+.hv_main:
+ movu m5, [lpfq+xq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp xd, -18
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ %%h7
+%if ARCH_X86_64
+ mova m2, [t4+xq*2]
+ paddw m2, [t2+xq*2]
+%else
+ mov r2, t4
+ mova m2, [r2+xq*2]
+ mov r2, t2
+ paddw m2, [r2+xq*2]
+ mov r2, t5
+%endif
+ mova m3, [t3+xq*2]
+%if ARCH_X86_64
+ mova m5, [t5+xq*2]
+%else
+ mova m5, [r2+xq*2]
+ mov r2, t6
+%endif
+ paddw m5, [t1+xq*2]
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m15
+ paddw m1, m15
+%if ARCH_X86_64
+ paddw m4, m0, [t6+xq*2]
+%else
+ paddw m4, m0, [r2+xq*2]
+ mov r2, t4
+%endif
+ mova [t0+xq*2], m0
+ punpcklwd m0, m2, m3
+ pmaddwd m0, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m6
+ punpckhwd m4, m5
+ pmaddwd m4, m6
+ paddd m0, m3
+ mova m3, [t3+xq*2+16]
+ paddd m4, m2
+%if ARCH_X86_64
+ mova m2, [t4+xq*2+16]
+ paddw m2, [t2+xq*2+16]
+ mova m5, [t5+xq*2+16]
+%else
+ mova m2, [r2+xq*2+16]
+ mov r2, t2
+ paddw m2, [r2+xq*2+16]
+ mov r2, t5
+ mova m5, [r2+xq*2+16]
+ mov r2, t6
+%endif
+ paddw m5, [t1+xq*2+16]
+ packuswb m0, m4
+%if ARCH_X86_64
+ paddw m4, m1, [t6+xq*2+16]
+%else
+ paddw m4, m1, [r2+xq*2+16]
+ mov dstq, dstmp
+%endif
+ mova [t0+xq*2+16], m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m6
+ punpckhwd m4, m5
+ pmaddwd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ packuswb m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+xq], m0
+ add xq, 16
+ jl .hv_loop
+ add dstq, strideq
+%if ARCH_X86_64
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+%else
+ mov dstmp, dstq
+ mov r1, t5
+ mov r2, t4
+ mov t6, r1
+ mov t5, r2
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, r1
+%endif
+ ret
+%if cpuflag(ssse3) ; identical in sse2 and ssse3, so share code
+.v:
+ mov xq, wq
+.v_loop:
+%if ARCH_X86_64
+ mova m1, [t4+xq*2]
+ paddw m1, [t2+xq*2]
+%else
+ mov r2, t4
+ mova m1, [r2+xq*2]
+ mov r2, t2
+ paddw m1, [r2+xq*2]
+ mov r2, t6
+%endif
+ mova m2, [t3+xq*2]
+ mova m4, [t1+xq*2]
+%if ARCH_X86_64
+ paddw m3, m4, [t6+xq*2]
+ paddw m4, [t5+xq*2]
+%else
+ paddw m3, m4, [r2+xq*2]
+ mov r2, t5
+ paddw m4, [r2+xq*2]
+ mov r2, t4
+%endif
+ punpcklwd m0, m1, m2
+ pmaddwd m0, m7
+ punpckhwd m1, m2
+ pmaddwd m1, m7
+ punpcklwd m2, m3, m4
+ pmaddwd m2, m6
+ punpckhwd m3, m4
+ pmaddwd m3, m6
+ paddd m0, m2
+ paddd m1, m3
+%if ARCH_X86_64
+ mova m2, [t4+xq*2+16]
+ paddw m2, [t2+xq*2+16]
+%else
+ mova m2, [r2+xq*2+16]
+ mov r2, t2
+ paddw m2, [r2+xq*2+16]
+ mov r2, t6
+%endif
+ mova m3, [t3+xq*2+16]
+ mova m5, [t1+xq*2+16]
+%if ARCH_X86_64
+ paddw m4, m5, [t6+xq*2+16]
+ paddw m5, [t5+xq*2+16]
+%else
+ paddw m4, m5, [r2+xq*2+16]
+ mov r2, t5
+ paddw m5, [r2+xq*2+16]
+ movifnidn dstq, dstmp
+%endif
+ packuswb m0, m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m6
+ punpckhwd m4, m5
+ pmaddwd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ packuswb m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+xq], m0
+ add xq, 16
+ jl .v_loop
+ add dstq, strideq
+%if ARCH_X86_64
+ mov t6, t5
+ mov t5, t4
+%else
+ mov dstmp, dstq
+ mov r1, t5
+ mov r2, t4
+ mov t6, r1
+ mov t5, r2
+%endif
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ ret
+%endif
+
+%if ARCH_X86_64
+cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \
+ w, h, edge, flt, x
+ mov fltq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ movq m14, [fltq]
+ add lpfq, wq
+ movq m7, [fltq+16]
+ add dstq, wq
+ mova m8, [pw_m16380]
+ lea t1, [rsp+wq*2+16]
+ mova m15, [pw_2056]
+ neg wq
+%if cpuflag(ssse3)
+ pshufb m14, [wiener_init]
+ mova m9, [wiener_shufB]
+ pshufd m13, m14, q3333 ; x1 x2
+ mova m10, [wiener_shufC]
+ punpcklqdq m14, m14 ; x3
+ mova m11, [wiener_shufD]
+ mova m12, [wiener_l_shuf]
+%else
+ punpcklwd m14, m14
+ pshufd m11, m14, q1111 ; x1
+ pshufd m13, m14, q2222 ; x2
+ pshufd m14, m14, q3333 ; x3
+%endif
+%else
+%if cpuflag(ssse3)
+ %define stk_off 80
+%else
+ %define m11 [stk+80]
+ %define stk_off 96
+%endif
+cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, tmpstride
+ %define stk esp
+ %define leftmp [stk+28]
+ %define m8 [base+pw_m16380]
+ %define m12 [base+wiener_l_shuf]
+ %define m14 [stk+48]
+ mov r1, r6m ; flt
+ mov r0, r0m ; dst
+ mov r4, r4m ; w
+ mov lpfq, lpfm
+ mov r2, r7m ; edge
+ mov r5, r5m ; h
+ movq m2, [r1+ 0]
+ movq m7, [r1+16]
+ add r0, r4
+ mov r1, r1m ; stride
+ add lpfq, r4
+ mov edged, r2
+ mov r2, r2m ; left
+ mov dstmp, r0
+ lea t1, [rsp+r4*2+stk_off]
+ mov hd, r5
+ neg r4
+ LEA r6, pb_right_ext_mask+21
+ mov wq, r4
+ mov strideq, r1
+ mov leftmp, r2
+ mov r4, r1
+%if cpuflag(ssse3)
+ pshufb m2, [base+wiener_init]
+ pshufd m1, m2, q3333
+ punpcklqdq m2, m2
+%else
+ punpcklwd m2, m2
+ pshufd m0, m2, q1111
+ pshufd m1, m2, q2222
+ pshufd m2, m2, q3333
+ mova m11, m0
+%endif
+ mova m13, m1
+ mova m14, m2
+%endif
+ psllw m7, 5
+ pshufd m6, m7, q0000 ; __ y1
+ pshufd m7, m7, q1111 ; y2 y3
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t4, t1
+ add t1, 384*2
+ call .h_top
+ lea xq, [lpfq+tmpstrideq*4]
+ mov lpfq, dstmp
+ mov t3, t1
+ add t1, 384*2
+ add xq, tmpstrideq
+ mov [rsp], xq ; below
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+.main:
+ mov t0, t4
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v2
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ RET
+.no_top:
+ lea t3, [lpfq+tmpstrideq*4]
+ mov lpfq, dstmp
+ lea t3, [t3+tmpstrideq*2]
+ mov [rsp], t3
+ call .h
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v2
+ add t0, 384*6
+ call .hv
+ dec hd
+ jnz .main
+.v2:
+ call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
+ add dstq, strideq
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ movifnidn dstmp, dstq
+.v1:
+ call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
+ jmp .end
+.h:
+ %define stk esp+4
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movifnidn leftq, leftmp
+ mova m4, [lpfq+xq]
+ movd m5, [leftq]
+ add leftq, 4
+ pslldq m4, 4
+ por m4, m5
+ movifnidn leftmp, leftq
+ jmp .h_main
+.h_extend_left:
+%if cpuflag(ssse3)
+ mova m4, [lpfq+xq]
+ pshufb m4, m12
+%else
+ mova m5, [lpfq+xq]
+ pshufd m4, m5, q2103
+ punpcklbw m5, m5
+ punpcklwd m5, m5
+ movss m4, m5
+%endif
+ jmp .h_main
+.h_top:
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+xq-4]
+.h_main:
+ movu m5, [lpfq+xq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp xd, -17
+ jl .h_have_right
+ call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
+.h_have_right:
+%macro %%h5 0
+%if cpuflag(ssse3)
+ pshufb m0, m4, m9
+ pmaddubsw m0, m13
+ pshufb m1, m5, m9
+ pmaddubsw m1, m13
+ pshufb m2, m4, m10
+ pmaddubsw m2, m13
+ pshufb m3, m5, m10
+ pmaddubsw m3, m13
+ pshufb m4, m11
+ paddw m0, m2
+ pmullw m2, m14, m4
+ pshufb m5, m11
+ paddw m1, m3
+ pmullw m3, m14, m5
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m8
+ paddw m5, m8
+ paddw m0, m2
+ paddw m1, m3
+ paddsw m0, m4
+ paddsw m1, m5
+%else
+ psrldq m0, m4, 2
+ pslldq m1, m4, 2
+ pxor m3, m3
+ punpcklbw m0, m3
+ punpckhbw m1, m3
+ paddw m0, m1
+ pmullw m0, m11
+ pshufd m2, m4, q0321
+ punpcklbw m2, m3
+ pmullw m1, m14, m2
+ paddw m0, m1
+ psrldq m1, m4, 3
+ pslldq m4, 3
+ punpcklbw m1, m3
+ punpckhbw m4, m3
+ paddw m1, m4
+ pmullw m1, m13
+ paddw m0, m1
+ psllw m2, 7
+ paddw m2, m8
+ paddsw m0, m2
+ psrldq m1, m5, 2
+ pslldq m4, m5, 2
+ punpcklbw m1, m3
+ punpckhbw m4, m3
+ paddw m1, m4
+ pmullw m1, m11
+ pshufd m4, m5, q0321
+ punpcklbw m4, m3
+ pmullw m2, m14, m4
+ paddw m1, m2
+ psrldq m2, m5, 3
+ pslldq m5, 3
+ punpcklbw m2, m3
+ punpckhbw m5, m3
+ paddw m2, m5
+ pmullw m2, m13
+ paddw m1, m2
+ psllw m4, 7
+ paddw m4, m8
+ paddsw m1, m4
+%endif
+%endmacro
+ %%h5
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m15
+ paddw m1, m15
+ mova [t1+xq*2+ 0], m0
+ mova [t1+xq*2+16], m1
+ add xq, 16
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movifnidn leftq, leftmp
+ mova m4, [lpfq+xq]
+ movd m5, [leftq]
+ add leftq, 4
+ pslldq m4, 4
+ por m4, m5
+ movifnidn leftmp, leftq
+ jmp .hv_main
+.hv_extend_left:
+%if cpuflag(ssse3)
+ mova m4, [lpfq+xq]
+ pshufb m4, m12
+%else
+ mova m5, [lpfq+xq]
+ pshufd m4, m5, q2103
+ punpcklbw m5, m5
+ punpcklwd m5, m5
+ movss m4, m5
+%endif
+ jmp .hv_main
+.hv_bottom:
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m4, [lpfq+xq-4]
+.hv_main:
+ movu m5, [lpfq+xq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp xd, -17
+ jl .hv_have_right
+ call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
+.hv_have_right:
+ %%h5
+ mova m2, [t3+xq*2]
+ paddw m2, [t1+xq*2]
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m15
+ paddw m1, m15
+%if ARCH_X86_64
+ mova m3, [t2+xq*2]
+ paddw m4, m0, [t4+xq*2]
+%else
+ mov r2, t2
+ mova m3, [r2+xq*2]
+ mov r2, t4
+ paddw m4, m0, [r2+xq*2]
+%endif
+ mova [t0+xq*2], m0
+ punpcklwd m0, m2, m3
+ pmaddwd m0, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m4
+ pmaddwd m3, m6
+ punpckhwd m4, m4
+ pmaddwd m4, m6
+ paddd m0, m3
+ paddd m4, m2
+ mova m2, [t3+xq*2+16]
+ paddw m2, [t1+xq*2+16]
+ packuswb m0, m4
+%if ARCH_X86_64
+ mova m3, [t2+xq*2+16]
+ paddw m4, m1, [t4+xq*2+16]
+%else
+ paddw m4, m1, [r2+xq*2+16]
+ mov r2, t2
+ mova m3, [r2+xq*2+16]
+ mov dstq, dstmp
+%endif
+ mova [t0+xq*2+16], m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m4
+ pmaddwd m3, m6
+ punpckhwd m4, m4
+ pmaddwd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ packuswb m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+xq], m0
+ add xq, 16
+ jl .hv_loop
+ add dstq, strideq
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t4
+ movifnidn dstmp, dstq
+ ret
+%if cpuflag(ssse3)
+.v:
+ mov xq, wq
+.v_loop:
+ mova m3, [t1+xq*2]
+ paddw m1, m3, [t3+xq*2]
+%if ARCH_X86_64
+ mova m2, [t2+xq*2]
+ paddw m3, [t4+xq*2]
+%else
+ mov r2, t2
+ mova m2, [r2+xq*2]
+ mov r2, t4
+ paddw m3, [r2+xq*2]
+%endif
+ punpcklwd m0, m1, m2
+ pmaddwd m0, m7
+ punpckhwd m1, m2
+ pmaddwd m1, m7
+ punpcklwd m2, m3
+ pmaddwd m2, m6
+ punpckhwd m3, m3
+ pmaddwd m3, m6
+ paddd m0, m2
+ paddd m1, m3
+ mova m4, [t1+xq*2+16]
+ paddw m2, m4, [t3+xq*2+16]
+%if ARCH_X86_64
+ mova m3, [t2+xq*2+16]
+ paddw m4, [t4+xq*2+16]
+%else
+ paddw m4, [r2+xq*2+16]
+ mov r2, t2
+ mova m3, [r2+xq*2+16]
+ mov dstq, dstmp
+%endif
+ packuswb m0, m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4
+ pmaddwd m3, m6
+ punpckhwd m4, m4
+ pmaddwd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ packuswb m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+xq], m0
+ add xq, 16
+ jl .v_loop
+ ret
+%endif
+%endmacro
+
+INIT_XMM sse2
+WIENER
+
+INIT_XMM ssse3
+WIENER
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; self-guided ;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro GATHERDD 3 ; dst, src, tmp
+ movd %3d, %2
+ %if ARCH_X86_64
+ movd %1, [r13+%3]
+ pextrw %3d, %2, 2
+ pinsrw %1, [r13+%3+2], 3
+ pextrw %3d, %2, 4
+ pinsrw %1, [r13+%3+2], 5
+ pextrw %3d, %2, 6
+ pinsrw %1, [r13+%3+2], 7
+ %else
+ movd %1, [base+sgr_x_by_x-0xf03+%3]
+ pextrw %3, %2, 2
+ pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 3
+ pextrw %3, %2, 4
+ pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 5
+ pextrw %3, %2, 6
+ pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 7
+ %endif
+%endmacro
+
+%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore
+ %if ARCH_X86_64
+ %define tmp r14
+ %else
+ %define tmp %4
+ %endif
+ GATHERDD %1, %2, tmp
+ GATHERDD %2, %3, tmp
+ movif32 %4, %5
+ psrld %1, 24
+ psrld %2, 24
+ packssdw %1, %2
+%endmacro
+
+%macro MULLD 3 ; dst, src, tmp
+ pmulhuw %3, %1, %2
+ pmullw %1, %2
+ pslld %3, 16
+ paddd %1, %3
+%endmacro
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 0, 1, 2, 3, 5
+ %if STACK_ALIGNMENT < 16
+ %assign extra_stack 5*16
+ %else
+ %assign extra_stack 3*16
+ %endif
+cglobal sgr_filter_5x5_8bpc, 1, 7, 8, -400*24-16-extra_stack, \
+ dst, stride, left, lpf, w
+ %if STACK_ALIGNMENT < 16
+ %define dstm dword [esp+calloff+16*0+4*6]
+ %define stridemp dword [esp+calloff+16*0+4*7]
+ %define leftm dword [esp+calloff+16*3+4*0]
+ %define lpfm dword [esp+calloff+16*3+4*1]
+ %define w0m dword [esp+calloff+16*3+4*2]
+ %define hd dword [esp+calloff+16*3+4*3]
+ %define edgeb byte [esp+calloff+16*3+4*4]
+ %define edged dword [esp+calloff+16*3+4*4]
+ %define leftmp leftm
+ %else
+ %define w0m wm
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %define edged dword r7m
+ %endif
+ %define hvsrcm dword [esp+calloff+4*0]
+ %define w1m dword [esp+calloff+4*1]
+ %define t0m dword [esp+calloff+4*2]
+ %define t2m dword [esp+calloff+4*3]
+ %define t3m dword [esp+calloff+4*4]
+ %define t4m dword [esp+calloff+4*5]
+ %define m8 [base+pb_1]
+ %define m9 [esp+calloff+16*2]
+ %define m10 [base+pd_0xf00800a4]
+ %define m11 [base+sgr_lshuf5]
+ %define m12 [base+pd_34816]
+ %define m13 [base+pb_0to15]
+ %define r10 r4
+ %define base r6-$$
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov strideq, [rstk+stack_offset+ 8]
+ mov leftq, [rstk+stack_offset+12]
+ mov lpfq, [rstk+stack_offset+16]
+ mov wd, [rstk+stack_offset+20]
+ mov dstm, dstq
+ mov stridemp, strideq
+ mov leftm, leftq
+ mov r1, [rstk+stack_offset+24]
+ mov r2, [rstk+stack_offset+32]
+ mov lpfm, lpfq
+ mov hd, r1
+ mov edged, r2
+ %endif
+%else
+DECLARE_REG_TMP 8, 7, 9, 11, 12
+cglobal sgr_filter_5x5_8bpc, 4, 15, 14, -400*24-16, dst, stride, left, lpf, \
+ w, h, edge, params
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ mov wd, wm
+%endif
+%if ARCH_X86_64
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x-0xf03]
+ movifnidn hd, hm
+ mov edged, r7m
+ movu m9, [paramsq]
+ add lpfq, wq
+ mova m8, [pb_1]
+ lea t1, [rsp+wq*2+20]
+ mova m10, [pd_0xf00800a4]
+ add dstq, wq
+ lea t3, [rsp+wq*4+400*12+16]
+ mova m12, [pd_34816] ; (1 << 11) + (1 << 15)
+ lea t4, [rsp+wq*2+400*20+16]
+ pshufhw m7, m9, q0000
+ pshufb m9, [pw_256] ; s0
+ punpckhqdq m7, m7 ; w0
+ neg wq
+ mova m13, [pb_0to15]
+ pxor m6, m6
+ mova m11, [sgr_lshuf5]
+ psllw m7, 4
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+%else
+ mov r1, [rstk+stack_offset+28] ; params
+ LEA r6, $$
+ movu m1, [r1]
+ add lpfm, wq
+ lea t1, [rsp+extra_stack+wq*2+20]
+ add dstq, wq
+ lea t3, [rsp+extra_stack+wq*4+400*12+16]
+ mov dstm, dstq
+ lea t4, [rsp+extra_stack+wq*2+400*20+16]
+ mov t3m, t3
+ pshufhw m7, m1, q0000
+ mov t4m, t4
+ pshufb m1, [base+pw_256] ; s0
+ punpckhqdq m7, m7 ; w0
+ psllw m7, 4
+ neg wq
+ mova m9, m1
+ pxor m6, m6
+ mov w1m, wd
+ sub wd, 2
+ mov lpfq, lpfm
+ mov w0m, wd
+ %define strideq r5
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, stridemp
+ movif32 t2m, t1
+ mov t2, t1
+ call .top_fixup
+ add t1, 400*6
+ call .h_top
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov lpfm, r10 ; below
+ movif32 t0m, t2
+ mov t0, t2
+ dec hd
+ jz .height1
+ or edged, 16
+ call .h
+.main:
+ add lpfq, stridemp
+ movif32 t4, t4m
+ call .hv
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+%if ARCH_X86_64
+ test hb, hb
+%else
+ mov r4, hd
+ test r4, r4
+%endif
+ jz .odd_height
+ call .h
+ add lpfq, stridemp
+ call .hv
+ movif32 dstq, dstm
+ call .n0
+ call .n1
+ sub hd, 2
+ movif32 t0, t0m
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, lpfm
+ call .h_top
+ add lpfq, stridemp
+ call .hv_bottom
+.end:
+ movif32 dstq, dstm
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ movif32 t4, t4m
+ call .hv
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .hv
+ movif32 dstq, dstm
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v
+ movif32 dstq, dstm
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v
+ jmp .end
+.no_top:
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+ lea t2, [t1+400*6]
+ movif32 t2m, t2
+ call .top_fixup
+ dec hd
+ jz .no_top_height1
+ or edged, 16
+ mov t0, t1
+ mov t1, t2
+ movif32 t0m, t0
+ jmp .main
+.no_top_height1:
+ movif32 t3, t3m
+ movif32 t4, t4m
+ call .v
+ call .prep_n
+ jmp .odd_height_end
+.extend_right:
+%assign stack_offset stack_offset+8
+%assign calloff 8
+ movd m1, wd
+ movd m3, [lpfq-1]
+ pshufb m1, m6
+ pshufb m3, m6
+ psubb m2, m8, m1
+ pcmpgtb m2, m13
+ pand m5, m2
+ pandn m2, m3
+ por m5, m2
+ ret
+%assign stack_offset stack_offset-4
+%assign calloff 4
+.h: ; horizontal boxsum
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ %define leftq r4
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 13
+ jmp .h_main
+.h_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, m11
+ jmp .h_main
+.h_top:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 wq, w0m
+.h_loop:
+ movu m5, [lpfq+wq-1]
+.h_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp wd, -10
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ palignr m2, m5, m4, 2
+ paddw m0, m4, m2
+ palignr m3, m5, m4, 6
+ paddw m0, m3
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m1
+ punpckhwd m2, m3
+ pmaddwd m2, m2
+ palignr m5, m4, 8
+ paddw m0, m5
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m3
+ paddd m1, m3
+ punpckhwd m3, m4, m5
+ pmaddwd m3, m3
+ shufps m4, m5, q2121
+ paddw m0, m4 ; sum
+ punpcklwd m5, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m4, m6
+ pmaddwd m4, m4
+ paddd m2, m3
+ test edgeb, 16 ; y > 0
+ jz .h_loop_end
+ paddw m0, [t1+wq*2+400*0]
+ paddd m1, [t1+wq*2+400*2]
+ paddd m2, [t1+wq*2+400*4]
+.h_loop_end:
+ paddd m1, m5 ; sumsq
+ paddd m2, m4
+ mova [t1+wq*2+400*0], m0
+ mova [t1+wq*2+400*2], m1
+ mova [t1+wq*2+400*4], m2
+ add wq, 8
+ jl .h_loop
+ ret
+.top_fixup:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wd, w0m
+%endif
+.top_fixup_loop: ; the sums of the first row needs to be doubled
+ mova m0, [t1+wq*2+400*0]
+ mova m1, [t1+wq*2+400*2]
+ mova m2, [t1+wq*2+400*4]
+ paddw m0, m0
+ paddd m1, m1
+ paddd m2, m2
+ mova [t2+wq*2+400*0], m0
+ mova [t2+wq*2+400*2], m1
+ mova [t2+wq*2+400*4], m2
+ add wq, 8
+ jl .top_fixup_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 13
+ jmp .hv_main
+.hv_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, m11
+ jmp .hv_main
+.hv_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv_loop_start
+%endif
+.hv_loop:
+ movif32 lpfq, hvsrcm
+.hv_loop_start:
+ movu m5, [lpfq+wq-1]
+.hv_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp wd, -10
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ movif32 t3, hd
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ palignr m3, m5, m4, 2
+ paddw m0, m4, m3
+ palignr m1, m5, m4, 6
+ paddw m0, m1
+ punpcklwd m2, m3, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m1
+ pmaddwd m3, m3
+ palignr m5, m4, 8
+ paddw m0, m5
+ punpcklwd m1, m4, m5
+ pmaddwd m1, m1
+ paddd m2, m1
+ punpckhwd m1, m4, m5
+ pmaddwd m1, m1
+ shufps m4, m5, q2121
+ paddw m0, m4 ; h sum
+ punpcklwd m5, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m4, m6
+ pmaddwd m4, m4
+ paddd m3, m1
+ paddd m2, m5 ; h sumsq
+ paddd m3, m4
+ paddw m1, m0, [t1+wq*2+400*0]
+ paddd m4, m2, [t1+wq*2+400*2]
+ paddd m5, m3, [t1+wq*2+400*4]
+%if ARCH_X86_64
+ test hd, hd
+%else
+ test t3, t3
+%endif
+ jz .hv_last_row
+.hv_main2:
+ paddw m1, [t2+wq*2+400*0] ; hv sum
+ paddd m4, [t2+wq*2+400*2] ; hv sumsq
+ paddd m5, [t2+wq*2+400*4]
+ mova [t0+wq*2+400*0], m0
+ pslld m0, m4, 4
+ mova [t0+wq*2+400*2], m2
+ mova [t0+wq*2+400*4], m3
+ pslld m2, m4, 3
+ paddd m4, m0
+ pslld m0, m5, 4
+ paddd m4, m2 ; a * 25
+ pslld m2, m5, 3
+ paddd m5, m0
+ paddd m5, m2
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmaddwd m2, m0, m0 ; b * b
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m2 ; p * s
+ MULLD m5, m9, m2
+ pmaddwd m0, m10 ; b * 164
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ movif32 t3, t3m
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, t2, t2m
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m2
+ MULLD m1, m5, m2
+ paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m12
+ mova [t4+wq*2+4], m3
+ psrld m0, 12 ; b
+ psrld m1, 12
+ mova [t3+wq*4+ 8], m0
+ mova [t3+wq*4+24], m1
+ add wq, 8
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ movif32 t2m, t2
+ movif32 t0m, t0
+ ret
+.hv_last_row: ; esoteric edge case for odd heights
+ mova [t1+wq*2+400*0], m1
+ paddw m1, m0
+ mova [t1+wq*2+400*2], m4
+ paddd m4, m2
+ mova [t1+wq*2+400*4], m5
+ paddd m5, m3
+ jmp .hv_main2
+.v: ; vertical boxsum + ab
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wd, w0m
+%endif
+.v_loop:
+ mova m0, [t1+wq*2+400*0]
+ mova m2, [t1+wq*2+400*2]
+ mova m3, [t1+wq*2+400*4]
+ paddw m1, m0, [t2+wq*2+400*0]
+ paddd m4, m2, [t2+wq*2+400*2]
+ paddd m5, m3, [t2+wq*2+400*4]
+ paddw m0, m0
+ paddd m2, m2
+ paddd m3, m3
+ paddw m1, m0 ; hv sum
+ paddd m4, m2 ; hv sumsq
+ pslld m0, m4, 4
+ paddd m5, m3
+ pslld m2, m4, 3
+ paddd m4, m0
+ pslld m0, m5, 4
+ paddd m4, m2 ; a * 25
+ pslld m2, m5, 3
+ paddd m5, m0
+ paddd m5, m2
+ punpcklwd m0, m1, m6
+ punpckhwd m1, m6
+ pmaddwd m2, m0, m0 ; b * b
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m2 ; p * s
+ MULLD m5, m9, m2
+ pmaddwd m0, m10 ; b * 164
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, t2, t2m
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m2
+ MULLD m1, m5, m2
+ paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m12
+ mova [t4+wq*2+4], m3
+ psrld m0, 12 ; b
+ psrld m1, 12
+ mova [t3+wq*4+ 8], m0
+ mova [t3+wq*4+24], m1
+ add wq, 8
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ movif64 wq, r4
+ movif32 wd, w1m
+.prep_n_loop:
+ movu m0, [t4+wq*2+ 2]
+ movu m3, [t4+wq*2+ 4]
+ movu m1, [t3+wq*4+ 4]
+ movu m4, [t3+wq*4+ 8]
+ movu m2, [t3+wq*4+20]
+ movu m5, [t3+wq*4+24]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ paddw m3, [t4+wq*2+ 0]
+ paddd m4, [t3+wq*4+ 0]
+ paddd m5, [t3+wq*4+16]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m4
+ pslld m4, 2
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m4 ; b 565
+ paddd m2, m5
+ mova [t4+wq*2+400*2+ 0], m0
+ mova [t3+wq*4+400*4+ 0], m1
+ mova [t3+wq*4+400*4+16], m2
+ add wq, 8
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n0_loop:
+ movu m0, [t4+wq*2+ 2]
+ movu m3, [t4+wq*2+ 4]
+ movu m1, [t3+wq*4+ 4]
+ movu m4, [t3+wq*4+ 8]
+ movu m2, [t3+wq*4+20]
+ movu m5, [t3+wq*4+24]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ paddw m3, [t4+wq*2+ 0]
+ paddd m4, [t3+wq*4+ 0]
+ paddd m5, [t3+wq*4+16]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m4
+ pslld m4, 2
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m4 ; b 565
+ paddd m2, m5
+ paddw m3, m0, [t4+wq*2+400*2+ 0]
+ paddd m4, m1, [t3+wq*4+400*4+ 0]
+ paddd m5, m2, [t3+wq*4+400*4+16]
+ mova [t4+wq*2+400*2+ 0], m0
+ mova [t3+wq*4+400*4+ 0], m1
+ mova [t3+wq*4+400*4+16], m2
+ movq m0, [dstq+wq]
+ punpcklbw m0, m6
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 8)
+ psubd m5, m3
+ psrad m4, 9
+ psrad m5, 9
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ packuswb m0, m0
+ movq [dstq+wq], m0
+ add wq, 8
+ jl .n0_loop
+ add dstq, stridemp
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n1_loop:
+ movq m0, [dstq+wq]
+ mova m3, [t4+wq*2+400*2+ 0]
+ mova m4, [t3+wq*4+400*4+ 0]
+ mova m5, [t3+wq*4+400*4+16]
+ punpcklbw m0, m6
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 7)
+ psubd m5, m3
+ psrad m4, 8
+ psrad m5, 8
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ packuswb m0, m0
+ movq [dstq+wq], m0
+ add wq, 8
+ jl .n1_loop
+ add dstq, stridemp
+ movif32 dstm, dstq
+ ret
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < 16
+ %assign extra_stack 4*16
+ %else
+ %assign extra_stack 2*16
+ %endif
+cglobal sgr_filter_3x3_8bpc, 1, 7, 8, -400*42-16-extra_stack, \
+ dst, stride, left, lpf, w
+ %if STACK_ALIGNMENT < 16
+ %define dstm dword [esp+calloff+16*2+4*0]
+ %define stridemp dword [esp+calloff+16*2+4*1]
+ %define leftm dword [esp+calloff+16*2+4*2]
+ %define lpfm dword [esp+calloff+16*2+4*3]
+ %define w0m dword [esp+calloff+16*2+4*4]
+ %define hd dword [esp+calloff+16*2+4*5]
+ %define edgeb byte [esp+calloff+16*2+4*6]
+ %define edged dword [esp+calloff+16*2+4*6]
+ %define leftmp leftm
+ %else
+ %define w0m wm
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %define edged dword r7m
+ %endif
+ %define hvsrcm dword [esp+calloff+4*0]
+ %define w1m dword [esp+calloff+4*1]
+ %define t3m dword [esp+calloff+4*2]
+ %define t4m dword [esp+calloff+4*3]
+ %define m8 [base+pb_0to15]
+ %define m9 [esp+calloff+16*1]
+ %define m10 [base+pd_0xf00801c7]
+ %define m11 [base+pd_34816]
+ %define m12 m6
+ %define m13 [base+sgr_lshuf3]
+ %define base r6-$$
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov strideq, [rstk+stack_offset+ 8]
+ mov leftq, [rstk+stack_offset+12]
+ mov lpfq, [rstk+stack_offset+16]
+ mov wd, [rstk+stack_offset+20]
+ mov dstm, dstq
+ mov stridemp, strideq
+ mov leftm, leftq
+ mov r1, [rstk+stack_offset+24]
+ mov r2, [rstk+stack_offset+32]
+ mov lpfm, lpfq
+ mov hd, r1
+ mov edged, r2
+ %endif
+%else
+cglobal sgr_filter_3x3_8bpc, 4, 15, 14, -400*42-8, dst, stride, left, lpf, \
+ w, h, edge, params
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ mov wd, wm
+%endif
+%if ARCH_X86_64
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x-0xf03]
+ mov hd, hm
+ mov edged, r7m
+ movq m9, [paramsq+4]
+ add lpfq, wq
+ lea t1, [rsp+wq*2+12]
+ mova m8, [pb_0to15]
+ add dstq, wq
+ lea t3, [rsp+wq*4+400*12+8]
+ mova m10, [pd_0xf00801c7]
+ lea t4, [rsp+wq*2+400*32+8]
+ mova m11, [pd_34816]
+ pshuflw m7, m9, q3333
+ pshufb m9, [pw_256] ; s1
+ punpcklqdq m7, m7 ; w1
+ neg wq
+ pxor m6, m6
+ mova m13, [sgr_lshuf3]
+ psllw m7, 4
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+%else
+ mov r1, [rstk+stack_offset+28] ; params
+ LEA r6, $$
+ movq m1, [r1+4]
+ add lpfm, wq
+ lea t1, [rsp+extra_stack+wq*2+20]
+ add dstq, wq
+ lea t3, [rsp+extra_stack+wq*4+400*12+16]
+ mov dstm, dstq
+ lea t4, [rsp+extra_stack+wq*2+400*32+16]
+ mov t3m, t3
+ pshuflw m7, m1, q3333
+ mov t4m, t4
+ pshufb m1, [base+pw_256] ; s1
+ punpcklqdq m7, m7 ; w1
+ psllw m7, 4
+ neg wq
+ mova m9, m1
+ pxor m6, m6
+ mov w1m, wd
+ sub wd, 2
+ mov lpfq, lpfm
+ mov w0m, wd
+ %define strideq r5
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, stridemp
+ mov t2, t1
+ add t1, 400*6
+ call .h_top
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov lpfm, r10 ; below
+ movif32 t4, t4m
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv0
+%if ARCH_X86_64
+ test hb, hb
+%else
+ mov r4, hd
+ test r4, r4
+%endif
+ jz .odd_height
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, lpfm
+ call .hv0_bottom
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wq, w0m
+ mov hvsrcm, lpfq
+%endif
+ lea t2, [t1+400*6]
+.top_fixup_loop:
+ mova m0, [t1+wq*2+400*0]
+ mova m1, [t1+wq*2+400*2]
+ mova m2, [t1+wq*2+400*4]
+ mova [t2+wq*2+400*0], m0
+ mova [t2+wq*2+400*2], m1
+ mova [t2+wq*2+400*4], m2
+ add wq, 8
+ jl .top_fixup_loop
+ movif32 t3, t3m
+ movif32 t4, t4m
+ call .v0
+ jmp .main
+.extend_right:
+%assign stack_offset stack_offset+8
+%assign calloff 8
+ movd m0, [lpfq-1]
+ movd m1, wd
+ mova m3, m8
+ pshufb m0, m6
+ pshufb m1, m6
+ mova m2, m6
+ psubb m2, m1
+ pcmpgtb m2, m3
+ pand m5, m2
+ pandn m2, m0
+ por m5, m2
+ ret
+%assign stack_offset stack_offset-4
+%assign calloff 4
+.h: ; horizontal boxsum
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ %define leftq r4
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 14
+ jmp .h_main
+.h_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, m13
+ jmp .h_main
+.h_top:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 wq, w0m
+.h_loop:
+ movu m5, [lpfq+wq]
+.h_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp wd, -9
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ palignr m0, m5, m4, 2
+ paddw m1, m4, m0
+ punpcklwd m2, m4, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m0
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m1, m5 ; sum
+ punpcklwd m4, m5, m6
+ pmaddwd m4, m4
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m4 ; sumsq
+ paddd m3, m5
+ mova [t1+wq*2+400*0], m1
+ mova [t1+wq*2+400*2], m2
+ mova [t1+wq*2+400*4], m3
+ add wq, 8
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 14
+ jmp .hv0_main
+.hv0_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, m13
+ jmp .hv0_main
+.hv0_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv0_loop_start
+%endif
+.hv0_loop:
+ movif32 lpfq, hvsrcm
+.hv0_loop_start:
+ movu m5, [lpfq+wq]
+.hv0_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp wd, -9
+ jl .hv0_have_right
+ call .extend_right
+.hv0_have_right:
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ palignr m0, m5, m4, 2
+ paddw m1, m4, m0
+ punpcklwd m2, m4, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m0
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m1, m5 ; sum
+ punpcklwd m4, m5, m6
+ pmaddwd m4, m4
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m4 ; sumsq
+ paddd m3, m5
+ paddw m0, m1, [t1+wq*2+400*0]
+ paddd m4, m2, [t1+wq*2+400*2]
+ paddd m5, m3, [t1+wq*2+400*4]
+ mova [t1+wq*2+400*0], m1
+ mova [t1+wq*2+400*2], m2
+ mova [t1+wq*2+400*4], m3
+ paddw m1, m0, [t2+wq*2+400*0]
+ paddd m2, m4, [t2+wq*2+400*2]
+ paddd m3, m5, [t2+wq*2+400*4]
+ mova [t2+wq*2+400*0], m0
+ mova [t2+wq*2+400*2], m4
+ mova [t2+wq*2+400*4], m5
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m6 ; b
+ pmaddwd m2, m0, m0 ; b * b
+ punpckhwd m1, m6
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m12 ; p * s
+ MULLD m5, m9, m12
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ movif32 t3, t3m
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m12
+ MULLD m1, m5, m12
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq*2+4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*4+ 8], m0
+ mova [t3+wq*4+24], m1
+ add wq, 8
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 14
+ jmp .hv1_main
+.hv1_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, m13
+ jmp .hv1_main
+.hv1_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv1_loop_start
+%endif
+.hv1_loop:
+ movif32 lpfq, hvsrcm
+.hv1_loop_start:
+ movu m5, [lpfq+wq]
+.hv1_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp wd, -9
+ jl .hv1_have_right
+ call .extend_right
+.hv1_have_right:
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ palignr m1, m5, m4, 2
+ paddw m0, m4, m1
+ punpcklwd m2, m4, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m1
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m0, m5 ; h sum
+ punpcklwd m1, m5, m6
+ pmaddwd m1, m1
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m1 ; h sumsq
+ paddd m3, m5
+ paddw m1, m0, [t2+wq*2+400*0]
+ paddd m4, m2, [t2+wq*2+400*2]
+ paddd m5, m3, [t2+wq*2+400*4]
+ mova [t2+wq*2+400*0], m0
+ mova [t2+wq*2+400*2], m2
+ mova [t2+wq*2+400*4], m3
+ pslld m2, m4, 3
+ pslld m3, m5, 3
+ paddd m4, m2 ; a * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m6 ; b
+ pmaddwd m2, m0, m0 ; b * b
+ punpckhwd m1, m6
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m12 ; p * s
+ MULLD m5, m9, m12
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ movif32 t3, t3m
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m12
+ MULLD m1, m5, m12
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq*2+400*2 +4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*4+400*4+ 8], m0
+ mova [t3+wq*4+400*4+24], m1
+ add wq, 8
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wd, w0m
+%endif
+.v0_loop:
+ mova m0, [t1+wq*2+400*0]
+ mova m4, [t1+wq*2+400*2]
+ mova m5, [t1+wq*2+400*4]
+ paddw m0, m0
+ paddd m4, m4
+ paddd m5, m5
+ paddw m1, m0, [t2+wq*2+400*0]
+ paddd m2, m4, [t2+wq*2+400*2]
+ paddd m3, m5, [t2+wq*2+400*4]
+ mova [t2+wq*2+400*0], m0
+ mova [t2+wq*2+400*2], m4
+ mova [t2+wq*2+400*4], m5
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m6 ; b
+ pmaddwd m2, m0, m0 ; b * b
+ punpckhwd m1, m6
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m12 ; p * s
+ MULLD m5, m9, m12
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m12
+ MULLD m1, m5, m12
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq*2+4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*4+ 8], m0
+ mova [t3+wq*4+24], m1
+ add wq, 8
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wd, w0m
+%endif
+.v1_loop:
+ mova m0, [t1+wq*2+400*0]
+ mova m4, [t1+wq*2+400*2]
+ mova m5, [t1+wq*2+400*4]
+ paddw m1, m0, [t2+wq*2+400*0]
+ paddd m2, m4, [t2+wq*2+400*2]
+ paddd m3, m5, [t2+wq*2+400*4]
+ mova [t2+wq*2+400*0], m0
+ mova [t2+wq*2+400*2], m4
+ mova [t2+wq*2+400*4], m5
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m6 ; b
+ pmaddwd m2, m0, m0 ; b * b
+ punpckhwd m1, m6
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m12 ; p * s
+ MULLD m5, m9, m12
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m12
+ MULLD m1, m5, m12
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq*2+400*2+ 4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*4+400*4+ 8], m0
+ mova [t3+wq*4+400*4+24], m1
+ add wq, 8
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ movif64 wq, r4
+ movif32 wd, w1m
+.prep_n_loop:
+ movu m0, [t4+wq*2+400*0+ 4]
+ movu m1, [t3+wq*4+400*0+ 8]
+ movu m2, [t3+wq*4+400*0+24]
+ movu m3, [t4+wq*2+400*0+ 2]
+ movu m4, [t3+wq*4+400*0+ 4]
+ movu m5, [t3+wq*4+400*0+20]
+ paddw m0, [t4+wq*2+400*0+ 0]
+ paddd m1, [t3+wq*4+400*0+ 0]
+ paddd m2, [t3+wq*4+400*0+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a[-1] 444
+ pslld m4, 2 ; b[-1] 444
+ pslld m5, 2
+ psubw m3, m0 ; a[-1] 343
+ psubd m4, m1 ; b[-1] 343
+ psubd m5, m2
+ mova [t4+wq*2+400*4], m3
+ mova [t3+wq*4+400*8+ 0], m4
+ mova [t3+wq*4+400*8+16], m5
+ movu m0, [t4+wq*2+400*2+ 4]
+ movu m1, [t3+wq*4+400*4+ 8]
+ movu m2, [t3+wq*4+400*4+24]
+ movu m3, [t4+wq*2+400*2+ 2]
+ movu m4, [t3+wq*4+400*4+ 4]
+ movu m5, [t3+wq*4+400*4+20]
+ paddw m0, [t4+wq*2+400*2+ 0]
+ paddd m1, [t3+wq*4+400*4+ 0]
+ paddd m2, [t3+wq*4+400*4+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a[ 0] 444
+ pslld m4, 2 ; b[ 0] 444
+ pslld m5, 2
+ mova [t4+wq*2+400* 6], m3
+ mova [t3+wq*4+400*12+ 0], m4
+ mova [t3+wq*4+400*12+16], m5
+ psubw m3, m0 ; a[ 0] 343
+ psubd m4, m1 ; b[ 0] 343
+ psubd m5, m2
+ mova [t4+wq*2+400* 8], m3
+ mova [t3+wq*4+400*16+ 0], m4
+ mova [t3+wq*4+400*16+16], m5
+ add wq, 8
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n0_loop:
+ movu m3, [t4+wq*2+400*0+4]
+ movu m1, [t4+wq*2+400*0+2]
+ paddw m3, [t4+wq*2+400*0+0]
+ paddw m1, m3
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+wq*2+400*4]
+ paddw m3, [t4+wq*2+400*6]
+ mova [t4+wq*2+400*4], m2
+ mova [t4+wq*2+400*6], m1
+ movu m4, [t3+wq*4+400*0+8]
+ movu m1, [t3+wq*4+400*0+4]
+ paddd m4, [t3+wq*4+400*0+0]
+ paddd m1, m4
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m4 ; b[ 1] 343
+ paddd m4, m2, [t3+wq*4+400* 8+ 0]
+ paddd m4, [t3+wq*4+400*12+ 0]
+ mova [t3+wq*4+400* 8+ 0], m2
+ mova [t3+wq*4+400*12+ 0], m1
+ movu m5, [t3+wq*4+400*0+24]
+ movu m1, [t3+wq*4+400*0+20]
+ paddd m5, [t3+wq*4+400*0+16]
+ paddd m1, m5
+ pslld m1, 2
+ psubd m2, m1, m5
+ paddd m5, m2, [t3+wq*4+400* 8+16]
+ paddd m5, [t3+wq*4+400*12+16]
+ mova [t3+wq*4+400* 8+16], m2
+ mova [t3+wq*4+400*12+16], m1
+ movq m0, [dstq+wq]
+ punpcklbw m0, m6
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 8)
+ psubd m5, m3
+ psrad m4, 9
+ psrad m5, 9
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ packuswb m0, m0
+ movq [dstq+wq], m0
+ add wq, 8
+ jl .n0_loop
+ add dstq, stridemp
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n1_loop:
+ movu m3, [t4+wq*2+400*2+4]
+ movu m1, [t4+wq*2+400*2+2]
+ paddw m3, [t4+wq*2+400*2+0]
+ paddw m1, m3
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+wq*2+400*6]
+ paddw m3, [t4+wq*2+400*8]
+ mova [t4+wq*2+400*6], m1
+ mova [t4+wq*2+400*8], m2
+ movu m4, [t3+wq*4+400*4+8]
+ movu m1, [t3+wq*4+400*4+4]
+ paddd m4, [t3+wq*4+400*4+0]
+ paddd m1, m4
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m4 ; b[ 1] 343
+ paddd m4, m2, [t3+wq*4+400*12+ 0]
+ paddd m4, [t3+wq*4+400*16+ 0]
+ mova [t3+wq*4+400*12+ 0], m1
+ mova [t3+wq*4+400*16+ 0], m2
+ movu m5, [t3+wq*4+400*4+24]
+ movu m1, [t3+wq*4+400*4+20]
+ paddd m5, [t3+wq*4+400*4+16]
+ paddd m1, m5
+ pslld m1, 2
+ psubd m2, m1, m5
+ paddd m5, m2, [t3+wq*4+400*12+16]
+ paddd m5, [t3+wq*4+400*16+16]
+ mova [t3+wq*4+400*12+16], m1
+ mova [t3+wq*4+400*16+16], m2
+ movq m0, [dstq+wq]
+ punpcklbw m0, m6
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 8)
+ psubd m5, m3
+ psrad m4, 9
+ psrad m5, 9
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ packuswb m0, m0
+ movq [dstq+wq], m0
+ add wq, 8
+ jl .n1_loop
+ add dstq, stridemp
+ movif32 dstm, dstq
+ ret
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < 16
+ %assign extra_stack 10*16
+ %else
+ %assign extra_stack 8*16
+ %endif
+cglobal sgr_filter_mix_8bpc, 1, 7, 8, -400*66-48-extra_stack, \
+ dst, stride, left, lpf, w
+ %if STACK_ALIGNMENT < 16
+ %define dstm dword [esp+calloff+16*8+4*0]
+ %define stridemp dword [esp+calloff+16*8+4*1]
+ %define leftm dword [esp+calloff+16*8+4*2]
+ %define lpfm dword [esp+calloff+16*8+4*3]
+ %define w0m dword [esp+calloff+16*8+4*4]
+ %define hd dword [esp+calloff+16*8+4*5]
+ %define edgeb byte [esp+calloff+16*8+4*6]
+ %define edged dword [esp+calloff+16*8+4*6]
+ %define leftmp leftm
+ %else
+ %define w0m wm
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %define edged dword r7m
+ %endif
+ %define hvsrcm dword [esp+calloff+4*0]
+ %define w1m dword [esp+calloff+4*1]
+ %define t3m dword [esp+calloff+4*2]
+ %define t4m dword [esp+calloff+4*3]
+ %xdefine m8 m6
+ %define m9 [base+pd_0xffff]
+ %define m10 [base+pd_34816]
+ %define m11 [base+pd_0xf00801c7]
+ %define m12 [base+pd_0xf00800a4]
+ %define m13 [esp+calloff+16*4]
+ %define m14 [esp+calloff+16*5]
+ %define m15 [esp+calloff+16*6]
+ %define m6 [esp+calloff+16*7]
+ %define base r6-$$
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov strideq, [rstk+stack_offset+ 8]
+ mov leftq, [rstk+stack_offset+12]
+ mov lpfq, [rstk+stack_offset+16]
+ mov wd, [rstk+stack_offset+20]
+ mov dstm, dstq
+ mov stridemp, strideq
+ mov leftm, leftq
+ mov r1, [rstk+stack_offset+24]
+ mov r2, [rstk+stack_offset+32]
+ mov lpfm, lpfq
+ mov hd, r1
+ mov edged, r2
+ %endif
+%else
+cglobal sgr_filter_mix_8bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \
+ w, h, edge, params
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ mov wd, wm
+%endif
+%if ARCH_X86_64
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x-0xf03]
+ movifnidn hd, hm
+ mov edged, r7m
+ mova m15, [paramsq]
+ add lpfq, wq
+ mova m9, [pd_0xffff]
+ lea t1, [rsp+wq*2+44]
+ mova m10, [pd_34816]
+ add dstq, wq
+ lea t3, [rsp+wq*4+400*24+40]
+ mova m11, [pd_0xf00801c7]
+ lea t4, [rsp+wq*2+400*52+40]
+ mova m12, [base+pd_0xf00800a4]
+ neg wq
+ pshuflw m13, m15, q0000
+ pshuflw m14, m15, q2222
+ pshufhw m15, m15, q1010
+ punpcklqdq m13, m13 ; s0
+ punpcklqdq m14, m14 ; s1
+ punpckhqdq m15, m15 ; w0 w1
+ pxor m6, m6
+ psllw m15, 2
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+%else
+ mov r1, [rstk+stack_offset+28] ; params
+ LEA r6, $$
+ mova m2, [r1]
+ add lpfm, wq
+ lea t1, [rsp+extra_stack+wq*2+52]
+ add dstq, wq
+ lea t3, [rsp+extra_stack+wq*4+400*24+48]
+ mov dstm, dstq
+ lea t4, [rsp+extra_stack+wq*2+400*52+48]
+ mov t3m, t3
+ mov t4m, t4
+ neg wq
+ pshuflw m0, m2, q0000
+ pshuflw m1, m2, q2222
+ pshufhw m2, m2, q1010
+ punpcklqdq m0, m0 ; s0
+ punpcklqdq m1, m1 ; s1
+ punpckhqdq m2, m2 ; w0 w1
+ mov w1m, wd
+ pxor m3, m3
+ psllw m2, 2
+ mova m13, m0
+ mova m14, m1
+ sub wd, 2
+ mova m15, m2
+ mova m6, m3
+ mov lpfq, lpfm
+ mov w0m, wd
+ %define strideq r5
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, stridemp
+ mov t2, t1
+%if ARCH_X86_64
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup
+%else
+ mov wq, w0m
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup_loop
+%endif
+ add t1, 400*12
+ call .h_top
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov lpfm, r10 ; below
+ movif32 t4, t4m
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv0
+%if ARCH_X86_64
+ test hd, hd
+%else
+ mov r4, hd
+ test r4, r4
+%endif
+ jz .odd_height
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, lpfm
+ call .hv0_bottom
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wq, w0m
+ mov hvsrcm, lpfq
+%endif
+ lea t2, [t1+400*12]
+.top_fixup_loop:
+ mova m0, [t1+wq*2+400* 0]
+ mova m1, [t1+wq*2+400* 2]
+ mova m2, [t1+wq*2+400* 4]
+ paddw m0, m0
+ mova m3, [t1+wq*2+400* 6]
+ paddd m1, m1
+ mova m4, [t1+wq*2+400* 8]
+ paddd m2, m2
+ mova m5, [t1+wq*2+400*10]
+ mova [t2+wq*2+400* 0], m0
+ mova [t2+wq*2+400* 2], m1
+ mova [t2+wq*2+400* 4], m2
+ mova [t2+wq*2+400* 6], m3
+ mova [t2+wq*2+400* 8], m4
+ mova [t2+wq*2+400*10], m5
+ add wq, 8
+ jl .top_fixup_loop
+ movif32 t3, t3m
+ movif32 t4, t4m
+ call .v0
+ jmp .main
+.extend_right:
+%assign stack_offset stack_offset+8
+%assign calloff 8
+%if ARCH_X86_64
+ SWAP m8, m6
+%endif
+ movd m1, wd
+ movd m3, [lpfq-1]
+ pshufb m1, m8
+ pshufb m3, m8
+ psubb m2, [base+pb_1], m1
+ pcmpgtb m2, [base+pb_0to15]
+ pand m5, m2
+ pandn m2, m3
+ por m5, m2
+%if ARCH_X86_64
+ SWAP m6, m8
+%endif
+ ret
+%assign stack_offset stack_offset-4
+%assign calloff 4
+.h: ; horizontal boxsum
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ %define leftq r4
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 13
+ jmp .h_main
+.h_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, [base+sgr_lshuf5]
+ jmp .h_main
+.h_top:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 wq, w0m
+.h_loop:
+ movu m5, [lpfq+wq-1]
+.h_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+%if ARCH_X86_32
+ pxor m8, m8
+%else
+ SWAP m8, m6
+%endif
+ jnz .h_have_right
+ cmp wd, -10
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ punpcklbw m4, m5, m8
+ punpckhbw m5, m8
+ palignr m3, m5, m4, 2
+ palignr m0, m5, m4, 4
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m5, m4, 6
+ paddw m1, m0 ; sum3
+ punpcklwd m7, m0, m8
+ pmaddwd m7, m7
+ punpckhwd m0, m8
+ pmaddwd m0, m0
+%if ARCH_X86_64
+ SWAP m6, m8
+%endif
+ paddd m2, m7 ; sumsq3
+ palignr m5, m4, 8
+ punpcklwd m7, m5, m4
+ paddw m8, m4, m5
+ pmaddwd m7, m7
+ punpckhwd m5, m4
+ pmaddwd m5, m5
+ paddd m3, m0
+ mova [t1+wq*2+400* 6], m1
+ mova [t1+wq*2+400* 8], m2
+ mova [t1+wq*2+400*10], m3
+ paddw m8, m1 ; sum5
+ paddd m7, m2 ; sumsq5
+ paddd m5, m3
+ mova [t1+wq*2+400* 0], m8
+ mova [t1+wq*2+400* 2], m7
+ mova [t1+wq*2+400* 4], m5
+ add wq, 8
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 13
+ jmp .hv0_main
+.hv0_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, [base+sgr_lshuf5]
+ jmp .hv0_main
+.hv0_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv0_loop_start
+%endif
+.hv0_loop:
+ movif32 lpfq, hvsrcm
+.hv0_loop_start:
+ movu m5, [lpfq+wq-1]
+.hv0_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+%if ARCH_X86_32
+ pxor m8, m8
+%else
+ SWAP m8, m6
+%endif
+ jnz .hv0_have_right
+ cmp wd, -10
+ jl .hv0_have_right
+ call .extend_right
+.hv0_have_right:
+ punpcklbw m4, m5, m8
+ punpckhbw m5, m8
+ palignr m3, m5, m4, 2
+ palignr m0, m5, m4, 4
+ movif32 t3, t3m
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m5, m4, 6
+ paddw m1, m0 ; h sum3
+ punpcklwd m7, m0, m8
+ pmaddwd m7, m7
+ punpckhwd m0, m8
+%if ARCH_X86_64
+ SWAP m6, m8
+%endif
+ pmaddwd m0, m0
+ paddd m2, m7 ; h sumsq3
+ palignr m5, m4, 8
+ punpcklwd m7, m5, m4
+ paddw m8, m4, m5
+ pmaddwd m7, m7
+ punpckhwd m5, m4
+ pmaddwd m5, m5
+ paddd m3, m0
+ paddw m8, m1 ; h sum5
+ paddd m7, m2 ; h sumsq5
+ paddd m5, m3
+ mova [t3+wq*4+400*8+ 8], m8
+ mova [t3+wq*4+400*0+ 8], m7
+ mova [t3+wq*4+400*0+24], m5
+ paddw m8, [t1+wq*2+400* 0]
+ paddd m7, [t1+wq*2+400* 2]
+ paddd m5, [t1+wq*2+400* 4]
+ mova [t1+wq*2+400* 0], m8
+ mova [t1+wq*2+400* 2], m7
+ mova [t1+wq*2+400* 4], m5
+ paddw m0, m1, [t1+wq*2+400* 6]
+ paddd m4, m2, [t1+wq*2+400* 8]
+ paddd m5, m3, [t1+wq*2+400*10]
+ mova [t1+wq*2+400* 6], m1
+ mova [t1+wq*2+400* 8], m2
+ mova [t1+wq*2+400*10], m3
+ paddw m1, m0, [t2+wq*2+400* 6]
+ paddd m2, m4, [t2+wq*2+400* 8]
+ paddd m3, m5, [t2+wq*2+400*10]
+ mova [t2+wq*2+400* 6], m0
+ mova [t2+wq*2+400* 8], m4
+ mova [t2+wq*2+400*10], m5
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a3 * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m7 ; b3
+ pmaddwd m2, m0, m0
+ punpckhwd m1, m7
+ pmaddwd m3, m1, m1
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ psubd m4, m2 ; p3
+ psubd m5, m3
+ MULLD m4, m14, m7 ; p3 * s1
+ MULLD m5, m14, m7
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z3, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m7
+ MULLD m1, m5, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*2+400*2+ 4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*4+400*4+ 8], m0
+ mova [t3+wq*4+400*4+24], m1
+ add wq, 8
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 13
+ jmp .hv1_main
+.hv1_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, [base+sgr_lshuf5]
+ jmp .hv1_main
+.hv1_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv1_loop_start
+%endif
+.hv1_loop:
+ movif32 lpfq, hvsrcm
+.hv1_loop_start:
+ movu m5, [lpfq+wq-1]
+.hv1_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+%if ARCH_X86_32
+ pxor m8, m8
+%else
+ SWAP m8, m6
+%endif
+ jnz .hv1_have_right
+ cmp wd, -10
+ jl .hv1_have_right
+ call .extend_right
+.hv1_have_right:
+ punpcklbw m4, m5, m8
+ punpckhbw m5, m8
+ palignr m7, m5, m4, 2
+ palignr m3, m5, m4, 4
+ paddw m2, m7, m3
+ punpcklwd m0, m7, m3
+ pmaddwd m0, m0
+ punpckhwd m7, m3
+ pmaddwd m7, m7
+ palignr m3, m5, m4, 6
+ paddw m2, m3 ; h sum3
+ punpcklwd m1, m3, m8
+ pmaddwd m1, m1
+ punpckhwd m3, m8
+%if ARCH_X86_64
+ SWAP m6, m8
+%endif
+ pmaddwd m3, m3
+ paddd m0, m1 ; h sumsq3
+ palignr m5, m4, 8
+ punpckhwd m1, m4, m5
+ paddw m8, m4, m5
+ pmaddwd m1, m1
+ punpcklwd m4, m5
+ pmaddwd m4, m4
+ paddd m7, m3
+ paddw m5, m2, [t2+wq*2+400* 6]
+ mova [t2+wq*2+400* 6], m2
+ paddw m8, m2 ; h sum5
+ paddd m2, m0, [t2+wq*2+400* 8]
+ paddd m3, m7, [t2+wq*2+400*10]
+ mova [t2+wq*2+400* 8], m0
+ mova [t2+wq*2+400*10], m7
+ paddd m4, m0 ; h sumsq5
+ paddd m1, m7
+ pslld m0, m2, 3
+ pslld m7, m3, 3
+ paddd m2, m0 ; a3 * 9
+ paddd m3, m7
+%if ARCH_X86_32
+ mova [esp+20], m8
+ pxor m8, m8
+%else
+ SWAP m8, m6
+%endif
+ punpcklwd m0, m5, m8 ; b3
+ pmaddwd m7, m0, m0
+ punpckhwd m5, m8
+ pmaddwd m8, m5, m5
+ psubd m2, m7 ; p3
+ psubd m3, m8
+ MULLD m2, m14, m8 ; p3 * s1
+ MULLD m3, m14, m8
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m5, m11
+ paddusw m2, m11
+ paddusw m3, m11
+ psrld m2, 20 ; min(z3, 255)
+ movif32 t3, t3m
+ psrld m3, 20
+ GATHER_X_BY_X m8, m2, m3, r0, dstm
+ punpcklwd m2, m8, m8
+ punpckhwd m3, m8, m8
+ MULLD m0, m2, m7
+ MULLD m5, m3, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m5, m10
+ psrld m0, 12
+ psrld m5, 12
+ mova [t4+wq*2+400*4+ 4], m8
+ mova [t3+wq*4+400*8+ 8], m0
+ mova [t3+wq*4+400*8+24], m5
+%if ARCH_X86_32
+ mova m8, [esp+20]
+%else
+ SWAP m6, m8
+ pxor m6, m6
+%endif
+ paddw m5, m8, [t2+wq*2+400*0]
+ paddd m2, m4, [t2+wq*2+400*2]
+ paddd m3, m1, [t2+wq*2+400*4]
+ paddw m5, [t1+wq*2+400*0]
+ paddd m2, [t1+wq*2+400*2]
+ paddd m3, [t1+wq*2+400*4]
+ mova [t2+wq*2+400*0], m8
+ pslld m0, m2, 4
+ mova [t2+wq*2+400*2], m4
+ pslld m8, m3, 4
+ mova [t2+wq*2+400*4], m1
+ pslld m4, m2, 3
+ paddd m2, m0
+ pslld m7, m3, 3
+ paddd m3, m8
+ paddd m2, m4 ; a5 * 25
+ paddd m3, m7
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ punpcklwd m0, m5, m7 ; b5
+ pmaddwd m4, m0, m0
+ punpckhwd m5, m7
+ pmaddwd m1, m5, m5
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ psubd m2, m4 ; p5
+ psubd m3, m1
+ MULLD m2, m13, m7 ; p5 * s0
+ MULLD m3, m13, m7
+ pmaddwd m0, m12 ; b5 * 164
+ pmaddwd m5, m12
+ paddusw m2, m12
+ paddusw m3, m12
+ psrld m2, 20 ; min(z5, 255)
+ psrld m3, 20
+ GATHER_X_BY_X m1, m2, m3, r0, dstm
+ punpcklwd m2, m1, m1
+ punpckhwd m3, m1, m1
+ MULLD m0, m2, m7
+ MULLD m5, m3, m7
+ paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m5, m10
+ mova [t4+wq*2+4], m1
+ psrld m0, 12
+ psrld m5, 12
+ mova [t3+wq*4+ 8], m0
+ mova [t3+wq*4+24], m5
+ add wq, 8
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab3 (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wd, w0m
+%endif
+.v0_loop:
+ mova m0, [t1+wq*2+400* 6]
+ mova m4, [t1+wq*2+400* 8]
+ mova m5, [t1+wq*2+400*10]
+ paddw m0, m0
+ paddd m4, m4
+ paddd m5, m5
+ paddw m1, m0, [t2+wq*2+400* 6]
+ paddd m2, m4, [t2+wq*2+400* 8]
+ paddd m3, m5, [t2+wq*2+400*10]
+ mova [t2+wq*2+400* 6], m0
+ mova [t2+wq*2+400* 8], m4
+ mova [t2+wq*2+400*10], m5
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a3 * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m7 ; b3
+ pmaddwd m2, m0, m0
+ punpckhwd m1, m7
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p3
+ psubd m5, m3
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MULLD m4, m14, m7 ; p3 * s1
+ MULLD m5, m14, m7
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z3, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m7
+ MULLD m1, m5, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*2+400*2+4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova m3, [t1+wq*2+400*0]
+ mova m4, [t1+wq*2+400*2]
+ mova m5, [t1+wq*2+400*4]
+ mova [t3+wq*4+400*8+ 8], m3
+ mova [t3+wq*4+400*0+ 8], m4
+ mova [t3+wq*4+400*0+24], m5
+ paddw m3, m3 ; cc5
+ paddd m4, m4
+ paddd m5, m5
+ mova [t1+wq*2+400*0], m3
+ mova [t1+wq*2+400*2], m4
+ mova [t1+wq*2+400*4], m5
+ mova [t3+wq*4+400*4+ 8], m0
+ mova [t3+wq*4+400*4+24], m1
+ add wq, 8
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wd, w0m
+%endif
+.v1_loop:
+ mova m4, [t1+wq*2+400* 6]
+ mova m5, [t1+wq*2+400* 8]
+ mova m7, [t1+wq*2+400*10]
+ paddw m1, m4, [t2+wq*2+400* 6]
+ paddd m2, m5, [t2+wq*2+400* 8]
+ paddd m3, m7, [t2+wq*2+400*10]
+ mova [t2+wq*2+400* 6], m4
+ mova [t2+wq*2+400* 8], m5
+ mova [t2+wq*2+400*10], m7
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a3 + 8) >> 4) * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m7 ; b3
+ pmaddwd m2, m0, m0
+ punpckhwd m1, m7
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p3
+ psubd m5, m3
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MULLD m4, m14, m7 ; p3 * s1
+ MULLD m5, m14, m7
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z3, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m7
+ MULLD m1, m5, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*2+400*4+4], m3
+ psrld m0, 12
+ psrld m8, m1, 12
+ mova m4, [t3+wq*4+400*8+ 8]
+ mova m5, [t3+wq*4+400*0+ 8]
+ mova m7, [t3+wq*4+400*0+24]
+ paddw m1, m4, [t2+wq*2+400*0]
+ paddd m2, m5, [t2+wq*2+400*2]
+ paddd m3, m7, [t2+wq*2+400*4]
+ paddw m1, [t1+wq*2+400*0]
+ paddd m2, [t1+wq*2+400*2]
+ paddd m3, [t1+wq*2+400*4]
+ mova [t2+wq*2+400*0], m4
+ mova [t2+wq*2+400*2], m5
+ mova [t2+wq*2+400*4], m7
+ pslld m4, m2, 4
+ mova [t3+wq*4+400*8+ 8], m0
+ pslld m5, m3, 4
+ mova [t3+wq*4+400*8+24], m8
+ pslld m7, m2, 3
+ paddd m2, m4
+ pslld m8, m3, 3
+ paddd m3, m5
+ paddd m2, m7 ; a5 * 25
+ paddd m3, m8
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ punpcklwd m0, m1, m7 ; b5
+ pmaddwd m4, m0, m0
+ punpckhwd m1, m7
+ pmaddwd m5, m1, m1
+ psubd m2, m4 ; p5
+ psubd m3, m5
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MULLD m2, m13, m7 ; p5 * s0
+ MULLD m3, m13, m7
+ pmaddwd m0, m12 ; b5 * 164
+ pmaddwd m1, m12
+ paddusw m2, m12
+ paddusw m3, m12
+ psrld m2, 20 ; min(z5, 255)
+ psrld m3, 20
+ GATHER_X_BY_X m4, m2, m3, r0, dstm
+ punpcklwd m2, m4, m4
+ punpckhwd m3, m4, m4
+ MULLD m0, m2, m7
+ MULLD m1, m3, m7
+ paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*2+4], m4
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*4+ 8], m0
+ mova [t3+wq*4+24], m1
+ add wq, 8
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ movif64 wq, r4
+ movif32 wd, w1m
+.prep_n_loop:
+ movu m0, [t4+wq*2+400*0+ 2]
+ movu m1, [t3+wq*4+400*0+ 4]
+ movu m2, [t3+wq*4+400*0+20]
+ movu m7, [t4+wq*2+400*0+ 4]
+ movu m8, [t3+wq*4+400*0+ 8]
+ paddw m3, m0, [t4+wq*2+400*0+ 0]
+ paddd m4, m1, [t3+wq*4+400*0+ 0]
+ paddd m5, m2, [t3+wq*4+400*0+16]
+ paddw m3, m7
+ paddd m4, m8
+ movu m7, [t3+wq*4+400*0+24]
+ paddw m0, m3
+ paddd m1, m4
+ psllw m3, 2
+ pslld m4, 2
+ paddd m5, m7
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a5 565
+ paddd m1, m4 ; b5 565
+ paddd m2, m5
+ mova [t4+wq*2+400* 6+ 0], m0
+ mova [t3+wq*4+400*12+ 0], m1
+ mova [t3+wq*4+400*12+16], m2
+ movu m0, [t4+wq*2+400*2+ 4]
+ movu m1, [t3+wq*4+400*4+ 8]
+ movu m2, [t3+wq*4+400*4+24]
+ movu m3, [t4+wq*2+400*2+ 2]
+ movu m4, [t3+wq*4+400*4+ 4]
+ movu m5, [t3+wq*4+400*4+20]
+ paddw m0, [t4+wq*2+400*2+ 0]
+ paddd m1, [t3+wq*4+400*4+ 0]
+ paddd m2, [t3+wq*4+400*4+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a3[-1] 444
+ pslld m4, 2 ; b3[-1] 444
+ pslld m5, 2
+ psubw m3, m0 ; a3[-1] 343
+ psubd m4, m1 ; b3[-1] 343
+ psubd m5, m2
+ mova [t4+wq*2+400* 8+ 0], m3
+ mova [t3+wq*4+400*16+ 0], m4
+ mova [t3+wq*4+400*16+16], m5
+ movu m0, [t4+wq*2+400*4+ 4]
+ movu m1, [t3+wq*4+400*8+ 8]
+ movu m2, [t3+wq*4+400*8+24]
+ movu m3, [t4+wq*2+400*4+ 2]
+ movu m4, [t3+wq*4+400*8+ 4]
+ movu m5, [t3+wq*4+400*8+20]
+ paddw m0, [t4+wq*2+400*4+ 0]
+ paddd m1, [t3+wq*4+400*8+ 0]
+ paddd m2, [t3+wq*4+400*8+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a3[ 0] 444
+ pslld m4, 2 ; b3[ 0] 444
+ pslld m5, 2
+ mova [t4+wq*2+400*10+ 0], m3
+ mova [t3+wq*4+400*20+ 0], m4
+ mova [t3+wq*4+400*20+16], m5
+ psubw m3, m0 ; a3[ 0] 343
+ psubd m4, m1 ; b3[ 0] 343
+ psubd m5, m2
+ mova [t4+wq*2+400*12+ 0], m3
+ mova [t3+wq*4+400*24+ 0], m4
+ mova [t3+wq*4+400*24+16], m5
+ add wq, 8
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n0_loop:
+ movu m0, [t4+wq*2+ 4]
+ movu m2, [t4+wq*2+ 2]
+ paddw m0, [t4+wq*2+ 0]
+ paddw m0, m2
+ paddw m2, m0
+ psllw m0, 2
+ paddw m0, m2 ; a5
+ movu m4, [t3+wq*4+ 8]
+ movu m5, [t3+wq*4+24]
+ movu m1, [t3+wq*4+ 4]
+ movu m3, [t3+wq*4+20]
+ paddd m4, [t3+wq*4+ 0]
+ paddd m5, [t3+wq*4+16]
+ paddd m4, m1
+ paddd m5, m3
+ paddd m1, m4
+ paddd m3, m5
+ pslld m4, 2
+ pslld m5, 2
+ paddd m4, m1 ; b5
+ paddd m5, m3
+ movu m2, [t4+wq*2+400* 6]
+ paddw m2, m0
+ mova [t4+wq*2+400* 6], m0
+ paddd m0, m4, [t3+wq*4+400*12+ 0]
+ paddd m1, m5, [t3+wq*4+400*12+16]
+ mova [t3+wq*4+400*12+ 0], m4
+ mova [t3+wq*4+400*12+16], m5
+ mova [rsp+16+ARCH_X86_32*4], m1
+ movu m3, [t4+wq*2+400*2+4]
+ movu m5, [t4+wq*2+400*2+2]
+ paddw m3, [t4+wq*2+400*2+0]
+ paddw m5, m3
+ psllw m5, 2 ; a3[ 1] 444
+ psubw m4, m5, m3 ; a3[ 1] 343
+ movu m3, [t4+wq*2+400* 8]
+ paddw m3, [t4+wq*2+400*10]
+ paddw m3, m4
+ mova [t4+wq*2+400* 8], m4
+ mova [t4+wq*2+400*10], m5
+ movu m1, [t3+wq*4+400*4+ 8]
+ movu m5, [t3+wq*4+400*4+ 4]
+ movu m7, [t3+wq*4+400*4+24]
+ movu m8, [t3+wq*4+400*4+20]
+ paddd m1, [t3+wq*4+400*4+ 0]
+ paddd m7, [t3+wq*4+400*4+16]
+ paddd m5, m1
+ paddd m8, m7
+ pslld m5, 2 ; b3[ 1] 444
+ pslld m8, 2
+ psubd m4, m5, m1 ; b3[ 1] 343
+%if ARCH_X86_32
+ mova [esp+52], m8
+ psubd m8, m7
+%else
+ psubd m6, m8, m7
+ SWAP m8, m6
+%endif
+ paddd m1, m4, [t3+wq*4+400*16+ 0]
+ paddd m7, m8, [t3+wq*4+400*16+16]
+ paddd m1, [t3+wq*4+400*20+ 0]
+ paddd m7, [t3+wq*4+400*20+16]
+ mova [t3+wq*4+400*16+ 0], m4
+ mova [t3+wq*4+400*16+16], m8
+ mova [t3+wq*4+400*20+ 0], m5
+%if ARCH_X86_32
+ mova m8, [esp+52]
+%else
+ SWAP m8, m6
+ pxor m6, m6
+%endif
+ mova [t3+wq*4+400*20+16], m8
+ mova [rsp+32+ARCH_X86_32*4], m7
+ movq m4, [dstq+wq]
+ punpcklbw m4, m6
+ punpcklwd m5, m4, m6
+ punpcklwd m7, m2, m6
+ pmaddwd m7, m5 ; a5 * src
+ punpcklwd m8, m3, m6
+ pmaddwd m8, m5 ; a3 * src
+ punpckhwd m5, m4, m6
+ punpckhwd m2, m6
+ pmaddwd m2, m5
+ punpckhwd m3, m6
+ pmaddwd m3, m5
+ psubd m0, m7 ; b5 - a5 * src + (1 << 8) - (src << 13)
+ psubd m1, m8 ; b3 - a3 * src + (1 << 8) - (src << 13)
+ psrld m0, 9
+ pslld m1, 7
+ pand m0, m9
+ pandn m8, m9, m1
+ por m0, m8
+ mova m1, [rsp+16+ARCH_X86_32*4]
+ psubd m1, m2
+ mova m2, [rsp+32+ARCH_X86_32*4]
+ psubd m2, m3
+ mova m3, [base+pd_4096]
+ psrld m1, 9
+ pslld m2, 7
+ pand m1, m9
+ pandn m5, m9, m2
+ por m1, m5
+ pmaddwd m0, m15
+ pmaddwd m1, m15
+ paddd m0, m3
+ paddd m1, m3
+ psrad m0, 13
+ psrad m1, 13
+ packssdw m0, m1
+ paddw m0, m4
+ packuswb m0, m0
+ movq [dstq+wq], m0
+ add wq, 8
+ jl .n0_loop
+ add dstq, stridemp
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n1_loop:
+ movu m3, [t4+wq*2+400*4+4]
+ movu m5, [t4+wq*2+400*4+2]
+ paddw m3, [t4+wq*2+400*4+0]
+ paddw m5, m3
+ psllw m5, 2 ; a3[ 1] 444
+ psubw m4, m5, m3 ; a3[ 1] 343
+ paddw m3, m4, [t4+wq*2+400*12]
+ paddw m3, [t4+wq*2+400*10]
+ mova [t4+wq*2+400*10], m5
+ mova [t4+wq*2+400*12], m4
+ movu m1, [t3+wq*4+400*8+ 8]
+ movu m5, [t3+wq*4+400*8+ 4]
+ movu m7, [t3+wq*4+400*8+24]
+ movu m8, [t3+wq*4+400*8+20]
+ paddd m1, [t3+wq*4+400*8+ 0]
+ paddd m7, [t3+wq*4+400*8+16]
+ paddd m5, m1
+ paddd m8, m7
+ pslld m5, 2 ; b3[ 1] 444
+ pslld m8, 2
+ psubd m4, m5, m1 ; b3[ 1] 343
+ psubd m0, m8, m7
+ paddd m1, m4, [t3+wq*4+400*24+ 0]
+ paddd m7, m0, [t3+wq*4+400*24+16]
+ paddd m1, [t3+wq*4+400*20+ 0]
+ paddd m7, [t3+wq*4+400*20+16]
+ mova [t3+wq*4+400*20+ 0], m5
+ mova [t3+wq*4+400*20+16], m8
+ mova [t3+wq*4+400*24+ 0], m4
+ mova [t3+wq*4+400*24+16], m0
+ movq m5, [dstq+wq]
+ mova m2, [t4+wq*2+400* 6]
+ punpcklbw m5, m6
+ punpcklwd m4, m5, m6
+ punpcklwd m8, m2, m6
+ pmaddwd m8, m4 ; a5 * src
+ punpcklwd m0, m3, m6
+ pmaddwd m0, m4 ; a3 * src
+ punpckhwd m4, m5, m6
+ punpckhwd m2, m6
+ pmaddwd m2, m4
+ punpckhwd m3, m6
+ pmaddwd m3, m4
+ psubd m1, m0 ; b3 - a3 * src + (1 << 8) - (src << 13)
+ mova m0, [t3+wq*4+400*12+ 0]
+ psubd m0, m8 ; b5 - a5 * src + (1 << 8) - (src << 13)
+ mova m4, [t3+wq*4+400*12+16]
+ psubd m4, m2
+ psubd m7, m3
+ pslld m1, 7
+ psrld m0, 8
+ psrld m4, 8
+ pslld m7, 7
+ pandn m3, m9, m1
+ pand m0, m9
+ por m0, m3
+ pand m4, m9
+ pandn m2, m9, m7
+ por m2, m4
+ mova m1, [base+pd_4096]
+ pmaddwd m0, m15
+ pmaddwd m2, m15
+ paddd m0, m1
+ paddd m2, m1
+ psrad m0, 13
+ psrad m2, 13
+ packssdw m0, m2
+ paddw m0, m5
+ packuswb m0, m0
+ movq [dstq+wq], m0
+ add wq, 8
+ jl .n1_loop
+ add dstq, stridemp
+ movif32 dstm, dstq
+ ret