summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/x86/looprestoration16_avx512.asm
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--third_party/dav1d/src/x86/looprestoration16_avx512.asm2524
1 files changed, 2524 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/looprestoration16_avx512.asm b/third_party/dav1d/src/x86/looprestoration16_avx512.asm
new file mode 100644
index 0000000000..e560c54a40
--- /dev/null
+++ b/third_party/dav1d/src/x86/looprestoration16_avx512.asm
@@ -0,0 +1,2524 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 16
+
+wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11
+wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11
+wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
+wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1
+wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+r_ext_mask: times 72 db -1
+ times 8 db 0
+wiener_hshift: dw 4, 4, 1, 1
+wiener_vshift: dw 1024, 1024, 4096, 4096
+wiener_round: dd 1049600, 1048832
+
+pw_164_455: dw 164, 455
+pw_1023: times 2 dw 1023
+pw_61448: times 2 dw 61448
+pd_m262128: dd -262128
+pd_m34816: dd -34816
+pd_m25: dd -25
+pd_m9: dd -9
+pd_8: dd 8
+pd_2147483648: dd 2147483648
+
+cextern sgr_x_by_x
+
+SECTION .text
+
+DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers
+
+INIT_ZMM avx512icl
+cglobal wiener_filter7_16bpc, 4, 15, 17, -384*12-16, dst, stride, left, lpf, \
+ w, h, edge, flt
+%define base t4-wiener_hshift
+ mov fltq, r6mp
+ movifnidn wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ mov t3d, r8m ; pixel_max
+ vbroadcasti128 m6, [wiener_shufA]
+ vpbroadcastd m12, [fltq+ 0] ; x0 x1
+ lea t4, [wiener_hshift]
+ vbroadcasti128 m7, [wiener_shufB]
+ add wd, wd
+ vpbroadcastd m13, [fltq+ 4] ; x2 x3
+ shr t3d, 11
+ vpbroadcastd m14, [fltq+16] ; y0 y1
+ add lpfq, wq
+ vpbroadcastd m15, [fltq+20] ; y2 y3
+ add dstq, wq
+ vbroadcasti128 m8, [wiener_shufC]
+ lea t1, [rsp+wq+16]
+ vbroadcasti128 m9, [wiener_shufD]
+ neg wq
+ vpbroadcastd m0, [base+wiener_hshift+t3*4]
+ mov r10d, 0xfe
+ vpbroadcastd m10, [base+wiener_round+t3*4]
+ kmovb k1, r10d
+ vpbroadcastd m11, [base+wiener_vshift+t3*4]
+ pmullw m12, m0 ; upshift filter coefs to make the
+ vpbroadcastd m16, [pd_m262128]
+ pmullw m13, m0 ; horizontal downshift constant
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t4, t1
+ add t1, 384*2
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+.main:
+ lea t0, [t1+384*2]
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v3
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.v1:
+ call .v
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v3
+ add t0, 384*8
+ call .hv
+ dec hd
+ jnz .main
+.v3:
+ call .v
+.v2:
+ call .v
+ jmp .v1
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movq xm3, [leftq]
+ vmovdqu64 m3{k1}, [lpfq+r10-8]
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ mova m4, [lpfq+r10+0]
+ vpbroadcastw xm3, xm4
+ vmovdqu64 m3{k1}, [lpfq+r10-8]
+ jmp .h_main2
+.h_top:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m3, [lpfq+r10-8]
+.h_main:
+ mova m4, [lpfq+r10+0]
+.h_main2:
+ movu m5, [lpfq+r10+8]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -68
+ jl .h_have_right
+ push r0
+ lea r0, [r_ext_mask+66]
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m3, m0, [r0+r10+ 0], 0xe4 ; c ? a : b
+ vpternlogd m4, m0, [r0+r10+ 8], 0xe4
+ vpternlogd m5, m0, [r0+r10+16], 0xe4
+ pop r0
+.h_have_right:
+ pshufb m2, m3, m6
+ pshufb m1, m4, m7
+ paddw m2, m1
+ pshufb m3, m8
+ mova m0, m16
+ vpdpwssd m0, m2, m12
+ pshufb m1, m4, m9
+ paddw m3, m1
+ pshufb m1, m4, m6
+ vpdpwssd m0, m3, m13
+ pshufb m2, m5, m7
+ paddw m2, m1
+ mova m1, m16
+ pshufb m4, m8
+ vpdpwssd m1, m2, m12
+ pshufb m5, m9
+ paddw m4, m5
+ vpdpwssd m1, m4, m13
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ psraw m0, 1
+ mova [t1+r10], m0
+ add r10, 64
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movq xm3, [leftq]
+ vmovdqu64 m3{k1}, [lpfq+r10-8]
+ add leftq, 8
+ jmp .hv_main
+.hv_extend_left:
+ mova m4, [lpfq+r10+0]
+ vpbroadcastw xm3, xm4
+ vmovdqu64 m3{k1}, [lpfq+r10-8]
+ jmp .hv_main2
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m3, [lpfq+r10-8]
+.hv_main:
+ mova m4, [lpfq+r10+0]
+.hv_main2:
+ movu m5, [lpfq+r10+8]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -68
+ jl .hv_have_right
+ push r0
+ lea r0, [r_ext_mask+66]
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m3, m0, [r0+r10+ 0], 0xe4
+ vpternlogd m4, m0, [r0+r10+ 8], 0xe4
+ vpternlogd m5, m0, [r0+r10+16], 0xe4
+ pop r0
+.hv_have_right:
+ pshufb m2, m3, m6
+ pshufb m1, m4, m7
+ paddw m2, m1
+ pshufb m3, m8
+ mova m0, m16
+ vpdpwssd m0, m2, m12
+ pshufb m1, m4, m9
+ paddw m3, m1
+ pshufb m1, m4, m6
+ vpdpwssd m0, m3, m13
+ pshufb m2, m5, m7
+ paddw m2, m1
+ pshufb m4, m8
+ mova m1, m16
+ vpdpwssd m1, m2, m12
+ pshufb m5, m9
+ paddw m4, m5
+ vpdpwssd m1, m4, m13
+ mova m2, [t4+r10]
+ paddw m2, [t2+r10]
+ mova m5, [t3+r10]
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ mova m4, [t5+r10]
+ paddw m4, [t1+r10]
+ psraw m0, 1
+ paddw m3, m0, [t6+r10]
+ mova [t0+r10], m0
+ punpcklwd m1, m2, m5
+ mova m0, m10
+ vpdpwssd m0, m1, m15
+ punpckhwd m2, m5
+ mova m1, m10
+ vpdpwssd m1, m2, m15
+ punpcklwd m2, m3, m4
+ vpdpwssd m0, m2, m14
+ punpckhwd m3, m4
+ vpdpwssd m1, m3, m14
+ psrad m0, 5
+ psrad m1, 5
+ packusdw m0, m1
+ pmulhuw m0, m11
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .hv_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+ add dstq, strideq
+ ret
+.v:
+ mov r10, wq
+.v_loop:
+ mova m2, [t4+r10]
+ paddw m2, [t2+r10]
+ mova m3, [t3+r10]
+ punpcklwd m1, m2, m3
+ mova m0, m10
+ vpdpwssd m0, m1, m15
+ punpckhwd m2, m3
+ mova m1, m10
+ vpdpwssd m1, m2, m15
+ mova m4, [t1+r10]
+ paddw m3, m4, [t6+r10]
+ paddw m4, [t5+r10]
+ punpcklwd m2, m3, m4
+ vpdpwssd m0, m2, m14
+ punpckhwd m3, m4
+ vpdpwssd m1, m3, m14
+ psrad m0, 5
+ psrad m1, 5
+ packusdw m0, m1
+ pmulhuw m0, m11
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .v_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+ ret
+
+cglobal wiener_filter5_16bpc, 4, 14, 15, 384*8+16, dst, stride, left, lpf, \
+ w, h, edge, flt
+%define base r13-r_ext_mask-70
+ mov fltq, r6mp
+ movifnidn wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ mov t3d, r8m ; pixel_max
+ vbroadcasti128 m5, [wiener_shufE]
+ vpbroadcastw m11, [fltq+ 2] ; x1
+ vbroadcasti128 m6, [wiener_shufB]
+ lea r13, [r_ext_mask+70]
+ vbroadcasti128 m7, [wiener_shufD]
+ add wd, wd
+ vpbroadcastd m12, [fltq+ 4] ; x2 x3
+ shr t3d, 11
+ vpbroadcastd m8, [pd_m262128] ; (1 << 4) - (1 << 18)
+ add lpfq, wq
+ vpbroadcastw m13, [fltq+18] ; y1
+ add dstq, wq
+ vpbroadcastd m14, [fltq+20] ; y2 y3
+ lea t1, [rsp+wq+16]
+ vpbroadcastd m0, [base+wiener_hshift+t3*4]
+ neg wq
+ vpbroadcastd m9, [base+wiener_round+t3*4]
+ mov r10d, 0xfffe
+ vpbroadcastd m10, [base+wiener_vshift+t3*4]
+ kmovw k1, r10d
+ pmullw m11, m0
+ pmullw m12, m0
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t4, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t3, t1
+ add t1, 384*2
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+.main:
+ mov t0, t4
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v2
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v2
+ add t0, 384*6
+ call .hv
+ dec hd
+ jnz .main
+.v2:
+ call .v
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+.v1:
+ call .v
+ jmp .end
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm3, [leftq+4]
+ vmovdqu32 m3{k1}, [lpfq+r10-4]
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastw xm3, [lpfq+r10]
+ vmovdqu32 m3{k1}, [lpfq+r10-4]
+ jmp .h_main
+.h_top:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m3, [lpfq+r10-4]
+.h_main:
+ movu m4, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -66
+ jl .h_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m3, m0, [r13+r10+0], 0xe4 ; c ? a : b
+ vpternlogd m4, m0, [r13+r10+8], 0xe4
+.h_have_right:
+ pshufb m1, m3, m5
+ mova m0, m8
+ vpdpwssd m0, m1, m11
+ pshufb m2, m4, m5
+ mova m1, m8
+ vpdpwssd m1, m2, m11
+ pshufb m2, m3, m6
+ pshufb m3, m7
+ paddw m2, m3
+ pshufb m3, m4, m6
+ vpdpwssd m0, m2, m12
+ pshufb m4, m7
+ paddw m3, m4
+ vpdpwssd m1, m3, m12
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ psraw m0, 1
+ mova [t1+r10], m0
+ add r10, 64
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm3, [leftq+4]
+ vmovdqu32 m3{k1}, [lpfq+r10-4]
+ add leftq, 8
+ jmp .hv_main
+.hv_extend_left:
+ vpbroadcastw xm3, [lpfq+r10]
+ vmovdqu32 m3{k1}, [lpfq+r10-4]
+ jmp .hv_main
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m3, [lpfq+r10-4]
+.hv_main:
+ movu m4, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -66
+ jl .hv_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m3, m0, [r13+r10+0], 0xe4
+ vpternlogd m4, m0, [r13+r10+8], 0xe4
+.hv_have_right:
+ pshufb m1, m3, m5
+ mova m0, m8
+ vpdpwssd m0, m1, m11
+ pshufb m2, m4, m5
+ mova m1, m8
+ vpdpwssd m1, m2, m11
+ pshufb m2, m3, m6
+ pshufb m3, m7
+ paddw m2, m3
+ pshufb m3, m4, m6
+ vpdpwssd m0, m2, m12
+ pshufb m4, m7
+ paddw m4, m3
+ vpdpwssd m1, m4, m12
+ mova m2, [t3+r10]
+ paddw m2, [t1+r10]
+ mova m3, [t2+r10]
+ punpcklwd m4, m2, m3
+ punpckhwd m2, m3
+ mova m3, m9
+ vpdpwssd m3, m2, m14
+ mova m2, m9
+ vpdpwssd m2, m4, m14
+ mova m4, [t4+r10]
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ psraw m0, 1
+ mova [t0+r10], m0
+ punpcklwd m1, m0, m4
+ vpdpwssd m2, m1, m13
+ punpckhwd m0, m4
+ vpdpwssd m3, m0, m13
+ psrad m2, 5
+ psrad m3, 5
+ packusdw m2, m3
+ pmulhuw m2, m10
+ mova [dstq+r10], m2
+ add r10, 64
+ jl .hv_loop
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t4
+ add dstq, strideq
+ ret
+.v:
+ mov r10, wq
+.v_loop:
+ mova m0, [t1+r10]
+ paddw m2, m0, [t3+r10]
+ mova m1, [t2+r10]
+ mova m4, [t4+r10]
+ punpckhwd m3, m2, m1
+ pmaddwd m3, m14
+ punpcklwd m2, m1
+ pmaddwd m2, m14
+ punpckhwd m1, m0, m4
+ pmaddwd m1, m13
+ punpcklwd m0, m4
+ pmaddwd m0, m13
+ paddd m3, m9
+ paddd m2, m9
+ paddd m1, m3
+ paddd m0, m2
+ psrad m1, 5
+ psrad m0, 5
+ packusdw m0, m1
+ pmulhuw m0, m10
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .v_loop
+ ret
+
+cglobal sgr_filter_5x5_16bpc, 4, 14, 22, 416*24+8, dst, stride, left, lpf, \
+ w, h, edge, params
+%define base r13-r_ext_mask-72
+ movifnidn wd, wm
+ mov paramsq, r6mp
+ lea r13, [r_ext_mask+72]
+ mov edged, r7m
+ movifnidn hd, hm
+ pxor m6, m6
+ vpbroadcastw m7, [paramsq+8] ; w0
+ add wd, wd
+ vpbroadcastd m8, [base+pd_8]
+ add lpfq, wq
+ vpbroadcastd m9, [base+pd_m25]
+ add dstq, wq
+ vpsubd m10, m6, [paramsq+0] {1to16} ; -s0
+ lea t3, [rsp+wq*2+416*12+8]
+ vpbroadcastd m11, [base+pw_164_455]
+ lea t4, [rsp+wq+416*20+8]
+ vpbroadcastd m12, [base+pw_61448] ; (15 << 12) + (1 << 3)
+ lea t1, [rsp+wq+12]
+ vpbroadcastd m13, [base+pd_m34816] ; -((1 << 11) + (1 << 15))
+ neg wq
+ vpbroadcastd m14, [base+pw_1023]
+ psllw m7, 4
+ mova m18, [sgr_x_by_x+64*0]
+ mov r10d, 0xfffffff8
+ mova m19, [sgr_x_by_x+64*1]
+ kmovd k1, r10d
+ mova m20, [sgr_x_by_x+64*2]
+ mov r10, 0x3333333333333333
+ mova m21, [sgr_x_by_x+64*3]
+ kmovq k2, r10
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call .top_fixup
+ add t1, 416*6
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ mov t0, t2
+ dec hd
+ jz .height1
+ or edged, 16
+ call .h
+.main:
+ add lpfq, strideq
+ call .hv
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ test hd, hd
+ jz .odd_height
+ call .h
+ add lpfq, strideq
+ call .hv
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .h_top
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .hv
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .hv
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea t2, [t1+416*6]
+ call .top_fixup
+ dec hd
+ jz .no_top_height1
+ or edged, 16
+ mov t0, t1
+ mov t1, t2
+ jmp .main
+.no_top_height1:
+ call .v
+ call .prep_n
+ jmp .odd_height_end
+.h: ; horizontal boxsum
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movq xm16, [leftq+2]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m16, [lpfq+r10- 2]
+.h_main:
+ movu m17, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -68
+ jl .h_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4 ; c ? a : b
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.h_have_right:
+ palignr m2, m17, m16, 2
+ paddw m0, m16, m2
+ palignr m3, m17, m16, 6
+ paddw m0, m3
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m1
+ punpckhwd m2, m3
+ pmaddwd m2, m2
+ shufpd m17, m16, m17, 0x55
+ paddw m0, m17
+ punpcklwd m3, m16, m17
+ vpdpwssd m1, m3, m3
+ punpckhwd m3, m16, m17
+ vpdpwssd m2, m3, m3
+ shufps m16, m17, q2121
+ paddw m0, m16 ; sum
+ test edgeb, 16 ; y > 0
+ jz .h_loop_end
+ paddw m0, [t1+r10+416*0]
+ paddd m1, [t1+r10+416*2]
+ paddd m2, [t1+r10+416*4]
+.h_loop_end:
+ punpcklwd m17, m16, m6
+ vpdpwssd m1, m17, m17 ; sumsq
+ punpckhwd m16, m6
+ vpdpwssd m2, m16, m16
+ mova [t1+r10+416*0], m0
+ mova [t1+r10+416*2], m1
+ mova [t1+r10+416*4], m2
+ add r10, 64
+ jl .h_loop
+ ret
+.top_fixup:
+ lea r10, [wq-4]
+.top_fixup_loop: ; the sums of the first row needs to be doubled
+ mova m0, [t1+r10+416*0]
+ mova m1, [t1+r10+416*2]
+ mova m2, [t1+r10+416*4]
+ paddw m0, m0
+ paddd m1, m1
+ paddd m2, m2
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m1
+ mova [t2+r10+416*4], m2
+ add r10, 64
+ jl .top_fixup_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movq xm16, [leftq+2]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ add leftq, 8
+ jmp .hv_main
+.hv_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ jmp .hv_main
+.hv_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m16, [lpfq+r10- 2]
+.hv_main:
+ movu m17, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -68
+ jl .hv_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.hv_have_right:
+ palignr m3, m17, m16, 2
+ paddw m0, m16, m3
+ palignr m1, m17, m16, 6
+ paddw m0, m1
+ punpcklwd m2, m3, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m1
+ pmaddwd m3, m3
+ shufpd m17, m16, m17, 0x55
+ paddw m0, m17
+ punpcklwd m1, m16, m17
+ vpdpwssd m2, m1, m1
+ punpckhwd m1, m16, m17
+ vpdpwssd m3, m1, m1
+ shufps m16, m17, q2121
+ paddw m0, m16 ; h sum
+ punpcklwd m17, m16, m6
+ vpdpwssd m2, m17, m17 ; h sumsq
+ punpckhwd m16, m6
+ vpdpwssd m3, m16, m16
+ paddw m1, m0, [t1+r10+416*0]
+ paddd m16, m2, [t1+r10+416*2]
+ paddd m17, m3, [t1+r10+416*4]
+ test hd, hd
+ jz .hv_last_row
+.hv_main2:
+ paddw m1, [t2+r10+416*0] ; hv sum
+ paddd m16, [t2+r10+416*2] ; hv sumsq
+ paddd m17, [t2+r10+416*4]
+ mova [t0+r10+416*0], m0
+ mova [t0+r10+416*2], m2
+ mova [t0+r10+416*4], m3
+ psrlw m3, m1, 1
+ paddd m16, m8
+ pavgw m3, m6 ; (b + 2) >> 2
+ paddd m17, m8
+ psrld m16, 4 ; (a + 8) >> 4
+ psrld m17, 4
+ pmulld m16, m9 ; -a * 25
+ pmulld m17, m9
+ punpcklwd m2, m3, m6
+ vpdpwssd m16, m2, m2 ; -p
+ punpckhwd m3, m6
+ vpdpwssd m17, m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmulld m16, m10 ; p * s
+ pmulld m17, m10
+ pmaddwd m0, m11 ; b * 164
+ pmaddwd m1, m11
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m20
+ pmaxsw m17, m6
+ paddusw m17, m12
+ psraw m17, 4 ; min(z, 255) - 256
+ vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x
+ pandn m16, m13, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ packssdw m16, m17
+ psubd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
+ psubd m1, m13
+ mova [t4+r10+4], m16
+ psrld m16, m0, 12 ; b
+ psrld m17, m1, 12
+ mova [t3+r10*2+ 8], xm16
+ mova [t3+r10*2+ 24], xm17
+ vextracti128 [t3+r10*2+ 40], ym16, 1
+ vextracti128 [t3+r10*2+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+104], m16, 3
+ vextracti32x4 [t3+r10*2+120], m17, 3
+ add r10, 64
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ ret
+.hv_last_row: ; esoteric edge case for odd heights
+ mova [t1+r10+416*0], m1
+ paddw m1, m0
+ mova [t1+r10+416*2], m16
+ paddd m16, m2
+ mova [t1+r10+416*4], m17
+ paddd m17, m3
+ jmp .hv_main2
+.v: ; vertical boxsum + ab
+ lea r10, [wq-4]
+.v_loop:
+ mova m2, [t1+r10+416*2]
+ mova m3, [t1+r10+416*4]
+ mova m0, [t1+r10+416*0]
+ paddd m16, m2, [t2+r10+416*2]
+ paddd m17, m3, [t2+r10+416*4]
+ paddw m1, m0, [t2+r10+416*0]
+ paddd m2, m2
+ paddd m3, m3
+ paddd m16, m2 ; hv sumsq
+ paddd m17, m3
+ paddd m16, m8
+ paddd m17, m8
+ psrld m16, 4 ; (a + 8) >> 4
+ psrld m17, 4
+ pmulld m16, m9 ; -a * 25
+ pmulld m17, m9
+ paddw m0, m0
+ paddw m1, m0 ; hv sum
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ vpdpwssd m16, m2, m2 ; -p
+ punpckhwd m3, m6
+ vpdpwssd m17, m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmulld m16, m10 ; p * s
+ pmulld m17, m10
+ pmaddwd m0, m11 ; b * 164
+ pmaddwd m1, m11
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m20
+ pmaxsw m17, m6
+ paddusw m17, m12
+ psraw m17, 4 ; min(z, 255) - 256
+ vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x
+ pandn m16, m13, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ packssdw m16, m17
+ psubd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
+ psubd m1, m13
+ mova [t4+r10+4], m16
+ psrld m16, m0, 12 ; b
+ psrld m17, m1, 12
+ mova [t3+r10*2+ 8], xm16
+ mova [t3+r10*2+ 24], xm17
+ vextracti128 [t3+r10*2+ 40], ym16, 1
+ vextracti128 [t3+r10*2+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+104], m16, 3
+ vextracti32x4 [t3+r10*2+120], m17, 3
+ add r10, 64
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu m0, [t4+r10*1+ 2]
+ movu m1, [t3+r10*2+ 4]
+ movu m2, [t3+r10*2+68]
+ paddw m3, m0, [t4+r10*1+ 0]
+ paddd m16, m1, [t3+r10*2+ 0]
+ paddd m17, m2, [t3+r10*2+64]
+ paddw m3, [t4+r10*1+ 4]
+ paddd m16, [t3+r10*2+ 8]
+ paddd m17, [t3+r10*2+72]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m16
+ pslld m16, 2
+ paddd m2, m17
+ pslld m17, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m16 ; b 565
+ paddd m2, m17
+ mova [t4+r10*1+416*2+ 0], m0
+ mova [t3+r10*2+416*4+ 0], m1
+ mova [t3+r10*2+416*4+64], m2
+ add r10, 64
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu m0, [t4+r10*1+ 2]
+ movu m1, [t3+r10*2+ 4]
+ movu m2, [t3+r10*2+68]
+ paddw m3, m0, [t4+r10*1+ 0]
+ paddd m16, m1, [t3+r10*2+ 0]
+ paddd m17, m2, [t3+r10*2+64]
+ paddw m3, [t4+r10*1+ 4]
+ paddd m16, [t3+r10*2+ 8]
+ paddd m17, [t3+r10*2+72]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m16
+ pslld m16, 2
+ paddd m2, m17
+ pslld m17, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m16 ; b 565
+ paddd m2, m17
+ paddw m3, m0, [t4+r10*1+416*2+ 0]
+ paddd m16, m1, [t3+r10*2+416*4+ 0]
+ paddd m17, m2, [t3+r10*2+416*4+64]
+ mova [t4+r10*1+416*2+ 0], m0
+ mova [t3+r10*2+416*4+ 0], m1
+ mova [t3+r10*2+416*4+64], m2
+ mova m0, [dstq+r10]
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vshufi32x4 m1, m16, m17, q2020
+ vshufi32x4 m16, m17, q3131
+ psubd m1, m2 ; b - a * src + (1 << 8)
+ psubd m16, m3
+ psrad m1, 9
+ psrad m16, 9
+ packssdw m1, m16
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova m0, [dstq+r10]
+ mova m3, [t4+r10*1+416*2+ 0]
+ mova m16, [t3+r10*2+416*4+ 0]
+ mova m17, [t3+r10*2+416*4+64]
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vshufi32x4 m1, m16, m17, q2020
+ vshufi32x4 m16, m17, q3131
+ psubd m1, m2 ; b - a * src + (1 << 7)
+ psubd m16, m3
+ psrad m1, 8
+ psrad m16, 8
+ packssdw m1, m16
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_3x3_16bpc, 4, 14, 22, 416*42+8, dst, stride, left, lpf, \
+ w, h, edge, params
+ movifnidn wd, wm
+ mov paramsq, r6mp
+ lea r13, [r_ext_mask+72]
+ mov edged, r7m
+ movifnidn hd, hm
+ pxor m6, m6
+ vpbroadcastw m7, [paramsq+10] ; w1
+ add wd, wd
+ vpbroadcastd m8, [base+pd_8]
+ add lpfq, wq
+ vpbroadcastd m9, [base+pd_m9]
+ add dstq, wq
+ vpsubd m10, m6, [paramsq+4] {1to16} ; -s1
+ lea t3, [rsp+wq*2+416*12+8]
+ vpbroadcastd m11, [base+pw_164_455]
+ lea t4, [rsp+wq+416*32+8]
+ vpbroadcastd m12, [base+pw_61448]
+ lea t1, [rsp+wq+12]
+ vpbroadcastd m13, [base+pd_m34816]
+ neg wq
+ vpbroadcastd m14, [base+pw_1023]
+ psllw m7, 4
+ mova m18, [sgr_x_by_x+64*0]
+ mov r10d, 0xfffffffc
+ mova m19, [sgr_x_by_x+64*1]
+ kmovd k1, r10d
+ mova m20, [sgr_x_by_x+64*2]
+ mov r10, 0x3333333333333333
+ mova m21, [sgr_x_by_x+64*3]
+ kmovq k2, r10
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ add t1, 416*6
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv0
+ test hd, hd
+ jz .odd_height
+ add lpfq, strideq
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv0_bottom
+ add lpfq, strideq
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea r10, [wq-4]
+ lea t2, [t1+416*6]
+.top_fixup_loop:
+ mova m0, [t1+r10+416*0]
+ mova m1, [t1+r10+416*2]
+ mova m2, [t1+r10+416*4]
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m1
+ mova [t2+r10+416*4], m2
+ add r10, 64
+ jl .top_fixup_loop
+ call .v0
+ jmp .main
+.h: ; horizontal boxsum
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm16, [leftq+4]
+ vmovdqu16 m16{k1}, [lpfq+wq-4]
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-4]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m16, [lpfq+r10+ 0]
+.h_main:
+ movu m17, [lpfq+r10+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -66
+ jl .h_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.h_have_right:
+ palignr m0, m17, m16, 2
+ paddw m1, m16, m0
+ punpcklwd m2, m16, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m16, m0
+ pmaddwd m3, m3
+ palignr m17, m16, 4
+ paddw m1, m17 ; sum
+ punpcklwd m16, m17, m6
+ vpdpwssd m2, m16, m16 ; sumsq
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ mova [t1+r10+416*0], m1
+ mova [t1+r10+416*2], m2
+ mova [t1+r10+416*4], m3
+ add r10, 64
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movd xm16, [leftq+4]
+ vmovdqu16 m16{k1}, [lpfq+wq-4]
+ add leftq, 8
+ jmp .hv0_main
+.hv0_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-4]
+ jmp .hv0_main
+.hv0_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+.hv0_loop:
+ movu m16, [lpfq+r10+ 0]
+.hv0_main:
+ movu m17, [lpfq+r10+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp r10d, -66
+ jl .hv0_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.hv0_have_right:
+ palignr m0, m17, m16, 2
+ paddw m1, m16, m0
+ punpcklwd m2, m16, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m16, m0
+ pmaddwd m3, m3
+ palignr m17, m16, 4
+ paddw m1, m17 ; sum
+ punpcklwd m16, m17, m6
+ vpdpwssd m2, m16, m16 ; sumsq
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ paddw m0, m1, [t1+r10+416*0]
+ paddd m16, m2, [t1+r10+416*2]
+ paddd m17, m3, [t1+r10+416*4]
+ mova [t1+r10+416*0], m1
+ mova [t1+r10+416*2], m2
+ mova [t1+r10+416*4], m3
+ paddw m1, m0, [t2+r10+416*0]
+ paddd m2, m16, [t2+r10+416*2]
+ paddd m3, m17, [t2+r10+416*4]
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m16
+ mova [t2+r10+416*4], m17
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m9 ; -((a + 8) >> 4) * 9
+ pmulld m3, m9
+ psrlw m17, m1, 1
+ pavgw m17, m6 ; (b + 2) >> 2
+ punpcklwd m16, m17, m6
+ vpdpwssd m2, m16, m16 ; -p
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ punpcklwd m16, m6, m1 ; b
+ punpckhwd m17, m6, m1
+ pminsd m2, m6
+ pminsd m3, m6
+ pmulld m2, m10 ; p * s
+ pmulld m3, m10
+ pmaddwd m16, m11 ; b * 455
+ pmaddwd m17, m11
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ paddusw m3, m12
+ psraw m3, 4 ; min(z, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x
+ pandn m2, m13, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m13
+ mova [t4+r10*1+416*0+4], m2
+ psrld m16, 12
+ psrld m17, 12
+ mova [t3+r10*2+416*0+ 8], xm16
+ mova [t3+r10*2+416*0+ 24], xm17
+ vextracti128 [t3+r10*2+416*0+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*0+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*0+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*0+120], m17, 3
+ add r10, 64
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movd xm16, [leftq+4]
+ vmovdqu16 m16{k1}, [lpfq+wq-4]
+ add leftq, 8
+ jmp .hv1_main
+.hv1_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-4]
+ jmp .hv1_main
+.hv1_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+.hv1_loop:
+ movu m16, [lpfq+r10+ 0]
+.hv1_main:
+ movu m17, [lpfq+r10+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp r10d, -66
+ jl .hv1_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.hv1_have_right:
+ palignr m1, m17, m16, 2
+ paddw m0, m16, m1
+ punpcklwd m2, m16, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m16, m1
+ pmaddwd m3, m3
+ palignr m17, m16, 4
+ paddw m0, m17 ; h sum
+ punpcklwd m1, m17, m6
+ vpdpwssd m2, m1, m1 ; h sumsq
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ paddw m1, m0, [t2+r10+416*0]
+ paddd m16, m2, [t2+r10+416*2]
+ paddd m17, m3, [t2+r10+416*4]
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m2
+ mova [t2+r10+416*4], m3
+ paddd m16, m8
+ paddd m17, m8
+ psrld m16, 4 ; (a + 8) >> 4
+ psrld m17, 4
+ pmulld m16, m9 ; -((a + 8) >> 4) * 9
+ pmulld m17, m9
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ vpdpwssd m16, m2, m2 ; -p
+ punpckhwd m3, m6
+ vpdpwssd m17, m3, m3
+ punpcklwd m0, m6, m1 ; b
+ punpckhwd m1, m6, m1
+ pminsd m16, m6
+ pminsd m17, m6
+ pmulld m16, m10 ; p * s
+ pmulld m17, m10
+ pmaddwd m0, m11 ; b * 455
+ pmaddwd m1, m11
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m20
+ paddusw m17, m12
+ psraw m17, 4 ; min(z, 255) - 256
+ vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x
+ pandn m16, m13, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ packssdw m16, m17
+ psubd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
+ psubd m1, m13
+ mova [t4+r10*1+416*2+4], m16
+ psrld m16, m0, 12
+ psrld m17, m1, 12
+ mova [t3+r10*2+416*4+ 8], xm16
+ mova [t3+r10*2+416*4+ 24], xm17
+ vextracti128 [t3+r10*2+416*4+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*4+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*4+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*4+120], m17, 3
+ add r10, 64
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab (even rows)
+ lea r10, [wq-4]
+.v0_loop:
+ mova m0, [t1+r10+416*0]
+ mova m16, [t1+r10+416*2]
+ mova m17, [t1+r10+416*4]
+ paddw m0, m0
+ paddd m16, m16
+ paddd m17, m17
+ paddw m1, m0, [t2+r10+416*0]
+ paddd m2, m16, [t2+r10+416*2]
+ paddd m3, m17, [t2+r10+416*4]
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m16
+ mova [t2+r10+416*4], m17
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m9 ; -((a + 8) >> 4) * 9
+ pmulld m3, m9
+ psrlw m17, m1, 1
+ pavgw m17, m6 ; (b + 2) >> 2
+ punpcklwd m16, m17, m6
+ vpdpwssd m2, m16, m16 ; -p
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ punpcklwd m16, m6, m1 ; b
+ punpckhwd m17, m6, m1
+ pminsd m2, m6
+ pminsd m3, m6
+ pmulld m2, m10 ; p * s
+ pmulld m3, m10
+ pmaddwd m16, m11 ; b * 455
+ pmaddwd m17, m11
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ paddusw m3, m12
+ psraw m3, 4 ; min(z, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x
+ pandn m2, m13, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m13
+ mova [t4+r10*1+416*0+4], m2
+ psrld m16, 12
+ psrld m17, 12
+ mova [t3+r10*2+416*0+ 8], xm16
+ mova [t3+r10*2+416*0+ 24], xm17
+ vextracti128 [t3+r10*2+416*0+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*0+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*0+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*0+120], m17, 3
+ add r10, 64
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+.v1_loop:
+ mova m0, [t1+r10+416*0]
+ mova m16, [t1+r10+416*2]
+ mova m17, [t1+r10+416*4]
+ paddw m1, m0, [t2+r10+416*0]
+ paddd m2, m16, [t2+r10+416*2]
+ paddd m3, m17, [t2+r10+416*4]
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m16
+ mova [t2+r10+416*4], m17
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m9 ; -((a + 8) >> 4) * 9
+ pmulld m3, m9
+ psrlw m17, m1, 1
+ pavgw m17, m6 ; (b + 2) >> 2
+ punpcklwd m16, m17, m6
+ vpdpwssd m2, m16, m16 ; -p
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ punpcklwd m16, m6, m1 ; b
+ punpckhwd m17, m6, m1
+ pminsd m2, m6
+ pminsd m3, m6
+ pmulld m2, m10 ; p * s
+ pmulld m3, m10
+ pmaddwd m16, m11 ; b * 455
+ pmaddwd m17, m11
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ paddusw m3, m12
+ psraw m3, 4 ; min(z, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x
+ pandn m2, m13, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m13
+ mova [t4+r10*1+416*2+4], m2
+ psrld m16, 12
+ psrld m17, 12
+ mova [t3+r10*2+416*4+ 8], xm16
+ mova [t3+r10*2+416*4+ 24], xm17
+ vextracti128 [t3+r10*2+416*4+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*4+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*4+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*4+120], m17, 3
+ add r10, 64
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ mova ym16, [t4+r10*1+416*0+0]
+ paddw ym16, [t4+r10*1+416*0+4]
+ paddw ym17, ym16, [t4+r10*1+416*0+2]
+ mova m0, [t3+r10*2+416*0+0]
+ paddd m0, [t3+r10*2+416*0+8]
+ paddd m1, m0, [t3+r10*2+416*0+4]
+ psllw ym17, 2 ; a[-1] 444
+ pslld m1, 2 ; b[-1] 444
+ psubw ym17, ym16 ; a[-1] 343
+ psubd m1, m0 ; b[-1] 343
+ vmovdqa32 [t4+r10*1+416* 4], ym17
+ vmovdqa32 [t3+r10*2+416* 8], m1
+ mova ym16, [t4+r10*1+416*2+0]
+ paddw ym16, [t4+r10*1+416*2+4]
+ paddw ym17, ym16, [t4+r10*1+416*2+2]
+ mova m0, [t3+r10*2+416*4+0]
+ paddd m0, [t3+r10*2+416*4+8]
+ paddd m1, m0, [t3+r10*2+416*4+4]
+ psllw ym17, 2 ; a[ 0] 444
+ pslld m1, 2 ; b[ 0] 444
+ vmovdqa32 [t4+r10*1+416* 6], ym17
+ vmovdqa32 [t3+r10*2+416*12], m1
+ psubw ym17, ym16 ; a[ 0] 343
+ psubd m1, m0 ; b[ 0] 343
+ vmovdqa32 [t4+r10*1+416* 8], ym17
+ vmovdqa32 [t3+r10*2+416*16], m1
+ add r10, 32
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ mova m3, [t4+r10*1+416*0+0]
+ paddw m3, [t4+r10*1+416*0+4]
+ paddw m1, m3, [t4+r10*1+416*0+2]
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+r10*1+416*4]
+ paddw m3, [t4+r10*1+416*6]
+ mova [t4+r10*1+416*4], m2
+ mova [t4+r10*1+416*6], m1
+ mova m16, [t3+r10*2+416*0+0]
+ paddd m16, [t3+r10*2+416*0+8]
+ paddd m1, m16, [t3+r10*2+416*0+4]
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m16 ; b[ 1] 343
+ paddd m16, m2, [t3+r10*2+416* 8+ 0]
+ paddd m16, [t3+r10*2+416*12+ 0]
+ mova [t3+r10*2+416* 8+ 0], m2
+ mova [t3+r10*2+416*12+ 0], m1
+ mova m17, [t3+r10*2+416*0+64]
+ paddd m17, [t3+r10*2+416*0+72]
+ paddd m1, m17, [t3+r10*2+416*0+68]
+ pslld m1, 2
+ psubd m2, m1, m17
+ paddd m17, m2, [t3+r10*2+416* 8+64]
+ paddd m17, [t3+r10*2+416*12+64]
+ mova [t3+r10*2+416* 8+64], m2
+ mova [t3+r10*2+416*12+64], m1
+ mova m0, [dstq+r10]
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vshufi32x4 m1, m16, m17, q2020
+ vshufi32x4 m16, m17, q3131
+ psubd m1, m2 ; b - a * src + (1 << 8)
+ psubd m16, m3
+ psrad m1, 9
+ psrad m16, 9
+ packssdw m1, m16
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova m3, [t4+r10*1+416*2+0]
+ paddw m3, [t4+r10*1+416*2+4]
+ paddw m1, m3, [t4+r10*1+416*2+2]
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+r10*1+416*6]
+ paddw m3, [t4+r10*1+416*8]
+ mova [t4+r10*1+416*6], m1
+ mova [t4+r10*1+416*8], m2
+ mova m16, [t3+r10*2+416*4+0]
+ paddd m16, [t3+r10*2+416*4+8]
+ paddd m1, m16, [t3+r10*2+416*4+4]
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m16 ; b[ 1] 343
+ paddd m16, m2, [t3+r10*2+416*12+ 0]
+ paddd m16, [t3+r10*2+416*16+ 0]
+ mova [t3+r10*2+416*12+ 0], m1
+ mova [t3+r10*2+416*16+ 0], m2
+ mova m17, [t3+r10*2+416*4+64]
+ paddd m17, [t3+r10*2+416*4+72]
+ paddd m1, m17, [t3+r10*2+416*4+68]
+ pslld m1, 2
+ psubd m2, m1, m17
+ paddd m17, m2, [t3+r10*2+416*12+64]
+ paddd m17, [t3+r10*2+416*16+64]
+ mova [t3+r10*2+416*12+64], m1
+ mova [t3+r10*2+416*16+64], m2
+ mova m0, [dstq+r10]
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vshufi32x4 m1, m16, m17, q2020
+ vshufi32x4 m16, m17, q3131
+ psubd m1, m2 ; b - a * src + (1 << 8)
+ psubd m16, m3
+ psrad m1, 9
+ psrad m16, 9
+ packssdw m1, m16
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_mix_16bpc, 4, 14, 23, 416*66+8, dst, stride, left, lpf, \
+ w, h, edge, params
+ movifnidn wd, wm
+ mov paramsq, r6mp
+ lea r13, [r_ext_mask+72]
+ mov edged, r7m
+ movifnidn hd, hm
+ vpbroadcastd m7, [paramsq+8] ; w0 w1
+ pxor m6, m6
+ vpbroadcastd m8, [base+pd_8]
+ add wd, wd
+ vpbroadcastd m9, [base+pd_m9]
+ add lpfq, wq
+ vpbroadcastd m10, [base+pd_m25]
+ add dstq, wq
+ vpsubd m11, m6, [paramsq+0] {1to16} ; -s0
+ lea t3, [rsp+wq*2+416*24+8]
+ vpsubd m12, m6, [paramsq+4] {1to16} ; -s1
+ lea t4, [rsp+wq+416*52+8]
+ vpbroadcastd m13, [base+pw_164_455]
+ lea t1, [rsp+wq+12]
+ vpbroadcastd m14, [base+pw_61448]
+ neg wq
+ vpbroadcastd m15, [base+pd_m34816]
+ psllw m7, 2
+ vpbroadcastd m22, [base+pd_2147483648]
+ mov r10d, 0xfffffff8
+ mova m18, [sgr_x_by_x+64*0]
+ kmovd k1, r10d
+ mova m19, [sgr_x_by_x+64*1]
+ mov r10, 0x3333333333333333
+ mova m20, [sgr_x_by_x+64*2]
+ kmovq k2, r10
+ mova m21, [sgr_x_by_x+64*3]
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx512icl).top_fixup
+ add t1, 416*12
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv0
+ test hd, hd
+ jz .odd_height
+ add lpfq, strideq
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv0_bottom
+ add lpfq, strideq
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea r10, [wq-4]
+ lea t2, [t1+416*12]
+.top_fixup_loop:
+ mova m0, [t1+r10+416* 0]
+ mova m1, [t1+r10+416* 2]
+ mova m2, [t1+r10+416* 4]
+ paddw m0, m0
+ mova m3, [t1+r10+416* 6]
+ paddd m1, m1
+ mova m4, [t1+r10+416* 8]
+ paddd m2, m2
+ mova m5, [t1+r10+416*10]
+ mova [t2+r10+416* 0], m0
+ mova [t2+r10+416* 2], m1
+ mova [t2+r10+416* 4], m2
+ mova [t2+r10+416* 6], m3
+ mova [t2+r10+416* 8], m4
+ mova [t2+r10+416*10], m5
+ add r10, 64
+ jl .top_fixup_loop
+ call .v0
+ jmp .main
+.h: ; horizontal boxsum
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movq xm16, [leftq+2]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m16, [lpfq+r10- 2]
+.h_main:
+ movu m17, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -68
+ jl .h_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.h_have_right:
+ palignr m3, m17, m16, 2
+ palignr m0, m17, m16, 4
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m17, m16, 6
+ paddw m1, m0 ; sum3
+ punpcklwd m4, m0, m6
+ vpdpwssd m2, m4, m4 ; sumsq3
+ punpckhwd m0, m6
+ vpdpwssd m3, m0, m0
+ shufpd m4, m16, m17, 0x55
+ punpcklwd m17, m4, m16
+ paddw m0, m16, m4
+ punpckhwd m4, m16
+ mova [t1+r10+416* 6], m1
+ mova [t1+r10+416* 8], m2
+ mova [t1+r10+416*10], m3
+ paddw m1, m0 ; sum5
+ vpdpwssd m2, m17, m17 ; sumsq5
+ vpdpwssd m3, m4, m4
+ mova [t1+r10+416* 0], m1
+ mova [t1+r10+416* 2], m2
+ mova [t1+r10+416* 4], m3
+ add r10, 64
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movq xm16, [leftq+2]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ add leftq, 8
+ jmp .hv0_main
+.hv0_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ jmp .hv0_main
+.hv0_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+.hv0_loop:
+ movu m16, [lpfq+r10- 2]
+.hv0_main:
+ movu m17, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp r10d, -68
+ jl .hv0_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.hv0_have_right:
+ palignr m3, m17, m16, 2
+ palignr m0, m17, m16, 4
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m17, m16, 6
+ paddw m1, m0 ; h sum3
+ punpcklwd m4, m0, m6
+ vpdpwssd m2, m4, m4 ; h sumsq3
+ punpckhwd m0, m6
+ vpdpwssd m3, m0, m0
+ shufpd m17, m16, m17, 0x55
+ paddw m4, m1, [t1+r10+416* 6]
+ paddd m5, m2, [t1+r10+416* 8]
+ mova [t1+r10+416* 6], m1
+ mova [t1+r10+416* 8], m2
+ paddw m1, m16
+ paddw m1, m17 ; h sum5
+ punpcklwd m0, m17, m16
+ vpdpwssd m2, m0, m0 ; h sumsq5
+ paddd m0, m3, [t1+r10+416*10]
+ mova [t1+r10+416*10], m3
+ punpckhwd m17, m16
+ vpdpwssd m3, m17, m17
+ mova [t3+r10*2+416*8+ 8], m1 ; we need a clean copy of the last row
+ mova [t3+r10*2+416*0+ 8], m2 ; in case height is odd
+ mova [t3+r10*2+416*0+72], m3
+ paddw m1, [t1+r10+416* 0]
+ paddd m2, [t1+r10+416* 2]
+ paddd m3, [t1+r10+416* 4]
+ mova [t1+r10+416* 0], m1
+ mova [t1+r10+416* 2], m2
+ mova [t1+r10+416* 4], m3
+ paddw m17, m4, [t2+r10+416* 6]
+ paddd m2, m5, [t2+r10+416* 8]
+ paddd m3, m0, [t2+r10+416*10]
+ mova [t2+r10+416* 6], m4
+ mova [t2+r10+416* 8], m5
+ mova [t2+r10+416*10], m0
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m9 ; -((a3 + 8) >> 4) * 9
+ pmulld m3, m9
+ psrlw m5, m17, 1
+ pavgw m5, m6 ; (b3 + 2) >> 2
+ punpcklwd m4, m5, m6
+ vpdpwssd m2, m4, m4 ; -p3
+ punpckhwd m5, m6
+ vpdpwssd m3, m5, m5
+ punpcklwd m16, m6, m17 ; b3
+ punpckhwd m17, m6, m17
+ pminsd m2, m6
+ pminsd m3, m6
+ pmulld m2, m12 ; p3 * s1
+ pmulld m3, m12
+ pmaddwd m16, m13 ; b3 * 455
+ pmaddwd m17, m13
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ paddusw m3, m14
+ psraw m3, 4 ; min(z3, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x3
+ pandn m2, m15, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ mova [t4+r10*1+416*2+4], m2
+ psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m15
+ psrld m16, 12
+ psrld m17, 12
+ mova [t3+r10*2+416*4+ 8], xm16
+ mova [t3+r10*2+416*4+ 24], xm17
+ vextracti128 [t3+r10*2+416*4+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*4+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*4+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*4+120], m17, 3
+ add r10, 64
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movq xm16, [leftq+2]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ add leftq, 8
+ jmp .hv1_main
+.hv1_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ jmp .hv1_main
+.hv1_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+.hv1_loop:
+ movu m16, [lpfq+r10- 2]
+.hv1_main:
+ movu m17, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp r10d, -68
+ jl .hv1_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.hv1_have_right:
+ palignr m1, m17, m16, 2
+ palignr m3, m17, m16, 4
+ paddw m2, m1, m3
+ punpcklwd m0, m1, m3
+ pmaddwd m0, m0
+ punpckhwd m1, m3
+ pmaddwd m1, m1
+ palignr m3, m17, m16, 6
+ paddw m2, m3 ; h sum3
+ punpcklwd m5, m3, m6
+ vpdpwssd m0, m5, m5 ; h sumsq3
+ punpckhwd m3, m6
+ vpdpwssd m1, m3, m3
+ shufpd m3, m16, m17, 0x55
+ punpcklwd m5, m16, m3
+ paddw m4, m16, m3
+ punpckhwd m16, m3
+ paddw m17, m2, [t2+r10+416* 6]
+ mova [t2+r10+416* 6], m2
+ paddw m4, m2 ; h sum5
+ paddd m2, m0, [t2+r10+416* 8]
+ paddd m3, m1, [t2+r10+416*10]
+ mova [t2+r10+416* 8], m0
+ mova [t2+r10+416*10], m1
+ vpdpwssd m0, m5, m5 ; h sumsq5
+ vpdpwssd m1, m16, m16
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m9 ; -((a3 + 8) >> 4) * 9
+ pmulld m3, m9
+ psrlw m16, m17, 1
+ pavgw m16, m6 ; (b3 + 2) >> 2
+ punpcklwd m5, m16, m6
+ vpdpwssd m2, m5, m5 ; -p3
+ punpckhwd m16, m6
+ vpdpwssd m3, m16, m16
+ punpcklwd m16, m6, m17 ; b3
+ punpckhwd m17, m6, m17
+ pminsd m2, m6
+ pminsd m3, m6
+ pmulld m2, m12 ; p3 * s1
+ pmulld m3, m12
+ pmaddwd m16, m13 ; b3 * 455
+ pmaddwd m17, m13
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ paddusw m3, m14
+ psraw m3, 4 ; min(z3, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x3
+ pandn m2, m15, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ mova [t4+r10*1+416*4+4], m2
+ psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m15
+ psrld m16, 12
+ psrld m17, 12
+ paddw m5, m4, [t2+r10+416*0]
+ paddd m2, m0, [t2+r10+416*2]
+ paddd m3, m1, [t2+r10+416*4]
+ paddw m5, [t1+r10+416*0]
+ paddd m2, [t1+r10+416*2]
+ paddd m3, [t1+r10+416*4]
+ mova [t2+r10+416*0], m4
+ mova [t2+r10+416*2], m0
+ mova [t2+r10+416*4], m1
+ mova [t3+r10*2+416*8+ 8], xm16
+ mova [t3+r10*2+416*8+ 24], xm17
+ vextracti128 [t3+r10*2+416*8+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*8+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*8+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*8+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*8+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*8+120], m17, 3
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a5 + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m10 ; -((a5 + 8) >> 4) * 25
+ pmulld m3, m10
+ psrlw m17, m5, 1
+ pavgw m17, m6 ; (b5 + 2) >> 2
+ punpcklwd m16, m17, m6
+ vpdpwssd m2, m16, m16 ; -p5
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ punpcklwd m16, m5, m6 ; b5
+ punpckhwd m17, m5, m6
+ pmulld m2, m11 ; p5 * s0
+ pmulld m3, m11
+ pmaddwd m16, m13 ; b5 * 164
+ pmaddwd m17, m13
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ pmaxsw m3, m6
+ paddusw m3, m14
+ psraw m3, 4 ; min(z5, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x5
+ pandn m2, m15, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ mova [t4+r10*1+416*0+4], m2
+ psubd m16, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ psubd m17, m15
+ psrld m16, 12
+ psrld m17, 12
+ mova [t3+r10*2+416*0+ 8], xm16
+ mova [t3+r10*2+416*0+ 24], xm17
+ vextracti128 [t3+r10*2+416*0+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*0+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*0+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*0+120], m17, 3
+ add r10, 64
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab3 (even rows)
+ lea r10, [wq-4]
+.v0_loop:
+ mova m16, [t1+r10+416* 6]
+ mova m2, [t1+r10+416* 8]
+ mova m3, [t1+r10+416*10]
+ paddw m16, m16
+ paddd m2, m2
+ paddd m3, m3
+ paddw m17, m16, [t2+r10+416* 6]
+ paddd m4, m2, [t2+r10+416* 8]
+ paddd m5, m3, [t2+r10+416*10]
+ mova [t2+r10+416* 6], m16
+ mova [t2+r10+416* 8], m2
+ mova [t2+r10+416*10], m3
+ paddd m4, m8
+ paddd m5, m8
+ psrld m4, 4 ; (a3 + 8) >> 4
+ psrld m5, 4
+ pmulld m4, m9 ; -((a3 + 8) >> 4) * 9
+ pmulld m5, m9
+ psrlw m3, m17, 1
+ pavgw m3, m6 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m6
+ vpdpwssd m4, m2, m2 ; -p3
+ punpckhwd m3, m6
+ vpdpwssd m5, m3, m3
+ punpcklwd m16, m6, m17 ; b3
+ punpckhwd m17, m6, m17
+ pminsd m4, m6
+ pminsd m5, m6
+ pmulld m4, m12 ; p3 * s1
+ pmulld m5, m12
+ pmaddwd m16, m13 ; b3 * 455
+ pmaddwd m17, m13
+ vpalignr m5{k2}, m4, m4, 2
+ mova m4, m20
+ paddusw m5, m14
+ psraw m5, 4 ; min(z3, 255) - 256
+ vpermt2b m4, m5, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m5
+ vpermi2b m5, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m5{k3}, m4 ; x3
+ pandn m4, m15, m5
+ psrld m5, 16
+ pmulld m16, m4
+ pmulld m17, m5
+ packssdw m4, m5
+ mova [t4+r10*1+416*2+4], m4
+ psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m15
+ psrld m16, 12
+ psrld m17, 12
+ mova m3, [t1+r10+416*0]
+ mova m4, [t1+r10+416*2]
+ mova m5, [t1+r10+416*4]
+ mova [t3+r10*2+416*8+ 8], m3
+ mova [t3+r10*2+416*0+ 8], m4
+ mova [t3+r10*2+416*0+72], m5
+ paddw m3, m3 ; cc5
+ paddd m4, m4
+ paddd m5, m5
+ mova [t1+r10+416*0], m3
+ mova [t1+r10+416*2], m4
+ mova [t1+r10+416*4], m5
+ mova [t3+r10*2+416*4+ 8], xm16
+ mova [t3+r10*2+416*4+ 24], xm17
+ vextracti128 [t3+r10*2+416*4+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*4+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*4+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*4+120], m17, 3
+ add r10, 64
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+.v1_loop:
+ mova m16, [t1+r10+416* 6]
+ mova m2, [t1+r10+416* 8]
+ mova m3, [t1+r10+416*10]
+ paddw m17, m16, [t2+r10+416* 6]
+ paddd m4, m2, [t2+r10+416* 8]
+ paddd m5, m3, [t2+r10+416*10]
+ mova [t2+r10+416* 6], m16
+ mova [t2+r10+416* 8], m2
+ mova [t2+r10+416*10], m3
+ paddd m4, m8
+ paddd m5, m8
+ psrld m4, 4 ; (a3 + 8) >> 4
+ psrld m5, 4
+ pmulld m4, m9 ; -((a3 + 8) >> 4) * 9
+ pmulld m5, m9
+ psrlw m3, m17, 1
+ pavgw m3, m6 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m6
+ vpdpwssd m4, m2, m2 ; -p3
+ punpckhwd m3, m6
+ vpdpwssd m5, m3, m3
+ punpcklwd m16, m6, m17 ; b3
+ punpckhwd m17, m6, m17
+ pminsd m4, m6
+ pminsd m5, m6
+ pmulld m4, m12 ; p3 * s1
+ pmulld m5, m12
+ pmaddwd m16, m13 ; b3 * 455
+ pmaddwd m17, m13
+ vpalignr m5{k2}, m4, m4, 2
+ mova m4, m20
+ paddusw m5, m14
+ psraw m5, 4 ; min(z3, 255) - 256
+ vpermt2b m4, m5, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m5
+ vpermi2b m5, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m5{k3}, m4 ; x3
+ pandn m4, m15, m5
+ psrld m5, 16
+ pmulld m16, m4
+ pmulld m17, m5
+ packssdw m4, m5
+ mova [t4+r10*1+416*4+4], m4
+ psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m15
+ psrld m16, 12
+ psrld m17, 12
+ mova m0, [t3+r10*2+416*8+ 8]
+ mova m4, [t3+r10*2+416*0+ 8]
+ mova m5, [t3+r10*2+416*0+72]
+ paddw m1, m0, [t2+r10+416*0]
+ paddd m2, m4, [t2+r10+416*2]
+ paddd m3, m5, [t2+r10+416*4]
+ paddw m1, [t1+r10+416*0]
+ paddd m2, [t1+r10+416*2]
+ paddd m3, [t1+r10+416*4]
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m4
+ mova [t2+r10+416*4], m5
+ mova [t3+r10*2+416*8+ 8], xm16
+ mova [t3+r10*2+416*8+ 24], xm17
+ vextracti128 [t3+r10*2+416*8+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*8+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*8+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*8+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*8+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*8+120], m17, 3
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a5 + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m10 ; -((a5 + 8) >> 4) * 25
+ pmulld m3, m10
+ psrlw m5, m1, 1
+ pavgw m5, m6 ; (b5 + 2) >> 2
+ punpcklwd m4, m5, m6
+ vpdpwssd m2, m4, m4 ; -p5
+ punpckhwd m5, m6
+ vpdpwssd m3, m5, m5
+ punpcklwd m16, m1, m6 ; b5
+ punpckhwd m17, m1, m6
+ pmulld m2, m11 ; p5 * s0
+ pmulld m3, m11
+ pmaddwd m16, m13 ; b5 * 164
+ pmaddwd m17, m13
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ pmaxsw m3, m6
+ paddusw m3, m14
+ psraw m3, 4 ; min(z5, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x5
+ pandn m2, m15, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ mova [t4+r10*1+416*0+4], m2
+ psubd m16, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ psubd m17, m15
+ psrld m16, 12
+ psrld m17, 12
+ mova [t3+r10*2+416*0+ 8], xm16
+ mova [t3+r10*2+416*0+ 24], xm17
+ vextracti128 [t3+r10*2+416*0+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*0+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*0+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*0+120], m17, 3
+ add r10, 64
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu ym0, [t4+r10*1+416*0+2]
+ paddw ym2, ym0, [t4+r10*1+416*0+0]
+ paddw ym2, [t4+r10*1+416*0+4]
+ movu m1, [t3+r10*2+416*0+4]
+ paddd m3, m1, [t3+r10*2+416*0+0]
+ paddd m3, [t3+r10*2+416*0+8]
+ paddw ym0, ym2
+ paddd m1, m3
+ psllw ym2, 2
+ pslld m3, 2
+ paddw ym0, ym2 ; a5 565
+ paddd m1, m3 ; b5 565
+ mova [t4+r10*1+416* 6], ym0
+ mova [t3+r10*2+416*12], m1
+ mova ym0, [t4+r10*1+416*2+0]
+ paddw ym0, [t4+r10*1+416*2+4]
+ paddw ym2, ym0, [t4+r10*1+416*2+2]
+ mova m1, [t3+r10*2+416*4+0]
+ paddd m1, [t3+r10*2+416*4+8]
+ paddd m3, m1, [t3+r10*2+416*4+4]
+ psllw ym2, 2 ; a3[-1] 444
+ pslld m3, 2 ; b3[-1] 444
+ psubw ym2, ym0 ; a3[-1] 343
+ psubd m3, m1 ; b3[-1] 343
+ mova [t4+r10*1+416* 8], ym2
+ mova [t3+r10*2+416*16], m3
+ mova ym0, [t4+r10*1+416*4+0]
+ paddw ym0, [t4+r10*1+416*4+4]
+ paddw ym2, ym0, [t4+r10*1+416*4+2]
+ mova m1, [t3+r10*2+416*8+0]
+ paddd m1, [t3+r10*2+416*8+8]
+ paddd m3, m1, [t3+r10*2+416*8+4]
+ psllw ym2, 2 ; a3[ 0] 444
+ pslld m3, 2 ; b3[ 0] 444
+ mova [t4+r10*1+416*10], ym2
+ mova [t3+r10*2+416*20], m3
+ psubw ym2, ym0 ; a3[ 0] 343
+ psubd m3, m1 ; b3[ 0] 343
+ mova [t4+r10*1+416*12], ym2
+ mova [t3+r10*2+416*24], m3
+ add r10, 32
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu ym2, [t4+r10*1+2]
+ paddw ym0, ym2, [t4+r10*1+0]
+ paddw ym0, [t4+r10*1+4]
+ paddw ym2, ym0
+ psllw ym0, 2
+ paddw ym0, ym2 ; a5
+ movu m1, [t3+r10*2+4]
+ paddd m4, m1, [t3+r10*2+0]
+ paddd m4, [t3+r10*2+8]
+ paddd m1, m4
+ pslld m4, 2
+ paddd m4, m1 ; b5
+ paddw ym2, ym0, [t4+r10*1+416* 6]
+ mova [t4+r10*1+416* 6], ym0
+ paddd m0, m4, [t3+r10*2+416*12]
+ mova [t3+r10*2+416*12], m4
+ mova ym3, [t4+r10*1+416*2+0]
+ paddw ym3, [t4+r10*1+416*2+4]
+ paddw ym5, ym3, [t4+r10*1+416*2+2]
+ psllw ym5, 2 ; a3[ 1] 444
+ psubw ym4, ym5, ym3 ; a3[ 1] 343
+ paddw ym3, ym4, [t4+r10*1+416* 8]
+ paddw ym3, [t4+r10*1+416*10]
+ mova [t4+r10*1+416* 8], ym4
+ mova [t4+r10*1+416*10], ym5
+ mova m1, [t3+r10*2+416*4+0]
+ paddd m1, [t3+r10*2+416*4+8]
+ paddd m5, m1, [t3+r10*2+416*4+4]
+ pslld m5, 2 ; b3[ 1] 444
+ psubd m4, m5, m1 ; b3[ 1] 343
+ paddd m1, m4, [t3+r10*2+416*16]
+ paddd m1, [t3+r10*2+416*20]
+ mova [t3+r10*2+416*16], m4
+ mova [t3+r10*2+416*20], m5
+ pmovzxwd m4, [dstq+r10]
+ pmovzxwd m2, ym2 ; a5
+ pmovzxwd m3, ym3 ; a3
+ pmaddwd m2, m4 ; a5 * src
+ pmaddwd m3, m4 ; a3 * src
+ vpshldd m4, m22, 13
+ psubd m0, m2 ; b5 - a5 * src + (1 << 8)
+ psubd m1, m3 ; b3 - a3 * src + (1 << 8)
+ psrld m0, 9
+ pslld m1, 7
+ vpblendmb m0{k2}, m1, m0
+ vpdpwssd m4, m0, m7
+ psrad m4, 7
+ pmaxsd m4, m6
+ vpmovusdw ym16, m4 ; clip
+ psrlw ym16, 6
+ mova [dstq+r10], ym16
+ add r10, 32
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova ym3, [t4+r10*1+416*4+0]
+ paddw ym3, [t4+r10*1+416*4+4]
+ paddw ym5, ym3, [t4+r10*1+416*4+2]
+ psllw ym5, 2 ; a3[ 1] 444
+ psubw ym4, ym5, ym3 ; a3[ 1] 343
+ paddw ym3, ym4, [t4+r10*1+416*12]
+ paddw ym3, [t4+r10*1+416*10]
+ mova [t4+r10*1+416*10], ym5
+ mova [t4+r10*1+416*12], ym4
+ mova m0, [t3+r10*2+416*8+0]
+ paddd m0, [t3+r10*2+416*8+8]
+ paddd m5, m0, [t3+r10*2+416*8+4]
+ pslld m5, 2 ; b3[ 1] 444
+ psubd m4, m5, m0 ; b3[ 1] 343
+ paddd m0, m4, [t3+r10*2+416*24]
+ paddd m0, [t3+r10*2+416*20]
+ mova [t3+r10*2+416*20], m5
+ mova [t3+r10*2+416*24], m4
+ pmovzxwd m4, [dstq+r10]
+ pmovzxwd m2, [t4+r10*1+416* 6]
+ pmovzxwd m3, ym3
+ mova m1, [t3+r10*2+416*12]
+ pmaddwd m2, m4 ; a5 * src
+ pmaddwd m3, m4 ; a3 * src
+ vpshldd m4, m22, 13
+ psubd m1, m2 ; b5 - a5 * src + (1 << 8)
+ psubd m0, m3 ; b3 - a3 * src + (1 << 8)
+ pslld m0, 7
+ vpalignr m0{k2}, m1, m1, 1
+ vpdpwssd m4, m0, m7
+ psrad m4, 7
+ pmaxsd m4, m6
+ vpmovusdw ym16, m4 ; clip
+ psrlw ym16, 6
+ mova [dstq+r10], ym16
+ add r10, 32
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+%endif ; ARCH_X86_64