From 36d22d82aa202bb199967e9512281e9a53db42c9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 21:33:14 +0200 Subject: Adding upstream version 115.7.0esr. Signed-off-by: Daniel Baumann --- third_party/dav1d/src/x86/looprestoration_avx2.asm | 2237 ++++++++++++++++++++ 1 file changed, 2237 insertions(+) create mode 100644 third_party/dav1d/src/x86/looprestoration_avx2.asm (limited to 'third_party/dav1d/src/x86/looprestoration_avx2.asm') diff --git a/third_party/dav1d/src/x86/looprestoration_avx2.asm b/third_party/dav1d/src/x86/looprestoration_avx2.asm new file mode 100644 index 0000000000..a73cb21882 --- /dev/null +++ b/third_party/dav1d/src/x86/looprestoration_avx2.asm @@ -0,0 +1,2237 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 + +wiener_l_shuf: db 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 +wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14 +wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12 +sgr_r_ext: times 16 db 1 + times 16 db 9 + +; dword version of dav1d_sgr_x_by_x[] for use with gathers, wastes a bit of +; cache but eliminates some shifts in the inner sgr loop which is overall a win +const sgr_x_by_x_avx2 + dd 255,128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16 + dd 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8 + dd 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5 + dd 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 + dd 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3 + dd 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 + dd 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1 + dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 + + times 4 db -1 ; needed for 16-bit sgr +pb_m5: times 4 db -5 +pb_3: times 4 db 3 +pw_5_6: dw 5, 6 + +sgr_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 +sgr_shuf: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1 + db 9, -1, 10, -1, 11, -1, 12, -1 + +pw_256: times 2 dw 256 +pw_2056: times 2 dw 2056 +pw_m16380: times 2 dw -16380 +pd_25: dd 25 +pd_34816: dd 34816 +pd_m4096: dd -4096 +pd_0xf00801c7: dd 0xf00801c7 +pd_0xf00800a4: dd 0xf00800a4 + +SECTION .text + +DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers + +INIT_YMM avx2 +cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ + w, h, edge, flt + mov fltq, r6mp + movifnidn hd, hm + mov edged, r7m + mov wd, wm + vbroadcasti128 m6, [wiener_shufA] + vpbroadcastb m11, [fltq+ 0] ; x0 x0 + vbroadcasti128 m7, [wiener_shufB] + vpbroadcastd m12, [fltq+ 2] + vbroadcasti128 m8, [wiener_shufC] + packsswb m12, m12 ; x1 x2 + vpbroadcastw m13, [fltq+ 6] ; x3 + vbroadcasti128 m9, [sgr_shuf+6] + add lpfq, wq + vpbroadcastd m10, [pw_m16380] + vpbroadcastd m14, [fltq+16] ; y0 y1 + add dstq, wq + vpbroadcastd m15, [fltq+20] ; y2 y3 + lea t1, [rsp+wq*2+16] + psllw m14, 5 + neg wq + psllw m15, 5 + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t6, t1 + mov t5, t1 + add t1, 384*2 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + mov t4, t1 + add t1, 384*2 + add r10, strideq + mov [rsp], r10 ; below + call .h + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v3 +.main: + lea t0, [t1+384*2] +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v3 + mov lpfq, [rsp] + call .hv_bottom + add lpfq, strideq + call .hv_bottom +.v1: + call .v + RET +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + mov t6, t1 + mov t5, t1 + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v3 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v3 + add t0, 384*8 + call .hv + dec hd + jnz .main +.v3: + call .v +.v2: + call .v + jmp .v1 +.extend_right: + movd xm2, r10d + vpbroadcastd m0, [pb_3] + vpbroadcastd m1, [pb_m5] + vpbroadcastb m2, xm2 + movu m3, [pb_0to31] + psubb m0, m2 + psubb m1, m2 + pminub m0, m3 + pminub m1, m3 + pshufb m4, m0 + pshufb m5, m1 + ret +.h: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm4, [leftq] + vpblendd m4, [lpfq+r10-4], 0xfe + add leftq, 4 + jmp .h_main +.h_extend_left: + vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located + mova m4, [lpfq+r10] ; before the start of the buffer + palignr m4, m5, 12 + pshufb m4, [wiener_l_shuf] + jmp .h_main +.h_top: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m4, [lpfq+r10-4] +.h_main: + movu m5, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -34 + jl .h_have_right + call .extend_right +.h_have_right: + pshufb m0, m4, m6 + pmaddubsw m0, m11 + pshufb m1, m5, m6 + pmaddubsw m1, m11 + pshufb m2, m4, m7 + pmaddubsw m2, m12 + pshufb m3, m5, m7 + pmaddubsw m3, m12 + paddw m0, m2 + pshufb m2, m4, m8 + pmaddubsw m2, m12 + paddw m1, m3 + pshufb m3, m5, m8 + pmaddubsw m3, m12 + pshufb m4, m9 + paddw m0, m2 + pmullw m2, m4, m13 + pshufb m5, m9 + paddw m1, m3 + pmullw m3, m5, m13 + psllw m4, 7 + psllw m5, 7 + paddw m4, m10 + paddw m5, m10 + paddw m0, m2 + vpbroadcastd m2, [pw_2056] + paddw m1, m3 + paddsw m0, m4 + paddsw m1, m5 + psraw m0, 3 + psraw m1, 3 + paddw m0, m2 + paddw m1, m2 + mova [t1+r10*2+ 0], m0 + mova [t1+r10*2+32], m1 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, strideq + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movd xm4, [leftq] + vpblendd m4, [lpfq+r10-4], 0xfe + add leftq, 4 + jmp .hv_main +.hv_extend_left: + movu m4, [lpfq+r10-4] + pshufb m4, [wiener_l_shuf] + jmp .hv_main +.hv_bottom: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m4, [lpfq+r10-4] +.hv_main: + movu m5, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -34 + jl .hv_have_right + call .extend_right +.hv_have_right: + pshufb m0, m4, m6 + pmaddubsw m0, m11 + pshufb m1, m5, m6 + pmaddubsw m1, m11 + pshufb m2, m4, m7 + pmaddubsw m2, m12 + pshufb m3, m5, m7 + pmaddubsw m3, m12 + paddw m0, m2 + pshufb m2, m4, m8 + pmaddubsw m2, m12 + paddw m1, m3 + pshufb m3, m5, m8 + pmaddubsw m3, m12 + pshufb m4, m9 + paddw m0, m2 + pmullw m2, m4, m13 + pshufb m5, m9 + paddw m1, m3 + pmullw m3, m5, m13 + psllw m4, 7 + psllw m5, 7 + paddw m4, m10 + paddw m5, m10 + paddw m0, m2 + paddw m1, m3 + mova m2, [t4+r10*2] + paddw m2, [t2+r10*2] + mova m3, [t3+r10*2] + paddsw m0, m4 + vpbroadcastd m4, [pw_2056] + paddsw m1, m5 + mova m5, [t5+r10*2] + paddw m5, [t1+r10*2] + psraw m0, 3 + psraw m1, 3 + paddw m0, m4 + paddw m1, m4 + paddw m4, m0, [t6+r10*2] + mova [t0+r10*2], m0 + punpcklwd m0, m2, m3 + pmaddwd m0, m15 + punpckhwd m2, m3 + pmaddwd m2, m15 + punpcklwd m3, m4, m5 + pmaddwd m3, m14 + punpckhwd m4, m5 + pmaddwd m4, m14 + paddd m0, m3 + paddd m4, m2 + mova m2, [t4+r10*2+32] + paddw m2, [t2+r10*2+32] + mova m3, [t3+r10*2+32] + mova m5, [t5+r10*2+32] + paddw m5, [t1+r10*2+32] + packuswb m0, m4 + paddw m4, m1, [t6+r10*2+32] + mova [t0+r10*2+32], m1 + punpcklwd m1, m2, m3 + pmaddwd m1, m15 + punpckhwd m2, m3 + pmaddwd m2, m15 + punpcklwd m3, m4, m5 + pmaddwd m3, m14 + punpckhwd m4, m5 + pmaddwd m4, m14 + paddd m1, m3 + paddd m2, m4 + packuswb m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + mova [dstq+r10], m0 + add r10, 32 + jl .hv_loop + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t6 + add dstq, strideq + ret +.v: + mov r10, wq +.v_loop: + mova m2, [t4+r10*2+ 0] + paddw m2, [t2+r10*2+ 0] + mova m4, [t3+r10*2+ 0] + mova m6, [t1+r10*2+ 0] + paddw m8, m6, [t6+r10*2+ 0] + paddw m6, [t5+r10*2+ 0] + mova m3, [t4+r10*2+32] + paddw m3, [t2+r10*2+32] + mova m5, [t3+r10*2+32] + mova m7, [t1+r10*2+32] + paddw m9, m7, [t6+r10*2+32] + paddw m7, [t5+r10*2+32] + punpcklwd m0, m2, m4 + pmaddwd m0, m15 + punpckhwd m2, m4 + pmaddwd m2, m15 + punpcklwd m4, m8, m6 + pmaddwd m4, m14 + punpckhwd m6, m8, m6 + pmaddwd m6, m14 + punpcklwd m1, m3, m5 + pmaddwd m1, m15 + punpckhwd m3, m5 + pmaddwd m3, m15 + punpcklwd m5, m9, m7 + pmaddwd m5, m14 + punpckhwd m7, m9, m7 + pmaddwd m7, m14 + paddd m0, m4 + paddd m2, m6 + paddd m1, m5 + paddd m3, m7 + packuswb m0, m2 + packuswb m1, m3 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + mova [dstq+r10], m0 + add r10, 32 + jl .v_loop + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + add dstq, strideq + ret + +cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ + w, h, edge, flt + mov fltq, r6mp + movifnidn hd, hm + mov edged, r7m + mov wd, wm + vbroadcasti128 m6, [wiener_shufB] + vpbroadcastd m12, [fltq+ 2] + vbroadcasti128 m7, [wiener_shufC] + packsswb m12, m12 ; x1 x2 + vpbroadcastw m13, [fltq+ 6] ; x3 + vbroadcasti128 m8, [sgr_shuf+6] + add lpfq, wq + vpbroadcastd m9, [pw_m16380] + vpbroadcastd m10, [pw_2056] + mova m11, [wiener_l_shuf] + vpbroadcastd m14, [fltq+16] ; __ y1 + add dstq, wq + vpbroadcastd m15, [fltq+20] ; y2 y3 + lea t1, [rsp+wq*2+16] + psllw m14, 5 + neg wq + psllw m15, 5 + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t4, t1 + add t1, 384*2 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + mov t3, t1 + add t1, 384*2 + add r10, strideq + mov [rsp], r10 ; below + call .h + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v2 +.main: + mov t0, t4 +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v2 + mov lpfq, [rsp] + call .hv_bottom + add lpfq, strideq + call .hv_bottom +.end: + RET +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v2 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v2 + add t0, 384*6 + call .hv + dec hd + jnz .main +.v2: + call .v + mov t4, t3 + mov t3, t2 + mov t2, t1 + add dstq, strideq +.v1: + call .v + jmp .end +.h: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm4, [leftq] + vpblendd m4, [lpfq+r10-4], 0xfe + add leftq, 4 + jmp .h_main +.h_extend_left: + vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located + mova m4, [lpfq+r10] ; before the start of the buffer + palignr m4, m5, 12 + pshufb m4, m11 + jmp .h_main +.h_top: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m4, [lpfq+r10-4] +.h_main: + movu m5, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -33 + jl .h_have_right + call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right +.h_have_right: + pshufb m0, m4, m6 + pmaddubsw m0, m12 + pshufb m1, m5, m6 + pmaddubsw m1, m12 + pshufb m2, m4, m7 + pmaddubsw m2, m12 + pshufb m3, m5, m7 + pmaddubsw m3, m12 + pshufb m4, m8 + paddw m0, m2 + pmullw m2, m4, m13 + pshufb m5, m8 + paddw m1, m3 + pmullw m3, m5, m13 + psllw m4, 7 + psllw m5, 7 + paddw m4, m9 + paddw m5, m9 + paddw m0, m2 + paddw m1, m3 + paddsw m0, m4 + paddsw m1, m5 + psraw m0, 3 + psraw m1, 3 + paddw m0, m10 + paddw m1, m10 + mova [t1+r10*2+ 0], m0 + mova [t1+r10*2+32], m1 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, strideq + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movd xm4, [leftq] + vpblendd m4, [lpfq+r10-4], 0xfe + add leftq, 4 + jmp .hv_main +.hv_extend_left: + movu m4, [lpfq+r10-4] + pshufb m4, m11 + jmp .hv_main +.hv_bottom: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m4, [lpfq+r10-4] +.hv_main: + movu m5, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -33 + jl .hv_have_right + call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right +.hv_have_right: + pshufb m0, m4, m6 + pmaddubsw m0, m12 + pshufb m1, m5, m6 + pmaddubsw m1, m12 + pshufb m2, m4, m7 + pmaddubsw m2, m12 + pshufb m3, m5, m7 + pmaddubsw m3, m12 + pshufb m4, m8 + paddw m0, m2 + pmullw m2, m4, m13 + pshufb m5, m8 + paddw m1, m3 + pmullw m3, m5, m13 + psllw m4, 7 + psllw m5, 7 + paddw m4, m9 + paddw m5, m9 + paddw m0, m2 + paddw m1, m3 + mova m2, [t3+r10*2] + paddw m2, [t1+r10*2] + mova m3, [t2+r10*2] + paddsw m0, m4 + paddsw m1, m5 + psraw m0, 3 + psraw m1, 3 + paddw m0, m10 + paddw m1, m10 + paddw m4, m0, [t4+r10*2] + mova [t0+r10*2], m0 + punpcklwd m0, m2, m3 + pmaddwd m0, m15 + punpckhwd m2, m3 + pmaddwd m2, m15 + punpcklwd m3, m4, m4 + pmaddwd m3, m14 + punpckhwd m4, m4 + pmaddwd m4, m14 + paddd m0, m3 + paddd m4, m2 + mova m2, [t3+r10*2+32] + paddw m2, [t1+r10*2+32] + mova m3, [t2+r10*2+32] + packuswb m0, m4 + paddw m4, m1, [t4+r10*2+32] + mova [t0+r10*2+32], m1 + punpcklwd m1, m2, m3 + pmaddwd m1, m15 + punpckhwd m2, m3 + pmaddwd m2, m15 + punpcklwd m3, m4, m4 + pmaddwd m3, m14 + punpckhwd m4, m4 + pmaddwd m4, m14 + paddd m1, m3 + paddd m2, m4 + packuswb m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + mova [dstq+r10], m0 + add r10, 32 + jl .hv_loop + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t4 + add dstq, strideq + ret +.v: + mov r10, wq + psrld m13, m14, 16 ; y1 __ +.v_loop: + mova m6, [t1+r10*2+ 0] + paddw m2, m6, [t3+r10*2+ 0] + mova m4, [t2+r10*2+ 0] + mova m7, [t1+r10*2+32] + paddw m3, m7, [t3+r10*2+32] + mova m5, [t2+r10*2+32] + paddw m6, [t4+r10*2+ 0] + paddw m7, [t4+r10*2+32] + punpcklwd m0, m2, m4 + pmaddwd m0, m15 + punpckhwd m2, m4 + pmaddwd m2, m15 + punpcklwd m1, m3, m5 + pmaddwd m1, m15 + punpckhwd m3, m5 + pmaddwd m3, m15 + punpcklwd m5, m7, m6 + pmaddwd m4, m5, m14 + punpckhwd m7, m6 + pmaddwd m6, m7, m14 + pmaddwd m5, m13 + pmaddwd m7, m13 + paddd m0, m4 + paddd m2, m6 + paddd m1, m5 + paddd m3, m7 + packuswb m0, m2 + packuswb m1, m3 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + mova [dstq+r10], m0 + add r10, 32 + jl .v_loop + ret + +cglobal sgr_filter_5x5_8bpc, 4, 13, 16, 400*24+16, dst, stride, left, lpf, \ + w, h, edge, params +%define base r12-sgr_x_by_x_avx2-256*4 + lea r12, [sgr_x_by_x_avx2+256*4] + mov paramsq, r6mp + mov wd, wm + movifnidn hd, hm + mov edged, r7m + vbroadcasti128 m8, [base+sgr_shuf+0] + vbroadcasti128 m9, [base+sgr_shuf+8] + add lpfq, wq + vbroadcasti128 m10, [base+sgr_shuf+2] + add dstq, wq + vbroadcasti128 m11, [base+sgr_shuf+6] + lea t3, [rsp+wq*4+16+400*12] + vpbroadcastd m12, [paramsq+0] ; s0 + pxor m6, m6 + vpbroadcastw m7, [paramsq+8] ; w0 + lea t1, [rsp+wq*2+20] + vpbroadcastd m13, [base+pd_0xf00800a4] + neg wq + vpbroadcastd m14, [base+pd_34816] ; (1 << 11) + (1 << 15) + psllw m7, 4 + vpbroadcastd m15, [base+pd_m4096] + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t2, t1 + call .top_fixup + add t1, 400*6 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov [rsp], r10 ; below + mov t0, t2 + dec hd + jz .height1 + or edged, 16 + call .h +.main: + add lpfq, strideq + call .hv + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, strideq + test hd, hd + jz .odd_height + call .h + add lpfq, strideq + call .hv + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .h_top + add lpfq, strideq + call .hv_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .hv + call .prep_n + jmp .odd_height_end +.odd_height: + call .hv + call .n0 + call .n1 +.odd_height_end: + call .v + call .n0 + jmp .end2 +.extend_bottom: + call .v + jmp .end +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + lea t2, [t1+400*6] + call .top_fixup + dec hd + jz .no_top_height1 + or edged, 16 + mov t0, t1 + mov t1, t2 + jmp .main +.no_top_height1: + call .v + call .prep_n + jmp .odd_height_end +.extend_right: + movd xm2, r10d + mova m0, [sgr_r_ext] + vpbroadcastb m2, xm2 + psubb m0, m2 + pminub m0, [pb_0to31] + pshufb m5, m0 + ret +.h: ; horizontal boxsum + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .h_main +.h_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .h_main +.h_top: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu xm5, [lpfq+r10-2] +.h_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -18 + jl .h_have_right + call .extend_right +.h_have_right: + pshufb m3, m5, m8 + pmullw m4, m3, m3 + pshufb m2, m5, m9 + paddw m0, m3, m2 + shufps m3, m2, q2121 + paddw m0, m3 + punpcklwd m1, m2, m3 + pmaddwd m1, m1 + punpckhwd m2, m3 + pmaddwd m2, m2 + punpcklwd m3, m4, m6 + paddd m1, m3 + punpckhwd m4, m6 + paddd m2, m4 + pshufb m4, m5, m10 + paddw m0, m4 + pshufb m5, m11 + paddw m0, m5 ; sum + punpcklwd m3, m4, m5 + pmaddwd m3, m3 + punpckhwd m4, m5 + pmaddwd m4, m4 + test edgeb, 16 ; y > 0 + jz .h_loop_end + paddw m0, [t1+r10*2+400*0] + paddd m1, [t1+r10*2+400*2] + paddd m2, [t1+r10*2+400*4] +.h_loop_end: + paddd m1, m3 ; sumsq + paddd m2, m4 + mova [t1+r10*2+400*0], m0 + mova [t1+r10*2+400*2], m1 + mova [t1+r10*2+400*4], m2 + add r10, 16 + jl .h_loop + ret +.top_fixup: + lea r10, [wq-2] +.top_fixup_loop: ; the sums of the first row needs to be doubled + mova m0, [t1+r10*2+400*0] + mova m1, [t1+r10*2+400*2] + mova m2, [t1+r10*2+400*4] + paddw m0, m0 + paddd m1, m1 + paddd m2, m2 + mova [t2+r10*2+400*0], m0 + mova [t2+r10*2+400*2], m1 + mova [t2+r10*2+400*4], m2 + add r10, 16 + jl .top_fixup_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .hv_main +.hv_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .hv_main +.hv_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu xm5, [lpfq+r10-2] +.hv_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -18 + jl .hv_have_right + call .extend_right +.hv_have_right: + pshufb m1, m5, m8 + pmullw m4, m1, m1 + pshufb m3, m5, m9 + paddw m0, m1, m3 + shufps m1, m3, q2121 + paddw m0, m1 + punpcklwd m2, m3, m1 + pmaddwd m2, m2 + punpckhwd m3, m1 + pmaddwd m3, m3 + punpcklwd m1, m4, m6 + paddd m2, m1 + punpckhwd m4, m6 + paddd m3, m4 + pshufb m1, m5, m10 + paddw m0, m1 + pshufb m5, m11 + paddw m0, m5 ; h sum + punpcklwd m4, m5, m1 + pmaddwd m4, m4 + punpckhwd m5, m1 + pmaddwd m5, m5 + paddw m1, m0, [t1+r10*2+400*0] + paddd m2, m4 ; h sumsq + paddd m3, m5 + paddd m4, m2, [t1+r10*2+400*2] + paddd m5, m3, [t1+r10*2+400*4] + test hd, hd + jz .hv_last_row +.hv_main2: + paddw m1, [t2+r10*2+400*0] ; hv sum + paddd m4, [t2+r10*2+400*2] ; hv sumsq + paddd m5, [t2+r10*2+400*4] + mova [t0+r10*2+400*0], m0 + mova [t0+r10*2+400*2], m2 + mova [t0+r10*2+400*4], m3 + vpbroadcastd m2, [pd_25] + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmulld m4, m2 ; a * 25 + pmulld m5, m2 + pmaddwd m2, m0, m0 ; b * b + pmaddwd m3, m1, m1 + psubd m4, m2 ; p + psubd m5, m3 + pmulld m4, m12 ; p * s + pmulld m5, m12 + pmaddwd m0, m13 ; b * 164 + pmaddwd m1, m13 + paddusw m4, m13 + paddusw m5, m13 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r12+m3*4], m4 ; x + psrad m4, m5, 20 + vpgatherdd m3, [r12+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m14 + pand m0, m15 + pand m1, m15 + por m0, m2 ; a | (b << 12) + por m1, m3 + mova [t3+r10*4+ 8], xm0 ; The neighbor calculations requires + vextracti128 [t3+r10*4+40], m0, 1 ; 13 bits for a and 21 bits for b. + mova [t3+r10*4+24], xm1 ; Packing them allows for 12+20, but + vextracti128 [t3+r10*4+56], m1, 1 ; that gets us most of the way. + add r10, 16 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + ret +.hv_last_row: ; esoteric edge case for odd heights + mova [t1+r10*2+400*0], m1 + paddw m1, m0 + mova [t1+r10*2+400*2], m4 + paddd m4, m2 + mova [t1+r10*2+400*4], m5 + paddd m5, m3 + jmp .hv_main2 +.v: ; vertical boxsum + ab + lea r10, [wq-2] +.v_loop: + mova m0, [t1+r10*2+400*0] + mova m2, [t1+r10*2+400*2] + mova m3, [t1+r10*2+400*4] + paddw m1, m0, [t2+r10*2+400*0] + paddd m4, m2, [t2+r10*2+400*2] + paddd m5, m3, [t2+r10*2+400*4] + paddw m0, m0 + paddd m2, m2 + paddd m3, m3 + paddw m1, m0 ; hv sum + paddd m4, m2 ; hv sumsq + paddd m5, m3 + vpbroadcastd m2, [pd_25] + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmulld m4, m2 ; a * 25 + pmulld m5, m2 + pmaddwd m2, m0, m0 ; b * b + pmaddwd m3, m1, m1 + psubd m4, m2 ; p + psubd m5, m3 + pmulld m4, m12 ; p * s + pmulld m5, m12 + pmaddwd m0, m13 ; b * 164 + pmaddwd m1, m13 + paddusw m4, m13 + paddusw m5, m13 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r12+m3*4], m4 ; x + psrad m4, m5, 20 + vpgatherdd m3, [r12+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m14 + pand m0, m15 + pand m1, m15 + por m0, m2 ; a | (b << 12) + por m1, m3 + mova [t3+r10*4+ 8], xm0 + vextracti128 [t3+r10*4+40], m0, 1 + mova [t3+r10*4+24], xm1 + vextracti128 [t3+r10*4+56], m1, 1 + add r10, 16 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu m0, [t3+r10*4+ 4] + movu m1, [t3+r10*4+36] + paddd m2, m0, [t3+r10*4+ 0] + paddd m3, m1, [t3+r10*4+32] + paddd m2, [t3+r10*4+ 8] + paddd m3, [t3+r10*4+40] + paddd m0, m2 + pslld m2, 2 + paddd m1, m3 + pslld m3, 2 + paddd m2, m0 ; ab 565 + paddd m3, m1 + pandn m0, m15, m2 ; a + psrld m2, 12 ; b + pandn m1, m15, m3 + psrld m3, 12 + mova [t3+r10*4+400*4+ 0], m0 + mova [t3+r10*4+400*8+ 0], m2 + mova [t3+r10*4+400*4+32], m1 + mova [t3+r10*4+400*8+32], m3 + add r10, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu m0, [t3+r10*4+ 4] + movu m1, [t3+r10*4+36] + paddd m2, m0, [t3+r10*4+ 0] + paddd m3, m1, [t3+r10*4+32] + paddd m2, [t3+r10*4+ 8] + paddd m3, [t3+r10*4+40] + paddd m0, m2 + pslld m2, 2 + paddd m1, m3 + pslld m3, 2 + paddd m2, m0 + paddd m3, m1 + pandn m0, m15, m2 + psrld m2, 12 + pandn m1, m15, m3 + psrld m3, 12 + paddd m4, m0, [t3+r10*4+400*4+ 0] ; a + paddd m5, m1, [t3+r10*4+400*4+32] + mova [t3+r10*4+400*4+ 0], m0 + mova [t3+r10*4+400*4+32], m1 + paddd m0, m2, [t3+r10*4+400*8+ 0] ; b + paddd m1, m3, [t3+r10*4+400*8+32] + mova [t3+r10*4+400*8+ 0], m2 + mova [t3+r10*4+400*8+32], m3 + pmovzxbd m2, [dstq+r10+0] + pmovzxbd m3, [dstq+r10+8] + pmaddwd m4, m2 ; a * src + pmaddwd m5, m3 + packssdw m2, m3 + psubd m0, m4 ; b - a * src + (1 << 8) + psubd m1, m5 + psrad m0, 9 + psrad m1, 9 + packssdw m0, m1 + pmulhrsw m0, m7 + paddw m0, m2 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + pshufd xm0, xm0, q3120 + mova [dstq+r10], xm0 + add r10, 16 + jl .n0_loop + add dstq, strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + pmovzxbd m2, [dstq+r10+0] + pmovzxbd m3, [dstq+r10+8] + pmaddwd m4, m2, [t3+r10*4+400*4+ 0] ; a * src + pmaddwd m5, m3, [t3+r10*4+400*4+32] + mova m0, [t3+r10*4+400*8+ 0] ; b + mova m1, [t3+r10*4+400*8+32] + packssdw m2, m3 + psubd m0, m4 ; b - a * src + (1 << 7) + psubd m1, m5 + psrad m0, 8 + psrad m1, 8 + packssdw m0, m1 + pmulhrsw m0, m7 + paddw m0, m2 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + pshufd xm0, xm0, q3120 + mova [dstq+r10], xm0 + add r10, 16 + jl .n1_loop + add dstq, strideq + ret + +cglobal sgr_filter_3x3_8bpc, 4, 15, 15, -400*28-16, dst, stride, left, lpf, \ + w, h, edge, params +%define base r14-sgr_x_by_x_avx2-256*4 + mov paramsq, r6mp + mov wd, wm + movifnidn hd, hm + mov edged, r7m + lea r14, [sgr_x_by_x_avx2+256*4] + vbroadcasti128 m8, [base+sgr_shuf+2] + add lpfq, wq + vbroadcasti128 m9, [base+sgr_shuf+4] + add dstq, wq + vbroadcasti128 m10, [base+sgr_shuf+6] + lea t3, [rsp+wq*4+16+400*12] + vpbroadcastd m11, [paramsq+ 4] ; s1 + pxor m6, m6 + vpbroadcastw m7, [paramsq+10] ; w1 + lea t1, [rsp+wq*2+20] + vpbroadcastd m12, [base+pd_0xf00801c7] + neg wq + vpbroadcastd m13, [base+pd_34816] ; (1 << 11) + (1 << 15) + psllw m7, 4 + vpbroadcastd m14, [base+pd_m4096] + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t2, t1 + add t1, 400*6 + call .h_top + lea t4, [lpfq+strideq*4] + mov lpfq, dstq + add t4, strideq + mov [rsp], t4 ; below + mov t0, t2 + call .hv +.main: + mov t5, t3 + add t3, 400*4 + dec hd + jz .height1 + add lpfq, strideq + call .hv + call .prep_n + dec hd + jz .extend_bottom +.main_loop: + add lpfq, strideq + call .hv + call .n + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .hv_bottom + call .n + add lpfq, strideq + call .hv_bottom +.end: + call .n + RET +.height1: + call .v + call .prep_n + mov t2, t1 + call .v + jmp .end +.extend_bottom: + call .v + call .n + mov t2, t1 + call .v + jmp .end +.no_top: + lea t4, [lpfq+strideq*4] + mov lpfq, dstq + lea t4, [t4+strideq*2] + mov [rsp], t4 + call .h + lea t0, [t1+400*6] + mov t2, t1 + call .v + jmp .main +.h: ; horizontal boxsum + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .h_main +.h_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .h_main +.h_top: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu xm5, [lpfq+r10-2] +.h_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -17 + jl .h_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right +.h_have_right: + pshufb m0, m5, m8 + pmullw m2, m0, m0 + pshufb m4, m5, m9 + paddw m0, m4 + pshufb m5, m10 + paddw m0, m5 ; sum + punpcklwd m3, m4, m5 + pmaddwd m3, m3 + punpckhwd m4, m5 + pmaddwd m4, m4 + punpcklwd m1, m2, m6 + punpckhwd m2, m6 + mova [t1+r10*2+400*0], m0 + paddd m1, m3 ; sumsq + paddd m2, m4 + mova [t1+r10*2+400*2], m1 + mova [t1+r10*2+400*4], m2 + add r10, 16 + jl .h_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .hv_main +.hv_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .hv_main +.hv_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu xm5, [lpfq+r10-2] +.hv_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -17 + jl .hv_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right +.hv_have_right: + pshufb m0, m5, m8 + pmullw m3, m0, m0 + pshufb m1, m5, m9 + paddw m0, m1 + pshufb m5, m10 + paddw m0, m5 ; h sum + punpcklwd m4, m5, m1 + pmaddwd m4, m4 + punpckhwd m5, m1 + pmaddwd m5, m5 + paddw m1, m0, [t2+r10*2+400*0] + paddw m1, [t1+r10*2+400*0] ; hv sum + punpcklwd m2, m3, m6 + punpckhwd m3, m6 + paddd m4, m2 ; h sumsq + paddd m5, m3 + paddd m2, m4, [t2+r10*2+400*2] + paddd m3, m5, [t2+r10*2+400*4] + paddd m2, [t1+r10*2+400*2] ; hv sumsq + paddd m3, [t1+r10*2+400*4] + mova [t0+r10*2+400*0], m0 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + mova [t0+r10*2+400*2], m4 + pslld m4, m2, 3 + mova [t0+r10*2+400*4], m5 + pslld m5, m3, 3 + paddd m4, m2 ; a * 9 + pmaddwd m2, m0, m0 ; b * b + paddd m5, m3 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p + psubd m5, m3 + pmulld m4, m11 ; p * s + pmulld m5, m11 + pmaddwd m0, m12 ; b * 455 + pmaddwd m1, m12 + paddusw m4, m12 + paddusw m5, m12 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r14+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r14+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m13 + pand m0, m14 + pand m1, m14 + por m0, m2 ; a | (b << 12) + por m1, m3 + mova [t3+r10*4+ 8], xm0 + vextracti128 [t3+r10*4+40], m0, 1 + mova [t3+r10*4+24], xm1 + vextracti128 [t3+r10*4+56], m1, 1 + add r10, 16 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + ret +.v: ; vertical boxsum + ab + lea r10, [wq-2] +.v_loop: + mova m1, [t1+r10*2+400*0] + paddw m1, m1 + paddw m1, [t2+r10*2+400*0] ; hv sum + mova m2, [t1+r10*2+400*2] + mova m3, [t1+r10*2+400*4] + paddd m2, m2 + paddd m3, m3 + paddd m2, [t2+r10*2+400*2] ; hv sumsq + paddd m3, [t2+r10*2+400*4] + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; a * 9 + pmaddwd m2, m0, m0 ; b * b + paddd m5, m3 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p + psubd m5, m3 + pmulld m4, m11 ; p * s + pmulld m5, m11 + pmaddwd m0, m12 ; b * 455 + pmaddwd m1, m12 + paddusw m4, m12 + paddusw m5, m12 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r14+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r14+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m13 + pand m0, m14 + pand m1, m14 + por m0, m2 ; a | (b << 12) + por m1, m3 + mova [t3+r10*4+ 8], xm0 + vextracti128 [t3+r10*4+40], m0, 1 + mova [t3+r10*4+24], xm1 + vextracti128 [t3+r10*4+56], m1, 1 + add r10, 16 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + mov r10, wq + mov t4, t3 + add t3, 400*4 +.prep_n_loop: + mova m2, [t5+r10*4+0] + mova m3, [t4+r10*4+0] + paddd m2, [t5+r10*4+8] + paddd m3, [t4+r10*4+8] + paddd m0, m2, [t5+r10*4+4] + paddd m1, m3, [t4+r10*4+4] + pslld m0, 2 + paddd m1, m1 ; ab[ 0] 222 + psubd m0, m2 ; ab[-1] 343 + mova [t3+r10*4+400*4], m1 + paddd m1, m1 + mova [t5+r10*4], m0 + psubd m1, m3 ; ab[ 0] 343 + mova [t4+r10*4], m1 + add r10, 8 + jl .prep_n_loop + ret +; a+b are packed together in a single dword, but we can't do the +; full neighbor calculations before splitting them since we don't +; have sufficient precision. The solution is to do the calculations +; in two equal halves and split a and b before doing the final sum. +ALIGN function_align +.n: ; neighbor + output + mov r10, wq +.n_loop: + mova m4, [t3+r10*4+ 0] + paddd m4, [t3+r10*4+ 8] + paddd m5, m4, [t3+r10*4+ 4] + paddd m5, m5 ; ab[+1] 222 + mova m2, [t3+r10*4+400*4+ 0] + paddd m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343 + mova m3, [t3+r10*4+400*4+32] + paddd m1, m3, [t5+r10*4+32] + mova [t3+r10*4+400*4+ 0], m5 + paddd m5, m5 + psubd m5, m4 ; ab[+1] 343 + mova [t5+r10*4+ 0], m5 + paddd m2, m5 ; ab[ 0] 222 + ab[+1] 343 + mova m4, [t3+r10*4+32] + paddd m4, [t3+r10*4+40] + paddd m5, m4, [t3+r10*4+36] + paddd m5, m5 + mova [t3+r10*4+400*4+32], m5 + paddd m5, m5 + psubd m5, m4 + mova [t5+r10*4+32], m5 + pandn m4, m14, m0 + psrld m0, 12 + paddd m3, m5 + pandn m5, m14, m2 + psrld m2, 12 + paddd m4, m5 ; a + pandn m5, m14, m1 + psrld m1, 12 + paddd m0, m2 ; b + (1 << 8) + pandn m2, m14, m3 + psrld m3, 12 + paddd m5, m2 + pmovzxbd m2, [dstq+r10+0] + paddd m1, m3 + pmovzxbd m3, [dstq+r10+8] + pmaddwd m4, m2 ; a * src + pmaddwd m5, m3 + packssdw m2, m3 + psubd m0, m4 ; b - a * src + (1 << 8) + psubd m1, m5 + psrad m0, 9 + psrad m1, 9 + packssdw m0, m1 + pmulhrsw m0, m7 + paddw m0, m2 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + pshufd xm0, xm0, q3120 + mova [dstq+r10], xm0 + add r10, 16 + jl .n_loop + mov r10, t5 + mov t5, t4 + mov t4, r10 + add dstq, strideq + ret + +cglobal sgr_filter_mix_8bpc, 4, 13, 16, 400*56+8, dst, stride, left, lpf, \ + w, h, edge, params +%define base r12-sgr_x_by_x_avx2-256*4 + lea r12, [sgr_x_by_x_avx2+256*4] + mov paramsq, r6mp + mov wd, wm + movifnidn hd, hm + mov edged, r7m + vbroadcasti128 m9, [base+sgr_shuf+0] + vbroadcasti128 m10, [base+sgr_shuf+8] + add lpfq, wq + vbroadcasti128 m11, [base+sgr_shuf+2] + vbroadcasti128 m12, [base+sgr_shuf+6] + add dstq, wq + vpbroadcastd m15, [paramsq+8] ; w0 w1 + lea t3, [rsp+wq*4+400*24+8] + vpbroadcastd m13, [paramsq+0] ; s0 + pxor m7, m7 + vpbroadcastd m14, [paramsq+4] ; s1 + lea t1, [rsp+wq*2+12] + neg wq + psllw m15, 2 ; to reuse existing pd_m4096 register for rounding + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t2, t1 + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).top_fixup + add t1, 400*12 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov [rsp], r10 ; below + call .hv0 +.main: + dec hd + jz .height1 + add lpfq, strideq + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, strideq + call .hv0 + test hd, hd + jz .odd_height + add lpfq, strideq + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .hv0_bottom + add lpfq, strideq + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + lea t2, [t1+400*12] + lea r10, [wq-2] +.top_fixup_loop: + mova m0, [t1+r10*2+400* 0] + mova m1, [t1+r10*2+400* 2] + mova m2, [t1+r10*2+400* 4] + paddw m0, m0 + mova m3, [t1+r10*2+400* 6] + paddd m1, m1 + mova m4, [t1+r10*2+400* 8] + paddd m2, m2 + mova m5, [t1+r10*2+400*10] + mova [t2+r10*2+400* 0], m0 + mova [t2+r10*2+400* 2], m1 + mova [t2+r10*2+400* 4], m2 + mova [t2+r10*2+400* 6], m3 + mova [t2+r10*2+400* 8], m4 + mova [t2+r10*2+400*10], m5 + add r10, 16 + jl .top_fixup_loop + call .v0 + jmp .main +.h: ; horizontal boxsums + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .h_main +.h_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .h_main +.h_top: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu xm5, [lpfq+r10-2] +.h_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -18 + jl .h_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right +.h_have_right: + pshufb m6, m5, m9 + pshufb m4, m5, m10 + paddw m8, m6, m4 + shufps m0, m6, m4, q2121 + pmullw m3, m0, m0 + pshufb m2, m5, m11 + paddw m0, m2 + pshufb m5, m12 + paddw m0, m5 ; sum3 + punpcklwd m1, m2, m5 + pmaddwd m1, m1 + punpckhwd m2, m5 + pmaddwd m2, m2 + punpcklwd m5, m6, m4 + pmaddwd m5, m5 + punpckhwd m6, m4 + pmaddwd m6, m6 + punpcklwd m4, m3, m7 + paddd m1, m4 ; sumsq3 + punpckhwd m3, m7 + paddd m2, m3 + mova [t1+r10*2+400* 6], m0 + mova [t1+r10*2+400* 8], m1 + mova [t1+r10*2+400*10], m2 + paddw m8, m0 ; sum5 + paddd m5, m1 ; sumsq5 + paddd m6, m2 + mova [t1+r10*2+400* 0], m8 + mova [t1+r10*2+400* 2], m5 + mova [t1+r10*2+400* 4], m6 + add r10, 16 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows) + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .hv0_main +.hv0_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .hv0_main +.hv0_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left +.hv0_loop: + movu xm5, [lpfq+r10-2] +.hv0_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp r10d, -18 + jl .hv0_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right +.hv0_have_right: + pshufb m6, m5, m9 + pshufb m4, m5, m10 + paddw m8, m6, m4 + shufps m1, m6, m4, q2121 + pmullw m0, m1, m1 + pshufb m3, m5, m11 + paddw m1, m3 + pshufb m5, m12 + paddw m1, m5 ; sum3 + punpcklwd m2, m3, m5 + pmaddwd m2, m2 + punpckhwd m3, m5 + pmaddwd m3, m3 + punpcklwd m5, m6, m4 + pmaddwd m5, m5 + punpckhwd m6, m4 + pmaddwd m6, m6 + punpcklwd m4, m0, m7 + paddd m2, m4 ; sumsq3 + punpckhwd m0, m7 + paddd m3, m0 + paddw m8, m1 ; sum5 + paddd m5, m2 ; sumsq5 + paddd m6, m3 + mova [t3+r10*4+400*8+ 8], m8 ; we need a clean copy of the last row + mova [t3+r10*4+400*0+ 8], m5 ; in case height is odd + mova [t3+r10*4+400*0+40], m6 + paddw m8, [t1+r10*2+400* 0] + paddd m5, [t1+r10*2+400* 2] + paddd m6, [t1+r10*2+400* 4] + mova [t1+r10*2+400* 0], m8 + mova [t1+r10*2+400* 2], m5 + mova [t1+r10*2+400* 4], m6 + paddw m0, m1, [t1+r10*2+400* 6] + paddd m4, m2, [t1+r10*2+400* 8] + paddd m5, m3, [t1+r10*2+400*10] + mova [t1+r10*2+400* 6], m1 + mova [t1+r10*2+400* 8], m2 + mova [t1+r10*2+400*10], m3 + paddw m1, m0, [t2+r10*2+400* 6] + paddd m2, m4, [t2+r10*2+400* 8] + paddd m3, m5, [t2+r10*2+400*10] + mova [t2+r10*2+400* 6], m0 + mova [t2+r10*2+400* 8], m4 + mova [t2+r10*2+400*10], m5 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; a3 * 9 + pmaddwd m2, m0, m0 ; b3 * b + paddd m5, m3 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p3 + vpbroadcastd m2, [base+pd_0xf00801c7] + psubd m5, m3 + pmulld m4, m14 ; p3 * s1 + pmulld m5, m14 + pmaddwd m0, m2 ; b3 * 455 + pmaddwd m1, m2 + paddusw m4, m2 + paddusw m5, m2 + psrad m3, m4, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r12+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r12+m4*4], m5 + vpbroadcastd m4, [base+pd_34816] + pmulld m0, m2 + vpbroadcastd m5, [base+pd_m4096] + pmulld m1, m3 + paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m4 + pand m0, m5 + pand m1, m5 + por m0, m2 ; a3 | (b3 << 12) + por m1, m3 + mova [t3+r10*4+400*4+ 8], xm0 + vextracti128 [t3+r10*4+400*4+40], m0, 1 + mova [t3+r10*4+400*4+24], xm1 + vextracti128 [t3+r10*4+400*4+56], m1, 1 + add r10, 16 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .hv1_main +.hv1_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .hv1_main +.hv1_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left +.hv1_loop: + movu xm5, [lpfq+r10-2] +.hv1_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp r10d, -18 + jl .hv1_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right +.hv1_have_right: + pshufb m6, m5, m9 + pshufb m3, m5, m10 + paddw m8, m6, m3 + shufps m2, m6, m3, q2121 + pmullw m1, m2, m2 + pshufb m0, m5, m11 + paddw m2, m0 + pshufb m5, m12 + paddw m2, m5 ; sum3 + punpcklwd m4, m5, m0 + pmaddwd m4, m4 + punpckhwd m5, m0 + pmaddwd m5, m5 + punpcklwd m0, m6, m3 + pmaddwd m0, m0 + punpckhwd m6, m3 + pmaddwd m6, m6 + punpcklwd m3, m1, m7 + paddd m4, m3 ; sumsq3 + punpckhwd m1, m7 + paddd m5, m1 + paddw m1, m2, [t2+r10*2+400* 6] + mova [t2+r10*2+400* 6], m2 + paddw m8, m2 ; sum5 + paddd m2, m4, [t2+r10*2+400* 8] + paddd m3, m5, [t2+r10*2+400*10] + mova [t2+r10*2+400* 8], m4 + mova [t2+r10*2+400*10], m5 + paddd m4, m0 ; sumsq5 + paddd m5, m6 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pslld m6, m2, 3 + pslld m7, m3, 3 + paddd m6, m2 ; a3 * 9 + pmaddwd m2, m0, m0 ; b3 * b3 + paddd m7, m3 + pmaddwd m3, m1, m1 + psubd m6, m2 ; p3 + vpbroadcastd m2, [base+pd_0xf00801c7] + psubd m7, m3 + pmulld m6, m14 ; p3 * s1 + pmulld m7, m14 + pmaddwd m0, m2 ; b3 * 455 + pmaddwd m1, m2 + paddusw m6, m2 + paddusw m7, m2 + psrad m3, m6, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r12+m3*4], m6 + psrad m6, m7, 20 + vpgatherdd m3, [r12+m6*4], m7 + vpbroadcastd m6, [base+pd_34816] ; x3 + pmulld m0, m2 + vpbroadcastd m7, [base+pd_m4096] + pmulld m1, m3 + paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m6 + pand m0, m7 + pand m7, m1 + por m0, m2 ; a3 | (b3 << 12) + por m7, m3 + paddw m1, m8, [t2+r10*2+400*0] + paddd m2, m4, [t2+r10*2+400*2] + paddd m3, m5, [t2+r10*2+400*4] + paddw m1, [t1+r10*2+400*0] + paddd m2, [t1+r10*2+400*2] + paddd m3, [t1+r10*2+400*4] + mova [t2+r10*2+400*0], m8 + mova [t2+r10*2+400*2], m4 + mova [t2+r10*2+400*4], m5 + mova [t3+r10*4+400*8+ 8], xm0 + vextracti128 [t3+r10*4+400*8+40], m0, 1 + mova [t3+r10*4+400*8+24], xm7 + vextracti128 [t3+r10*4+400*8+56], m7, 1 + vpbroadcastd m4, [base+pd_25] + pxor m7, m7 + punpcklwd m0, m1, m7 ; b5 + punpckhwd m1, m7 + pmulld m2, m4 ; a5 * 25 + pmulld m3, m4 + pmaddwd m4, m0, m0 ; b5 * b5 + pmaddwd m5, m1, m1 + psubd m2, m4 ; p5 + vpbroadcastd m4, [base+pd_0xf00800a4] + psubd m3, m5 + pmulld m2, m13 ; p5 * s0 + pmulld m3, m13 + pmaddwd m0, m4 ; b5 * 164 + pmaddwd m1, m4 + paddusw m2, m4 + paddusw m3, m4 + psrad m5, m2, 20 ; min(z5, 255) - 256 + vpgatherdd m4, [r12+m5*4], m2 ; x5 + psrad m2, m3, 20 + vpgatherdd m5, [r12+m2*4], m3 + pmulld m0, m4 + pmulld m1, m5 + paddd m0, m6 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m1, m6 + vpbroadcastd m6, [base+pd_m4096] + pand m0, m6 + pand m1, m6 + por m0, m4 ; a5 | (b5 << 12) + por m1, m5 + mova [t3+r10*4+400*0+ 8], xm0 + vextracti128 [t3+r10*4+400*0+40], m0, 1 + mova [t3+r10*4+400*0+24], xm1 + vextracti128 [t3+r10*4+400*0+56], m1, 1 + add r10, 16 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab3 (even rows) + lea r10, [wq-2] + vpbroadcastd m6, [base+pd_34816] + vpbroadcastd m8, [base+pd_m4096] +.v0_loop: + mova m0, [t1+r10*2+400* 6] + mova m4, [t1+r10*2+400* 8] + mova m5, [t1+r10*2+400*10] + paddw m0, m0 + paddd m4, m4 + paddd m5, m5 + paddw m1, m0, [t2+r10*2+400* 6] + paddd m2, m4, [t2+r10*2+400* 8] + paddd m3, m5, [t2+r10*2+400*10] + mova [t2+r10*2+400* 6], m0 + mova [t2+r10*2+400* 8], m4 + mova [t2+r10*2+400*10], m5 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; a3 * 9 + pmaddwd m2, m0, m0 ; b3 * b3 + paddd m5, m3 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p3 + vpbroadcastd m2, [base+pd_0xf00801c7] + psubd m5, m3 + pmulld m4, m14 ; p3 * s1 + pmulld m5, m14 + pmaddwd m0, m2 ; b3 * 455 + pmaddwd m1, m2 + paddusw m4, m2 + paddusw m5, m2 + psrad m3, m4, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r12+m3*4], m4 ; x3 + psrad m4, m5, 20 + vpgatherdd m3, [r12+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m6 + pand m0, m8 + pand m1, m8 + por m0, m2 ; a3 | (b3 << 12) + por m1, m3 + mova m2, [t1+r10*2+400*0] + mova m3, [t1+r10*2+400*2] + mova m4, [t1+r10*2+400*4] + mova [t3+r10*4+400*8+ 8], m2 + mova [t3+r10*4+400*0+ 8], m3 + mova [t3+r10*4+400*0+40], m4 + paddw m2, m2 ; cc5 + paddd m3, m3 + paddd m4, m4 + mova [t1+r10*2+400*0], m2 + mova [t1+r10*2+400*2], m3 + mova [t1+r10*2+400*4], m4 + mova [t3+r10*4+400*4+ 8], xm0 + vextracti128 [t3+r10*4+400*4+40], m0, 1 + mova [t3+r10*4+400*4+24], xm1 + vextracti128 [t3+r10*4+400*4+56], m1, 1 + add r10, 16 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) + lea r10, [wq-2] +.v1_loop: + mova m4, [t1+r10*2+400* 6] + mova m5, [t1+r10*2+400* 8] + mova m6, [t1+r10*2+400*10] + paddw m1, m4, [t2+r10*2+400* 6] + paddd m2, m5, [t2+r10*2+400* 8] + paddd m3, m6, [t2+r10*2+400*10] + mova [t2+r10*2+400* 6], m4 + mova [t2+r10*2+400* 8], m5 + mova [t2+r10*2+400*10], m6 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; a3 * 9 + pmaddwd m2, m0, m0 ; b3 * b3 + paddd m5, m3 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p3 + vpbroadcastd m2, [base+pd_0xf00801c7] + psubd m5, m3 + pmulld m4, m14 ; p3 * s1 + pmulld m5, m14 + pmaddwd m0, m2 ; b3 * 455 + pmaddwd m1, m2 + paddusw m4, m2 + paddusw m5, m2 + psrad m3, m4, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r12+m3*4], m4 ; x3 + psrad m4, m5, 20 + vpgatherdd m3, [r12+m4*4], m5 + vpbroadcastd m4, [base+pd_34816] + pmulld m0, m2 + vpbroadcastd m8, [base+pd_m4096] + pmulld m1, m3 + paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m4 + pand m0, m8 + pand m8, m1 + por m0, m2 ; a3 | (b3 << 12) + por m8, m3 + mova m4, [t3+r10*4+400*8+ 8] + mova m5, [t3+r10*4+400*0+ 8] + mova m6, [t3+r10*4+400*0+40] + paddw m1, m4, [t2+r10*2+400*0] + paddd m2, m5, [t2+r10*2+400*2] + paddd m3, m6, [t2+r10*2+400*4] + paddw m1, [t1+r10*2+400*0] + paddd m2, [t1+r10*2+400*2] + paddd m3, [t1+r10*2+400*4] + mova [t2+r10*2+400*0], m4 + mova [t2+r10*2+400*2], m5 + mova [t2+r10*2+400*4], m6 + vpbroadcastd m4, [base+pd_25] + mova [t3+r10*4+400*8+ 8], xm0 + vextracti128 [t3+r10*4+400*8+40], m0, 1 + mova [t3+r10*4+400*8+24], xm8 + vextracti128 [t3+r10*4+400*8+56], m8, 1 + punpcklwd m0, m1, m7 ; b5 + punpckhwd m1, m7 + pmulld m2, m4 ; a5 * 25 + pmulld m3, m4 + pmaddwd m4, m0, m0 ; b5 * b5 + pmaddwd m5, m1, m1 + psubd m2, m4 ; p5 + vpbroadcastd m4, [base+pd_0xf00800a4] + psubd m3, m5 + pmulld m2, m13 ; p5 * s0 + pmulld m3, m13 + pmaddwd m0, m4 ; b5 * 164 + pmaddwd m1, m4 + paddusw m2, m4 + paddusw m3, m4 + psrad m5, m2, 20 ; min(z5, 255) - 256 + vpgatherdd m4, [r12+m5*4], m2 ; x5 + psrad m2, m3, 20 + vpgatherdd m5, [r12+m2*4], m3 + pmulld m0, m4 + vpbroadcastd m6, [base+pd_34816] + pmulld m1, m5 + paddd m0, m6 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m1, m6 + vpbroadcastd m6, [base+pd_m4096] + pand m0, m6 + pand m1, m6 + por m0, m4 ; a5 | (b5 << 12) + por m1, m5 + mova [t3+r10*4+400*0+ 8], xm0 + vextracti128 [t3+r10*4+400*0+40], m0, 1 + mova [t3+r10*4+400*0+24], xm1 + vextracti128 [t3+r10*4+400*0+56], m1, 1 + add r10, 16 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu m0, [t3+r10*4+400*0+4] + paddd m1, m0, [t3+r10*4+400*0+0] + mova m4, [t3+r10*4+400*4+0] + paddd m1, [t3+r10*4+400*0+8] + mova m5, [t3+r10*4+400*8+0] + paddd m4, [t3+r10*4+400*4+8] + paddd m5, [t3+r10*4+400*8+8] + paddd m2, m4, [t3+r10*4+400*4+4] + paddd m3, m5, [t3+r10*4+400*8+4] + paddd m0, m1 + pslld m1, 2 + pslld m2, 2 + paddd m1, m0 ; ab5 565 + paddd m3, m3 ; ab3[ 0] 222 + psubd m2, m4 ; ab3[-1] 343 + mova [t3+r10*4+400*20], m3 + pandn m0, m6, m1 ; a5 565 + mova [t3+r10*4+400*24], m2 + psrld m1, 12 ; b5 565 + mova [t3+r10*4+400*12], m0 + paddd m3, m3 + mova [t3+r10*4+400*16], m1 + psubd m3, m5 ; ab3[ 0] 343 + mova [t3+r10*4+400*28], m3 + add r10, 8 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu m0, [t3+r10*4+4] + paddd m4, m0, [t3+r10*4+0] + paddd m4, [t3+r10*4+8] + paddd m0, m4 + pslld m4, 2 + paddd m4, m0 + pandn m0, m6, m4 + psrld m4, 12 + paddd m2, m0, [t3+r10*4+400*12] ; a5 + mova [t3+r10*4+400*12], m0 + paddd m0, m4, [t3+r10*4+400*16] ; b5 + (1 << 8) + mova [t3+r10*4+400*16], m4 + mova m3, [t3+r10*4+400*4+0] + paddd m3, [t3+r10*4+400*4+8] + paddd m5, m3, [t3+r10*4+400*4+4] + paddd m5, m5 ; ab3[ 1] 222 + mova m4, [t3+r10*4+400*20] + paddd m1, m4, [t3+r10*4+400*24] ; ab3[ 0] 222 + ab3[-1] 343 + mova [t3+r10*4+400*20], m5 + paddd m5, m5 + psubd m5, m3 ; ab3[ 1] 343 + mova [t3+r10*4+400*24], m5 + paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343 + pandn m3, m6, m1 + psrld m1, 12 + pandn m5, m6, m4 + psrld m4, 12 + paddd m3, m5 ; a3 + paddd m1, m4 ; b3 + (1 << 8) + pmovzxbd m4, [dstq+r10] + pmaddwd m2, m4 ; a5 * src + pmaddwd m3, m4 ; a3 * src + psubd m0, m2 ; b5 - a5 * src + (1 << 8) + psubd m1, m3 ; b3 - a3 * src + (1 << 8) + psrld m0, 9 + pslld m1, 7 + pblendw m0, m1, 0xaa + pmaddwd m0, m15 + psubd m0, m6 + psrad m0, 13 + paddd m0, m4 + vextracti128 xm1, m0, 1 + packssdw xm0, xm1 + packuswb xm0, xm0 + movq [dstq+r10], xm0 + add r10, 8 + jl .n0_loop + add dstq, strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + mova m3, [t3+r10*4+400*8+0] + paddd m3, [t3+r10*4+400*8+8] + paddd m5, m3, [t3+r10*4+400*8+4] + paddd m5, m5 ; ab3[ 1] 222 + mova m4, [t3+r10*4+400*20] + paddd m1, m4, [t3+r10*4+400*28] ; ab3[ 0] 222 + ab3[-1] 343 + mova [t3+r10*4+400*20], m5 + paddd m5, m5 + psubd m5, m3 ; ab3[ 1] 343 + mova [t3+r10*4+400*28], m5 + paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343 + pandn m3, m6, m1 + psrld m1, 12 + pandn m5, m6, m4 + psrld m4, 12 + paddd m3, m5 ; -a3 + paddd m1, m4 ; b3 + (1 << 8) + pmovzxbd m4, [dstq+r10] + pmaddwd m2, m4, [t3+r10*4+400*12] ; -a5 * src + mova m0, [t3+r10*4+400*16] ; b5 + (1 << 7) + pmaddwd m3, m4 ; -a3 * src + psubd m0, m2 ; a5 * src + b5 + (1 << 7) + psubd m1, m3 ; a3 * src + b3 + (1 << 8) + psrld m0, 8 + pslld m1, 7 + pblendw m0, m1, 0xaa + pmaddwd m0, m15 + psubd m0, m6 + psrad m0, 13 + paddd m0, m4 + vextracti128 xm1, m0, 1 + packssdw xm0, xm1 + packuswb xm0, xm0 + movq [dstq+r10], xm0 + add r10, 8 + jl .n1_loop + add dstq, strideq + ret + +%endif ; ARCH_X86_64 -- cgit v1.2.3