; Copyright © 2018, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 32 wiener_l_shuf: db 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14 wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12 sgr_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 sgr_r_ext: times 16 db 1 times 16 db 9 ; dword version of dav1d_sgr_x_by_x[] for use with gathers, wastes a bit of ; cache but eliminates some shifts in the inner sgr loop which is overall a win const sgr_x_by_x_avx2 dd 255,128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16 dd 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8 dd 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5 dd 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 dd 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3 dd 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 dd 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 times 4 db -1 ; needed for 16-bit sgr pb_m5: times 4 db -5 pb_3: times 4 db 3 pw_5_6: dw 5, 6 sgr_shuf: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1 db 9, -1, 10, -1, 11, -1, 12, -1 pw_256: times 2 dw 256 pw_2056: times 2 dw 2056 pw_m16380: times 2 dw -16380 pd_25: dd 25 pd_34816: dd 34816 pd_m4096: dd -4096 pd_0xf00801c7: dd 0xf00801c7 pd_0xf00800a4: dd 0xf00800a4 cextern pb_0to63 SECTION .text DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers INIT_YMM avx2 cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ w, h, edge, flt mov fltq, r6mp movifnidn hd, hm mov edged, r7m mov wd, wm vbroadcasti128 m6, [wiener_shufA] vpbroadcastb m11, [fltq+ 0] ; x0 x0 vbroadcasti128 m7, [wiener_shufB] vpbroadcastd m12, [fltq+ 2] vbroadcasti128 m8, [wiener_shufC] packsswb m12, m12 ; x1 x2 vpbroadcastw m13, [fltq+ 6] ; x3 vbroadcasti128 m9, [sgr_shuf+6] add lpfq, wq vpbroadcastd m10, [pw_m16380] vpbroadcastd m14, [fltq+16] ; y0 y1 add dstq, wq vpbroadcastd m15, [fltq+20] ; y2 y3 lea t1, [rsp+wq*2+16] psllw m14, 5 neg wq psllw m15, 5 test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t6, t1 mov t5, t1 add t1, 384*2 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq mov t4, t1 add t1, 384*2 add r10, strideq mov [rsp], r10 ; below call .h mov t3, t1 mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v3 .main: lea t0, [t1+384*2] .main_loop: call .hv dec hd jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .v3 mov lpfq, [rsp] call .hv_bottom add lpfq, strideq call .hv_bottom .v1: call .v RET .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov [rsp], r10 call .h mov t6, t1 mov t5, t1 mov t4, t1 mov t3, t1 mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v3 lea t0, [t1+384*2] call .hv dec hd jz .v3 add t0, 384*8 call .hv dec hd jnz .main .v3: call .v .v2: call .v jmp .v1 .extend_right: movd xm2, r10d vpbroadcastd m0, [pb_3] vpbroadcastd m1, [pb_m5] vpbroadcastb m2, xm2 mova m3, [pb_0to63] psubb m0, m2 psubb m1, m2 pminub m0, m3 pminub m1, m3 pshufb m4, m0 pshufb m5, m1 ret .h: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movd xm4, [leftq] vpblendd m4, [lpfq+r10-4], 0xfe add leftq, 4 jmp .h_main .h_extend_left: vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located mova m4, [lpfq+r10] ; before the start of the buffer palignr m4, m5, 12 pshufb m4, [wiener_l_shuf] jmp .h_main .h_top: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu m4, [lpfq+r10-4] .h_main: movu m5, [lpfq+r10+4] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -34 jl .h_have_right call .extend_right .h_have_right: pshufb m0, m4, m6 pmaddubsw m0, m11 pshufb m1, m5, m6 pmaddubsw m1, m11 pshufb m2, m4, m7 pmaddubsw m2, m12 pshufb m3, m5, m7 pmaddubsw m3, m12 paddw m0, m2 pshufb m2, m4, m8 pmaddubsw m2, m12 paddw m1, m3 pshufb m3, m5, m8 pmaddubsw m3, m12 pshufb m4, m9 paddw m0, m2 pmullw m2, m4, m13 pshufb m5, m9 paddw m1, m3 pmullw m3, m5, m13 psllw m4, 7 psllw m5, 7 paddw m4, m10 paddw m5, m10 paddw m0, m2 vpbroadcastd m2, [pw_2056] paddw m1, m3 paddsw m0, m4 paddsw m1, m5 psraw m0, 3 psraw m1, 3 paddw m0, m2 paddw m1, m2 mova [t1+r10*2+ 0], m0 mova [t1+r10*2+32], m1 add r10, 32 jl .h_loop ret ALIGN function_align .hv: add lpfq, strideq mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movd xm4, [leftq] vpblendd m4, [lpfq+r10-4], 0xfe add leftq, 4 jmp .hv_main .hv_extend_left: movu m4, [lpfq+r10-4] pshufb m4, [wiener_l_shuf] jmp .hv_main .hv_bottom: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu m4, [lpfq+r10-4] .hv_main: movu m5, [lpfq+r10+4] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp r10d, -34 jl .hv_have_right call .extend_right .hv_have_right: pshufb m0, m4, m6 pmaddubsw m0, m11 pshufb m1, m5, m6 pmaddubsw m1, m11 pshufb m2, m4, m7 pmaddubsw m2, m12 pshufb m3, m5, m7 pmaddubsw m3, m12 paddw m0, m2 pshufb m2, m4, m8 pmaddubsw m2, m12 paddw m1, m3 pshufb m3, m5, m8 pmaddubsw m3, m12 pshufb m4, m9 paddw m0, m2 pmullw m2, m4, m13 pshufb m5, m9 paddw m1, m3 pmullw m3, m5, m13 psllw m4, 7 psllw m5, 7 paddw m4, m10 paddw m5, m10 paddw m0, m2 paddw m1, m3 mova m2, [t4+r10*2] paddw m2, [t2+r10*2] mova m3, [t3+r10*2] paddsw m0, m4 vpbroadcastd m4, [pw_2056] paddsw m1, m5 mova m5, [t5+r10*2] paddw m5, [t1+r10*2] psraw m0, 3 psraw m1, 3 paddw m0, m4 paddw m1, m4 paddw m4, m0, [t6+r10*2] mova [t0+r10*2], m0 punpcklwd m0, m2, m3 pmaddwd m0, m15 punpckhwd m2, m3 pmaddwd m2, m15 punpcklwd m3, m4, m5 pmaddwd m3, m14 punpckhwd m4, m5 pmaddwd m4, m14 paddd m0, m3 paddd m4, m2 mova m2, [t4+r10*2+32] paddw m2, [t2+r10*2+32] mova m3, [t3+r10*2+32] mova m5, [t5+r10*2+32] paddw m5, [t1+r10*2+32] packuswb m0, m4 paddw m4, m1, [t6+r10*2+32] mova [t0+r10*2+32], m1 punpcklwd m1, m2, m3 pmaddwd m1, m15 punpckhwd m2, m3 pmaddwd m2, m15 punpcklwd m3, m4, m5 pmaddwd m3, m14 punpckhwd m4, m5 pmaddwd m4, m14 paddd m1, m3 paddd m2, m4 packuswb m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 mova [dstq+r10], m0 add r10, 32 jl .hv_loop mov t6, t5 mov t5, t4 mov t4, t3 mov t3, t2 mov t2, t1 mov t1, t0 mov t0, t6 add dstq, strideq ret .v: mov r10, wq .v_loop: mova m2, [t4+r10*2+ 0] paddw m2, [t2+r10*2+ 0] mova m4, [t3+r10*2+ 0] mova m6, [t1+r10*2+ 0] paddw m8, m6, [t6+r10*2+ 0] paddw m6, [t5+r10*2+ 0] mova m3, [t4+r10*2+32] paddw m3, [t2+r10*2+32] mova m5, [t3+r10*2+32] mova m7, [t1+r10*2+32] paddw m9, m7, [t6+r10*2+32] paddw m7, [t5+r10*2+32] punpcklwd m0, m2, m4 pmaddwd m0, m15 punpckhwd m2, m4 pmaddwd m2, m15 punpcklwd m4, m8, m6 pmaddwd m4, m14 punpckhwd m6, m8, m6 pmaddwd m6, m14 punpcklwd m1, m3, m5 pmaddwd m1, m15 punpckhwd m3, m5 pmaddwd m3, m15 punpcklwd m5, m9, m7 pmaddwd m5, m14 punpckhwd m7, m9, m7 pmaddwd m7, m14 paddd m0, m4 paddd m2, m6 paddd m1, m5 paddd m3, m7 packuswb m0, m2 packuswb m1, m3 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 mova [dstq+r10], m0 add r10, 32 jl .v_loop mov t6, t5 mov t5, t4 mov t4, t3 mov t3, t2 mov t2, t1 add dstq, strideq ret cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ w, h, edge, flt mov fltq, r6mp movifnidn hd, hm mov edged, r7m mov wd, wm vbroadcasti128 m6, [wiener_shufB] vpbroadcastd m12, [fltq+ 2] vbroadcasti128 m7, [wiener_shufC] packsswb m12, m12 ; x1 x2 vpbroadcastw m13, [fltq+ 6] ; x3 vbroadcasti128 m8, [sgr_shuf+6] add lpfq, wq vpbroadcastd m9, [pw_m16380] vpbroadcastd m10, [pw_2056] mova m11, [wiener_l_shuf] vpbroadcastd m14, [fltq+16] ; __ y1 add dstq, wq vpbroadcastd m15, [fltq+20] ; y2 y3 lea t1, [rsp+wq*2+16] psllw m14, 5 neg wq psllw m15, 5 test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t4, t1 add t1, 384*2 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq mov t3, t1 add t1, 384*2 add r10, strideq mov [rsp], r10 ; below call .h mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v2 .main: mov t0, t4 .main_loop: call .hv dec hd jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .v2 mov lpfq, [rsp] call .hv_bottom add lpfq, strideq call .hv_bottom .end: RET .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov [rsp], r10 call .h mov t4, t1 mov t3, t1 mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v2 lea t0, [t1+384*2] call .hv dec hd jz .v2 add t0, 384*6 call .hv dec hd jnz .main .v2: call .v mov t4, t3 mov t3, t2 mov t2, t1 add dstq, strideq .v1: call .v jmp .end .h: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movd xm4, [leftq] vpblendd m4, [lpfq+r10-4], 0xfe add leftq, 4 jmp .h_main .h_extend_left: vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located mova m4, [lpfq+r10] ; before the start of the buffer palignr m4, m5, 12 pshufb m4, m11 jmp .h_main .h_top: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu m4, [lpfq+r10-4] .h_main: movu m5, [lpfq+r10+4] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -33 jl .h_have_right call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right .h_have_right: pshufb m0, m4, m6 pmaddubsw m0, m12 pshufb m1, m5, m6 pmaddubsw m1, m12 pshufb m2, m4, m7 pmaddubsw m2, m12 pshufb m3, m5, m7 pmaddubsw m3, m12 pshufb m4, m8 paddw m0, m2 pmullw m2, m4, m13 pshufb m5, m8 paddw m1, m3 pmullw m3, m5, m13 psllw m4, 7 psllw m5, 7 paddw m4, m9 paddw m5, m9 paddw m0, m2 paddw m1, m3 paddsw m0, m4 paddsw m1, m5 psraw m0, 3 psraw m1, 3 paddw m0, m10 paddw m1, m10 mova [t1+r10*2+ 0], m0 mova [t1+r10*2+32], m1 add r10, 32 jl .h_loop ret ALIGN function_align .hv: add lpfq, strideq mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movd xm4, [leftq] vpblendd m4, [lpfq+r10-4], 0xfe add leftq, 4 jmp .hv_main .hv_extend_left: movu m4, [lpfq+r10-4] pshufb m4, m11 jmp .hv_main .hv_bottom: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu m4, [lpfq+r10-4] .hv_main: movu m5, [lpfq+r10+4] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp r10d, -33 jl .hv_have_right call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right .hv_have_right: pshufb m0, m4, m6 pmaddubsw m0, m12 pshufb m1, m5, m6 pmaddubsw m1, m12 pshufb m2, m4, m7 pmaddubsw m2, m12 pshufb m3, m5, m7 pmaddubsw m3, m12 pshufb m4, m8 paddw m0, m2 pmullw m2, m4, m13 pshufb m5, m8 paddw m1, m3 pmullw m3, m5, m13 psllw m4, 7 psllw m5, 7 paddw m4, m9 paddw m5, m9 paddw m0, m2 paddw m1, m3 mova m2, [t3+r10*2] paddw m2, [t1+r10*2] mova m3, [t2+r10*2] paddsw m0, m4 paddsw m1, m5 psraw m0, 3 psraw m1, 3 paddw m0, m10 paddw m1, m10 paddw m4, m0, [t4+r10*2] mova [t0+r10*2], m0 punpcklwd m0, m2, m3 pmaddwd m0, m15 punpckhwd m2, m3 pmaddwd m2, m15 punpcklwd m3, m4, m4 pmaddwd m3, m14 punpckhwd m4, m4 pmaddwd m4, m14 paddd m0, m3 paddd m4, m2 mova m2, [t3+r10*2+32] paddw m2, [t1+r10*2+32] mova m3, [t2+r10*2+32] packuswb m0, m4 paddw m4, m1, [t4+r10*2+32] mova [t0+r10*2+32], m1 punpcklwd m1, m2, m3 pmaddwd m1, m15 punpckhwd m2, m3 pmaddwd m2, m15 punpcklwd m3, m4, m4 pmaddwd m3, m14 punpckhwd m4, m4 pmaddwd m4, m14 paddd m1, m3 paddd m2, m4 packuswb m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 mova [dstq+r10], m0 add r10, 32 jl .hv_loop mov t4, t3 mov t3, t2 mov t2, t1 mov t1, t0 mov t0, t4 add dstq, strideq ret .v: mov r10, wq psrld m13, m14, 16 ; y1 __ .v_loop: mova m6, [t1+r10*2+ 0] paddw m2, m6, [t3+r10*2+ 0] mova m4, [t2+r10*2+ 0] mova m7, [t1+r10*2+32] paddw m3, m7, [t3+r10*2+32] mova m5, [t2+r10*2+32] paddw m6, [t4+r10*2+ 0] paddw m7, [t4+r10*2+32] punpcklwd m0, m2, m4 pmaddwd m0, m15 punpckhwd m2, m4 pmaddwd m2, m15 punpcklwd m1, m3, m5 pmaddwd m1, m15 punpckhwd m3, m5 pmaddwd m3, m15 punpcklwd m5, m7, m6 pmaddwd m4, m5, m14 punpckhwd m7, m6 pmaddwd m6, m7, m14 pmaddwd m5, m13 pmaddwd m7, m13 paddd m0, m4 paddd m2, m6 paddd m1, m5 paddd m3, m7 packuswb m0, m2 packuswb m1, m3 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 mova [dstq+r10], m0 add r10, 32 jl .v_loop ret cglobal sgr_filter_5x5_8bpc, 4, 13, 16, 400*24+16, dst, stride, left, lpf, \ w, h, edge, params %define base r12-sgr_x_by_x_avx2-256*4 lea r12, [sgr_x_by_x_avx2+256*4] mov paramsq, r6mp mov wd, wm movifnidn hd, hm mov edged, r7m vbroadcasti128 m8, [base+sgr_shuf+0] vbroadcasti128 m9, [base+sgr_shuf+8] add lpfq, wq vbroadcasti128 m10, [base+sgr_shuf+2] add dstq, wq vbroadcasti128 m11, [base+sgr_shuf+6] lea t3, [rsp+wq*4+16+400*12] vpbroadcastd m12, [paramsq+0] ; s0 pxor m6, m6 vpbroadcastw m7, [paramsq+8] ; w0 lea t1, [rsp+wq*2+20] vpbroadcastd m13, [base+pd_0xf00800a4] neg wq vpbroadcastd m14, [base+pd_34816] ; (1 << 11) + (1 << 15) psllw m7, 4 vpbroadcastd m15, [base+pd_m4096] test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t2, t1 call .top_fixup add t1, 400*6 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq add r10, strideq mov [rsp], r10 ; below mov t0, t2 dec hd jz .height1 or edged, 16 call .h .main: add lpfq, strideq call .hv call .prep_n sub hd, 2 jl .extend_bottom .main_loop: add lpfq, strideq test hd, hd jz .odd_height call .h add lpfq, strideq call .hv call .n0 call .n1 sub hd, 2 jge .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, [rsp] call .h_top add lpfq, strideq call .hv_bottom .end: call .n0 call .n1 .end2: RET .height1: call .hv call .prep_n jmp .odd_height_end .odd_height: call .hv call .n0 call .n1 .odd_height_end: call .v call .n0 jmp .end2 .extend_bottom: call .v jmp .end .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov [rsp], r10 call .h lea t2, [t1+400*6] call .top_fixup dec hd jz .no_top_height1 or edged, 16 mov t0, t1 mov t1, t2 jmp .main .no_top_height1: call .v call .prep_n jmp .odd_height_end .extend_right: movd xm2, r10d mova m0, [sgr_r_ext] vpbroadcastb m2, xm2 psubb m0, m2 pminub m0, [pb_0to63] pshufb m5, m0 ret .h: ; horizontal boxsum lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left vpbroadcastd xm0, [leftq] mova xm5, [lpfq+wq] palignr xm5, xm0, 12 add leftq, 4 jmp .h_main .h_extend_left: mova xm5, [lpfq+wq] pshufb xm5, [base+sgr_l_shuf] jmp .h_main .h_top: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu xm5, [lpfq+r10-2] .h_main: vinserti128 m5, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -18 jl .h_have_right call .extend_right .h_have_right: pshufb m3, m5, m8 pmullw m4, m3, m3 pshufb m2, m5, m9 paddw m0, m3, m2 shufps m3, m2, q2121 paddw m0, m3 punpcklwd m1, m2, m3 pmaddwd m1, m1 punpckhwd m2, m3 pmaddwd m2, m2 punpcklwd m3, m4, m6 paddd m1, m3 punpckhwd m4, m6 paddd m2, m4 pshufb m4, m5, m10 paddw m0, m4 pshufb m5, m11 paddw m0, m5 ; sum punpcklwd m3, m4, m5 pmaddwd m3, m3 punpckhwd m4, m5 pmaddwd m4, m4 test edgeb, 16 ; y > 0 jz .h_loop_end paddw m0, [t1+r10*2+400*0] paddd m1, [t1+r10*2+400*2] paddd m2, [t1+r10*2+400*4] .h_loop_end: paddd m1, m3 ; sumsq paddd m2, m4 mova [t1+r10*2+400*0], m0 mova [t1+r10*2+400*2], m1 mova [t1+r10*2+400*4], m2 add r10, 16 jl .h_loop ret .top_fixup: lea r10, [wq-2] .top_fixup_loop: ; the sums of the first row needs to be doubled mova m0, [t1+r10*2+400*0] mova m1, [t1+r10*2+400*2] mova m2, [t1+r10*2+400*4] paddw m0, m0 paddd m1, m1 paddd m2, m2 mova [t2+r10*2+400*0], m0 mova [t2+r10*2+400*2], m1 mova [t2+r10*2+400*4], m2 add r10, 16 jl .top_fixup_loop ret ALIGN function_align .hv: ; horizontal boxsum + vertical boxsum + ab lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left vpbroadcastd xm0, [leftq] mova xm5, [lpfq+wq] palignr xm5, xm0, 12 add leftq, 4 jmp .hv_main .hv_extend_left: mova xm5, [lpfq+wq] pshufb xm5, [base+sgr_l_shuf] jmp .hv_main .hv_bottom: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu xm5, [lpfq+r10-2] .hv_main: vinserti128 m5, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp r10d, -18 jl .hv_have_right call .extend_right .hv_have_right: pshufb m1, m5, m8 pmullw m4, m1, m1 pshufb m3, m5, m9 paddw m0, m1, m3 shufps m1, m3, q2121 paddw m0, m1 punpcklwd m2, m3, m1 pmaddwd m2, m2 punpckhwd m3, m1 pmaddwd m3, m3 punpcklwd m1, m4, m6 paddd m2, m1 punpckhwd m4, m6 paddd m3, m4 pshufb m1, m5, m10 paddw m0, m1 pshufb m5, m11 paddw m0, m5 ; h sum punpcklwd m4, m5, m1 pmaddwd m4, m4 punpckhwd m5, m1 pmaddwd m5, m5 paddw m1, m0, [t1+r10*2+400*0] paddd m2, m4 ; h sumsq paddd m3, m5 paddd m4, m2, [t1+r10*2+400*2] paddd m5, m3, [t1+r10*2+400*4] test hd, hd jz .hv_last_row .hv_main2: paddw m1, [t2+r10*2+400*0] ; hv sum paddd m4, [t2+r10*2+400*2] ; hv sumsq paddd m5, [t2+r10*2+400*4] mova [t0+r10*2+400*0], m0 mova [t0+r10*2+400*2], m2 mova [t0+r10*2+400*4], m3 vpbroadcastd m2, [pd_25] punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 pmulld m4, m2 ; a * 25 pmulld m5, m2 pmaddwd m2, m0, m0 ; b * b pmaddwd m3, m1, m1 psubd m4, m2 ; p psubd m5, m3 pmulld m4, m12 ; p * s pmulld m5, m12 pmaddwd m0, m13 ; b * 164 pmaddwd m1, m13 paddusw m4, m13 paddusw m5, m13 psrad m3, m4, 20 ; min(z, 255) - 256 vpgatherdd m2, [r12+m3*4], m4 ; x psrad m4, m5, 20 vpgatherdd m3, [r12+m4*4], m5 pmulld m0, m2 pmulld m1, m3 paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) paddd m1, m14 pand m0, m15 pand m1, m15 por m0, m2 ; a | (b << 12) por m1, m3 mova [t3+r10*4+ 8], xm0 ; The neighbor calculations requires vextracti128 [t3+r10*4+40], m0, 1 ; 13 bits for a and 21 bits for b. mova [t3+r10*4+24], xm1 ; Packing them allows for 12+20, but vextracti128 [t3+r10*4+56], m1, 1 ; that gets us most of the way. add r10, 16 jl .hv_loop mov t2, t1 mov t1, t0 mov t0, t2 ret .hv_last_row: ; esoteric edge case for odd heights mova [t1+r10*2+400*0], m1 paddw m1, m0 mova [t1+r10*2+400*2], m4 paddd m4, m2 mova [t1+r10*2+400*4], m5 paddd m5, m3 jmp .hv_main2 .v: ; vertical boxsum + ab lea r10, [wq-2] .v_loop: mova m0, [t1+r10*2+400*0] mova m2, [t1+r10*2+400*2] mova m3, [t1+r10*2+400*4] paddw m1, m0, [t2+r10*2+400*0] paddd m4, m2, [t2+r10*2+400*2] paddd m5, m3, [t2+r10*2+400*4] paddw m0, m0 paddd m2, m2 paddd m3, m3 paddw m1, m0 ; hv sum paddd m4, m2 ; hv sumsq paddd m5, m3 vpbroadcastd m2, [pd_25] punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 pmulld m4, m2 ; a * 25 pmulld m5, m2 pmaddwd m2, m0, m0 ; b * b pmaddwd m3, m1, m1 psubd m4, m2 ; p psubd m5, m3 pmulld m4, m12 ; p * s pmulld m5, m12 pmaddwd m0, m13 ; b * 164 pmaddwd m1, m13 paddusw m4, m13 paddusw m5, m13 psrad m3, m4, 20 ; min(z, 255) - 256 vpgatherdd m2, [r12+m3*4], m4 ; x psrad m4, m5, 20 vpgatherdd m3, [r12+m4*4], m5 pmulld m0, m2 pmulld m1, m3 paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) paddd m1, m14 pand m0, m15 pand m1, m15 por m0, m2 ; a | (b << 12) por m1, m3 mova [t3+r10*4+ 8], xm0 vextracti128 [t3+r10*4+40], m0, 1 mova [t3+r10*4+24], xm1 vextracti128 [t3+r10*4+56], m1, 1 add r10, 16 jl .v_loop ret .prep_n: ; initial neighbor setup mov r10, wq .prep_n_loop: movu m0, [t3+r10*4+ 4] movu m1, [t3+r10*4+36] paddd m2, m0, [t3+r10*4+ 0] paddd m3, m1, [t3+r10*4+32] paddd m2, [t3+r10*4+ 8] paddd m3, [t3+r10*4+40] paddd m0, m2 pslld m2, 2 paddd m1, m3 pslld m3, 2 paddd m2, m0 ; ab 565 paddd m3, m1 pandn m0, m15, m2 ; a psrld m2, 12 ; b pandn m1, m15, m3 psrld m3, 12 mova [t3+r10*4+400*4+ 0], m0 mova [t3+r10*4+400*8+ 0], m2 mova [t3+r10*4+400*4+32], m1 mova [t3+r10*4+400*8+32], m3 add r10, 16 jl .prep_n_loop ret ALIGN function_align .n0: ; neighbor + output (even rows) mov r10, wq .n0_loop: movu m0, [t3+r10*4+ 4] movu m1, [t3+r10*4+36] paddd m2, m0, [t3+r10*4+ 0] paddd m3, m1, [t3+r10*4+32] paddd m2, [t3+r10*4+ 8] paddd m3, [t3+r10*4+40] paddd m0, m2 pslld m2, 2 paddd m1, m3 pslld m3, 2 paddd m2, m0 paddd m3, m1 pandn m0, m15, m2 psrld m2, 12 pandn m1, m15, m3 psrld m3, 12 paddd m4, m0, [t3+r10*4+400*4+ 0] ; a paddd m5, m1, [t3+r10*4+400*4+32] mova [t3+r10*4+400*4+ 0], m0 mova [t3+r10*4+400*4+32], m1 paddd m0, m2, [t3+r10*4+400*8+ 0] ; b paddd m1, m3, [t3+r10*4+400*8+32] mova [t3+r10*4+400*8+ 0], m2 mova [t3+r10*4+400*8+32], m3 pmovzxbd m2, [dstq+r10+0] pmovzxbd m3, [dstq+r10+8] pmaddwd m4, m2 ; a * src pmaddwd m5, m3 packssdw m2, m3 psubd m0, m4 ; b - a * src + (1 << 8) psubd m1, m5 psrad m0, 9 psrad m1, 9 packssdw m0, m1 pmulhrsw m0, m7 paddw m0, m2 vextracti128 xm1, m0, 1 packuswb xm0, xm1 pshufd xm0, xm0, q3120 mova [dstq+r10], xm0 add r10, 16 jl .n0_loop add dstq, strideq ret ALIGN function_align .n1: ; neighbor + output (odd rows) mov r10, wq .n1_loop: pmovzxbd m2, [dstq+r10+0] pmovzxbd m3, [dstq+r10+8] pmaddwd m4, m2, [t3+r10*4+400*4+ 0] ; a * src pmaddwd m5, m3, [t3+r10*4+400*4+32] mova m0, [t3+r10*4+400*8+ 0] ; b mova m1, [t3+r10*4+400*8+32] packssdw m2, m3 psubd m0, m4 ; b - a * src + (1 << 7) psubd m1, m5 psrad m0, 8 psrad m1, 8 packssdw m0, m1 pmulhrsw m0, m7 paddw m0, m2 vextracti128 xm1, m0, 1 packuswb xm0, xm1 pshufd xm0, xm0, q3120 mova [dstq+r10], xm0 add r10, 16 jl .n1_loop add dstq, strideq ret cglobal sgr_filter_3x3_8bpc, 4, 15, 15, -400*28-16, dst, stride, left, lpf, \ w, h, edge, params %define base r14-sgr_x_by_x_avx2-256*4 mov paramsq, r6mp mov wd, wm movifnidn hd, hm mov edged, r7m lea r14, [sgr_x_by_x_avx2+256*4] vbroadcasti128 m8, [base+sgr_shuf+2] add lpfq, wq vbroadcasti128 m9, [base+sgr_shuf+4] add dstq, wq vbroadcasti128 m10, [base+sgr_shuf+6] lea t3, [rsp+wq*4+16+400*12] vpbroadcastd m11, [paramsq+ 4] ; s1 pxor m6, m6 vpbroadcastw m7, [paramsq+10] ; w1 lea t1, [rsp+wq*2+20] vpbroadcastd m12, [base+pd_0xf00801c7] neg wq vpbroadcastd m13, [base+pd_34816] ; (1 << 11) + (1 << 15) psllw m7, 4 vpbroadcastd m14, [base+pd_m4096] test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t2, t1 add t1, 400*6 call .h_top lea t4, [lpfq+strideq*4] mov lpfq, dstq add t4, strideq mov [rsp], t4 ; below mov t0, t2 call .hv .main: mov t5, t3 add t3, 400*4 dec hd jz .height1 add lpfq, strideq call .hv call .prep_n dec hd jz .extend_bottom .main_loop: add lpfq, strideq call .hv call .n dec hd jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, [rsp] call .hv_bottom call .n add lpfq, strideq call .hv_bottom .end: call .n RET .height1: call .v call .prep_n mov t2, t1 call .v jmp .end .extend_bottom: call .v call .n mov t2, t1 call .v jmp .end .no_top: lea t4, [lpfq+strideq*4] mov lpfq, dstq lea t4, [t4+strideq*2] mov [rsp], t4 call .h lea t0, [t1+400*6] mov t2, t1 call .v jmp .main .h: ; horizontal boxsum lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left vpbroadcastd xm0, [leftq] mova xm5, [lpfq+wq] palignr xm5, xm0, 12 add leftq, 4 jmp .h_main .h_extend_left: mova xm5, [lpfq+wq] pshufb xm5, [base+sgr_l_shuf] jmp .h_main .h_top: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu xm5, [lpfq+r10-2] .h_main: vinserti128 m5, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -17 jl .h_have_right call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right .h_have_right: pshufb m0, m5, m8 pmullw m2, m0, m0 pshufb m4, m5, m9 paddw m0, m4 pshufb m5, m10 paddw m0, m5 ; sum punpcklwd m3, m4, m5 pmaddwd m3, m3 punpckhwd m4, m5 pmaddwd m4, m4 punpcklwd m1, m2, m6 punpckhwd m2, m6 mova [t1+r10*2+400*0], m0 paddd m1, m3 ; sumsq paddd m2, m4 mova [t1+r10*2+400*2], m1 mova [t1+r10*2+400*4], m2 add r10, 16 jl .h_loop ret ALIGN function_align .hv: ; horizontal boxsum + vertical boxsum + ab lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left vpbroadcastd xm0, [leftq] mova xm5, [lpfq+wq] palignr xm5, xm0, 12 add leftq, 4 jmp .hv_main .hv_extend_left: mova xm5, [lpfq+wq] pshufb xm5, [base+sgr_l_shuf] jmp .hv_main .hv_bottom: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu xm5, [lpfq+r10-2] .hv_main: vinserti128 m5, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp r10d, -17 jl .hv_have_right call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right .hv_have_right: pshufb m0, m5, m8 pmullw m3, m0, m0 pshufb m1, m5, m9 paddw m0, m1 pshufb m5, m10 paddw m0, m5 ; h sum punpcklwd m4, m5, m1 pmaddwd m4, m4 punpckhwd m5, m1 pmaddwd m5, m5 paddw m1, m0, [t2+r10*2+400*0] paddw m1, [t1+r10*2+400*0] ; hv sum punpcklwd m2, m3, m6 punpckhwd m3, m6 paddd m4, m2 ; h sumsq paddd m5, m3 paddd m2, m4, [t2+r10*2+400*2] paddd m3, m5, [t2+r10*2+400*4] paddd m2, [t1+r10*2+400*2] ; hv sumsq paddd m3, [t1+r10*2+400*4] mova [t0+r10*2+400*0], m0 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 mova [t0+r10*2+400*2], m4 pslld m4, m2, 3 mova [t0+r10*2+400*4], m5 pslld m5, m3, 3 paddd m4, m2 ; a * 9 pmaddwd m2, m0, m0 ; b * b paddd m5, m3 pmaddwd m3, m1, m1 psubd m4, m2 ; p psubd m5, m3 pmulld m4, m11 ; p * s pmulld m5, m11 pmaddwd m0, m12 ; b * 455 pmaddwd m1, m12 paddusw m4, m12 paddusw m5, m12 psrad m3, m4, 20 ; min(z, 255) - 256 vpgatherdd m2, [r14+m3*4], m4 psrad m4, m5, 20 vpgatherdd m3, [r14+m4*4], m5 pmulld m0, m2 pmulld m1, m3 paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m13 pand m0, m14 pand m1, m14 por m0, m2 ; a | (b << 12) por m1, m3 mova [t3+r10*4+ 8], xm0 vextracti128 [t3+r10*4+40], m0, 1 mova [t3+r10*4+24], xm1 vextracti128 [t3+r10*4+56], m1, 1 add r10, 16 jl .hv_loop mov t2, t1 mov t1, t0 mov t0, t2 ret .v: ; vertical boxsum + ab lea r10, [wq-2] .v_loop: mova m1, [t1+r10*2+400*0] paddw m1, m1 paddw m1, [t2+r10*2+400*0] ; hv sum mova m2, [t1+r10*2+400*2] mova m3, [t1+r10*2+400*4] paddd m2, m2 paddd m3, m3 paddd m2, [t2+r10*2+400*2] ; hv sumsq paddd m3, [t2+r10*2+400*4] punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; a * 9 pmaddwd m2, m0, m0 ; b * b paddd m5, m3 pmaddwd m3, m1, m1 psubd m4, m2 ; p psubd m5, m3 pmulld m4, m11 ; p * s pmulld m5, m11 pmaddwd m0, m12 ; b * 455 pmaddwd m1, m12 paddusw m4, m12 paddusw m5, m12 psrad m3, m4, 20 ; min(z, 255) - 256 vpgatherdd m2, [r14+m3*4], m4 psrad m4, m5, 20 vpgatherdd m3, [r14+m4*4], m5 pmulld m0, m2 pmulld m1, m3 paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m13 pand m0, m14 pand m1, m14 por m0, m2 ; a | (b << 12) por m1, m3 mova [t3+r10*4+ 8], xm0 vextracti128 [t3+r10*4+40], m0, 1 mova [t3+r10*4+24], xm1 vextracti128 [t3+r10*4+56], m1, 1 add r10, 16 jl .v_loop ret .prep_n: ; initial neighbor setup mov r10, wq mov t4, t3 add t3, 400*4 .prep_n_loop: mova m2, [t5+r10*4+0] mova m3, [t4+r10*4+0] paddd m2, [t5+r10*4+8] paddd m3, [t4+r10*4+8] paddd m0, m2, [t5+r10*4+4] paddd m1, m3, [t4+r10*4+4] pslld m0, 2 paddd m1, m1 ; ab[ 0] 222 psubd m0, m2 ; ab[-1] 343 mova [t3+r10*4+400*4], m1 paddd m1, m1 mova [t5+r10*4], m0 psubd m1, m3 ; ab[ 0] 343 mova [t4+r10*4], m1 add r10, 8 jl .prep_n_loop ret ; a+b are packed together in a single dword, but we can't do the ; full neighbor calculations before splitting them since we don't ; have sufficient precision. The solution is to do the calculations ; in two equal halves and split a and b before doing the final sum. ALIGN function_align .n: ; neighbor + output mov r10, wq .n_loop: mova m4, [t3+r10*4+ 0] paddd m4, [t3+r10*4+ 8] paddd m5, m4, [t3+r10*4+ 4] paddd m5, m5 ; ab[+1] 222 mova m2, [t3+r10*4+400*4+ 0] paddd m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343 mova m3, [t3+r10*4+400*4+32] paddd m1, m3, [t5+r10*4+32] mova [t3+r10*4+400*4+ 0], m5 paddd m5, m5 psubd m5, m4 ; ab[+1] 343 mova [t5+r10*4+ 0], m5 paddd m2, m5 ; ab[ 0] 222 + ab[+1] 343 mova m4, [t3+r10*4+32] paddd m4, [t3+r10*4+40] paddd m5, m4, [t3+r10*4+36] paddd m5, m5 mova [t3+r10*4+400*4+32], m5 paddd m5, m5 psubd m5, m4 mova [t5+r10*4+32], m5 pandn m4, m14, m0 psrld m0, 12 paddd m3, m5 pandn m5, m14, m2 psrld m2, 12 paddd m4, m5 ; a pandn m5, m14, m1 psrld m1, 12 paddd m0, m2 ; b + (1 << 8) pandn m2, m14, m3 psrld m3, 12 paddd m5, m2 pmovzxbd m2, [dstq+r10+0] paddd m1, m3 pmovzxbd m3, [dstq+r10+8] pmaddwd m4, m2 ; a * src pmaddwd m5, m3 packssdw m2, m3 psubd m0, m4 ; b - a * src + (1 << 8) psubd m1, m5 psrad m0, 9 psrad m1, 9 packssdw m0, m1 pmulhrsw m0, m7 paddw m0, m2 vextracti128 xm1, m0, 1 packuswb xm0, xm1 pshufd xm0, xm0, q3120 mova [dstq+r10], xm0 add r10, 16 jl .n_loop mov r10, t5 mov t5, t4 mov t4, r10 add dstq, strideq ret cglobal sgr_filter_mix_8bpc, 4, 13, 16, 400*56+8, dst, stride, left, lpf, \ w, h, edge, params %define base r12-sgr_x_by_x_avx2-256*4 lea r12, [sgr_x_by_x_avx2+256*4] mov paramsq, r6mp mov wd, wm movifnidn hd, hm mov edged, r7m vbroadcasti128 m9, [base+sgr_shuf+0] vbroadcasti128 m10, [base+sgr_shuf+8] add lpfq, wq vbroadcasti128 m11, [base+sgr_shuf+2] vbroadcasti128 m12, [base+sgr_shuf+6] add dstq, wq vpbroadcastd m15, [paramsq+8] ; w0 w1 lea t3, [rsp+wq*4+400*24+8] vpbroadcastd m13, [paramsq+0] ; s0 pxor m7, m7 vpbroadcastd m14, [paramsq+4] ; s1 lea t1, [rsp+wq*2+12] neg wq psllw m15, 2 ; to reuse existing pd_m4096 register for rounding test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t2, t1 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).top_fixup add t1, 400*12 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq add r10, strideq mov [rsp], r10 ; below call .hv0 .main: dec hd jz .height1 add lpfq, strideq call .hv1 call .prep_n sub hd, 2 jl .extend_bottom .main_loop: add lpfq, strideq call .hv0 test hd, hd jz .odd_height add lpfq, strideq call .hv1 call .n0 call .n1 sub hd, 2 jge .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, [rsp] call .hv0_bottom add lpfq, strideq call .hv1_bottom .end: call .n0 call .n1 .end2: RET .height1: call .v1 call .prep_n jmp .odd_height_end .odd_height: call .v1 call .n0 call .n1 .odd_height_end: call .v0 call .v1 call .n0 jmp .end2 .extend_bottom: call .v0 call .v1 jmp .end .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov [rsp], r10 call .h lea t2, [t1+400*12] lea r10, [wq-2] .top_fixup_loop: mova m0, [t1+r10*2+400* 0] mova m1, [t1+r10*2+400* 2] mova m2, [t1+r10*2+400* 4] paddw m0, m0 mova m3, [t1+r10*2+400* 6] paddd m1, m1 mova m4, [t1+r10*2+400* 8] paddd m2, m2 mova m5, [t1+r10*2+400*10] mova [t2+r10*2+400* 0], m0 mova [t2+r10*2+400* 2], m1 mova [t2+r10*2+400* 4], m2 mova [t2+r10*2+400* 6], m3 mova [t2+r10*2+400* 8], m4 mova [t2+r10*2+400*10], m5 add r10, 16 jl .top_fixup_loop call .v0 jmp .main .h: ; horizontal boxsums lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left vpbroadcastd xm0, [leftq] mova xm5, [lpfq+wq] palignr xm5, xm0, 12 add leftq, 4 jmp .h_main .h_extend_left: mova xm5, [lpfq+wq] pshufb xm5, [base+sgr_l_shuf] jmp .h_main .h_top: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu xm5, [lpfq+r10-2] .h_main: vinserti128 m5, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -18 jl .h_have_right call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right .h_have_right: pshufb m6, m5, m9 pshufb m4, m5, m10 paddw m8, m6, m4 shufps m0, m6, m4, q2121 pmullw m3, m0, m0 pshufb m2, m5, m11 paddw m0, m2 pshufb m5, m12 paddw m0, m5 ; sum3 punpcklwd m1, m2, m5 pmaddwd m1, m1 punpckhwd m2, m5 pmaddwd m2, m2 punpcklwd m5, m6, m4 pmaddwd m5, m5 punpckhwd m6, m4 pmaddwd m6, m6 punpcklwd m4, m3, m7 paddd m1, m4 ; sumsq3 punpckhwd m3, m7 paddd m2, m3 mova [t1+r10*2+400* 6], m0 mova [t1+r10*2+400* 8], m1 mova [t1+r10*2+400*10], m2 paddw m8, m0 ; sum5 paddd m5, m1 ; sumsq5 paddd m6, m2 mova [t1+r10*2+400* 0], m8 mova [t1+r10*2+400* 2], m5 mova [t1+r10*2+400* 4], m6 add r10, 16 jl .h_loop ret ALIGN function_align .hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows) lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left vpbroadcastd xm0, [leftq] mova xm5, [lpfq+wq] palignr xm5, xm0, 12 add leftq, 4 jmp .hv0_main .hv0_extend_left: mova xm5, [lpfq+wq] pshufb xm5, [base+sgr_l_shuf] jmp .hv0_main .hv0_bottom: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left .hv0_loop: movu xm5, [lpfq+r10-2] .hv0_main: vinserti128 m5, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv0_have_right cmp r10d, -18 jl .hv0_have_right call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right .hv0_have_right: pshufb m6, m5, m9 pshufb m4, m5, m10 paddw m8, m6, m4 shufps m1, m6, m4, q2121 pmullw m0, m1, m1 pshufb m3, m5, m11 paddw m1, m3 pshufb m5, m12 paddw m1, m5 ; sum3 punpcklwd m2, m3, m5 pmaddwd m2, m2 punpckhwd m3, m5 pmaddwd m3, m3 punpcklwd m5, m6, m4 pmaddwd m5, m5 punpckhwd m6, m4 pmaddwd m6, m6 punpcklwd m4, m0, m7 paddd m2, m4 ; sumsq3 punpckhwd m0, m7 paddd m3, m0 paddw m8, m1 ; sum5 paddd m5, m2 ; sumsq5 paddd m6, m3 mova [t3+r10*4+400*8+ 8], m8 ; we need a clean copy of the last row mova [t3+r10*4+400*0+ 8], m5 ; in case height is odd mova [t3+r10*4+400*0+40], m6 paddw m8, [t1+r10*2+400* 0] paddd m5, [t1+r10*2+400* 2] paddd m6, [t1+r10*2+400* 4] mova [t1+r10*2+400* 0], m8 mova [t1+r10*2+400* 2], m5 mova [t1+r10*2+400* 4], m6 paddw m0, m1, [t1+r10*2+400* 6] paddd m4, m2, [t1+r10*2+400* 8] paddd m5, m3, [t1+r10*2+400*10] mova [t1+r10*2+400* 6], m1 mova [t1+r10*2+400* 8], m2 mova [t1+r10*2+400*10], m3 paddw m1, m0, [t2+r10*2+400* 6] paddd m2, m4, [t2+r10*2+400* 8] paddd m3, m5, [t2+r10*2+400*10] mova [t2+r10*2+400* 6], m0 mova [t2+r10*2+400* 8], m4 mova [t2+r10*2+400*10], m5 punpcklwd m0, m1, m7 ; b3 punpckhwd m1, m7 pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; a3 * 9 pmaddwd m2, m0, m0 ; b3 * b paddd m5, m3 pmaddwd m3, m1, m1 psubd m4, m2 ; p3 vpbroadcastd m2, [base+pd_0xf00801c7] psubd m5, m3 pmulld m4, m14 ; p3 * s1 pmulld m5, m14 pmaddwd m0, m2 ; b3 * 455 pmaddwd m1, m2 paddusw m4, m2 paddusw m5, m2 psrad m3, m4, 20 ; min(z3, 255) - 256 vpgatherdd m2, [r12+m3*4], m4 psrad m4, m5, 20 vpgatherdd m3, [r12+m4*4], m5 vpbroadcastd m4, [base+pd_34816] pmulld m0, m2 vpbroadcastd m5, [base+pd_m4096] pmulld m1, m3 paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m4 pand m0, m5 pand m1, m5 por m0, m2 ; a3 | (b3 << 12) por m1, m3 mova [t3+r10*4+400*4+ 8], xm0 vextracti128 [t3+r10*4+400*4+40], m0, 1 mova [t3+r10*4+400*4+24], xm1 vextracti128 [t3+r10*4+400*4+56], m1, 1 add r10, 16 jl .hv0_loop ret ALIGN function_align .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left vpbroadcastd xm0, [leftq] mova xm5, [lpfq+wq] palignr xm5, xm0, 12 add leftq, 4 jmp .hv1_main .hv1_extend_left: mova xm5, [lpfq+wq] pshufb xm5, [base+sgr_l_shuf] jmp .hv1_main .hv1_bottom: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left .hv1_loop: movu xm5, [lpfq+r10-2] .hv1_main: vinserti128 m5, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv1_have_right cmp r10d, -18 jl .hv1_have_right call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right .hv1_have_right: pshufb m6, m5, m9 pshufb m3, m5, m10 paddw m8, m6, m3 shufps m2, m6, m3, q2121 pmullw m1, m2, m2 pshufb m0, m5, m11 paddw m2, m0 pshufb m5, m12 paddw m2, m5 ; sum3 punpcklwd m4, m5, m0 pmaddwd m4, m4 punpckhwd m5, m0 pmaddwd m5, m5 punpcklwd m0, m6, m3 pmaddwd m0, m0 punpckhwd m6, m3 pmaddwd m6, m6 punpcklwd m3, m1, m7 paddd m4, m3 ; sumsq3 punpckhwd m1, m7 paddd m5, m1 paddw m1, m2, [t2+r10*2+400* 6] mova [t2+r10*2+400* 6], m2 paddw m8, m2 ; sum5 paddd m2, m4, [t2+r10*2+400* 8] paddd m3, m5, [t2+r10*2+400*10] mova [t2+r10*2+400* 8], m4 mova [t2+r10*2+400*10], m5 paddd m4, m0 ; sumsq5 paddd m5, m6 punpcklwd m0, m1, m7 ; b3 punpckhwd m1, m7 pslld m6, m2, 3 pslld m7, m3, 3 paddd m6, m2 ; a3 * 9 pmaddwd m2, m0, m0 ; b3 * b3 paddd m7, m3 pmaddwd m3, m1, m1 psubd m6, m2 ; p3 vpbroadcastd m2, [base+pd_0xf00801c7] psubd m7, m3 pmulld m6, m14 ; p3 * s1 pmulld m7, m14 pmaddwd m0, m2 ; b3 * 455 pmaddwd m1, m2 paddusw m6, m2 paddusw m7, m2 psrad m3, m6, 20 ; min(z3, 255) - 256 vpgatherdd m2, [r12+m3*4], m6 psrad m6, m7, 20 vpgatherdd m3, [r12+m6*4], m7 vpbroadcastd m6, [base+pd_34816] ; x3 pmulld m0, m2 vpbroadcastd m7, [base+pd_m4096] pmulld m1, m3 paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m6 pand m0, m7 pand m7, m1 por m0, m2 ; a3 | (b3 << 12) por m7, m3 paddw m1, m8, [t2+r10*2+400*0] paddd m2, m4, [t2+r10*2+400*2] paddd m3, m5, [t2+r10*2+400*4] paddw m1, [t1+r10*2+400*0] paddd m2, [t1+r10*2+400*2] paddd m3, [t1+r10*2+400*4] mova [t2+r10*2+400*0], m8 mova [t2+r10*2+400*2], m4 mova [t2+r10*2+400*4], m5 mova [t3+r10*4+400*8+ 8], xm0 vextracti128 [t3+r10*4+400*8+40], m0, 1 mova [t3+r10*4+400*8+24], xm7 vextracti128 [t3+r10*4+400*8+56], m7, 1 vpbroadcastd m4, [base+pd_25] pxor m7, m7 punpcklwd m0, m1, m7 ; b5 punpckhwd m1, m7 pmulld m2, m4 ; a5 * 25 pmulld m3, m4 pmaddwd m4, m0, m0 ; b5 * b5 pmaddwd m5, m1, m1 psubd m2, m4 ; p5 vpbroadcastd m4, [base+pd_0xf00800a4] psubd m3, m5 pmulld m2, m13 ; p5 * s0 pmulld m3, m13 pmaddwd m0, m4 ; b5 * 164 pmaddwd m1, m4 paddusw m2, m4 paddusw m3, m4 psrad m5, m2, 20 ; min(z5, 255) - 256 vpgatherdd m4, [r12+m5*4], m2 ; x5 psrad m2, m3, 20 vpgatherdd m5, [r12+m2*4], m3 pmulld m0, m4 pmulld m1, m5 paddd m0, m6 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) paddd m1, m6 vpbroadcastd m6, [base+pd_m4096] pand m0, m6 pand m1, m6 por m0, m4 ; a5 | (b5 << 12) por m1, m5 mova [t3+r10*4+400*0+ 8], xm0 vextracti128 [t3+r10*4+400*0+40], m0, 1 mova [t3+r10*4+400*0+24], xm1 vextracti128 [t3+r10*4+400*0+56], m1, 1 add r10, 16 jl .hv1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .v0: ; vertical boxsums + ab3 (even rows) lea r10, [wq-2] vpbroadcastd m6, [base+pd_34816] vpbroadcastd m8, [base+pd_m4096] .v0_loop: mova m0, [t1+r10*2+400* 6] mova m4, [t1+r10*2+400* 8] mova m5, [t1+r10*2+400*10] paddw m0, m0 paddd m4, m4 paddd m5, m5 paddw m1, m0, [t2+r10*2+400* 6] paddd m2, m4, [t2+r10*2+400* 8] paddd m3, m5, [t2+r10*2+400*10] mova [t2+r10*2+400* 6], m0 mova [t2+r10*2+400* 8], m4 mova [t2+r10*2+400*10], m5 punpcklwd m0, m1, m7 ; b3 punpckhwd m1, m7 pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; a3 * 9 pmaddwd m2, m0, m0 ; b3 * b3 paddd m5, m3 pmaddwd m3, m1, m1 psubd m4, m2 ; p3 vpbroadcastd m2, [base+pd_0xf00801c7] psubd m5, m3 pmulld m4, m14 ; p3 * s1 pmulld m5, m14 pmaddwd m0, m2 ; b3 * 455 pmaddwd m1, m2 paddusw m4, m2 paddusw m5, m2 psrad m3, m4, 20 ; min(z3, 255) - 256 vpgatherdd m2, [r12+m3*4], m4 ; x3 psrad m4, m5, 20 vpgatherdd m3, [r12+m4*4], m5 pmulld m0, m2 pmulld m1, m3 paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m6 pand m0, m8 pand m1, m8 por m0, m2 ; a3 | (b3 << 12) por m1, m3 mova m2, [t1+r10*2+400*0] mova m3, [t1+r10*2+400*2] mova m4, [t1+r10*2+400*4] mova [t3+r10*4+400*8+ 8], m2 mova [t3+r10*4+400*0+ 8], m3 mova [t3+r10*4+400*0+40], m4 paddw m2, m2 ; cc5 paddd m3, m3 paddd m4, m4 mova [t1+r10*2+400*0], m2 mova [t1+r10*2+400*2], m3 mova [t1+r10*2+400*4], m4 mova [t3+r10*4+400*4+ 8], xm0 vextracti128 [t3+r10*4+400*4+40], m0, 1 mova [t3+r10*4+400*4+24], xm1 vextracti128 [t3+r10*4+400*4+56], m1, 1 add r10, 16 jl .v0_loop ret .v1: ; vertical boxsums + ab (odd rows) lea r10, [wq-2] .v1_loop: mova m4, [t1+r10*2+400* 6] mova m5, [t1+r10*2+400* 8] mova m6, [t1+r10*2+400*10] paddw m1, m4, [t2+r10*2+400* 6] paddd m2, m5, [t2+r10*2+400* 8] paddd m3, m6, [t2+r10*2+400*10] mova [t2+r10*2+400* 6], m4 mova [t2+r10*2+400* 8], m5 mova [t2+r10*2+400*10], m6 punpcklwd m0, m1, m7 ; b3 punpckhwd m1, m7 pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; a3 * 9 pmaddwd m2, m0, m0 ; b3 * b3 paddd m5, m3 pmaddwd m3, m1, m1 psubd m4, m2 ; p3 vpbroadcastd m2, [base+pd_0xf00801c7] psubd m5, m3 pmulld m4, m14 ; p3 * s1 pmulld m5, m14 pmaddwd m0, m2 ; b3 * 455 pmaddwd m1, m2 paddusw m4, m2 paddusw m5, m2 psrad m3, m4, 20 ; min(z3, 255) - 256 vpgatherdd m2, [r12+m3*4], m4 ; x3 psrad m4, m5, 20 vpgatherdd m3, [r12+m4*4], m5 vpbroadcastd m4, [base+pd_34816] pmulld m0, m2 vpbroadcastd m8, [base+pd_m4096] pmulld m1, m3 paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m4 pand m0, m8 pand m8, m1 por m0, m2 ; a3 | (b3 << 12) por m8, m3 mova m4, [t3+r10*4+400*8+ 8] mova m5, [t3+r10*4+400*0+ 8] mova m6, [t3+r10*4+400*0+40] paddw m1, m4, [t2+r10*2+400*0] paddd m2, m5, [t2+r10*2+400*2] paddd m3, m6, [t2+r10*2+400*4] paddw m1, [t1+r10*2+400*0] paddd m2, [t1+r10*2+400*2] paddd m3, [t1+r10*2+400*4] mova [t2+r10*2+400*0], m4 mova [t2+r10*2+400*2], m5 mova [t2+r10*2+400*4], m6 vpbroadcastd m4, [base+pd_25] mova [t3+r10*4+400*8+ 8], xm0 vextracti128 [t3+r10*4+400*8+40], m0, 1 mova [t3+r10*4+400*8+24], xm8 vextracti128 [t3+r10*4+400*8+56], m8, 1 punpcklwd m0, m1, m7 ; b5 punpckhwd m1, m7 pmulld m2, m4 ; a5 * 25 pmulld m3, m4 pmaddwd m4, m0, m0 ; b5 * b5 pmaddwd m5, m1, m1 psubd m2, m4 ; p5 vpbroadcastd m4, [base+pd_0xf00800a4] psubd m3, m5 pmulld m2, m13 ; p5 * s0 pmulld m3, m13 pmaddwd m0, m4 ; b5 * 164 pmaddwd m1, m4 paddusw m2, m4 paddusw m3, m4 psrad m5, m2, 20 ; min(z5, 255) - 256 vpgatherdd m4, [r12+m5*4], m2 ; x5 psrad m2, m3, 20 vpgatherdd m5, [r12+m2*4], m3 pmulld m0, m4 vpbroadcastd m6, [base+pd_34816] pmulld m1, m5 paddd m0, m6 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) paddd m1, m6 vpbroadcastd m6, [base+pd_m4096] pand m0, m6 pand m1, m6 por m0, m4 ; a5 | (b5 << 12) por m1, m5 mova [t3+r10*4+400*0+ 8], xm0 vextracti128 [t3+r10*4+400*0+40], m0, 1 mova [t3+r10*4+400*0+24], xm1 vextracti128 [t3+r10*4+400*0+56], m1, 1 add r10, 16 jl .v1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .prep_n: ; initial neighbor setup mov r10, wq .prep_n_loop: movu m0, [t3+r10*4+400*0+4] paddd m1, m0, [t3+r10*4+400*0+0] mova m4, [t3+r10*4+400*4+0] paddd m1, [t3+r10*4+400*0+8] mova m5, [t3+r10*4+400*8+0] paddd m4, [t3+r10*4+400*4+8] paddd m5, [t3+r10*4+400*8+8] paddd m2, m4, [t3+r10*4+400*4+4] paddd m3, m5, [t3+r10*4+400*8+4] paddd m0, m1 pslld m1, 2 pslld m2, 2 paddd m1, m0 ; ab5 565 paddd m3, m3 ; ab3[ 0] 222 psubd m2, m4 ; ab3[-1] 343 mova [t3+r10*4+400*20], m3 pandn m0, m6, m1 ; a5 565 mova [t3+r10*4+400*24], m2 psrld m1, 12 ; b5 565 mova [t3+r10*4+400*12], m0 paddd m3, m3 mova [t3+r10*4+400*16], m1 psubd m3, m5 ; ab3[ 0] 343 mova [t3+r10*4+400*28], m3 add r10, 8 jl .prep_n_loop ret ALIGN function_align .n0: ; neighbor + output (even rows) mov r10, wq .n0_loop: movu m0, [t3+r10*4+4] paddd m4, m0, [t3+r10*4+0] paddd m4, [t3+r10*4+8] paddd m0, m4 pslld m4, 2 paddd m4, m0 pandn m0, m6, m4 psrld m4, 12 paddd m2, m0, [t3+r10*4+400*12] ; a5 mova [t3+r10*4+400*12], m0 paddd m0, m4, [t3+r10*4+400*16] ; b5 + (1 << 8) mova [t3+r10*4+400*16], m4 mova m3, [t3+r10*4+400*4+0] paddd m3, [t3+r10*4+400*4+8] paddd m5, m3, [t3+r10*4+400*4+4] paddd m5, m5 ; ab3[ 1] 222 mova m4, [t3+r10*4+400*20] paddd m1, m4, [t3+r10*4+400*24] ; ab3[ 0] 222 + ab3[-1] 343 mova [t3+r10*4+400*20], m5 paddd m5, m5 psubd m5, m3 ; ab3[ 1] 343 mova [t3+r10*4+400*24], m5 paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343 pandn m3, m6, m1 psrld m1, 12 pandn m5, m6, m4 psrld m4, 12 paddd m3, m5 ; a3 paddd m1, m4 ; b3 + (1 << 8) pmovzxbd m4, [dstq+r10] pmaddwd m2, m4 ; a5 * src pmaddwd m3, m4 ; a3 * src psubd m0, m2 ; b5 - a5 * src + (1 << 8) psubd m1, m3 ; b3 - a3 * src + (1 << 8) psrld m0, 9 pslld m1, 7 pblendw m0, m1, 0xaa pmaddwd m0, m15 psubd m0, m6 psrad m0, 13 paddd m0, m4 vextracti128 xm1, m0, 1 packssdw xm0, xm1 packuswb xm0, xm0 movq [dstq+r10], xm0 add r10, 8 jl .n0_loop add dstq, strideq ret ALIGN function_align .n1: ; neighbor + output (odd rows) mov r10, wq .n1_loop: mova m3, [t3+r10*4+400*8+0] paddd m3, [t3+r10*4+400*8+8] paddd m5, m3, [t3+r10*4+400*8+4] paddd m5, m5 ; ab3[ 1] 222 mova m4, [t3+r10*4+400*20] paddd m1, m4, [t3+r10*4+400*28] ; ab3[ 0] 222 + ab3[-1] 343 mova [t3+r10*4+400*20], m5 paddd m5, m5 psubd m5, m3 ; ab3[ 1] 343 mova [t3+r10*4+400*28], m5 paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343 pandn m3, m6, m1 psrld m1, 12 pandn m5, m6, m4 psrld m4, 12 paddd m3, m5 ; -a3 paddd m1, m4 ; b3 + (1 << 8) pmovzxbd m4, [dstq+r10] pmaddwd m2, m4, [t3+r10*4+400*12] ; -a5 * src mova m0, [t3+r10*4+400*16] ; b5 + (1 << 7) pmaddwd m3, m4 ; -a3 * src psubd m0, m2 ; a5 * src + b5 + (1 << 7) psubd m1, m3 ; a3 * src + b3 + (1 << 8) psrld m0, 8 pslld m1, 7 pblendw m0, m1, 0xaa pmaddwd m0, m15 psubd m0, m6 psrad m0, 13 paddd m0, m4 vextracti128 xm1, m0, 1 packssdw xm0, xm1 packuswb xm0, xm0 movq [dstq+r10], xm0 add r10, 8 jl .n1_loop add dstq, strideq ret %endif ; ARCH_X86_64