; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 wiener_lshuf5: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 wiener_lshuf7: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 pb_m14_m13: times 8 db -14,-13 pb_m10_m9: times 8 db -10, -9 pb_m6_m5: times 8 db -6, -5 pb_m2_m1: times 8 db -2, -1 pb_2_3: times 8 db 2, 3 pb_6_7: times 8 db 6, 7 pw_256: times 8 dw 256 pw_1023: times 8 dw 1023 pd_8: times 4 dd 8 pd_4096: times 4 dd 4096 pd_34816: times 4 dd 34816 pd_m262128: times 4 dd -262128 pd_0xffff: times 4 dd 0xffff pd_0xf00800a4: times 4 dd 0xf00800a4 pd_0xf00801c7: times 4 dd 0xf00801c7 pd_0xfffffff0: times 4 dd 0xfffffff0 wiener_shifts: dw 4, 4, 2048, 2048, 1, 1, 8192, 8192 wiener_round: dd 1049600, 1048832 cextern sgr_x_by_x SECTION .text %macro movif64 2 ; dst, src %if ARCH_X86_64 mov %1, %2 %endif %endmacro %macro movif32 2 ; dst, src %if ARCH_X86_32 mov %1, %2 %endif %endmacro INIT_XMM ssse3 %if ARCH_X86_32 DECLARE_REG_TMP 5, 6 %if STACK_ALIGNMENT < 16 %assign extra_stack 13*16 %else %assign extra_stack 12*16 %endif cglobal wiener_filter7_16bpc, 4, 7, 8, -384*12-16-extra_stack, \ dst, stride, left, lpf, w, flt %if STACK_ALIGNMENT < 16 %define lpfm dword [esp+calloff+16*12+ 0] %define wm dword [esp+calloff+16*12+ 4] %define hd dword [esp+calloff+16*12+ 8] %define edgeb byte [esp+calloff+16*12+12] %define edged dword [esp+calloff+16*12+12] %else %define hd dword r5m %define edgeb byte r7m %endif %define PICmem dword [esp+calloff+4*0] %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers %define t1m dword [esp+calloff+4*2] %define t2m dword [esp+calloff+4*3] %define t3m dword [esp+calloff+4*4] %define t4m dword [esp+calloff+4*5] %define t5m dword [esp+calloff+4*6] %define t6m dword [esp+calloff+4*7] %define t2 t2m %define t3 t3m %define t4 t4m %define t5 t5m %define t6 t6m %define m8 [esp+calloff+16*2] %define m9 [esp+calloff+16*3] %define m10 [esp+calloff+16*4] %define m11 [esp+calloff+16*5] %define m12 [esp+calloff+16*6] %define m13 [esp+calloff+16*7] %define m14 [esp+calloff+16*8] %define m15 [esp+calloff+16*9] %define r10 r4 %define base t0-wiener_shifts %assign calloff 0 %if STACK_ALIGNMENT < 16 mov wd, [rstk+stack_offset+20] mov wm, wd mov r5, [rstk+stack_offset+24] mov hd, r5 mov r5, [rstk+stack_offset+32] mov edged, r5 ; edge %endif %else DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ w, h, edge, flt %define base %endif %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 movifnidn wd, wm %endif %if ARCH_X86_64 mov fltq, r6mp movifnidn hd, hm mov edged, r7m mov t3d, r8m ; pixel_max movq m13, [fltq] movq m15, [fltq+16] %else %if STACK_ALIGNMENT < 16 mov t0, [rstk+stack_offset+28] mov t1, [rstk+stack_offset+36] ; pixel_max movq m1, [t0] ; fx movq m3, [t0+16] ; fy LEA t0, wiener_shifts %else mov fltq, r6m movq m1, [fltq] movq m3, [fltq+16] LEA t0, wiener_shifts mov t1, r8m ; pixel_max %endif mov PICmem, t0 %endif mova m6, [base+wiener_shufA] mova m7, [base+wiener_shufB] %if ARCH_X86_64 lea t4, [wiener_shifts] add wd, wd pshufd m12, m13, q0000 ; x0 x1 pshufd m13, m13, q1111 ; x2 x3 pshufd m14, m15, q0000 ; y0 y1 pshufd m15, m15, q1111 ; y2 y3 mova m8, [wiener_shufC] mova m9, [wiener_shufD] add lpfq, wq lea t1, [rsp+wq+16] add dstq, wq neg wq shr t3d, 11 %define base t4-wiener_shifts movd m10, [base+wiener_round+t3*4] movq m11, [base+wiener_shifts+t3*8] pshufd m10, m10, q0000 pshufd m0, m11, q0000 pshufd m11, m11, q1111 pmullw m12, m0 ; upshift filter coefs to make the pmullw m13, m0 ; horizontal downshift constant DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w %define lpfm [rsp] %define base %define wiener_lshuf7_mem [wiener_lshuf7] %define pd_m262128_mem [pd_m262128] %else add wd, wd mova m4, [base+wiener_shufC] mova m5, [base+wiener_shufD] pshufd m0, m1, q0000 pshufd m1, m1, q1111 pshufd m2, m3, q0000 pshufd m3, m3, q1111 mova m8, m4 mova m9, m5 mova m14, m2 mova m15, m3 shr t1, 11 add lpfq, wq mova m3, [base+pd_m262128] movd m4, [base+wiener_round+t1*4] movq m5, [base+wiener_shifts+t1*8] lea t1, [esp+extra_stack+wq+16] add dstq, wq neg wq pshufd m4, m4, q0000 pshufd m2, m5, q0000 pshufd m5, m5, q1111 mov wm, wq pmullw m0, m2 pmullw m1, m2 mova m2, [base+wiener_lshuf7] %define pd_m262128_mem [esp+calloff+16*10] mova pd_m262128_mem, m3 mova m10, m4 mova m11, m5 mova m12, m0 mova m13, m1 %define wiener_lshuf7_mem [esp+calloff+16*11] mova wiener_lshuf7_mem, m2 %endif test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t6, t1 mov t5, t1 add t1, 384*2 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq mov t4, t1 add t1, 384*2 add r10, strideq mov lpfm, r10 ; below call .h mov t3, t1 mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v3 .main: lea t0, [t1+384*2] .main_loop: call .hv dec hd jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .v3 mov lpfq, lpfm call .hv_bottom add lpfq, strideq call .hv_bottom .v1: call .v RET .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov lpfm, r10 call .h mov t6, t1 mov t5, t1 mov t4, t1 mov t3, t1 mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v3 lea t0, [t1+384*2] call .hv dec hd jz .v3 add t0, 384*8 call .hv dec hd jnz .main .v3: call .v movif32 wq, wm .v2: call .v movif32 wq, wm jmp .v1 .extend_right: %assign stack_offset stack_offset+8 %assign calloff 8 movif32 t0, PICmem pxor m0, m0 movd m1, wd mova m2, [base+pb_0to15] pshufb m1, m0 mova m0, [base+pb_6_7] psubb m0, m1 pminub m0, m2 pshufb m3, m0 mova m0, [base+pb_m2_m1] psubb m0, m1 pminub m0, m2 pshufb m4, m0 mova m0, [base+pb_m10_m9] psubb m0, m1 pminub m0, m2 pshufb m5, m0 movif32 t0, t0m ret %assign stack_offset stack_offset-4 %assign calloff 4 .h: movif64 wq, r4 movif32 wq, wm test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movq m3, [leftq] movhps m3, [lpfq+wq] add leftq, 8 jmp .h_main .h_extend_left: mova m3, [lpfq+wq] ; avoid accessing memory located pshufb m3, wiener_lshuf7_mem ; before the start of the buffer jmp .h_main .h_top: movif64 wq, r4 test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu m3, [lpfq+wq-8] .h_main: mova m4, [lpfq+wq+0] movu m5, [lpfq+wq+8] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp wd, -20 jl .h_have_right call .extend_right .h_have_right: pshufb m0, m3, m6 pshufb m1, m4, m7 paddw m0, m1 pshufb m3, m8 pmaddwd m0, m12 pshufb m1, m4, m9 paddw m3, m1 pshufb m1, m4, m6 pmaddwd m3, m13 pshufb m2, m5, m7 paddw m1, m2 mova m2, pd_m262128_mem ; (1 << 4) - (1 << 18) pshufb m4, m8 pmaddwd m1, m12 pshufb m5, m9 paddw m4, m5 pmaddwd m4, m13 paddd m0, m2 paddd m1, m2 paddd m0, m3 paddd m1, m4 psrad m0, 4 psrad m1, 4 packssdw m0, m1 psraw m0, 1 mova [t1+wq], m0 add wq, 16 jl .h_loop movif32 wq, wm ret ALIGN function_align .hv: add lpfq, strideq movif64 wq, r4 movif32 t0m, t0 movif32 t1m, t1 test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movq m3, [leftq] movhps m3, [lpfq+wq] add leftq, 8 jmp .hv_main .hv_extend_left: mova m3, [lpfq+wq] pshufb m3, wiener_lshuf7_mem jmp .hv_main .hv_bottom: movif64 wq, r4 movif32 t0m, t0 movif32 t1m, t1 test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu m3, [lpfq+wq-8] .hv_main: mova m4, [lpfq+wq+0] movu m5, [lpfq+wq+8] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp wd, -20 jl .hv_have_right call .extend_right .hv_have_right: movif32 t1, t4m movif32 t0, t2m pshufb m0, m3, m6 pshufb m1, m4, m7 paddw m0, m1 pshufb m3, m8 pmaddwd m0, m12 pshufb m1, m4, m9 paddw m3, m1 pshufb m1, m4, m6 pmaddwd m3, m13 pshufb m2, m5, m7 paddw m1, m2 mova m2, pd_m262128_mem pshufb m4, m8 pmaddwd m1, m12 pshufb m5, m9 paddw m4, m5 pmaddwd m4, m13 paddd m0, m2 paddd m1, m2 %if ARCH_X86_64 mova m2, [t4+wq] paddw m2, [t2+wq] mova m5, [t3+wq] %else mova m2, [t1+wq] paddw m2, [t0+wq] mov t1, t3m mov t0, t5m mova m5, [t1+wq] mov t1, t1m %endif paddd m0, m3 paddd m1, m4 psrad m0, 4 psrad m1, 4 packssdw m0, m1 %if ARCH_X86_64 mova m4, [t5+wq] paddw m4, [t1+wq] psraw m0, 1 paddw m3, m0, [t6+wq] %else mova m4, [t0+wq] paddw m4, [t1+wq] mov t0, t0m mov t1, t6m psraw m0, 1 paddw m3, m0, [t1+wq] %endif mova [t0+wq], m0 punpcklwd m0, m2, m5 pmaddwd m0, m15 punpckhwd m2, m5 pmaddwd m2, m15 punpcklwd m1, m3, m4 pmaddwd m1, m14 punpckhwd m3, m4 pmaddwd m3, m14 paddd m0, m10 paddd m2, m10 paddd m0, m1 paddd m2, m3 psrad m0, 6 psrad m2, 6 packssdw m0, m2 pmulhw m0, m11 pxor m1, m1 pmaxsw m0, m1 mova [dstq+wq], m0 add wq, 16 jl .hv_loop %if ARCH_X86_64 mov t6, t5 mov t5, t4 mov t4, t3 mov t3, t2 mov t2, t1 mov t1, t0 mov t0, t6 %else mov r4, t5m mov t1, t4m mov t6m, r4 mov t5m, t1 mov r4, t3m mov t1, t2m mov t4m, r4 mov t3m, t1 mov r4, t1m mov t1, t0 mov t2m, r4 mov t0, t6m mov wq, wm %endif add dstq, strideq ret .v: movif64 wq, r4 movif32 t0m, t0 movif32 t1m, t1 .v_loop: %if ARCH_X86_64 mova m1, [t4+wq] paddw m1, [t2+wq] mova m2, [t3+wq] mova m4, [t1+wq] paddw m3, m4, [t6+wq] paddw m4, [t5+wq] %else mov t0, t4m mov t1, t2m mova m1, [t0+wq] paddw m1, [t1+wq] mov t0, t3m mov t1, t1m mova m2, [t0+wq] mova m4, [t1+wq] mov t0, t6m mov t1, t5m paddw m3, m4, [t0+wq] paddw m4, [t1+wq] %endif punpcklwd m0, m1, m2 pmaddwd m0, m15 punpckhwd m1, m2 pmaddwd m1, m15 punpcklwd m2, m3, m4 pmaddwd m2, m14 punpckhwd m3, m4 pmaddwd m3, m14 paddd m0, m10 paddd m1, m10 paddd m0, m2 paddd m1, m3 psrad m0, 6 psrad m1, 6 packssdw m0, m1 pmulhw m0, m11 pxor m1, m1 pmaxsw m0, m1 mova [dstq+wq], m0 add wq, 16 jl .v_loop %if ARCH_X86_64 mov t6, t5 mov t5, t4 mov t4, t3 mov t3, t2 mov t2, t1 %else mov t0, t5m mov t1, t4m mov r4, t3m mov t6m, t0 mov t5m, t1 mov t4m, r4 mov r4, t2m mov t1, t1m mov t0, t0m mov t3m, r4 mov t2m, t1 %endif add dstq, strideq ret %if ARCH_X86_32 %if STACK_ALIGNMENT < 16 %assign stack_size 12*16+384*8 %else %assign stack_size 11*16+384*8 %endif cglobal wiener_filter5_16bpc, 4, 7, 8, -stack_size, dst, stride, left, \ lpf, w, flt %if STACK_ALIGNMENT < 16 %define lpfm dword [esp+calloff+4*6] %define wm dword [esp+calloff+4*7] %define hd dword [esp+calloff+16*10+0] %define edgeb byte [esp+calloff+16*10+4] %define edged dword [esp+calloff+16*10+4] %else %define hd dword r5m %define edgeb byte r7m %endif %define PICmem dword [esp+calloff+4*0] %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers %define t1m dword [esp+calloff+4*2] %define t2m dword [esp+calloff+4*3] %define t3m dword [esp+calloff+4*4] %define t4m dword [esp+calloff+4*5] %define t2 t2m %define t3 t3m %define t4 t4m %define m8 [esp+calloff+16*2] %define m9 [esp+calloff+16*3] %define m10 [esp+calloff+16*4] %define m11 [esp+calloff+16*5] %define m12 [esp+calloff+16*6] %define m13 [esp+calloff+16*7] %define m14 [esp+calloff+16*8] %define m15 [esp+calloff+16*9] %define base t0-wiener_shifts %assign calloff 0 %if STACK_ALIGNMENT < 16 mov wd, [rstk+stack_offset+20] mov wm, wd mov r5, [rstk+stack_offset+24] mov hd, r5 mov r5, [rstk+stack_offset+32] mov edged, r5 ; edge %endif %else cglobal wiener_filter5_16bpc, 4, 14, 16, 384*8+16, dst, stride, left, lpf, \ w, h, edge, flt %define base %endif %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 movifnidn wd, wm %endif %if ARCH_X86_64 mov fltq, r6mp movifnidn hd, hm mov edged, r7m mov t3d, r8m ; pixel_max movq m12, [fltq] movq m14, [fltq+16] %else %if STACK_ALIGNMENT < 16 mov t0, [rstk+stack_offset+28] mov t1, [rstk+stack_offset+36] ; pixel_max movq m1, [t0] ; fx movq m3, [t0+16] ; fy LEA t0, wiener_shifts %else mov fltq, r6m movq m1, [fltq] movq m3, [fltq+16] LEA t0, wiener_shifts mov t1, r8m ; pixel_max %endif mov PICmem, t0 %endif mova m5, [base+wiener_shufE] mova m6, [base+wiener_shufB] mova m7, [base+wiener_shufD] %if ARCH_X86_64 lea t4, [wiener_shifts] add wd, wd punpcklwd m11, m12, m12 pshufd m11, m11, q1111 ; x1 pshufd m12, m12, q1111 ; x2 x3 punpcklwd m13, m14, m14 pshufd m13, m13, q1111 ; y1 pshufd m14, m14, q1111 ; y2 y3 shr t3d, 11 mova m8, [pd_m262128] ; (1 << 4) - (1 << 18) add lpfq, wq lea t1, [rsp+wq+16] add dstq, wq neg wq %define base t4-wiener_shifts movd m9, [base+wiener_round+t3*4] movq m10, [base+wiener_shifts+t3*8] pshufd m9, m9, q0000 pshufd m0, m10, q0000 pshufd m10, m10, q1111 mova m15, [wiener_lshuf5] pmullw m11, m0 pmullw m12, m0 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w %define lpfm [rsp] %define base %else add wd, wd punpcklwd m0, m1, m1 pshufd m0, m0, q1111 ; x1 pshufd m1, m1, q1111 ; x2 x3 punpcklwd m2, m3, m3 pshufd m2, m2, q1111 ; y1 pshufd m3, m3, q1111 ; y2 y3 mova m4, [base+pd_m262128] ; (1 << 4) - (1 << 18) mova m13, m2 mova m14, m3 mova m8, m4 shr t1, 11 add lpfq, wq movd m2, [base+wiener_round+t1*4] movq m3, [base+wiener_shifts+t1*8] %if STACK_ALIGNMENT < 16 lea t1, [esp+16*11+wq+16] %else lea t1, [esp+16*10+wq+16] %endif add dstq, wq neg wq pshufd m2, m2, q0000 pshufd m4, m3, q0000 pshufd m3, m3, q1111 mov wm, wq pmullw m0, m4 pmullw m1, m4 mova m4, [base+wiener_lshuf5] mova m9, m2 mova m10, m3 mova m11, m0 mova m12, m1 mova m15, m4 %endif test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t4, t1 add t1, 384*2 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq mov t3, t1 add t1, 384*2 add r10, strideq mov lpfm, r10 ; below call .h mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v2 .main: mov t0, t4 .main_loop: call .hv dec hd jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .v2 mov lpfq, lpfm call .hv_bottom add lpfq, strideq call .hv_bottom .end: RET .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov lpfm, r10 call .h mov t4, t1 mov t3, t1 mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v2 lea t0, [t1+384*2] call .hv dec hd jz .v2 add t0, 384*6 call .hv dec hd jnz .main .v2: call .v %if ARCH_X86_64 mov t4, t3 mov t3, t2 mov t2, t1 %else mov t0, t3m mov r4, t2m mov t1, t1m mov t4m, t0 mov t3m, r4 mov t2m, t1 mov wq, wm %endif add dstq, strideq .v1: call .v jmp .end .extend_right: %assign stack_offset stack_offset+8 %assign calloff 8 movif32 t0, PICmem pxor m1, m1 movd m2, wd mova m0, [base+pb_2_3] pshufb m2, m1 mova m1, [base+pb_m6_m5] psubb m0, m2 psubb m1, m2 mova m2, [base+pb_0to15] pminub m0, m2 pminub m1, m2 pshufb m3, m0 pshufb m4, m1 ret %assign stack_offset stack_offset-4 %assign calloff 4 .h: movif64 wq, r4 movif32 wq, wm test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left mova m4, [lpfq+wq] movd m3, [leftq+4] pslldq m4, 4 por m3, m4 add leftq, 8 jmp .h_main .h_extend_left: mova m3, [lpfq+wq] ; avoid accessing memory located pshufb m3, m15 ; before the start of the buffer jmp .h_main .h_top: movif64 wq, r4 movif32 wq, wm test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu m3, [lpfq+wq-4] .h_main: movu m4, [lpfq+wq+4] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp wd, -18 jl .h_have_right call .extend_right .h_have_right: pshufb m0, m3, m5 pmaddwd m0, m11 pshufb m1, m4, m5 pmaddwd m1, m11 pshufb m2, m3, m6 pshufb m3, m7 paddw m2, m3 pshufb m3, m4, m6 pmaddwd m2, m12 pshufb m4, m7 paddw m3, m4 pmaddwd m3, m12 paddd m0, m8 paddd m1, m8 paddd m0, m2 paddd m1, m3 psrad m0, 4 psrad m1, 4 packssdw m0, m1 psraw m0, 1 mova [t1+wq], m0 add wq, 16 jl .h_loop movif32 wq, wm ret ALIGN function_align .hv: add lpfq, strideq movif64 wq, r4 movif32 t0m, t0 movif32 t1m, t1 test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left mova m4, [lpfq+wq] movd m3, [leftq+4] pslldq m4, 4 por m3, m4 add leftq, 8 jmp .hv_main .hv_extend_left: mova m3, [lpfq+wq] pshufb m3, m15 jmp .hv_main .hv_bottom: movif64 wq, r4 movif32 t0m, t0 movif32 t1m, t1 test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu m3, [lpfq+wq-4] .hv_main: movu m4, [lpfq+wq+4] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp wd, -18 jl .hv_have_right call .extend_right .hv_have_right: movif32 t1, t1m movif32 t0, t3m pshufb m0, m3, m5 pmaddwd m0, m11 pshufb m1, m4, m5 pmaddwd m1, m11 pshufb m2, m3, m6 pshufb m3, m7 paddw m2, m3 pshufb m3, m4, m6 pmaddwd m2, m12 pshufb m4, m7 paddw m3, m4 pmaddwd m3, m12 paddd m0, m8 paddd m1, m8 paddd m0, m2 %if ARCH_X86_64 mova m2, [t3+wq] paddw m2, [t1+wq] paddd m1, m3 mova m4, [t2+wq] %else mova m2, [t0+wq] mov t0, t2m paddw m2, [t1+wq] mov t1, t4m paddd m1, m3 mova m4, [t0+wq] mov t0, t0m %endif punpckhwd m3, m2, m4 pmaddwd m3, m14 punpcklwd m2, m4 %if ARCH_X86_64 mova m4, [t4+wq] %else mova m4, [t1+wq] %endif psrad m0, 4 psrad m1, 4 packssdw m0, m1 pmaddwd m2, m14 psraw m0, 1 mova [t0+wq], m0 punpckhwd m1, m0, m4 pmaddwd m1, m13 punpcklwd m0, m4 pmaddwd m0, m13 paddd m3, m9 paddd m2, m9 paddd m1, m3 paddd m0, m2 psrad m1, 6 psrad m0, 6 packssdw m0, m1 pmulhw m0, m10 pxor m1, m1 pmaxsw m0, m1 mova [dstq+wq], m0 add wq, 16 jl .hv_loop %if ARCH_X86_64 mov t4, t3 mov t3, t2 mov t2, t1 mov t1, t0 mov t0, t4 %else mov r4, t3m mov t1, t2m mov t4m, r4 mov t3m, t1 mov r4, t1m mov t1, t0 mov t2m, r4 mov t0, t4m mov wq, wm %endif add dstq, strideq ret .v: movif64 wq, r4 movif32 t1m, t1 .v_loop: %if ARCH_X86_64 mova m0, [t1+wq] paddw m2, m0, [t3+wq] mova m1, [t2+wq] mova m4, [t4+wq] %else mov t0, t3m mova m0, [t1+wq] mov t1, t2m paddw m2, m0, [t0+wq] mov t0, t4m mova m1, [t1+wq] mova m4, [t0+wq] %endif punpckhwd m3, m2, m1 pmaddwd m3, m14 punpcklwd m2, m1 pmaddwd m2, m14 punpckhwd m1, m0, m4 pmaddwd m1, m13 punpcklwd m0, m4 pmaddwd m0, m13 paddd m3, m9 paddd m2, m9 paddd m1, m3 paddd m0, m2 psrad m1, 6 psrad m0, 6 packssdw m0, m1 pmulhw m0, m10 pxor m1, m1 pmaxsw m0, m1 mova [dstq+wq], m0 add wq, 16 %if ARCH_X86_64 jl .v_loop %else jge .v_end mov t1, t1m jmp .v_loop .v_end: %endif ret %macro GATHERDD 3 ; dst, src, tmp movd %3d, %2 %if ARCH_X86_64 movd %1, [r13+%3] pextrw %3d, %2, 2 pinsrw %1, [r13+%3+2], 3 pextrw %3d, %2, 4 pinsrw %1, [r13+%3+2], 5 pextrw %3d, %2, 6 pinsrw %1, [r13+%3+2], 7 %else movd %1, [base+sgr_x_by_x-0xf03+%3] pextrw %3, %2, 2 pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 3 pextrw %3, %2, 4 pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 5 pextrw %3, %2, 6 pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 7 %endif %endmacro %macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore %if ARCH_X86_64 %define tmp r14 %else %define tmp %4 %endif GATHERDD %1, %2, tmp GATHERDD %2, %3, tmp movif32 %4, %5 psrld %1, 24 psrld %2, 24 packssdw %1, %2 %endmacro %macro MAXSD 3-4 0 ; dst, src, restore_tmp pcmpgtd %3, %1, %2 pand %1, %3 pandn %3, %2 por %1, %3 %if %4 == 1 pxor %3, %3 %endif %endmacro %macro MULLD 3 ; dst, src, tmp pmulhuw %3, %1, %2 pmullw %1, %2 pslld %3, 16 paddd %1, %3 %endmacro %if ARCH_X86_32 DECLARE_REG_TMP 0, 1, 2, 3, 5 %if STACK_ALIGNMENT < 16 %assign extra_stack 5*16 %else %assign extra_stack 3*16 %endif cglobal sgr_filter_5x5_16bpc, 1, 7, 8, -400*24-16-extra_stack, \ dst, stride, left, lpf, w %if STACK_ALIGNMENT < 16 %define dstm dword [esp+calloff+16*0+4*6] %define stridemp dword [esp+calloff+16*0+4*7] %define leftm dword [esp+calloff+16*3+4*0] %define lpfm dword [esp+calloff+16*3+4*1] %define w0m dword [esp+calloff+16*3+4*2] %define hd dword [esp+calloff+16*3+4*3] %define edgeb byte [esp+calloff+16*3+4*4] %define edged dword [esp+calloff+16*3+4*4] %define leftmp leftm %else %define w0m wm %define hd dword r5m %define edgeb byte r7m %define edged dword r7m %endif %define hvsrcm dword [esp+calloff+4*0] %define w1m dword [esp+calloff+4*1] %define t0m dword [esp+calloff+4*2] %define t2m dword [esp+calloff+4*3] %define t3m dword [esp+calloff+4*4] %define t4m dword [esp+calloff+4*5] %define m8 [base+pd_8] %define m9 [base+pd_0xfffffff0] %define m10 [esp+calloff+16*2] %define m11 [base+pd_0xf00800a4] %define m12 [base+sgr_lshuf5] %define m13 [base+pd_34816] %define m14 [base+pw_1023] %define r10 r4 %define base r6-$$ %assign calloff 0 %if STACK_ALIGNMENT < 16 mov strideq, [rstk+stack_offset+ 8] mov leftq, [rstk+stack_offset+12] mov lpfq, [rstk+stack_offset+16] mov wd, [rstk+stack_offset+20] mov dstm, dstq mov stridemp, strideq mov leftm, leftq mov r1, [rstk+stack_offset+24] mov r2, [rstk+stack_offset+32] mov lpfm, lpfq mov hd, r1 mov edged, r2 %endif %else cglobal sgr_filter_5x5_16bpc, 4, 15, 15, -400*24-16, dst, stride, left, lpf, \ w, h, edge, params %endif %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 movifnidn wd, wm %endif %if ARCH_X86_64 mov paramsq, r6mp lea r13, [sgr_x_by_x-0xf03] movifnidn hd, hm add wd, wd mov edged, r7m movu m10, [paramsq] mova m12, [sgr_lshuf5] add lpfq, wq mova m8, [pd_8] lea t1, [rsp+wq+20] mova m9, [pd_0xfffffff0] add dstq, wq lea t3, [rsp+wq*2+400*12+16] mova m11, [pd_0xf00800a4] lea t4, [rsp+wq+400*20+16] pshufhw m7, m10, q0000 pshufb m10, [pw_256] ; s0 punpckhqdq m7, m7 ; w0 neg wq mova m13, [pd_34816] ; (1 << 11) + (1 << 15) pxor m6, m6 mova m14, [pw_1023] psllw m7, 4 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w %define lpfm [rsp] %else mov r1, [rstk+stack_offset+28] ; params LEA r6, $$ add wd, wd movu m1, [r1] add lpfm, wq lea t1, [rsp+extra_stack+wq+20] add dstq, wq lea t3, [rsp+extra_stack+wq*2+400*12+16] mov dstm, dstq lea t4, [rsp+extra_stack+wq+400*20+16] mov t3m, t3 pshufhw m7, m1, q0000 mov t4m, t4 pshufb m1, [base+pw_256] ; s0 punpckhqdq m7, m7 ; w0 psllw m7, 4 neg wq mova m10, m1 pxor m6, m6 mov w1m, wd sub wd, 4 mov lpfq, lpfm mov w0m, wd %define strideq r5 %endif test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, stridemp movif32 t2m, t1 mov t2, t1 call .top_fixup add t1, 400*6 call .h_top movif32 strideq, stridemp lea r10, [lpfq+strideq*4] mov lpfq, dstq add r10, strideq mov lpfm, r10 ; below movif32 t0m, t2 mov t0, t2 dec hd jz .height1 or edged, 16 call .h .main: add lpfq, stridemp movif32 t4, t4m call .hv call .prep_n sub hd, 2 jl .extend_bottom .main_loop: movif32 lpfq, hvsrcm add lpfq, stridemp %if ARCH_X86_64 test hb, hb %else mov r4, hd test r4, r4 %endif jz .odd_height call .h add lpfq, stridemp call .hv movif32 dstq, dstm call .n0 call .n1 sub hd, 2 movif32 t0, t0m jge .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, lpfm call .h_top add lpfq, stridemp call .hv_bottom .end: movif32 dstq, dstm call .n0 call .n1 .end2: RET .height1: movif32 t4, t4m call .hv call .prep_n jmp .odd_height_end .odd_height: call .hv movif32 dstq, dstm call .n0 call .n1 .odd_height_end: call .v movif32 dstq, dstm call .n0 jmp .end2 .extend_bottom: call .v jmp .end .no_top: movif32 strideq, stridemp lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov lpfm, r10 call .h lea t2, [t1+400*6] movif32 t2m, t2 call .top_fixup dec hd jz .no_top_height1 or edged, 16 mov t0, t1 mov t1, t2 movif32 t0m, t0 jmp .main .no_top_height1: movif32 t3, t3m movif32 t4, t4m call .v call .prep_n jmp .odd_height_end .extend_right: movd m0, wd movd m1, [lpfq-2] mova m2, [base+pw_256] mova m3, [base+pb_m14_m13] pshufb m0, m6 pshufb m1, m2 psubb m2, m0 psubb m3, m0 mova m0, [base+pb_0to15] pcmpgtb m2, m0 pcmpgtb m3, m0 pand m4, m2 pand m5, m3 pandn m2, m1 pandn m3, m1 por m4, m2 por m5, m3 ret %assign stack_offset stack_offset+4 %assign calloff 4 .h: ; horizontal boxsum %if ARCH_X86_64 lea wq, [r4-4] %else %define leftq r4 %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movif32 leftq, leftm movddup m5, [leftq] movif32 wq, w0m mova m4, [lpfq+wq+4] add leftmp, 8 palignr m4, m5, 10 jmp .h_main .h_extend_left: movif32 wq, w0m mova m4, [lpfq+wq+4] pshufb m4, m12 jmp .h_main .h_top: %if ARCH_X86_64 lea wq, [r4-4] %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movif32 wq, w0m .h_loop: movu m4, [lpfq+wq- 2] .h_main: movu m5, [lpfq+wq+14] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp wd, -20 jl .h_have_right call .extend_right .h_have_right: palignr m2, m5, m4, 2 paddw m0, m4, m2 palignr m3, m5, m4, 6 paddw m0, m3 punpcklwd m1, m2, m3 pmaddwd m1, m1 punpckhwd m2, m3 pmaddwd m2, m2 palignr m5, m4, 8 paddw m0, m5 punpcklwd m3, m4, m5 pmaddwd m3, m3 paddd m1, m3 punpckhwd m3, m4, m5 pmaddwd m3, m3 shufps m4, m5, q2121 paddw m0, m4 ; sum punpcklwd m5, m4, m6 pmaddwd m5, m5 punpckhwd m4, m6 pmaddwd m4, m4 paddd m2, m3 test edgeb, 16 ; y > 0 jz .h_loop_end paddw m0, [t1+wq+400*0] paddd m1, [t1+wq+400*2] paddd m2, [t1+wq+400*4] .h_loop_end: paddd m1, m5 ; sumsq paddd m2, m4 mova [t1+wq+400*0], m0 mova [t1+wq+400*2], m1 mova [t1+wq+400*4], m2 add wq, 16 jl .h_loop ret .top_fixup: %if ARCH_X86_64 lea wq, [r4-4] %else mov wd, w0m %endif .top_fixup_loop: ; the sums of the first row needs to be doubled mova m0, [t1+wq+400*0] mova m1, [t1+wq+400*2] mova m2, [t1+wq+400*4] paddw m0, m0 paddd m1, m1 paddd m2, m2 mova [t2+wq+400*0], m0 mova [t2+wq+400*2], m1 mova [t2+wq+400*4], m2 add wq, 16 jl .top_fixup_loop ret ALIGN function_align .hv: ; horizontal boxsum + vertical boxsum + ab %if ARCH_X86_64 lea wq, [r4-4] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movif32 leftq, leftm movddup m5, [leftq] movif32 wq, w0m mova m4, [lpfq+wq+4] add leftmp, 8 palignr m4, m5, 10 jmp .hv_main .hv_extend_left: movif32 wq, w0m mova m4, [lpfq+wq+4] pshufb m4, m12 jmp .hv_main .hv_bottom: %if ARCH_X86_64 lea wq, [r4-4] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movif32 wq, w0m %if ARCH_X86_32 jmp .hv_loop_start %endif .hv_loop: movif32 lpfq, hvsrcm .hv_loop_start: movu m4, [lpfq+wq- 2] .hv_main: movu m5, [lpfq+wq+14] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp wd, -20 jl .hv_have_right call .extend_right .hv_have_right: movif32 t3, hd palignr m3, m5, m4, 2 paddw m0, m4, m3 palignr m1, m5, m4, 6 paddw m0, m1 punpcklwd m2, m3, m1 pmaddwd m2, m2 punpckhwd m3, m1 pmaddwd m3, m3 palignr m5, m4, 8 paddw m0, m5 punpcklwd m1, m4, m5 pmaddwd m1, m1 paddd m2, m1 punpckhwd m1, m4, m5 pmaddwd m1, m1 shufps m4, m5, q2121 paddw m0, m4 ; h sum punpcklwd m5, m4, m6 pmaddwd m5, m5 punpckhwd m4, m6 pmaddwd m4, m4 paddd m3, m1 paddd m2, m5 ; h sumsq paddd m3, m4 paddw m1, m0, [t1+wq+400*0] paddd m4, m2, [t1+wq+400*2] paddd m5, m3, [t1+wq+400*4] %if ARCH_X86_64 test hd, hd %else test t3, t3 %endif jz .hv_last_row .hv_main2: paddw m1, [t2+wq+400*0] ; hv sum paddd m4, [t2+wq+400*2] ; hv sumsq paddd m5, [t2+wq+400*4] mova [t0+wq+400*0], m0 mova [t0+wq+400*2], m2 mova [t0+wq+400*4], m3 psrlw m3, m1, 1 paddd m4, m8 pavgw m3, m6 ; (b + 2) >> 2 paddd m5, m8 pand m4, m9 ; ((a + 8) >> 4) << 4 pand m5, m9 psrld m2, m4, 4 psrld m0, m5, 4 paddd m2, m4 psrld m4, 1 paddd m0, m5 psrld m5, 1 paddd m4, m2 ; a * 25 paddd m5, m0 punpcklwd m2, m3, m6 punpckhwd m3, m6 pmaddwd m2, m2 ; b * b pmaddwd m3, m3 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 MAXSD m4, m2, m6 MAXSD m5, m3, m6, 1 psubd m4, m2 ; p psubd m5, m3 MULLD m4, m10, m2 ; p * s MULLD m5, m10, m2 pmaddwd m0, m11 ; b * 164 pmaddwd m1, m11 paddusw m4, m11 paddusw m5, m11 psrld m4, 20 ; min(z, 255) movif32 t3, t3m psrld m5, 20 GATHER_X_BY_X m3, m4, m5, t2, t2m punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m2 MULLD m1, m5, m2 paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) paddd m1, m13 mova [t4+wq+4], m3 psrld m0, 12 ; b psrld m1, 12 mova [t3+wq*2+ 8], m0 mova [t3+wq*2+24], m1 add wq, 16 jl .hv_loop mov t2, t1 mov t1, t0 mov t0, t2 movif32 t2m, t2 movif32 t0m, t0 ret .hv_last_row: ; esoteric edge case for odd heights mova [t1+wq+400*0], m1 paddw m1, m0 mova [t1+wq+400*2], m4 paddd m4, m2 mova [t1+wq+400*4], m5 paddd m5, m3 jmp .hv_main2 .v: ; vertical boxsum + ab %if ARCH_X86_64 lea wq, [r4-4] %else mov wd, w0m %endif .v_loop: mova m0, [t1+wq+400*0] mova m2, [t1+wq+400*2] mova m3, [t1+wq+400*4] paddw m1, m0, [t2+wq+400*0] paddd m4, m2, [t2+wq+400*2] paddd m5, m3, [t2+wq+400*4] paddw m0, m0 paddd m2, m2 paddd m3, m3 paddw m1, m0 ; hv sum paddd m4, m2 ; hv sumsq paddd m5, m3 psrlw m3, m1, 1 paddd m4, m8 pavgw m3, m6 ; (b + 2) >> 2 paddd m5, m8 pand m4, m9 ; ((a + 8) >> 4) << 4 pand m5, m9 psrld m2, m4, 4 psrld m0, m5, 4 paddd m2, m4 psrld m4, 1 paddd m0, m5 psrld m5, 1 paddd m4, m2 ; a * 25 paddd m5, m0 punpcklwd m2, m3, m6 punpckhwd m3, m6 pmaddwd m2, m2 ; b * b pmaddwd m3, m3 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 MAXSD m4, m2, m6 MAXSD m5, m3, m6, 1 psubd m4, m2 ; p psubd m5, m3 MULLD m4, m10, m2 ; p * s MULLD m5, m10, m2 pmaddwd m0, m11 ; b * 164 pmaddwd m1, m11 paddusw m4, m11 paddusw m5, m11 psrld m4, 20 ; min(z, 255) psrld m5, 20 GATHER_X_BY_X m3, m4, m5, t2, t2m punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m2 MULLD m1, m5, m2 paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) paddd m1, m13 mova [t4+wq+4], m3 psrld m0, 12 ; b psrld m1, 12 mova [t3+wq*2+ 8], m0 mova [t3+wq*2+24], m1 add wq, 16 jl .v_loop ret .prep_n: ; initial neighbor setup movif64 wq, r4 movif32 wd, w1m .prep_n_loop: movu m0, [t4+wq*1+ 2] movu m3, [t4+wq*1+ 4] movu m1, [t3+wq*2+ 4] movu m4, [t3+wq*2+ 8] movu m2, [t3+wq*2+20] movu m5, [t3+wq*2+24] paddw m3, m0 paddd m4, m1 paddd m5, m2 paddw m3, [t4+wq*1+ 0] paddd m4, [t3+wq*2+ 0] paddd m5, [t3+wq*2+16] paddw m0, m3 psllw m3, 2 paddd m1, m4 pslld m4, 2 paddd m2, m5 pslld m5, 2 paddw m0, m3 ; a 565 paddd m1, m4 ; b 565 paddd m2, m5 mova [t4+wq*1+400*2+ 0], m0 mova [t3+wq*2+400*4+ 0], m1 mova [t3+wq*2+400*4+16], m2 add wq, 16 jl .prep_n_loop ret ALIGN function_align .n0: ; neighbor + output (even rows) movif64 wq, r4 movif32 wd, w1m .n0_loop: movu m0, [t4+wq*1+ 2] movu m3, [t4+wq*1+ 4] movu m1, [t3+wq*2+ 4] movu m4, [t3+wq*2+ 8] movu m2, [t3+wq*2+20] movu m5, [t3+wq*2+24] paddw m3, m0 paddd m4, m1 paddd m5, m2 paddw m3, [t4+wq*1+ 0] paddd m4, [t3+wq*2+ 0] paddd m5, [t3+wq*2+16] paddw m0, m3 psllw m3, 2 paddd m1, m4 pslld m4, 2 paddd m2, m5 pslld m5, 2 paddw m0, m3 ; a 565 paddd m1, m4 ; b 565 paddd m2, m5 paddw m3, m0, [t4+wq*1+400*2+ 0] paddd m4, m1, [t3+wq*2+400*4+ 0] paddd m5, m2, [t3+wq*2+400*4+16] mova [t4+wq*1+400*2+ 0], m0 mova [t3+wq*2+400*4+ 0], m1 mova [t3+wq*2+400*4+16], m2 mova m0, [dstq+wq] punpcklwd m1, m0, m6 ; src punpcklwd m2, m3, m6 ; a pmaddwd m2, m1 ; a * src punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 psubd m4, m2 ; b - a * src + (1 << 8) psubd m5, m3 psrad m4, 9 psrad m5, 9 packssdw m4, m5 pmulhrsw m4, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m14 mova [dstq+wq], m0 add wq, 16 jl .n0_loop add dstq, stridemp ret ALIGN function_align .n1: ; neighbor + output (odd rows) movif64 wq, r4 movif32 wd, w1m .n1_loop: mova m0, [dstq+wq] mova m3, [t4+wq*1+400*2+ 0] mova m4, [t3+wq*2+400*4+ 0] mova m5, [t3+wq*2+400*4+16] punpcklwd m1, m0, m6 ; src punpcklwd m2, m3, m6 ; a pmaddwd m2, m1 punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 psubd m4, m2 ; b - a * src + (1 << 7) psubd m5, m3 psrad m4, 8 psrad m5, 8 packssdw m4, m5 pmulhrsw m4, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m14 mova [dstq+wq], m0 add wq, 16 jl .n1_loop add dstq, stridemp movif32 dstm, dstq ret %if ARCH_X86_32 %if STACK_ALIGNMENT < 16 %assign extra_stack 4*16 %else %assign extra_stack 2*16 %endif cglobal sgr_filter_3x3_16bpc, 1, 7, 8, -400*42-16-extra_stack, \ dst, stride, left, lpf, w %if STACK_ALIGNMENT < 16 %define dstm dword [esp+calloff+16*2+4*0] %define stridemp dword [esp+calloff+16*2+4*1] %define leftm dword [esp+calloff+16*2+4*2] %define lpfm dword [esp+calloff+16*2+4*3] %define w0m dword [esp+calloff+16*2+4*4] %define hd dword [esp+calloff+16*2+4*5] %define edgeb byte [esp+calloff+16*2+4*6] %define edged dword [esp+calloff+16*2+4*6] %define leftmp leftm %else %define w0m wm %define hd dword r5m %define edgeb byte r7m %define edged dword r7m %endif %define hvsrcm dword [esp+calloff+4*0] %define w1m dword [esp+calloff+4*1] %define t3m dword [esp+calloff+4*2] %define t4m dword [esp+calloff+4*3] %define m8 [base+pd_8] %define m9 [esp+calloff+16*1] %define m10 [base+pd_0xf00801c7] %define m11 [base+pd_34816] %define m12 [base+sgr_lshuf3] %define m13 [base+pw_1023] %define m14 m6 %define base r6-$$ %assign calloff 0 %if STACK_ALIGNMENT < 16 mov strideq, [rstk+stack_offset+ 8] mov leftq, [rstk+stack_offset+12] mov lpfq, [rstk+stack_offset+16] mov wd, [rstk+stack_offset+20] mov dstm, dstq mov stridemp, strideq mov leftm, leftq mov r1, [rstk+stack_offset+24] mov r2, [rstk+stack_offset+32] mov lpfm, lpfq mov hd, r1 mov edged, r2 %endif %else cglobal sgr_filter_3x3_16bpc, 4, 15, 15, -400*42-8, dst, stride, left, lpf, \ w, h, edge, params %endif %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 movifnidn wd, wm %endif %if ARCH_X86_64 mov paramsq, r6mp lea r13, [sgr_x_by_x-0xf03] movifnidn hd, hm add wd, wd mov edged, r7m movq m9, [paramsq+4] add lpfq, wq lea t1, [rsp+wq+12] mova m8, [pd_8] add dstq, wq lea t3, [rsp+wq*2+400*12+8] mova m10, [pd_0xf00801c7] lea t4, [rsp+wq+400*32+8] mova m11, [pd_34816] pshuflw m7, m9, q3333 pshufb m9, [pw_256] ; s1 punpcklqdq m7, m7 ; w1 neg wq pxor m6, m6 mova m13, [pw_1023] psllw m7, 4 mova m12, [sgr_lshuf3] DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w %define lpfm [rsp] %else mov r1, [rstk+stack_offset+28] ; params LEA r6, $$ add wd, wd movq m1, [r1+4] add lpfm, wq lea t1, [rsp+extra_stack+wq+20] add dstq, wq lea t3, [rsp+extra_stack+wq*2+400*12+16] mov dstm, dstq lea t4, [rsp+extra_stack+wq+400*32+16] mov t3m, t3 pshuflw m7, m1, q3333 mov t4m, t4 pshufb m1, [base+pw_256] ; s1 punpcklqdq m7, m7 ; w1 psllw m7, 4 neg wq mova m9, m1 pxor m6, m6 mov w1m, wd sub wd, 4 mov lpfq, lpfm mov w0m, wd %define strideq r5 %endif test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, stridemp mov t2, t1 add t1, 400*6 call .h_top movif32 strideq, stridemp lea r10, [lpfq+strideq*4] mov lpfq, dstq add r10, strideq mov lpfm, r10 ; below movif32 t4, t4m call .hv0 .main: dec hd jz .height1 movif32 lpfq, hvsrcm add lpfq, stridemp call .hv1 call .prep_n sub hd, 2 jl .extend_bottom .main_loop: movif32 lpfq, hvsrcm add lpfq, stridemp call .hv0 %if ARCH_X86_64 test hb, hb %else mov r4, hd test r4, r4 %endif jz .odd_height movif32 lpfq, hvsrcm add lpfq, stridemp call .hv1 call .n0 call .n1 sub hd, 2 jge .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, lpfm call .hv0_bottom movif32 lpfq, hvsrcm add lpfq, stridemp call .hv1_bottom .end: call .n0 call .n1 .end2: RET .height1: call .v1 call .prep_n jmp .odd_height_end .odd_height: call .v1 call .n0 call .n1 .odd_height_end: call .v0 call .v1 call .n0 jmp .end2 .extend_bottom: call .v0 call .v1 jmp .end .no_top: movif32 strideq, stridemp lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov lpfm, r10 call .h %if ARCH_X86_64 lea wq, [r4-4] %else mov wq, w0m mov hvsrcm, lpfq %endif lea t2, [t1+400*6] .top_fixup_loop: mova m0, [t1+wq+400*0] mova m1, [t1+wq+400*2] mova m2, [t1+wq+400*4] mova [t2+wq+400*0], m0 mova [t2+wq+400*2], m1 mova [t2+wq+400*4], m2 add wq, 16 jl .top_fixup_loop movif32 t3, t3m movif32 t4, t4m call .v0 jmp .main .extend_right: movd m1, wd movd m5, [lpfq-2] mova m2, [base+pw_256] mova m3, [base+pb_0to15] pshufb m1, m6 pshufb m5, m2 psubb m2, m1 pcmpgtb m2, m3 pand m4, m2 pandn m2, m5 por m4, m2 ret %assign stack_offset stack_offset+4 %assign calloff 4 .h: ; horizontal boxsum %if ARCH_X86_64 lea wq, [r4-4] %else %define leftq r4 %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movif32 leftq, leftm movddup m5, [leftq] movif32 wq, w0m mova m4, [lpfq+wq+4] add leftmp, 8 palignr m4, m5, 12 jmp .h_main .h_extend_left: movif32 wq, w0m mova m4, [lpfq+wq+4] pshufb m4, m12 jmp .h_main .h_top: %if ARCH_X86_64 lea wq, [r4-4] %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movif32 wq, w0m .h_loop: movu m4, [lpfq+wq+ 0] .h_main: movu m5, [lpfq+wq+16] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp wd, -18 jl .h_have_right call .extend_right .h_have_right: palignr m0, m5, m4, 2 paddw m1, m4, m0 punpcklwd m2, m4, m0 pmaddwd m2, m2 punpckhwd m3, m4, m0 pmaddwd m3, m3 palignr m5, m4, 4 paddw m1, m5 ; sum punpcklwd m4, m5, m6 pmaddwd m4, m4 punpckhwd m5, m6 pmaddwd m5, m5 paddd m2, m4 ; sumsq paddd m3, m5 mova [t1+wq+400*0], m1 mova [t1+wq+400*2], m2 mova [t1+wq+400*4], m3 add wq, 16 jl .h_loop ret ALIGN function_align .hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) %if ARCH_X86_64 lea wq, [r4-4] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left movif32 leftq, leftm movddup m5, [leftq] movif32 wq, w0m mova m4, [lpfq+wq+4] add leftmp, 8 palignr m4, m5, 12 jmp .hv0_main .hv0_extend_left: movif32 wq, w0m mova m4, [lpfq+wq+4] pshufb m4, m12 jmp .hv0_main .hv0_bottom: %if ARCH_X86_64 lea wq, [r4-4] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left movif32 wq, w0m %if ARCH_X86_32 jmp .hv0_loop_start %endif .hv0_loop: movif32 lpfq, hvsrcm .hv0_loop_start: movu m4, [lpfq+wq+ 0] .hv0_main: movu m5, [lpfq+wq+16] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv0_have_right cmp wd, -18 jl .hv0_have_right call .extend_right .hv0_have_right: palignr m0, m5, m4, 2 paddw m1, m4, m0 punpcklwd m2, m4, m0 pmaddwd m2, m2 punpckhwd m3, m4, m0 pmaddwd m3, m3 palignr m5, m4, 4 paddw m1, m5 ; sum punpcklwd m4, m5, m6 pmaddwd m4, m4 punpckhwd m5, m6 pmaddwd m5, m5 paddd m2, m4 ; sumsq paddd m3, m5 paddw m0, m1, [t1+wq+400*0] paddd m4, m2, [t1+wq+400*2] paddd m5, m3, [t1+wq+400*4] mova [t1+wq+400*0], m1 mova [t1+wq+400*2], m2 mova [t1+wq+400*4], m3 paddw m1, m0, [t2+wq+400*0] paddd m2, m4, [t2+wq+400*2] paddd m3, m5, [t2+wq+400*4] mova [t2+wq+400*0], m0 mova [t2+wq+400*2], m4 mova [t2+wq+400*4], m5 paddd m2, m8 paddd m3, m8 psrld m2, 4 ; (a + 8) >> 4 psrld m3, 4 pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; ((a + 8) >> 4) * 9 paddd m5, m3 psrlw m3, m1, 1 pavgw m3, m6 ; (b + 2) >> 2 punpcklwd m2, m3, m6 pmaddwd m2, m2 punpckhwd m3, m6 pmaddwd m3, m3 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 MAXSD m4, m2, m14 MAXSD m5, m3, m14 psubd m4, m2 ; p psubd m5, m3 MULLD m4, m9, m14 ; p * s MULLD m5, m9, m14 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 paddusw m4, m10 paddusw m5, m10 psrld m4, 20 ; min(z, 255) movif32 t3, t3m psrld m5, 20 GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m14 MULLD m1, m5, m14 %if ARCH_X86_32 pxor m6, m6 %endif paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 mova [t4+wq+4], m3 psrld m0, 12 psrld m1, 12 mova [t3+wq*2+ 8], m0 mova [t3+wq*2+24], m1 add wq, 16 jl .hv0_loop ret ALIGN function_align .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) %if ARCH_X86_64 lea wq, [r4-4] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left movif32 leftq, leftm movddup m5, [leftq] movif32 wq, w0m mova m4, [lpfq+wq+4] add leftmp, 8 palignr m4, m5, 12 jmp .hv1_main .hv1_extend_left: movif32 wq, w0m mova m4, [lpfq+wq+4] pshufb m4, m12 jmp .hv1_main .hv1_bottom: %if ARCH_X86_64 lea wq, [r4-4] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left movif32 wq, w0m %if ARCH_X86_32 jmp .hv1_loop_start %endif .hv1_loop: movif32 lpfq, hvsrcm .hv1_loop_start: movu m4, [lpfq+wq+ 0] .hv1_main: movu m5, [lpfq+wq+16] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv1_have_right cmp wd, -18 jl .hv1_have_right call .extend_right .hv1_have_right: palignr m1, m5, m4, 2 paddw m0, m4, m1 punpcklwd m2, m4, m1 pmaddwd m2, m2 punpckhwd m3, m4, m1 pmaddwd m3, m3 palignr m5, m4, 4 paddw m0, m5 ; h sum punpcklwd m1, m5, m6 pmaddwd m1, m1 punpckhwd m5, m6 pmaddwd m5, m5 paddd m2, m1 ; h sumsq paddd m3, m5 paddw m1, m0, [t2+wq+400*0] paddd m4, m2, [t2+wq+400*2] paddd m5, m3, [t2+wq+400*4] mova [t2+wq+400*0], m0 mova [t2+wq+400*2], m2 mova [t2+wq+400*4], m3 paddd m4, m8 paddd m5, m8 psrld m4, 4 ; (a + 8) >> 4 psrld m5, 4 pslld m2, m4, 3 pslld m3, m5, 3 paddd m4, m2 ; ((a + 8) >> 4) * 9 paddd m5, m3 psrlw m3, m1, 1 pavgw m3, m6 ; (b + 2) >> 2 punpcklwd m2, m3, m6 pmaddwd m2, m2 punpckhwd m3, m6 pmaddwd m3, m3 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 MAXSD m4, m2, m14 MAXSD m5, m3, m14 psubd m4, m2 ; p psubd m5, m3 MULLD m4, m9, m14 ; p * s MULLD m5, m9, m14 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 paddusw m4, m10 paddusw m5, m10 psrld m4, 20 ; min(z, 255) movif32 t3, t3m psrld m5, 20 GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m14 MULLD m1, m5, m14 %if ARCH_X86_32 pxor m6, m6 %endif paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 mova [t4+wq*1+400*2 +4], m3 psrld m0, 12 psrld m1, 12 mova [t3+wq*2+400*4+ 8], m0 mova [t3+wq*2+400*4+24], m1 add wq, 16 jl .hv1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .v0: ; vertical boxsums + ab (even rows) %if ARCH_X86_64 lea wq, [r4-4] %else mov wd, w0m %endif .v0_loop: mova m0, [t1+wq+400*0] mova m4, [t1+wq+400*2] mova m5, [t1+wq+400*4] paddw m0, m0 paddd m4, m4 paddd m5, m5 paddw m1, m0, [t2+wq+400*0] paddd m2, m4, [t2+wq+400*2] paddd m3, m5, [t2+wq+400*4] mova [t2+wq+400*0], m0 mova [t2+wq+400*2], m4 mova [t2+wq+400*4], m5 paddd m2, m8 paddd m3, m8 psrld m2, 4 ; (a + 8) >> 4 psrld m3, 4 pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; ((a + 8) >> 4) * 9 paddd m5, m3 psrlw m3, m1, 1 pavgw m3, m6 ; (b + 2) >> 2 punpcklwd m2, m3, m6 pmaddwd m2, m2 punpckhwd m3, m6 pmaddwd m3, m3 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 MAXSD m4, m2, m14 MAXSD m5, m3, m14 psubd m4, m2 ; p psubd m5, m3 MULLD m4, m9, m14 ; p * s MULLD m5, m9, m14 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 paddusw m4, m10 paddusw m5, m10 psrld m4, 20 ; min(z, 255) psrld m5, 20 GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m14 MULLD m1, m5, m14 %if ARCH_X86_32 pxor m6, m6 %endif paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 mova [t4+wq*1+400*0+ 4], m3 psrld m0, 12 psrld m1, 12 mova [t3+wq*2+400*0+ 8], m0 mova [t3+wq*2+400*0+24], m1 add wq, 16 jl .v0_loop ret .v1: ; vertical boxsums + ab (odd rows) %if ARCH_X86_64 lea wq, [r4-4] %else mov wd, w0m %endif .v1_loop: mova m0, [t1+wq+400*0] mova m4, [t1+wq+400*2] mova m5, [t1+wq+400*4] paddw m1, m0, [t2+wq+400*0] paddd m2, m4, [t2+wq+400*2] paddd m3, m5, [t2+wq+400*4] mova [t2+wq+400*0], m0 mova [t2+wq+400*2], m4 mova [t2+wq+400*4], m5 paddd m2, m8 paddd m3, m8 psrld m2, 4 ; (a + 8) >> 4 psrld m3, 4 pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; ((a + 8) >> 4) * 9 paddd m5, m3 psrlw m3, m1, 1 pavgw m3, m6 ; (b + 2) >> 2 punpcklwd m2, m3, m6 pmaddwd m2, m2 punpckhwd m3, m6 pmaddwd m3, m3 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 MAXSD m4, m2, m14 MAXSD m5, m3, m14 psubd m4, m2 ; p psubd m5, m3 MULLD m4, m9, m14 ; p * s MULLD m5, m9, m14 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 paddusw m4, m10 paddusw m5, m10 psrld m4, 20 ; min(z, 255) psrld m5, 20 GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m14 MULLD m1, m5, m14 %if ARCH_X86_32 pxor m6, m6 %endif paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 mova [t4+wq*1+400*2+ 4], m3 psrld m0, 12 psrld m1, 12 mova [t3+wq*2+400*4+ 8], m0 mova [t3+wq*2+400*4+24], m1 add wq, 16 jl .v1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .prep_n: ; initial neighbor setup movif64 wq, r4 movif32 wd, w1m .prep_n_loop: movu m0, [t4+wq*1+400*0+ 4] movu m1, [t3+wq*2+400*0+ 8] movu m2, [t3+wq*2+400*0+24] movu m3, [t4+wq*1+400*0+ 2] movu m4, [t3+wq*2+400*0+ 4] movu m5, [t3+wq*2+400*0+20] paddw m0, [t4+wq*1+400*0+ 0] paddd m1, [t3+wq*2+400*0+ 0] paddd m2, [t3+wq*2+400*0+16] paddw m3, m0 paddd m4, m1 paddd m5, m2 psllw m3, 2 ; a[-1] 444 pslld m4, 2 ; b[-1] 444 pslld m5, 2 psubw m3, m0 ; a[-1] 343 psubd m4, m1 ; b[-1] 343 psubd m5, m2 mova [t4+wq*1+400*4], m3 mova [t3+wq*2+400*8+ 0], m4 mova [t3+wq*2+400*8+16], m5 movu m0, [t4+wq*1+400*2+ 4] movu m1, [t3+wq*2+400*4+ 8] movu m2, [t3+wq*2+400*4+24] movu m3, [t4+wq*1+400*2+ 2] movu m4, [t3+wq*2+400*4+ 4] movu m5, [t3+wq*2+400*4+20] paddw m0, [t4+wq*1+400*2+ 0] paddd m1, [t3+wq*2+400*4+ 0] paddd m2, [t3+wq*2+400*4+16] paddw m3, m0 paddd m4, m1 paddd m5, m2 psllw m3, 2 ; a[ 0] 444 pslld m4, 2 ; b[ 0] 444 pslld m5, 2 mova [t4+wq*1+400* 6], m3 mova [t3+wq*2+400*12+ 0], m4 mova [t3+wq*2+400*12+16], m5 psubw m3, m0 ; a[ 0] 343 psubd m4, m1 ; b[ 0] 343 psubd m5, m2 mova [t4+wq*1+400* 8], m3 mova [t3+wq*2+400*16+ 0], m4 mova [t3+wq*2+400*16+16], m5 add wq, 16 jl .prep_n_loop ret ALIGN function_align .n0: ; neighbor + output (even rows) movif64 wq, r4 movif32 wd, w1m .n0_loop: movu m3, [t4+wq*1+400*0+4] movu m1, [t4+wq*1+400*0+2] paddw m3, [t4+wq*1+400*0+0] paddw m1, m3 psllw m1, 2 ; a[ 1] 444 psubw m2, m1, m3 ; a[ 1] 343 paddw m3, m2, [t4+wq*1+400*4] paddw m3, [t4+wq*1+400*6] mova [t4+wq*1+400*4], m2 mova [t4+wq*1+400*6], m1 movu m4, [t3+wq*2+400*0+8] movu m1, [t3+wq*2+400*0+4] paddd m4, [t3+wq*2+400*0+0] paddd m1, m4 pslld m1, 2 ; b[ 1] 444 psubd m2, m1, m4 ; b[ 1] 343 paddd m4, m2, [t3+wq*2+400* 8+ 0] paddd m4, [t3+wq*2+400*12+ 0] mova [t3+wq*2+400* 8+ 0], m2 mova [t3+wq*2+400*12+ 0], m1 movu m5, [t3+wq*2+400*0+24] movu m1, [t3+wq*2+400*0+20] paddd m5, [t3+wq*2+400*0+16] paddd m1, m5 pslld m1, 2 psubd m2, m1, m5 paddd m5, m2, [t3+wq*2+400* 8+16] paddd m5, [t3+wq*2+400*12+16] mova [t3+wq*2+400* 8+16], m2 mova [t3+wq*2+400*12+16], m1 mova m0, [dstq+wq] punpcklwd m1, m0, m6 punpcklwd m2, m3, m6 pmaddwd m2, m1 ; a * src punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 psubd m4, m2 ; b - a * src + (1 << 8) psubd m5, m3 psrad m4, 9 psrad m5, 9 packssdw m4, m5 pmulhrsw m4, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m13 mova [dstq+wq], m0 add wq, 16 jl .n0_loop add dstq, stridemp ret ALIGN function_align .n1: ; neighbor + output (odd rows) movif64 wq, r4 movif32 wd, w1m .n1_loop: movu m3, [t4+wq*1+400*2+4] movu m1, [t4+wq*1+400*2+2] paddw m3, [t4+wq*1+400*2+0] paddw m1, m3 psllw m1, 2 ; a[ 1] 444 psubw m2, m1, m3 ; a[ 1] 343 paddw m3, m2, [t4+wq*1+400*6] paddw m3, [t4+wq*1+400*8] mova [t4+wq*1+400*6], m1 mova [t4+wq*1+400*8], m2 movu m4, [t3+wq*2+400*4+8] movu m1, [t3+wq*2+400*4+4] paddd m4, [t3+wq*2+400*4+0] paddd m1, m4 pslld m1, 2 ; b[ 1] 444 psubd m2, m1, m4 ; b[ 1] 343 paddd m4, m2, [t3+wq*2+400*12+ 0] paddd m4, [t3+wq*2+400*16+ 0] mova [t3+wq*2+400*12+ 0], m1 mova [t3+wq*2+400*16+ 0], m2 movu m5, [t3+wq*2+400*4+24] movu m1, [t3+wq*2+400*4+20] paddd m5, [t3+wq*2+400*4+16] paddd m1, m5 pslld m1, 2 psubd m2, m1, m5 paddd m5, m2, [t3+wq*2+400*12+16] paddd m5, [t3+wq*2+400*16+16] mova [t3+wq*2+400*12+16], m1 mova [t3+wq*2+400*16+16], m2 mova m0, [dstq+wq] punpcklwd m1, m0, m6 punpcklwd m2, m3, m6 pmaddwd m2, m1 ; a * src punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 psubd m4, m2 ; b - a * src + (1 << 8) psubd m5, m3 psrad m4, 9 psrad m5, 9 packssdw m4, m5 pmulhrsw m4, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m13 mova [dstq+wq], m0 add wq, 16 jl .n1_loop add dstq, stridemp movif32 dstm, dstq ret %if ARCH_X86_32 %if STACK_ALIGNMENT < 16 %assign extra_stack 10*16 %else %assign extra_stack 8*16 %endif cglobal sgr_filter_mix_16bpc, 1, 7, 8, -400*66-48-extra_stack, \ dst, stride, left, lpf, w %if STACK_ALIGNMENT < 16 %define dstm dword [esp+calloff+16*8+4*0] %define stridemp dword [esp+calloff+16*8+4*1] %define leftm dword [esp+calloff+16*8+4*2] %define lpfm dword [esp+calloff+16*8+4*3] %define w0m dword [esp+calloff+16*8+4*4] %define hd dword [esp+calloff+16*8+4*5] %define edgeb byte [esp+calloff+16*8+4*6] %define edged dword [esp+calloff+16*8+4*6] %define leftmp leftm %else %define w0m wm %define hd dword r5m %define edgeb byte r7m %define edged dword r7m %endif %define hvsrcm dword [esp+calloff+4*0] %define w1m dword [esp+calloff+4*1] %define t3m dword [esp+calloff+4*2] %define t4m dword [esp+calloff+4*3] %xdefine m8 m6 %define m9 [base+pd_8] %define m10 [base+pd_34816] %define m11 [base+pd_0xf00801c7] %define m12 [base+pd_0xf00800a4] %define m13 [esp+calloff+16*4] %define m14 [esp+calloff+16*5] %define m15 [esp+calloff+16*6] %define m6 [esp+calloff+16*7] %define base r6-$$ %assign calloff 0 %if STACK_ALIGNMENT < 16 mov strideq, [rstk+stack_offset+ 8] mov leftq, [rstk+stack_offset+12] mov lpfq, [rstk+stack_offset+16] mov wd, [rstk+stack_offset+20] mov dstm, dstq mov stridemp, strideq mov leftm, leftq mov r1, [rstk+stack_offset+24] mov r2, [rstk+stack_offset+32] mov lpfm, lpfq mov hd, r1 mov edged, r2 %endif %else cglobal sgr_filter_mix_16bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \ w, h, edge, params %endif %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 movifnidn wd, wm %endif %if ARCH_X86_64 mov paramsq, r6mp lea r13, [sgr_x_by_x-0xf03] movifnidn hd, hm add wd, wd mov edged, r7m mova m14, [paramsq] add lpfq, wq mova m9, [pd_8] lea t1, [rsp+wq+44] mova m10, [pd_34816] add dstq, wq mova m11, [pd_0xf00801c7] lea t3, [rsp+wq*2+400*24+40] mova m12, [pd_0xf00800a4] lea t4, [rsp+wq+400*52+40] neg wq pshufd m15, m14, q2222 ; w0 w1 punpcklwd m14, m14 pshufd m13, m14, q0000 ; s0 pshufd m14, m14, q2222 ; s1 pxor m6, m6 psllw m15, 2 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w %define lpfm [rsp] %else mov r1, [rstk+stack_offset+28] ; params LEA r6, $$ add wd, wd mova m2, [r1] add lpfm, wq lea t1, [rsp+extra_stack+wq+52] add dstq, wq lea t3, [rsp+extra_stack+wq*2+400*24+48] mov dstm, dstq lea t4, [rsp+extra_stack+wq+400*52+48] mov t3m, t3 mov t4m, t4 neg wq pshuflw m0, m2, q0000 pshuflw m1, m2, q2222 pshufhw m2, m2, q1010 punpcklqdq m0, m0 ; s0 punpcklqdq m1, m1 ; s1 punpckhqdq m2, m2 ; w0 w1 mov w1m, wd pxor m3, m3 psllw m2, 2 mova m13, m0 mova m14, m1 sub wd, 4 mova m15, m2 mova m6, m3 mov lpfq, lpfm mov w0m, wd %define strideq r5 %endif test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, stridemp mov t2, t1 %if ARCH_X86_64 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup %else mov wq, w0m call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup_loop %endif add t1, 400*12 call .h_top movif32 strideq, stridemp lea r10, [lpfq+strideq*4] mov lpfq, dstq add r10, strideq mov lpfm, r10 ; below movif32 t4, t4m call .hv0 .main: dec hd jz .height1 movif32 lpfq, hvsrcm add lpfq, stridemp call .hv1 call .prep_n sub hd, 2 jl .extend_bottom .main_loop: movif32 lpfq, hvsrcm add lpfq, stridemp call .hv0 %if ARCH_X86_64 test hd, hd %else mov r4, hd test r4, r4 %endif jz .odd_height movif32 lpfq, hvsrcm add lpfq, stridemp call .hv1 call .n0 call .n1 sub hd, 2 jge .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, lpfm call .hv0_bottom movif32 lpfq, hvsrcm add lpfq, stridemp call .hv1_bottom .end: call .n0 call .n1 .end2: RET .height1: call .v1 call .prep_n jmp .odd_height_end .odd_height: call .v1 call .n0 call .n1 .odd_height_end: call .v0 call .v1 call .n0 jmp .end2 .extend_bottom: call .v0 call .v1 jmp .end .no_top: movif32 strideq, stridemp lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov lpfm, r10 call .h %if ARCH_X86_64 lea wq, [r4-4] %else mov wq, w0m mov hvsrcm, lpfq %endif lea t2, [t1+400*12] .top_fixup_loop: mova m0, [t1+wq+400* 0] mova m1, [t1+wq+400* 2] mova m2, [t1+wq+400* 4] paddw m0, m0 mova m3, [t1+wq+400* 6] paddd m1, m1 mova m4, [t1+wq+400* 8] paddd m2, m2 mova m5, [t1+wq+400*10] mova [t2+wq+400* 0], m0 mova [t2+wq+400* 2], m1 mova [t2+wq+400* 4], m2 mova [t2+wq+400* 6], m3 mova [t2+wq+400* 8], m4 mova [t2+wq+400*10], m5 add wq, 16 jl .top_fixup_loop movif32 t3, t3m movif32 t4, t4m call .v0 jmp .main .h: ; horizontal boxsum %assign stack_offset stack_offset+4 %assign calloff 4 %if ARCH_X86_64 lea wq, [r4-4] %else %define leftq r4 %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movif32 leftq, leftm movddup m5, [leftq] movif32 wq, w0m mova m4, [lpfq+wq+4] add leftmp, 8 palignr m4, m5, 10 jmp .h_main .h_extend_left: movif32 wq, w0m mova m4, [lpfq+wq+4] pshufb m4, [base+sgr_lshuf5] jmp .h_main .h_top: %if ARCH_X86_64 lea wq, [r4-4] %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movif32 wq, w0m .h_loop: movu m4, [lpfq+wq- 2] .h_main: movu m5, [lpfq+wq+14] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp wd, -20 jl .h_have_right %if ARCH_X86_32 pxor m8, m8 %endif call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right .h_have_right: palignr m3, m5, m4, 2 palignr m0, m5, m4, 4 paddw m1, m3, m0 punpcklwd m2, m3, m0 pmaddwd m2, m2 punpckhwd m3, m0 pmaddwd m3, m3 palignr m0, m5, m4, 6 paddw m1, m0 ; sum3 punpcklwd m7, m0, m6 pmaddwd m7, m7 punpckhwd m0, m6 pmaddwd m0, m0 paddd m2, m7 ; sumsq3 palignr m5, m4, 8 punpcklwd m7, m5, m4 paddw m8, m4, m5 pmaddwd m7, m7 punpckhwd m5, m4 pmaddwd m5, m5 paddd m3, m0 mova [t1+wq+400* 6], m1 mova [t1+wq+400* 8], m2 mova [t1+wq+400*10], m3 paddw m8, m1 ; sum5 paddd m7, m2 ; sumsq5 paddd m5, m3 mova [t1+wq+400* 0], m8 mova [t1+wq+400* 2], m7 mova [t1+wq+400* 4], m5 add wq, 16 jl .h_loop ret ALIGN function_align .hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) %if ARCH_X86_64 lea wq, [r4-4] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left movif32 leftq, leftm movddup m5, [leftq] movif32 wq, w0m mova m4, [lpfq+wq+4] add leftmp, 8 palignr m4, m5, 10 jmp .hv0_main .hv0_extend_left: movif32 wq, w0m mova m4, [lpfq+wq+4] pshufb m4, [base+sgr_lshuf5] jmp .hv0_main .hv0_bottom: %if ARCH_X86_64 lea wq, [r4-4] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left movif32 wq, w0m %if ARCH_X86_32 jmp .hv0_loop_start %endif .hv0_loop: movif32 lpfq, hvsrcm .hv0_loop_start: movu m4, [lpfq+wq- 2] .hv0_main: movu m5, [lpfq+wq+14] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv0_have_right cmp wd, -20 jl .hv0_have_right %if ARCH_X86_32 pxor m8, m8 %endif call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right .hv0_have_right: palignr m3, m5, m4, 2 palignr m0, m5, m4, 4 movif32 t3, t3m paddw m1, m3, m0 punpcklwd m2, m3, m0 pmaddwd m2, m2 punpckhwd m3, m0 pmaddwd m3, m3 palignr m0, m5, m4, 6 paddw m1, m0 ; h sum3 punpcklwd m7, m0, m6 pmaddwd m7, m7 punpckhwd m0, m6 pmaddwd m0, m0 paddd m2, m7 ; h sumsq3 palignr m5, m4, 8 punpcklwd m7, m5, m4 paddw m8, m4, m5 pmaddwd m7, m7 punpckhwd m5, m4 pmaddwd m5, m5 paddd m3, m0 paddw m8, m1 ; h sum5 paddd m7, m2 ; h sumsq5 paddd m5, m3 mova [t3+wq*2+400*8+ 8], m8 mova [t3+wq*2+400*0+ 8], m7 mova [t3+wq*2+400*0+24], m5 paddw m8, [t1+wq+400* 0] paddd m7, [t1+wq+400* 2] paddd m5, [t1+wq+400* 4] mova [t1+wq+400* 0], m8 mova [t1+wq+400* 2], m7 mova [t1+wq+400* 4], m5 paddw m0, m1, [t1+wq+400* 6] paddd m4, m2, [t1+wq+400* 8] paddd m5, m3, [t1+wq+400*10] mova [t1+wq+400* 6], m1 mova [t1+wq+400* 8], m2 mova [t1+wq+400*10], m3 paddw m1, m0, [t2+wq+400* 6] paddd m2, m4, [t2+wq+400* 8] paddd m3, m5, [t2+wq+400*10] mova [t2+wq+400* 6], m0 mova [t2+wq+400* 8], m4 mova [t2+wq+400*10], m5 paddd m2, m9 paddd m3, m9 psrld m2, 4 ; (a3 + 8) >> 4 psrld m3, 4 %if ARCH_X86_32 pxor m7, m7 %else SWAP m7, m6 %endif pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 paddd m5, m3 psrlw m3, m1, 1 pavgw m3, m7 ; (b3 + 2) >> 2 punpcklwd m2, m3, m7 pmaddwd m2, m2 punpckhwd m3, m7 pmaddwd m3, m3 punpcklwd m0, m1, m7 ; b3 punpckhwd m1, m7 %if ARCH_X86_64 SWAP m7, m6 %endif MAXSD m4, m2, m7 MAXSD m5, m3, m7 psubd m4, m2 ; p3 psubd m5, m3 MULLD m4, m14, m7 ; p3 * s1 MULLD m5, m14, m7 pmaddwd m0, m11 ; b3 * 455 pmaddwd m1, m11 paddusw m4, m11 paddusw m5, m11 psrld m4, 20 ; min(z3, 255) psrld m5, 20 GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m7 MULLD m1, m5, m7 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m10 mova [t4+wq*1+400*2+ 4], m3 psrld m0, 12 psrld m1, 12 mova [t3+wq*2+400*4+ 8], m0 mova [t3+wq*2+400*4+24], m1 add wq, 16 jl .hv0_loop ret ALIGN function_align .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) %if ARCH_X86_64 lea wq, [r4-4] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left movif32 leftq, leftm movddup m5, [leftq] movif32 wq, w0m mova m4, [lpfq+wq+4] add leftmp, 8 palignr m4, m5, 10 jmp .hv1_main .hv1_extend_left: movif32 wq, w0m mova m4, [lpfq+wq+4] pshufb m4, [base+sgr_lshuf5] jmp .hv1_main .hv1_bottom: %if ARCH_X86_64 lea wq, [r4-4] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left movif32 wq, w0m %if ARCH_X86_32 jmp .hv1_loop_start %endif .hv1_loop: movif32 lpfq, hvsrcm .hv1_loop_start: movu m4, [lpfq+wq- 2] .hv1_main: movu m5, [lpfq+wq+14] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv1_have_right cmp wd, -20 jl .hv1_have_right %if ARCH_X86_32 pxor m8, m8 %endif call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right .hv1_have_right: palignr m7, m5, m4, 2 palignr m3, m5, m4, 4 paddw m2, m7, m3 punpcklwd m0, m7, m3 pmaddwd m0, m0 punpckhwd m7, m3 pmaddwd m7, m7 palignr m3, m5, m4, 6 paddw m2, m3 ; h sum3 punpcklwd m1, m3, m6 pmaddwd m1, m1 punpckhwd m3, m6 pmaddwd m3, m3 paddd m0, m1 ; h sumsq3 palignr m5, m4, 8 punpckhwd m1, m4, m5 paddw m8, m4, m5 pmaddwd m1, m1 punpcklwd m4, m5 pmaddwd m4, m4 paddd m7, m3 paddw m5, m2, [t2+wq+400* 6] mova [t2+wq+400* 6], m2 paddw m8, m2 ; h sum5 paddd m2, m0, [t2+wq+400* 8] paddd m3, m7, [t2+wq+400*10] mova [t2+wq+400* 8], m0 mova [t2+wq+400*10], m7 paddd m4, m0 ; h sumsq5 paddd m1, m7 paddd m2, m9 paddd m3, m9 psrld m2, 4 ; (a3 + 8) >> 4 psrld m3, 4 pslld m0, m2, 3 pslld m7, m3, 3 paddd m2, m0 ; ((a3 + 8) >> 4) * 9 paddd m3, m7 psrlw m7, m5, 1 pavgw m7, m6 ; (b3 + 2) >> 2 punpcklwd m0, m7, m6 pmaddwd m0, m0 punpckhwd m7, m6 pmaddwd m7, m7 %if ARCH_X86_32 mova [esp+20], m8 %else SWAP m8, m6 %endif MAXSD m2, m0, m8 MAXSD m3, m7, m8 pxor m8, m8 psubd m2, m0 ; p3 psubd m3, m7 punpcklwd m0, m5, m8 ; b3 punpckhwd m5, m8 MULLD m2, m14, m8 ; p3 * s1 MULLD m3, m14, m8 pmaddwd m0, m11 ; b3 * 455 pmaddwd m5, m11 paddusw m2, m11 paddusw m3, m11 psrld m2, 20 ; min(z3, 255) movif32 t3, t3m psrld m3, 20 GATHER_X_BY_X m8, m2, m3, r0, dstm punpcklwd m2, m8, m8 punpckhwd m3, m8, m8 MULLD m0, m2, m7 MULLD m5, m3, m7 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m5, m10 psrld m0, 12 psrld m5, 12 mova [t4+wq*1+400*4+4], m8 mova [t3+wq*2+400*8+ 8], m0 mova [t3+wq*2+400*8+24], m5 %if ARCH_X86_32 mova m8, [esp+20] %else SWAP m6, m8 pxor m6, m6 %endif paddw m5, m8, [t2+wq+400*0] paddd m2, m4, [t2+wq+400*2] paddd m3, m1, [t2+wq+400*4] paddw m5, [t1+wq+400*0] paddd m2, [t1+wq+400*2] paddd m3, [t1+wq+400*4] mova [t2+wq+400*0], m8 paddd m2, m9 paddd m3, m9 psrld m2, 4 ; (a5 + 8) >> 4 psrld m3, 4 mova [t2+wq+400*2], m4 pslld m8, m2, 4 mova [t2+wq+400*4], m1 pslld m4, m3, 4 paddd m8, m2 pslld m2, 3 paddd m4, m3 pslld m3, 3 paddd m2, m8 ; ((a5 + 8) >> 4) * 25 paddd m3, m4 %if ARCH_X86_32 pxor m7, m7 %else SWAP m7, m6 %endif psrlw m1, m5, 1 pavgw m1, m7 ; (b5 + 2) >> 2 punpcklwd m4, m1, m7 pmaddwd m4, m4 punpckhwd m1, m7 pmaddwd m1, m1 punpcklwd m0, m5, m7 ; b5 punpckhwd m5, m7 %if ARCH_X86_64 SWAP m7, m6 %endif MAXSD m2, m4, m7 psubd m2, m4 ; p5 MAXSD m3, m1, m7 psubd m3, m1 MULLD m2, m13, m7 ; p5 * s0 MULLD m3, m13, m7 pmaddwd m0, m12 ; b5 * 164 pmaddwd m5, m12 paddusw m2, m12 paddusw m3, m12 psrld m2, 20 ; min(z5, 255) psrld m3, 20 GATHER_X_BY_X m1, m2, m3, r0, dstm punpcklwd m2, m1, m1 punpckhwd m3, m1, m1 MULLD m0, m2, m7 MULLD m5, m3, m7 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) paddd m5, m10 mova [t4+wq*1+400*0+ 4], m1 psrld m0, 12 psrld m5, 12 mova [t3+wq*2+400*0+ 8], m0 mova [t3+wq*2+400*0+24], m5 add wq, 16 jl .hv1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .v0: ; vertical boxsums + ab3 (even rows) %if ARCH_X86_64 lea wq, [r4-4] %else mov wd, w0m %endif .v0_loop: mova m0, [t1+wq+400* 6] mova m4, [t1+wq+400* 8] mova m5, [t1+wq+400*10] paddw m0, m0 paddd m4, m4 paddd m5, m5 paddw m1, m0, [t2+wq+400* 6] paddd m2, m4, [t2+wq+400* 8] paddd m3, m5, [t2+wq+400*10] mova [t2+wq+400* 6], m0 mova [t2+wq+400* 8], m4 mova [t2+wq+400*10], m5 paddd m2, m9 paddd m3, m9 psrld m2, 4 ; (a3 + 8) >> 4 psrld m3, 4 %if ARCH_X86_32 pxor m7, m7 %else SWAP m7, m6 %endif pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 paddd m5, m3 psrlw m3, m1, 1 pavgw m3, m7 ; (b3 + 2) >> 2 punpcklwd m2, m3, m7 pmaddwd m2, m2 punpckhwd m3, m7 pmaddwd m3, m3 punpcklwd m0, m1, m7 ; b3 punpckhwd m1, m7 %if ARCH_X86_64 SWAP m7, m6 %endif MAXSD m4, m2, m7 MAXSD m5, m3, m7 psubd m4, m2 ; p3 psubd m5, m3 MULLD m4, m14, m7 ; p3 * s1 MULLD m5, m14, m7 pmaddwd m0, m11 ; b3 * 455 pmaddwd m1, m11 paddusw m4, m11 paddusw m5, m11 psrld m4, 20 ; min(z3, 255) psrld m5, 20 GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m7 MULLD m1, m5, m7 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m10 mova [t4+wq*1+400*2+4], m3 psrld m0, 12 psrld m1, 12 mova m3, [t1+wq+400*0] mova m4, [t1+wq+400*2] mova m5, [t1+wq+400*4] mova [t3+wq*2+400*8+ 8], m3 mova [t3+wq*2+400*0+ 8], m4 mova [t3+wq*2+400*0+24], m5 paddw m3, m3 ; cc5 paddd m4, m4 paddd m5, m5 mova [t1+wq+400*0], m3 mova [t1+wq+400*2], m4 mova [t1+wq+400*4], m5 mova [t3+wq*2+400*4+ 8], m0 mova [t3+wq*2+400*4+24], m1 add wq, 16 jl .v0_loop ret .v1: ; vertical boxsums + ab (odd rows) %if ARCH_X86_64 lea wq, [r4-4] %else mov wd, w0m %endif .v1_loop: mova m4, [t1+wq+400* 6] mova m5, [t1+wq+400* 8] mova m7, [t1+wq+400*10] paddw m1, m4, [t2+wq+400* 6] paddd m2, m5, [t2+wq+400* 8] paddd m3, m7, [t2+wq+400*10] mova [t2+wq+400* 6], m4 mova [t2+wq+400* 8], m5 mova [t2+wq+400*10], m7 paddd m2, m9 paddd m3, m9 psrld m2, 4 ; (a3 + 8) >> 4 psrld m3, 4 %if ARCH_X86_32 pxor m7, m7 %else SWAP m7, m6 %endif pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 paddd m5, m3 psrlw m3, m1, 1 pavgw m3, m7 ; (b3 + 2) >> 2 punpcklwd m2, m3, m7 pmaddwd m2, m2 punpckhwd m3, m7 pmaddwd m3, m3 punpcklwd m0, m1, m7 ; b3 punpckhwd m1, m7 %if ARCH_X86_64 SWAP m7, m6 %endif MAXSD m4, m2, m7 MAXSD m5, m3, m7 psubd m4, m2 ; p3 psubd m5, m3 MULLD m4, m14, m7 ; p3 * s1 MULLD m5, m14, m7 pmaddwd m0, m11 ; b3 * 455 pmaddwd m1, m11 paddusw m4, m11 paddusw m5, m11 psrld m4, 20 ; min(z3, 255) psrld m5, 20 GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m7 MULLD m1, m5, m7 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m10 mova [t4+wq*1+400*4+4], m3 psrld m0, 12 psrld m8, m1, 12 mova m4, [t3+wq*2+400*8+ 8] mova m5, [t3+wq*2+400*0+ 8] mova m7, [t3+wq*2+400*0+24] paddw m1, m4, [t2+wq+400*0] paddd m2, m5, [t2+wq+400*2] paddd m3, m7, [t2+wq+400*4] paddw m1, [t1+wq+400*0] paddd m2, [t1+wq+400*2] paddd m3, [t1+wq+400*4] mova [t2+wq+400*0], m4 mova [t2+wq+400*2], m5 mova [t2+wq+400*4], m7 paddd m2, m9 paddd m3, m9 psrld m2, 4 ; (a5 + 8) >> 4 psrld m3, 4 mova [t3+wq*2+400*8+ 8], m0 pslld m4, m2, 4 mova [t3+wq*2+400*8+24], m8 pslld m5, m3, 4 paddd m4, m2 pslld m2, 3 paddd m5, m3 pslld m3, 3 paddd m2, m4 paddd m3, m5 %if ARCH_X86_32 pxor m7, m7 %else SWAP m7, m6 %endif psrlw m5, m1, 1 pavgw m5, m7 ; (b5 + 2) >> 2 punpcklwd m4, m5, m7 pmaddwd m4, m4 punpckhwd m5, m7 pmaddwd m5, m5 punpcklwd m0, m1, m7 ; b5 punpckhwd m1, m7 %if ARCH_X86_64 SWAP m7, m6 %endif MAXSD m2, m4, m7 psubd m2, m4 ; p5 MAXSD m3, m5, m7 psubd m3, m5 MULLD m2, m13, m7 ; p5 * s0 MULLD m3, m13, m7 pmaddwd m0, m12 ; b5 * 164 pmaddwd m1, m12 paddusw m2, m12 paddusw m3, m12 psrld m2, 20 ; min(z5, 255) psrld m3, 20 GATHER_X_BY_X m4, m2, m3, r0, dstm punpcklwd m2, m4, m4 punpckhwd m3, m4, m4 MULLD m0, m2, m7 MULLD m1, m3, m7 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) paddd m1, m10 mova [t4+wq*1+400*0+ 4], m4 psrld m0, 12 psrld m1, 12 mova [t3+wq*2+400*0+ 8], m0 mova [t3+wq*2+400*0+24], m1 add wq, 16 jl .v1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .prep_n: ; initial neighbor setup movif64 wq, r4 movif32 wd, w1m .prep_n_loop: movu m0, [t4+wq*1+400*0+ 2] movu m1, [t3+wq*2+400*0+ 4] movu m2, [t3+wq*2+400*0+20] movu m7, [t4+wq*1+400*0+ 4] movu m8, [t3+wq*2+400*0+ 8] paddw m3, m0, [t4+wq*1+400*0+ 0] paddd m4, m1, [t3+wq*2+400*0+ 0] paddd m5, m2, [t3+wq*2+400*0+16] paddw m3, m7 paddd m4, m8 movu m7, [t3+wq*2+400*0+24] paddw m0, m3 paddd m1, m4 psllw m3, 2 pslld m4, 2 paddd m5, m7 paddd m2, m5 pslld m5, 2 paddw m0, m3 ; a5 565 paddd m1, m4 ; b5 565 paddd m2, m5 mova [t4+wq*1+400* 6+ 0], m0 mova [t3+wq*2+400*12+ 0], m1 mova [t3+wq*2+400*12+16], m2 movu m0, [t4+wq*1+400*2+ 4] movu m1, [t3+wq*2+400*4+ 8] movu m2, [t3+wq*2+400*4+24] movu m3, [t4+wq*1+400*2+ 2] movu m4, [t3+wq*2+400*4+ 4] movu m5, [t3+wq*2+400*4+20] paddw m0, [t4+wq*1+400*2+ 0] paddd m1, [t3+wq*2+400*4+ 0] paddd m2, [t3+wq*2+400*4+16] paddw m3, m0 paddd m4, m1 paddd m5, m2 psllw m3, 2 ; a3[-1] 444 pslld m4, 2 ; b3[-1] 444 pslld m5, 2 psubw m3, m0 ; a3[-1] 343 psubd m4, m1 ; b3[-1] 343 psubd m5, m2 mova [t4+wq*1+400* 8+ 0], m3 mova [t3+wq*2+400*16+ 0], m4 mova [t3+wq*2+400*16+16], m5 movu m0, [t4+wq*1+400*4+ 4] movu m1, [t3+wq*2+400*8+ 8] movu m2, [t3+wq*2+400*8+24] movu m3, [t4+wq*1+400*4+ 2] movu m4, [t3+wq*2+400*8+ 4] movu m5, [t3+wq*2+400*8+20] paddw m0, [t4+wq*1+400*4+ 0] paddd m1, [t3+wq*2+400*8+ 0] paddd m2, [t3+wq*2+400*8+16] paddw m3, m0 paddd m4, m1 paddd m5, m2 psllw m3, 2 ; a3[ 0] 444 pslld m4, 2 ; b3[ 0] 444 pslld m5, 2 mova [t4+wq*1+400*10+ 0], m3 mova [t3+wq*2+400*20+ 0], m4 mova [t3+wq*2+400*20+16], m5 psubw m3, m0 ; a3[ 0] 343 psubd m4, m1 ; b3[ 0] 343 psubd m5, m2 mova [t4+wq*1+400*12+ 0], m3 mova [t3+wq*2+400*24+ 0], m4 mova [t3+wq*2+400*24+16], m5 add wq, 16 jl .prep_n_loop ret ALIGN function_align .n0: ; neighbor + output (even rows) movif64 wq, r4 movif32 wd, w1m .n0_loop: movu m0, [t4+wq*1+ 4] movu m2, [t4+wq*1+ 2] paddw m0, [t4+wq*1+ 0] paddw m0, m2 paddw m2, m0 psllw m0, 2 paddw m0, m2 ; a5 movu m4, [t3+wq*2+ 8] movu m5, [t3+wq*2+24] movu m1, [t3+wq*2+ 4] movu m3, [t3+wq*2+20] paddd m4, [t3+wq*2+ 0] paddd m5, [t3+wq*2+16] paddd m4, m1 paddd m5, m3 paddd m1, m4 paddd m3, m5 pslld m4, 2 pslld m5, 2 paddd m4, m1 ; b5 paddd m5, m3 movu m2, [t4+wq*1+400* 6] paddw m2, m0 mova [t4+wq*1+400* 6], m0 paddd m0, m4, [t3+wq*2+400*12+ 0] paddd m1, m5, [t3+wq*2+400*12+16] mova [t3+wq*2+400*12+ 0], m4 mova [t3+wq*2+400*12+16], m5 mova [rsp+16+ARCH_X86_32*4], m1 movu m3, [t4+wq*1+400*2+4] movu m5, [t4+wq*1+400*2+2] paddw m3, [t4+wq*1+400*2+0] paddw m5, m3 psllw m5, 2 ; a3[ 1] 444 psubw m4, m5, m3 ; a3[ 1] 343 movu m3, [t4+wq*1+400* 8] paddw m3, [t4+wq*1+400*10] paddw m3, m4 mova [t4+wq*1+400* 8], m4 mova [t4+wq*1+400*10], m5 movu m1, [t3+wq*2+400*4+ 8] movu m5, [t3+wq*2+400*4+ 4] movu m7, [t3+wq*2+400*4+24] movu m8, [t3+wq*2+400*4+20] paddd m1, [t3+wq*2+400*4+ 0] paddd m7, [t3+wq*2+400*4+16] paddd m5, m1 paddd m8, m7 pslld m5, 2 ; b3[ 1] 444 pslld m8, 2 psubd m4, m5, m1 ; b3[ 1] 343 %if ARCH_X86_32 mova [esp+52], m8 psubd m8, m7 %else psubd m6, m8, m7 SWAP m8, m6 %endif paddd m1, m4, [t3+wq*2+400*16+ 0] paddd m7, m8, [t3+wq*2+400*16+16] paddd m1, [t3+wq*2+400*20+ 0] paddd m7, [t3+wq*2+400*20+16] mova [t3+wq*2+400*16+ 0], m4 mova [t3+wq*2+400*16+16], m8 mova [t3+wq*2+400*20+ 0], m5 %if ARCH_X86_32 mova m8, [esp+52] %else SWAP m8, m6 pxor m6, m6 %endif mova [t3+wq*2+400*20+16], m8 mova [rsp+32+ARCH_X86_32*4], m7 movu m5, [dstq+wq] punpcklwd m4, m5, m6 punpcklwd m7, m2, m6 pmaddwd m7, m4 ; a5 * src punpcklwd m8, m3, m6 pmaddwd m8, m4 ; a3 * src punpckhwd m5, m6 punpckhwd m2, m6 pmaddwd m2, m5 punpckhwd m3, m6 pmaddwd m3, m5 pslld m4, 13 pslld m5, 13 psubd m0, m7 ; b5 - a5 * src + (1 << 8) psubd m1, m8 ; b3 - a3 * src + (1 << 8) mova m7, [base+pd_0xffff] psrld m0, 9 pslld m1, 7 pand m0, m7 pandn m8, m7, m1 por m0, m8 mova m1, [rsp+16+ARCH_X86_32*4] mova m8, [rsp+32+ARCH_X86_32*4] psubd m1, m2 psubd m8, m3 mova m2, [base+pd_4096] psrld m1, 9 pslld m8, 7 pand m1, m7 pandn m7, m8 por m1, m7 pmaddwd m0, m15 pmaddwd m1, m15 %if ARCH_X86_32 pxor m7, m7 %else SWAP m7, m6 %endif paddd m4, m2 paddd m5, m2 paddd m0, m4 paddd m1, m5 psrad m0, 8 psrad m1, 8 packssdw m0, m1 ; clip pmaxsw m0, m7 psrlw m0, 5 mova [dstq+wq], m0 add wq, 16 jl .n0_loop add dstq, stridemp ret %if ARCH_X86_64 SWAP m6, m7 %endif ALIGN function_align .n1: ; neighbor + output (odd rows) movif64 wq, r4 movif32 wd, w1m .n1_loop: movu m3, [t4+wq*1+400*4+4] movu m5, [t4+wq*1+400*4+2] paddw m3, [t4+wq*1+400*4+0] paddw m5, m3 psllw m5, 2 ; a3[ 1] 444 psubw m4, m5, m3 ; a3[ 1] 343 paddw m3, m4, [t4+wq*1+400*12] paddw m3, [t4+wq*1+400*10] mova [t4+wq*1+400*10], m5 mova [t4+wq*1+400*12], m4 movu m1, [t3+wq*2+400*8+ 8] movu m5, [t3+wq*2+400*8+ 4] movu m7, [t3+wq*2+400*8+24] movu m8, [t3+wq*2+400*8+20] paddd m1, [t3+wq*2+400*8+ 0] paddd m7, [t3+wq*2+400*8+16] paddd m5, m1 paddd m8, m7 pslld m5, 2 ; b3[ 1] 444 pslld m8, 2 psubd m4, m5, m1 ; b3[ 1] 343 psubd m0, m8, m7 paddd m1, m4, [t3+wq*2+400*24+ 0] paddd m7, m0, [t3+wq*2+400*24+16] paddd m1, [t3+wq*2+400*20+ 0] paddd m7, [t3+wq*2+400*20+16] mova [t3+wq*2+400*20+ 0], m5 mova [t3+wq*2+400*20+16], m8 mova [t3+wq*2+400*24+ 0], m4 mova [t3+wq*2+400*24+16], m0 mova m5, [dstq+wq] mova m2, [t4+wq*1+400* 6] punpcklwd m4, m5, m6 punpcklwd m8, m2, m6 pmaddwd m8, m4 ; a5 * src punpcklwd m0, m3, m6 pmaddwd m0, m4 ; a3 * src punpckhwd m5, m6 punpckhwd m2, m6 pmaddwd m2, m5 punpckhwd m3, m6 pmaddwd m3, m5 psubd m1, m0 ; b3 - a3 * src + (1 << 8) pslld m4, 13 pslld m5, 13 mova m0, [t3+wq*2+400*12+ 0] psubd m0, m8 ; b5 - a5 * src + (1 << 8) mova m8, [t3+wq*2+400*12+16] psubd m8, m2 psubd m7, m3 mova m2, [base+pd_0xffff] pslld m1, 7 psrld m0, 8 psrld m8, 8 pslld m7, 7 pand m0, m2 pandn m3, m2, m1 por m0, m3 pand m8, m2 pandn m2, m7 por m2, m8 mova m1, [base+pd_4096] pmaddwd m0, m15 pmaddwd m2, m15 %if ARCH_X86_64 SWAP m7, m6 %endif pxor m7, m7 paddd m4, m1 paddd m5, m1 paddd m0, m4 paddd m2, m5 psrad m0, 8 psrad m2, 8 packssdw m0, m2 ; clip pmaxsw m0, m7 psrlw m0, 5 mova [dstq+wq], m0 add wq, 16 jl .n1_loop add dstq, stridemp movif32 dstm, dstq ret