From 43a97878ce14b72f0981164f87f2e35e14151312 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 11:22:09 +0200 Subject: Adding upstream version 110.0.1. Signed-off-by: Daniel Baumann --- .../dav1d/src/x86/looprestoration16_sse.asm | 3723 ++++++++++++++++++++ 1 file changed, 3723 insertions(+) create mode 100644 third_party/dav1d/src/x86/looprestoration16_sse.asm (limited to 'third_party/dav1d/src/x86/looprestoration16_sse.asm') diff --git a/third_party/dav1d/src/x86/looprestoration16_sse.asm b/third_party/dav1d/src/x86/looprestoration16_sse.asm new file mode 100644 index 0000000000..872e502982 --- /dev/null +++ b/third_party/dav1d/src/x86/looprestoration16_sse.asm @@ -0,0 +1,3723 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA + +wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 +wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 +wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 +wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 +wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +wiener_lshuf5: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 +wiener_lshuf7: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 +sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 +sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 +pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + +pb_m14_m13: times 8 db -14,-13 +pb_m10_m9: times 8 db -10, -9 +pb_m6_m5: times 8 db -6, -5 +pb_m2_m1: times 8 db -2, -1 +pb_2_3: times 8 db 2, 3 +pb_6_7: times 8 db 6, 7 +pw_256: times 8 dw 256 +pw_1023: times 8 dw 1023 +pd_8: times 4 dd 8 +pd_4096: times 4 dd 4096 +pd_34816: times 4 dd 34816 +pd_m262128: times 4 dd -262128 +pd_0xffff: times 4 dd 0xffff +pd_0xf00800a4: times 4 dd 0xf00800a4 +pd_0xf00801c7: times 4 dd 0xf00801c7 +pd_0xfffffff0: times 4 dd 0xfffffff0 + +wiener_shifts: dw 4, 4, 2048, 2048, 1, 1, 8192, 8192 +wiener_round: dd 1049600, 1048832 + +cextern sgr_x_by_x + +SECTION .text + +%macro movif64 2 ; dst, src + %if ARCH_X86_64 + mov %1, %2 + %endif +%endmacro + +%macro movif32 2 ; dst, src + %if ARCH_X86_32 + mov %1, %2 + %endif +%endmacro + +INIT_XMM ssse3 +%if ARCH_X86_32 +DECLARE_REG_TMP 5, 6 + %if STACK_ALIGNMENT < 16 + %assign extra_stack 13*16 + %else + %assign extra_stack 12*16 + %endif +cglobal wiener_filter7_16bpc, 4, 7, 8, -384*12-16-extra_stack, \ + dst, stride, left, lpf, w, flt + %if STACK_ALIGNMENT < 16 + %define lpfm dword [esp+calloff+16*12+ 0] + %define wm dword [esp+calloff+16*12+ 4] + %define hd dword [esp+calloff+16*12+ 8] + %define edgeb byte [esp+calloff+16*12+12] + %define edged dword [esp+calloff+16*12+12] + %else + %define hd dword r5m + %define edgeb byte r7m + %endif + %define PICmem dword [esp+calloff+4*0] + %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers + %define t1m dword [esp+calloff+4*2] + %define t2m dword [esp+calloff+4*3] + %define t3m dword [esp+calloff+4*4] + %define t4m dword [esp+calloff+4*5] + %define t5m dword [esp+calloff+4*6] + %define t6m dword [esp+calloff+4*7] + %define t2 t2m + %define t3 t3m + %define t4 t4m + %define t5 t5m + %define t6 t6m + %define m8 [esp+calloff+16*2] + %define m9 [esp+calloff+16*3] + %define m10 [esp+calloff+16*4] + %define m11 [esp+calloff+16*5] + %define m12 [esp+calloff+16*6] + %define m13 [esp+calloff+16*7] + %define m14 [esp+calloff+16*8] + %define m15 [esp+calloff+16*9] + %define r10 r4 + %define base t0-wiener_shifts + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov wd, [rstk+stack_offset+20] + mov wm, wd + mov r5, [rstk+stack_offset+24] + mov hd, r5 + mov r5, [rstk+stack_offset+32] + mov edged, r5 ; edge + %endif +%else +DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers +cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ + w, h, edge, flt + %define base +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + movifnidn wd, wm +%endif +%if ARCH_X86_64 + mov fltq, r6mp + movifnidn hd, hm + mov edged, r7m + mov t3d, r8m ; pixel_max + movq m13, [fltq] + movq m15, [fltq+16] +%else + %if STACK_ALIGNMENT < 16 + mov t0, [rstk+stack_offset+28] + mov t1, [rstk+stack_offset+36] ; pixel_max + movq m1, [t0] ; fx + movq m3, [t0+16] ; fy + LEA t0, wiener_shifts + %else + mov fltq, r6m + movq m1, [fltq] + movq m3, [fltq+16] + LEA t0, wiener_shifts + mov t1, r8m ; pixel_max + %endif + mov PICmem, t0 +%endif + mova m6, [base+wiener_shufA] + mova m7, [base+wiener_shufB] +%if ARCH_X86_64 + lea t4, [wiener_shifts] + add wd, wd + pshufd m12, m13, q0000 ; x0 x1 + pshufd m13, m13, q1111 ; x2 x3 + pshufd m14, m15, q0000 ; y0 y1 + pshufd m15, m15, q1111 ; y2 y3 + mova m8, [wiener_shufC] + mova m9, [wiener_shufD] + add lpfq, wq + lea t1, [rsp+wq+16] + add dstq, wq + neg wq + shr t3d, 11 + %define base t4-wiener_shifts + movd m10, [base+wiener_round+t3*4] + movq m11, [base+wiener_shifts+t3*8] + pshufd m10, m10, q0000 + pshufd m0, m11, q0000 + pshufd m11, m11, q1111 + pmullw m12, m0 ; upshift filter coefs to make the + pmullw m13, m0 ; horizontal downshift constant + DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w + %define lpfm [rsp] + %define base + %define wiener_lshuf7_mem [wiener_lshuf7] + %define pd_m262128_mem [pd_m262128] +%else + add wd, wd + mova m4, [base+wiener_shufC] + mova m5, [base+wiener_shufD] + pshufd m0, m1, q0000 + pshufd m1, m1, q1111 + pshufd m2, m3, q0000 + pshufd m3, m3, q1111 + mova m8, m4 + mova m9, m5 + mova m14, m2 + mova m15, m3 + shr t1, 11 + add lpfq, wq + mova m3, [base+pd_m262128] + movd m4, [base+wiener_round+t1*4] + movq m5, [base+wiener_shifts+t1*8] + lea t1, [esp+extra_stack+wq+16] + add dstq, wq + neg wq + pshufd m4, m4, q0000 + pshufd m2, m5, q0000 + pshufd m5, m5, q1111 + mov wm, wq + pmullw m0, m2 + pmullw m1, m2 + mova m2, [base+wiener_lshuf7] + %define pd_m262128_mem [esp+calloff+16*10] + mova pd_m262128_mem, m3 + mova m10, m4 + mova m11, m5 + mova m12, m0 + mova m13, m1 + %define wiener_lshuf7_mem [esp+calloff+16*11] + mova wiener_lshuf7_mem, m2 +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t6, t1 + mov t5, t1 + add t1, 384*2 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + mov t4, t1 + add t1, 384*2 + add r10, strideq + mov lpfm, r10 ; below + call .h + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v3 +.main: + lea t0, [t1+384*2] +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v3 + mov lpfq, lpfm + call .hv_bottom + add lpfq, strideq + call .hv_bottom +.v1: + call .v + RET +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov lpfm, r10 + call .h + mov t6, t1 + mov t5, t1 + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v3 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v3 + add t0, 384*8 + call .hv + dec hd + jnz .main +.v3: + call .v + movif32 wq, wm +.v2: + call .v + movif32 wq, wm + jmp .v1 +.extend_right: +%assign stack_offset stack_offset+8 +%assign calloff 8 + movif32 t0, PICmem + pxor m0, m0 + movd m1, wd + mova m2, [base+pb_0to15] + pshufb m1, m0 + mova m0, [base+pb_6_7] + psubb m0, m1 + pminub m0, m2 + pshufb m3, m0 + mova m0, [base+pb_m2_m1] + psubb m0, m1 + pminub m0, m2 + pshufb m4, m0 + mova m0, [base+pb_m10_m9] + psubb m0, m1 + pminub m0, m2 + pshufb m5, m0 + movif32 t0, t0m + ret +%assign stack_offset stack_offset-4 +%assign calloff 4 +.h: + movif64 wq, r4 + movif32 wq, wm + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movq m3, [leftq] + movhps m3, [lpfq+wq] + add leftq, 8 + jmp .h_main +.h_extend_left: + mova m3, [lpfq+wq] ; avoid accessing memory located + pshufb m3, wiener_lshuf7_mem ; before the start of the buffer + jmp .h_main +.h_top: + movif64 wq, r4 + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m3, [lpfq+wq-8] +.h_main: + mova m4, [lpfq+wq+0] + movu m5, [lpfq+wq+8] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -20 + jl .h_have_right + call .extend_right +.h_have_right: + pshufb m0, m3, m6 + pshufb m1, m4, m7 + paddw m0, m1 + pshufb m3, m8 + pmaddwd m0, m12 + pshufb m1, m4, m9 + paddw m3, m1 + pshufb m1, m4, m6 + pmaddwd m3, m13 + pshufb m2, m5, m7 + paddw m1, m2 + mova m2, pd_m262128_mem ; (1 << 4) - (1 << 18) + pshufb m4, m8 + pmaddwd m1, m12 + pshufb m5, m9 + paddw m4, m5 + pmaddwd m4, m13 + paddd m0, m2 + paddd m1, m2 + paddd m0, m3 + paddd m1, m4 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + psraw m0, 1 + mova [t1+wq], m0 + add wq, 16 + jl .h_loop + movif32 wq, wm + ret +ALIGN function_align +.hv: + add lpfq, strideq + movif64 wq, r4 + movif32 t0m, t0 + movif32 t1m, t1 + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movq m3, [leftq] + movhps m3, [lpfq+wq] + add leftq, 8 + jmp .hv_main +.hv_extend_left: + mova m3, [lpfq+wq] + pshufb m3, wiener_lshuf7_mem + jmp .hv_main +.hv_bottom: + movif64 wq, r4 + movif32 t0m, t0 + movif32 t1m, t1 + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m3, [lpfq+wq-8] +.hv_main: + mova m4, [lpfq+wq+0] + movu m5, [lpfq+wq+8] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp wd, -20 + jl .hv_have_right + call .extend_right +.hv_have_right: + movif32 t1, t4m + movif32 t0, t2m + pshufb m0, m3, m6 + pshufb m1, m4, m7 + paddw m0, m1 + pshufb m3, m8 + pmaddwd m0, m12 + pshufb m1, m4, m9 + paddw m3, m1 + pshufb m1, m4, m6 + pmaddwd m3, m13 + pshufb m2, m5, m7 + paddw m1, m2 + mova m2, pd_m262128_mem + pshufb m4, m8 + pmaddwd m1, m12 + pshufb m5, m9 + paddw m4, m5 + pmaddwd m4, m13 + paddd m0, m2 + paddd m1, m2 +%if ARCH_X86_64 + mova m2, [t4+wq] + paddw m2, [t2+wq] + mova m5, [t3+wq] +%else + mova m2, [t1+wq] + paddw m2, [t0+wq] + mov t1, t3m + mov t0, t5m + mova m5, [t1+wq] + mov t1, t1m +%endif + paddd m0, m3 + paddd m1, m4 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 +%if ARCH_X86_64 + mova m4, [t5+wq] + paddw m4, [t1+wq] + psraw m0, 1 + paddw m3, m0, [t6+wq] +%else + mova m4, [t0+wq] + paddw m4, [t1+wq] + mov t0, t0m + mov t1, t6m + psraw m0, 1 + paddw m3, m0, [t1+wq] +%endif + mova [t0+wq], m0 + punpcklwd m0, m2, m5 + pmaddwd m0, m15 + punpckhwd m2, m5 + pmaddwd m2, m15 + punpcklwd m1, m3, m4 + pmaddwd m1, m14 + punpckhwd m3, m4 + pmaddwd m3, m14 + paddd m0, m10 + paddd m2, m10 + paddd m0, m1 + paddd m2, m3 + psrad m0, 6 + psrad m2, 6 + packssdw m0, m2 + pmulhw m0, m11 + pxor m1, m1 + pmaxsw m0, m1 + mova [dstq+wq], m0 + add wq, 16 + jl .hv_loop +%if ARCH_X86_64 + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t6 +%else + mov r4, t5m + mov t1, t4m + mov t6m, r4 + mov t5m, t1 + mov r4, t3m + mov t1, t2m + mov t4m, r4 + mov t3m, t1 + mov r4, t1m + mov t1, t0 + mov t2m, r4 + mov t0, t6m + mov wq, wm +%endif + add dstq, strideq + ret +.v: + movif64 wq, r4 + movif32 t0m, t0 + movif32 t1m, t1 +.v_loop: +%if ARCH_X86_64 + mova m1, [t4+wq] + paddw m1, [t2+wq] + mova m2, [t3+wq] + mova m4, [t1+wq] + paddw m3, m4, [t6+wq] + paddw m4, [t5+wq] +%else + mov t0, t4m + mov t1, t2m + mova m1, [t0+wq] + paddw m1, [t1+wq] + mov t0, t3m + mov t1, t1m + mova m2, [t0+wq] + mova m4, [t1+wq] + mov t0, t6m + mov t1, t5m + paddw m3, m4, [t0+wq] + paddw m4, [t1+wq] +%endif + punpcklwd m0, m1, m2 + pmaddwd m0, m15 + punpckhwd m1, m2 + pmaddwd m1, m15 + punpcklwd m2, m3, m4 + pmaddwd m2, m14 + punpckhwd m3, m4 + pmaddwd m3, m14 + paddd m0, m10 + paddd m1, m10 + paddd m0, m2 + paddd m1, m3 + psrad m0, 6 + psrad m1, 6 + packssdw m0, m1 + pmulhw m0, m11 + pxor m1, m1 + pmaxsw m0, m1 + mova [dstq+wq], m0 + add wq, 16 + jl .v_loop +%if ARCH_X86_64 + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 +%else + mov t0, t5m + mov t1, t4m + mov r4, t3m + mov t6m, t0 + mov t5m, t1 + mov t4m, r4 + mov r4, t2m + mov t1, t1m + mov t0, t0m + mov t3m, r4 + mov t2m, t1 +%endif + add dstq, strideq + ret + +%if ARCH_X86_32 + %if STACK_ALIGNMENT < 16 + %assign stack_size 12*16+384*8 + %else + %assign stack_size 11*16+384*8 + %endif +cglobal wiener_filter5_16bpc, 4, 7, 8, -stack_size, dst, stride, left, \ + lpf, w, flt + %if STACK_ALIGNMENT < 16 + %define lpfm dword [esp+calloff+4*6] + %define wm dword [esp+calloff+4*7] + %define hd dword [esp+calloff+16*10+0] + %define edgeb byte [esp+calloff+16*10+4] + %define edged dword [esp+calloff+16*10+4] + %else + %define hd dword r5m + %define edgeb byte r7m + %endif + %define PICmem dword [esp+calloff+4*0] + %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers + %define t1m dword [esp+calloff+4*2] + %define t2m dword [esp+calloff+4*3] + %define t3m dword [esp+calloff+4*4] + %define t4m dword [esp+calloff+4*5] + %define t2 t2m + %define t3 t3m + %define t4 t4m + %define m8 [esp+calloff+16*2] + %define m9 [esp+calloff+16*3] + %define m10 [esp+calloff+16*4] + %define m11 [esp+calloff+16*5] + %define m12 [esp+calloff+16*6] + %define m13 [esp+calloff+16*7] + %define m14 [esp+calloff+16*8] + %define m15 [esp+calloff+16*9] + %define base t0-wiener_shifts + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov wd, [rstk+stack_offset+20] + mov wm, wd + mov r5, [rstk+stack_offset+24] + mov hd, r5 + mov r5, [rstk+stack_offset+32] + mov edged, r5 ; edge + %endif +%else +cglobal wiener_filter5_16bpc, 4, 14, 16, 384*8+16, dst, stride, left, lpf, \ + w, h, edge, flt + %define base +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + movifnidn wd, wm +%endif +%if ARCH_X86_64 + mov fltq, r6mp + movifnidn hd, hm + mov edged, r7m + mov t3d, r8m ; pixel_max + movq m12, [fltq] + movq m14, [fltq+16] +%else + %if STACK_ALIGNMENT < 16 + mov t0, [rstk+stack_offset+28] + mov t1, [rstk+stack_offset+36] ; pixel_max + movq m1, [t0] ; fx + movq m3, [t0+16] ; fy + LEA t0, wiener_shifts + %else + mov fltq, r6m + movq m1, [fltq] + movq m3, [fltq+16] + LEA t0, wiener_shifts + mov t1, r8m ; pixel_max + %endif + mov PICmem, t0 +%endif + mova m5, [base+wiener_shufE] + mova m6, [base+wiener_shufB] + mova m7, [base+wiener_shufD] +%if ARCH_X86_64 + lea t4, [wiener_shifts] + add wd, wd + punpcklwd m11, m12, m12 + pshufd m11, m11, q1111 ; x1 + pshufd m12, m12, q1111 ; x2 x3 + punpcklwd m13, m14, m14 + pshufd m13, m13, q1111 ; y1 + pshufd m14, m14, q1111 ; y2 y3 + shr t3d, 11 + mova m8, [pd_m262128] ; (1 << 4) - (1 << 18) + add lpfq, wq + lea t1, [rsp+wq+16] + add dstq, wq + neg wq + %define base t4-wiener_shifts + movd m9, [base+wiener_round+t3*4] + movq m10, [base+wiener_shifts+t3*8] + pshufd m9, m9, q0000 + pshufd m0, m10, q0000 + pshufd m10, m10, q1111 + mova m15, [wiener_lshuf5] + pmullw m11, m0 + pmullw m12, m0 + DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w + %define lpfm [rsp] + %define base +%else + add wd, wd + punpcklwd m0, m1, m1 + pshufd m0, m0, q1111 ; x1 + pshufd m1, m1, q1111 ; x2 x3 + punpcklwd m2, m3, m3 + pshufd m2, m2, q1111 ; y1 + pshufd m3, m3, q1111 ; y2 y3 + mova m4, [base+pd_m262128] ; (1 << 4) - (1 << 18) + mova m13, m2 + mova m14, m3 + mova m8, m4 + shr t1, 11 + add lpfq, wq + movd m2, [base+wiener_round+t1*4] + movq m3, [base+wiener_shifts+t1*8] + %if STACK_ALIGNMENT < 16 + lea t1, [esp+16*11+wq+16] + %else + lea t1, [esp+16*10+wq+16] + %endif + add dstq, wq + neg wq + pshufd m2, m2, q0000 + pshufd m4, m3, q0000 + pshufd m3, m3, q1111 + mov wm, wq + pmullw m0, m4 + pmullw m1, m4 + mova m4, [base+wiener_lshuf5] + mova m9, m2 + mova m10, m3 + mova m11, m0 + mova m12, m1 + mova m15, m4 +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t4, t1 + add t1, 384*2 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + mov t3, t1 + add t1, 384*2 + add r10, strideq + mov lpfm, r10 ; below + call .h + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v2 +.main: + mov t0, t4 +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v2 + mov lpfq, lpfm + call .hv_bottom + add lpfq, strideq + call .hv_bottom +.end: + RET +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov lpfm, r10 + call .h + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v2 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v2 + add t0, 384*6 + call .hv + dec hd + jnz .main +.v2: + call .v +%if ARCH_X86_64 + mov t4, t3 + mov t3, t2 + mov t2, t1 +%else + mov t0, t3m + mov r4, t2m + mov t1, t1m + mov t4m, t0 + mov t3m, r4 + mov t2m, t1 + mov wq, wm +%endif + add dstq, strideq +.v1: + call .v + jmp .end +.extend_right: +%assign stack_offset stack_offset+8 +%assign calloff 8 + movif32 t0, PICmem + pxor m1, m1 + movd m2, wd + mova m0, [base+pb_2_3] + pshufb m2, m1 + mova m1, [base+pb_m6_m5] + psubb m0, m2 + psubb m1, m2 + mova m2, [base+pb_0to15] + pminub m0, m2 + pminub m1, m2 + pshufb m3, m0 + pshufb m4, m1 + ret +%assign stack_offset stack_offset-4 +%assign calloff 4 +.h: + movif64 wq, r4 + movif32 wq, wm + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + mova m4, [lpfq+wq] + movd m3, [leftq+4] + pslldq m4, 4 + por m3, m4 + add leftq, 8 + jmp .h_main +.h_extend_left: + mova m3, [lpfq+wq] ; avoid accessing memory located + pshufb m3, m15 ; before the start of the buffer + jmp .h_main +.h_top: + movif64 wq, r4 + movif32 wq, wm + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m3, [lpfq+wq-4] +.h_main: + movu m4, [lpfq+wq+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -18 + jl .h_have_right + call .extend_right +.h_have_right: + pshufb m0, m3, m5 + pmaddwd m0, m11 + pshufb m1, m4, m5 + pmaddwd m1, m11 + pshufb m2, m3, m6 + pshufb m3, m7 + paddw m2, m3 + pshufb m3, m4, m6 + pmaddwd m2, m12 + pshufb m4, m7 + paddw m3, m4 + pmaddwd m3, m12 + paddd m0, m8 + paddd m1, m8 + paddd m0, m2 + paddd m1, m3 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + psraw m0, 1 + mova [t1+wq], m0 + add wq, 16 + jl .h_loop + movif32 wq, wm + ret +ALIGN function_align +.hv: + add lpfq, strideq + movif64 wq, r4 + movif32 t0m, t0 + movif32 t1m, t1 + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + mova m4, [lpfq+wq] + movd m3, [leftq+4] + pslldq m4, 4 + por m3, m4 + add leftq, 8 + jmp .hv_main +.hv_extend_left: + mova m3, [lpfq+wq] + pshufb m3, m15 + jmp .hv_main +.hv_bottom: + movif64 wq, r4 + movif32 t0m, t0 + movif32 t1m, t1 + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m3, [lpfq+wq-4] +.hv_main: + movu m4, [lpfq+wq+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp wd, -18 + jl .hv_have_right + call .extend_right +.hv_have_right: + movif32 t1, t1m + movif32 t0, t3m + pshufb m0, m3, m5 + pmaddwd m0, m11 + pshufb m1, m4, m5 + pmaddwd m1, m11 + pshufb m2, m3, m6 + pshufb m3, m7 + paddw m2, m3 + pshufb m3, m4, m6 + pmaddwd m2, m12 + pshufb m4, m7 + paddw m3, m4 + pmaddwd m3, m12 + paddd m0, m8 + paddd m1, m8 + paddd m0, m2 +%if ARCH_X86_64 + mova m2, [t3+wq] + paddw m2, [t1+wq] + paddd m1, m3 + mova m4, [t2+wq] +%else + mova m2, [t0+wq] + mov t0, t2m + paddw m2, [t1+wq] + mov t1, t4m + paddd m1, m3 + mova m4, [t0+wq] + mov t0, t0m +%endif + punpckhwd m3, m2, m4 + pmaddwd m3, m14 + punpcklwd m2, m4 +%if ARCH_X86_64 + mova m4, [t4+wq] +%else + mova m4, [t1+wq] +%endif + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + pmaddwd m2, m14 + psraw m0, 1 + mova [t0+wq], m0 + punpckhwd m1, m0, m4 + pmaddwd m1, m13 + punpcklwd m0, m4 + pmaddwd m0, m13 + paddd m3, m9 + paddd m2, m9 + paddd m1, m3 + paddd m0, m2 + psrad m1, 6 + psrad m0, 6 + packssdw m0, m1 + pmulhw m0, m10 + pxor m1, m1 + pmaxsw m0, m1 + mova [dstq+wq], m0 + add wq, 16 + jl .hv_loop +%if ARCH_X86_64 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t4 +%else + mov r4, t3m + mov t1, t2m + mov t4m, r4 + mov t3m, t1 + mov r4, t1m + mov t1, t0 + mov t2m, r4 + mov t0, t4m + mov wq, wm +%endif + add dstq, strideq + ret +.v: + movif64 wq, r4 + movif32 t1m, t1 +.v_loop: +%if ARCH_X86_64 + mova m0, [t1+wq] + paddw m2, m0, [t3+wq] + mova m1, [t2+wq] + mova m4, [t4+wq] +%else + mov t0, t3m + mova m0, [t1+wq] + mov t1, t2m + paddw m2, m0, [t0+wq] + mov t0, t4m + mova m1, [t1+wq] + mova m4, [t0+wq] +%endif + punpckhwd m3, m2, m1 + pmaddwd m3, m14 + punpcklwd m2, m1 + pmaddwd m2, m14 + punpckhwd m1, m0, m4 + pmaddwd m1, m13 + punpcklwd m0, m4 + pmaddwd m0, m13 + paddd m3, m9 + paddd m2, m9 + paddd m1, m3 + paddd m0, m2 + psrad m1, 6 + psrad m0, 6 + packssdw m0, m1 + pmulhw m0, m10 + pxor m1, m1 + pmaxsw m0, m1 + mova [dstq+wq], m0 + add wq, 16 +%if ARCH_X86_64 + jl .v_loop +%else + jge .v_end + mov t1, t1m + jmp .v_loop +.v_end: +%endif + ret + +%macro GATHERDD 3 ; dst, src, tmp + movd %3d, %2 + %if ARCH_X86_64 + movd %1, [r13+%3] + pextrw %3d, %2, 2 + pinsrw %1, [r13+%3+2], 3 + pextrw %3d, %2, 4 + pinsrw %1, [r13+%3+2], 5 + pextrw %3d, %2, 6 + pinsrw %1, [r13+%3+2], 7 + %else + movd %1, [base+sgr_x_by_x-0xf03+%3] + pextrw %3, %2, 2 + pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 3 + pextrw %3, %2, 4 + pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 5 + pextrw %3, %2, 6 + pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 7 + %endif +%endmacro + +%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore + %if ARCH_X86_64 + %define tmp r14 + %else + %define tmp %4 + %endif + GATHERDD %1, %2, tmp + GATHERDD %2, %3, tmp + movif32 %4, %5 + psrld %1, 24 + psrld %2, 24 + packssdw %1, %2 +%endmacro + +%macro MAXSD 3-4 0 ; dst, src, restore_tmp + pcmpgtd %3, %1, %2 + pand %1, %3 + pandn %3, %2 + por %1, %3 + %if %4 == 1 + pxor %3, %3 + %endif +%endmacro + +%macro MULLD 3 ; dst, src, tmp + pmulhuw %3, %1, %2 + pmullw %1, %2 + pslld %3, 16 + paddd %1, %3 +%endmacro + +%if ARCH_X86_32 +DECLARE_REG_TMP 0, 1, 2, 3, 5 + %if STACK_ALIGNMENT < 16 + %assign extra_stack 5*16 + %else + %assign extra_stack 3*16 + %endif +cglobal sgr_filter_5x5_16bpc, 1, 7, 8, -400*24-16-extra_stack, \ + dst, stride, left, lpf, w + %if STACK_ALIGNMENT < 16 + %define dstm dword [esp+calloff+16*0+4*6] + %define stridemp dword [esp+calloff+16*0+4*7] + %define leftm dword [esp+calloff+16*3+4*0] + %define lpfm dword [esp+calloff+16*3+4*1] + %define w0m dword [esp+calloff+16*3+4*2] + %define hd dword [esp+calloff+16*3+4*3] + %define edgeb byte [esp+calloff+16*3+4*4] + %define edged dword [esp+calloff+16*3+4*4] + %define leftmp leftm + %else + %define w0m wm + %define hd dword r5m + %define edgeb byte r7m + %define edged dword r7m + %endif + %define hvsrcm dword [esp+calloff+4*0] + %define w1m dword [esp+calloff+4*1] + %define t0m dword [esp+calloff+4*2] + %define t2m dword [esp+calloff+4*3] + %define t3m dword [esp+calloff+4*4] + %define t4m dword [esp+calloff+4*5] + %define m8 [base+pd_8] + %define m9 [base+pd_0xfffffff0] + %define m10 [esp+calloff+16*2] + %define m11 [base+pd_0xf00800a4] + %define m12 [base+sgr_lshuf5] + %define m13 [base+pd_34816] + %define m14 [base+pw_1023] + %define r10 r4 + %define base r6-$$ + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov strideq, [rstk+stack_offset+ 8] + mov leftq, [rstk+stack_offset+12] + mov lpfq, [rstk+stack_offset+16] + mov wd, [rstk+stack_offset+20] + mov dstm, dstq + mov stridemp, strideq + mov leftm, leftq + mov r1, [rstk+stack_offset+24] + mov r2, [rstk+stack_offset+32] + mov lpfm, lpfq + mov hd, r1 + mov edged, r2 + %endif +%else +cglobal sgr_filter_5x5_16bpc, 4, 15, 15, -400*24-16, dst, stride, left, lpf, \ + w, h, edge, params +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + movifnidn wd, wm +%endif +%if ARCH_X86_64 + mov paramsq, r6mp + lea r13, [sgr_x_by_x-0xf03] + movifnidn hd, hm + add wd, wd + mov edged, r7m + movu m10, [paramsq] + mova m12, [sgr_lshuf5] + add lpfq, wq + mova m8, [pd_8] + lea t1, [rsp+wq+20] + mova m9, [pd_0xfffffff0] + add dstq, wq + lea t3, [rsp+wq*2+400*12+16] + mova m11, [pd_0xf00800a4] + lea t4, [rsp+wq+400*20+16] + pshufhw m7, m10, q0000 + pshufb m10, [pw_256] ; s0 + punpckhqdq m7, m7 ; w0 + neg wq + mova m13, [pd_34816] ; (1 << 11) + (1 << 15) + pxor m6, m6 + mova m14, [pw_1023] + psllw m7, 4 + DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w + %define lpfm [rsp] +%else + mov r1, [rstk+stack_offset+28] ; params + LEA r6, $$ + add wd, wd + movu m1, [r1] + add lpfm, wq + lea t1, [rsp+extra_stack+wq+20] + add dstq, wq + lea t3, [rsp+extra_stack+wq*2+400*12+16] + mov dstm, dstq + lea t4, [rsp+extra_stack+wq+400*20+16] + mov t3m, t3 + pshufhw m7, m1, q0000 + mov t4m, t4 + pshufb m1, [base+pw_256] ; s0 + punpckhqdq m7, m7 ; w0 + psllw m7, 4 + neg wq + mova m10, m1 + pxor m6, m6 + mov w1m, wd + sub wd, 4 + mov lpfq, lpfm + mov w0m, wd + %define strideq r5 +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, stridemp + movif32 t2m, t1 + mov t2, t1 + call .top_fixup + add t1, 400*6 + call .h_top + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov lpfm, r10 ; below + movif32 t0m, t2 + mov t0, t2 + dec hd + jz .height1 + or edged, 16 + call .h +.main: + add lpfq, stridemp + movif32 t4, t4m + call .hv + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + movif32 lpfq, hvsrcm + add lpfq, stridemp +%if ARCH_X86_64 + test hb, hb +%else + mov r4, hd + test r4, r4 +%endif + jz .odd_height + call .h + add lpfq, stridemp + call .hv + movif32 dstq, dstm + call .n0 + call .n1 + sub hd, 2 + movif32 t0, t0m + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, lpfm + call .h_top + add lpfq, stridemp + call .hv_bottom +.end: + movif32 dstq, dstm + call .n0 + call .n1 +.end2: + RET +.height1: + movif32 t4, t4m + call .hv + call .prep_n + jmp .odd_height_end +.odd_height: + call .hv + movif32 dstq, dstm + call .n0 + call .n1 +.odd_height_end: + call .v + movif32 dstq, dstm + call .n0 + jmp .end2 +.extend_bottom: + call .v + jmp .end +.no_top: + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov lpfm, r10 + call .h + lea t2, [t1+400*6] + movif32 t2m, t2 + call .top_fixup + dec hd + jz .no_top_height1 + or edged, 16 + mov t0, t1 + mov t1, t2 + movif32 t0m, t0 + jmp .main +.no_top_height1: + movif32 t3, t3m + movif32 t4, t4m + call .v + call .prep_n + jmp .odd_height_end +.extend_right: + movd m0, wd + movd m1, [lpfq-2] + mova m2, [base+pw_256] + mova m3, [base+pb_m14_m13] + pshufb m0, m6 + pshufb m1, m2 + psubb m2, m0 + psubb m3, m0 + mova m0, [base+pb_0to15] + pcmpgtb m2, m0 + pcmpgtb m3, m0 + pand m4, m2 + pand m5, m3 + pandn m2, m1 + pandn m3, m1 + por m4, m2 + por m5, m3 + ret +%assign stack_offset stack_offset+4 +%assign calloff 4 +.h: ; horizontal boxsum +%if ARCH_X86_64 + lea wq, [r4-4] +%else + %define leftq r4 +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 10 + jmp .h_main +.h_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, m12 + jmp .h_main +.h_top: +%if ARCH_X86_64 + lea wq, [r4-4] +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 wq, w0m +.h_loop: + movu m4, [lpfq+wq- 2] +.h_main: + movu m5, [lpfq+wq+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -20 + jl .h_have_right + call .extend_right +.h_have_right: + palignr m2, m5, m4, 2 + paddw m0, m4, m2 + palignr m3, m5, m4, 6 + paddw m0, m3 + punpcklwd m1, m2, m3 + pmaddwd m1, m1 + punpckhwd m2, m3 + pmaddwd m2, m2 + palignr m5, m4, 8 + paddw m0, m5 + punpcklwd m3, m4, m5 + pmaddwd m3, m3 + paddd m1, m3 + punpckhwd m3, m4, m5 + pmaddwd m3, m3 + shufps m4, m5, q2121 + paddw m0, m4 ; sum + punpcklwd m5, m4, m6 + pmaddwd m5, m5 + punpckhwd m4, m6 + pmaddwd m4, m4 + paddd m2, m3 + test edgeb, 16 ; y > 0 + jz .h_loop_end + paddw m0, [t1+wq+400*0] + paddd m1, [t1+wq+400*2] + paddd m2, [t1+wq+400*4] +.h_loop_end: + paddd m1, m5 ; sumsq + paddd m2, m4 + mova [t1+wq+400*0], m0 + mova [t1+wq+400*2], m1 + mova [t1+wq+400*4], m2 + add wq, 16 + jl .h_loop + ret +.top_fixup: +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov wd, w0m +%endif +.top_fixup_loop: ; the sums of the first row needs to be doubled + mova m0, [t1+wq+400*0] + mova m1, [t1+wq+400*2] + mova m2, [t1+wq+400*4] + paddw m0, m0 + paddd m1, m1 + paddd m2, m2 + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m1 + mova [t2+wq+400*4], m2 + add wq, 16 + jl .top_fixup_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 10 + jmp .hv_main +.hv_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, m12 + jmp .hv_main +.hv_bottom: +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv_loop_start +%endif +.hv_loop: + movif32 lpfq, hvsrcm +.hv_loop_start: + movu m4, [lpfq+wq- 2] +.hv_main: + movu m5, [lpfq+wq+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp wd, -20 + jl .hv_have_right + call .extend_right +.hv_have_right: + movif32 t3, hd + palignr m3, m5, m4, 2 + paddw m0, m4, m3 + palignr m1, m5, m4, 6 + paddw m0, m1 + punpcklwd m2, m3, m1 + pmaddwd m2, m2 + punpckhwd m3, m1 + pmaddwd m3, m3 + palignr m5, m4, 8 + paddw m0, m5 + punpcklwd m1, m4, m5 + pmaddwd m1, m1 + paddd m2, m1 + punpckhwd m1, m4, m5 + pmaddwd m1, m1 + shufps m4, m5, q2121 + paddw m0, m4 ; h sum + punpcklwd m5, m4, m6 + pmaddwd m5, m5 + punpckhwd m4, m6 + pmaddwd m4, m4 + paddd m3, m1 + paddd m2, m5 ; h sumsq + paddd m3, m4 + paddw m1, m0, [t1+wq+400*0] + paddd m4, m2, [t1+wq+400*2] + paddd m5, m3, [t1+wq+400*4] +%if ARCH_X86_64 + test hd, hd +%else + test t3, t3 +%endif + jz .hv_last_row +.hv_main2: + paddw m1, [t2+wq+400*0] ; hv sum + paddd m4, [t2+wq+400*2] ; hv sumsq + paddd m5, [t2+wq+400*4] + mova [t0+wq+400*0], m0 + mova [t0+wq+400*2], m2 + mova [t0+wq+400*4], m3 + psrlw m3, m1, 1 + paddd m4, m8 + pavgw m3, m6 ; (b + 2) >> 2 + paddd m5, m8 + pand m4, m9 ; ((a + 8) >> 4) << 4 + pand m5, m9 + psrld m2, m4, 4 + psrld m0, m5, 4 + paddd m2, m4 + psrld m4, 1 + paddd m0, m5 + psrld m5, 1 + paddd m4, m2 ; a * 25 + paddd m5, m0 + punpcklwd m2, m3, m6 + punpckhwd m3, m6 + pmaddwd m2, m2 ; b * b + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m6 + MAXSD m5, m3, m6, 1 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m10, m2 ; p * s + MULLD m5, m10, m2 + pmaddwd m0, m11 ; b * 164 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m4, 20 ; min(z, 255) + movif32 t3, t3m + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, t2, t2m + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m2 + MULLD m1, m5, m2 + paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m13 + mova [t4+wq+4], m3 + psrld m0, 12 ; b + psrld m1, 12 + mova [t3+wq*2+ 8], m0 + mova [t3+wq*2+24], m1 + add wq, 16 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + movif32 t2m, t2 + movif32 t0m, t0 + ret +.hv_last_row: ; esoteric edge case for odd heights + mova [t1+wq+400*0], m1 + paddw m1, m0 + mova [t1+wq+400*2], m4 + paddd m4, m2 + mova [t1+wq+400*4], m5 + paddd m5, m3 + jmp .hv_main2 +.v: ; vertical boxsum + ab +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov wd, w0m +%endif +.v_loop: + mova m0, [t1+wq+400*0] + mova m2, [t1+wq+400*2] + mova m3, [t1+wq+400*4] + paddw m1, m0, [t2+wq+400*0] + paddd m4, m2, [t2+wq+400*2] + paddd m5, m3, [t2+wq+400*4] + paddw m0, m0 + paddd m2, m2 + paddd m3, m3 + paddw m1, m0 ; hv sum + paddd m4, m2 ; hv sumsq + paddd m5, m3 + psrlw m3, m1, 1 + paddd m4, m8 + pavgw m3, m6 ; (b + 2) >> 2 + paddd m5, m8 + pand m4, m9 ; ((a + 8) >> 4) << 4 + pand m5, m9 + psrld m2, m4, 4 + psrld m0, m5, 4 + paddd m2, m4 + psrld m4, 1 + paddd m0, m5 + psrld m5, 1 + paddd m4, m2 ; a * 25 + paddd m5, m0 + punpcklwd m2, m3, m6 + punpckhwd m3, m6 + pmaddwd m2, m2 ; b * b + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m6 + MAXSD m5, m3, m6, 1 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m10, m2 ; p * s + MULLD m5, m10, m2 + pmaddwd m0, m11 ; b * 164 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m4, 20 ; min(z, 255) + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, t2, t2m + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m2 + MULLD m1, m5, m2 + paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m13 + mova [t4+wq+4], m3 + psrld m0, 12 ; b + psrld m1, 12 + mova [t3+wq*2+ 8], m0 + mova [t3+wq*2+24], m1 + add wq, 16 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + movif64 wq, r4 + movif32 wd, w1m +.prep_n_loop: + movu m0, [t4+wq*1+ 2] + movu m3, [t4+wq*1+ 4] + movu m1, [t3+wq*2+ 4] + movu m4, [t3+wq*2+ 8] + movu m2, [t3+wq*2+20] + movu m5, [t3+wq*2+24] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + paddw m3, [t4+wq*1+ 0] + paddd m4, [t3+wq*2+ 0] + paddd m5, [t3+wq*2+16] + paddw m0, m3 + psllw m3, 2 + paddd m1, m4 + pslld m4, 2 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a 565 + paddd m1, m4 ; b 565 + paddd m2, m5 + mova [t4+wq*1+400*2+ 0], m0 + mova [t3+wq*2+400*4+ 0], m1 + mova [t3+wq*2+400*4+16], m2 + add wq, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + movif64 wq, r4 + movif32 wd, w1m +.n0_loop: + movu m0, [t4+wq*1+ 2] + movu m3, [t4+wq*1+ 4] + movu m1, [t3+wq*2+ 4] + movu m4, [t3+wq*2+ 8] + movu m2, [t3+wq*2+20] + movu m5, [t3+wq*2+24] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + paddw m3, [t4+wq*1+ 0] + paddd m4, [t3+wq*2+ 0] + paddd m5, [t3+wq*2+16] + paddw m0, m3 + psllw m3, 2 + paddd m1, m4 + pslld m4, 2 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a 565 + paddd m1, m4 ; b 565 + paddd m2, m5 + paddw m3, m0, [t4+wq*1+400*2+ 0] + paddd m4, m1, [t3+wq*2+400*4+ 0] + paddd m5, m2, [t3+wq*2+400*4+16] + mova [t4+wq*1+400*2+ 0], m0 + mova [t3+wq*2+400*4+ 0], m1 + mova [t3+wq*2+400*4+16], m2 + mova m0, [dstq+wq] + punpcklwd m1, m0, m6 ; src + punpcklwd m2, m3, m6 ; a + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + psubd m4, m2 ; b - a * src + (1 << 8) + psubd m5, m3 + psrad m4, 9 + psrad m5, 9 + packssdw m4, m5 + pmulhrsw m4, m7 + paddw m0, m4 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+wq], m0 + add wq, 16 + jl .n0_loop + add dstq, stridemp + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + movif64 wq, r4 + movif32 wd, w1m +.n1_loop: + mova m0, [dstq+wq] + mova m3, [t4+wq*1+400*2+ 0] + mova m4, [t3+wq*2+400*4+ 0] + mova m5, [t3+wq*2+400*4+16] + punpcklwd m1, m0, m6 ; src + punpcklwd m2, m3, m6 ; a + pmaddwd m2, m1 + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + psubd m4, m2 ; b - a * src + (1 << 7) + psubd m5, m3 + psrad m4, 8 + psrad m5, 8 + packssdw m4, m5 + pmulhrsw m4, m7 + paddw m0, m4 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+wq], m0 + add wq, 16 + jl .n1_loop + add dstq, stridemp + movif32 dstm, dstq + ret + +%if ARCH_X86_32 + %if STACK_ALIGNMENT < 16 + %assign extra_stack 4*16 + %else + %assign extra_stack 2*16 + %endif +cglobal sgr_filter_3x3_16bpc, 1, 7, 8, -400*42-16-extra_stack, \ + dst, stride, left, lpf, w + %if STACK_ALIGNMENT < 16 + %define dstm dword [esp+calloff+16*2+4*0] + %define stridemp dword [esp+calloff+16*2+4*1] + %define leftm dword [esp+calloff+16*2+4*2] + %define lpfm dword [esp+calloff+16*2+4*3] + %define w0m dword [esp+calloff+16*2+4*4] + %define hd dword [esp+calloff+16*2+4*5] + %define edgeb byte [esp+calloff+16*2+4*6] + %define edged dword [esp+calloff+16*2+4*6] + %define leftmp leftm + %else + %define w0m wm + %define hd dword r5m + %define edgeb byte r7m + %define edged dword r7m + %endif + %define hvsrcm dword [esp+calloff+4*0] + %define w1m dword [esp+calloff+4*1] + %define t3m dword [esp+calloff+4*2] + %define t4m dword [esp+calloff+4*3] + %define m8 [base+pd_8] + %define m9 [esp+calloff+16*1] + %define m10 [base+pd_0xf00801c7] + %define m11 [base+pd_34816] + %define m12 [base+sgr_lshuf3] + %define m13 [base+pw_1023] + %define m14 m6 + %define base r6-$$ + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov strideq, [rstk+stack_offset+ 8] + mov leftq, [rstk+stack_offset+12] + mov lpfq, [rstk+stack_offset+16] + mov wd, [rstk+stack_offset+20] + mov dstm, dstq + mov stridemp, strideq + mov leftm, leftq + mov r1, [rstk+stack_offset+24] + mov r2, [rstk+stack_offset+32] + mov lpfm, lpfq + mov hd, r1 + mov edged, r2 + %endif +%else +cglobal sgr_filter_3x3_16bpc, 4, 15, 15, -400*42-8, dst, stride, left, lpf, \ + w, h, edge, params +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + movifnidn wd, wm +%endif +%if ARCH_X86_64 + mov paramsq, r6mp + lea r13, [sgr_x_by_x-0xf03] + movifnidn hd, hm + add wd, wd + mov edged, r7m + movq m9, [paramsq+4] + add lpfq, wq + lea t1, [rsp+wq+12] + mova m8, [pd_8] + add dstq, wq + lea t3, [rsp+wq*2+400*12+8] + mova m10, [pd_0xf00801c7] + lea t4, [rsp+wq+400*32+8] + mova m11, [pd_34816] + pshuflw m7, m9, q3333 + pshufb m9, [pw_256] ; s1 + punpcklqdq m7, m7 ; w1 + neg wq + pxor m6, m6 + mova m13, [pw_1023] + psllw m7, 4 + mova m12, [sgr_lshuf3] + DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w + %define lpfm [rsp] +%else + mov r1, [rstk+stack_offset+28] ; params + LEA r6, $$ + add wd, wd + movq m1, [r1+4] + add lpfm, wq + lea t1, [rsp+extra_stack+wq+20] + add dstq, wq + lea t3, [rsp+extra_stack+wq*2+400*12+16] + mov dstm, dstq + lea t4, [rsp+extra_stack+wq+400*32+16] + mov t3m, t3 + pshuflw m7, m1, q3333 + mov t4m, t4 + pshufb m1, [base+pw_256] ; s1 + punpcklqdq m7, m7 ; w1 + psllw m7, 4 + neg wq + mova m9, m1 + pxor m6, m6 + mov w1m, wd + sub wd, 4 + mov lpfq, lpfm + mov w0m, wd + %define strideq r5 +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, stridemp + mov t2, t1 + add t1, 400*6 + call .h_top + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov lpfm, r10 ; below + movif32 t4, t4m + call .hv0 +.main: + dec hd + jz .height1 + movif32 lpfq, hvsrcm + add lpfq, stridemp + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + movif32 lpfq, hvsrcm + add lpfq, stridemp + call .hv0 +%if ARCH_X86_64 + test hb, hb +%else + mov r4, hd + test r4, r4 +%endif + jz .odd_height + movif32 lpfq, hvsrcm + add lpfq, stridemp + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, lpfm + call .hv0_bottom + movif32 lpfq, hvsrcm + add lpfq, stridemp + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov lpfm, r10 + call .h +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov wq, w0m + mov hvsrcm, lpfq +%endif + lea t2, [t1+400*6] +.top_fixup_loop: + mova m0, [t1+wq+400*0] + mova m1, [t1+wq+400*2] + mova m2, [t1+wq+400*4] + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m1 + mova [t2+wq+400*4], m2 + add wq, 16 + jl .top_fixup_loop + movif32 t3, t3m + movif32 t4, t4m + call .v0 + jmp .main +.extend_right: + movd m1, wd + movd m5, [lpfq-2] + mova m2, [base+pw_256] + mova m3, [base+pb_0to15] + pshufb m1, m6 + pshufb m5, m2 + psubb m2, m1 + pcmpgtb m2, m3 + pand m4, m2 + pandn m2, m5 + por m4, m2 + ret +%assign stack_offset stack_offset+4 +%assign calloff 4 +.h: ; horizontal boxsum +%if ARCH_X86_64 + lea wq, [r4-4] +%else + %define leftq r4 +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 12 + jmp .h_main +.h_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, m12 + jmp .h_main +.h_top: +%if ARCH_X86_64 + lea wq, [r4-4] +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 wq, w0m +.h_loop: + movu m4, [lpfq+wq+ 0] +.h_main: + movu m5, [lpfq+wq+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -18 + jl .h_have_right + call .extend_right +.h_have_right: + palignr m0, m5, m4, 2 + paddw m1, m4, m0 + punpcklwd m2, m4, m0 + pmaddwd m2, m2 + punpckhwd m3, m4, m0 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m1, m5 ; sum + punpcklwd m4, m5, m6 + pmaddwd m4, m4 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m4 ; sumsq + paddd m3, m5 + mova [t1+wq+400*0], m1 + mova [t1+wq+400*2], m2 + mova [t1+wq+400*4], m3 + add wq, 16 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 12 + jmp .hv0_main +.hv0_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, m12 + jmp .hv0_main +.hv0_bottom: +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv0_loop_start +%endif +.hv0_loop: + movif32 lpfq, hvsrcm +.hv0_loop_start: + movu m4, [lpfq+wq+ 0] +.hv0_main: + movu m5, [lpfq+wq+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp wd, -18 + jl .hv0_have_right + call .extend_right +.hv0_have_right: + palignr m0, m5, m4, 2 + paddw m1, m4, m0 + punpcklwd m2, m4, m0 + pmaddwd m2, m2 + punpckhwd m3, m4, m0 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m1, m5 ; sum + punpcklwd m4, m5, m6 + pmaddwd m4, m4 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m4 ; sumsq + paddd m3, m5 + paddw m0, m1, [t1+wq+400*0] + paddd m4, m2, [t1+wq+400*2] + paddd m5, m3, [t1+wq+400*4] + mova [t1+wq+400*0], m1 + mova [t1+wq+400*2], m2 + mova [t1+wq+400*4], m3 + paddw m1, m0, [t2+wq+400*0] + paddd m2, m4, [t2+wq+400*2] + paddd m3, m5, [t2+wq+400*4] + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m4 + mova [t2+wq+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m14 + MAXSD m5, m3, m14 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m14 ; p * s + MULLD m5, m9, m14 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m4, 20 ; min(z, 255) + movif32 t3, t3m + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, r0, dstm + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m14 + MULLD m1, m5, m14 +%if ARCH_X86_32 + pxor m6, m6 +%endif + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + mova [t4+wq+4], m3 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+ 8], m0 + mova [t3+wq*2+24], m1 + add wq, 16 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 12 + jmp .hv1_main +.hv1_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, m12 + jmp .hv1_main +.hv1_bottom: +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv1_loop_start +%endif +.hv1_loop: + movif32 lpfq, hvsrcm +.hv1_loop_start: + movu m4, [lpfq+wq+ 0] +.hv1_main: + movu m5, [lpfq+wq+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp wd, -18 + jl .hv1_have_right + call .extend_right +.hv1_have_right: + palignr m1, m5, m4, 2 + paddw m0, m4, m1 + punpcklwd m2, m4, m1 + pmaddwd m2, m2 + punpckhwd m3, m4, m1 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m0, m5 ; h sum + punpcklwd m1, m5, m6 + pmaddwd m1, m1 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m1 ; h sumsq + paddd m3, m5 + paddw m1, m0, [t2+wq+400*0] + paddd m4, m2, [t2+wq+400*2] + paddd m5, m3, [t2+wq+400*4] + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m2 + mova [t2+wq+400*4], m3 + paddd m4, m8 + paddd m5, m8 + psrld m4, 4 ; (a + 8) >> 4 + psrld m5, 4 + pslld m2, m4, 3 + pslld m3, m5, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m14 + MAXSD m5, m3, m14 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m14 ; p * s + MULLD m5, m9, m14 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m4, 20 ; min(z, 255) + movif32 t3, t3m + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, r0, dstm + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m14 + MULLD m1, m5, m14 +%if ARCH_X86_32 + pxor m6, m6 +%endif + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + mova [t4+wq*1+400*2 +4], m3 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+400*4+ 8], m0 + mova [t3+wq*2+400*4+24], m1 + add wq, 16 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab (even rows) +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov wd, w0m +%endif +.v0_loop: + mova m0, [t1+wq+400*0] + mova m4, [t1+wq+400*2] + mova m5, [t1+wq+400*4] + paddw m0, m0 + paddd m4, m4 + paddd m5, m5 + paddw m1, m0, [t2+wq+400*0] + paddd m2, m4, [t2+wq+400*2] + paddd m3, m5, [t2+wq+400*4] + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m4 + mova [t2+wq+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m14 + MAXSD m5, m3, m14 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m14 ; p * s + MULLD m5, m9, m14 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m4, 20 ; min(z, 255) + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, r0, dstm + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m14 + MULLD m1, m5, m14 +%if ARCH_X86_32 + pxor m6, m6 +%endif + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + mova [t4+wq*1+400*0+ 4], m3 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+400*0+ 8], m0 + mova [t3+wq*2+400*0+24], m1 + add wq, 16 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov wd, w0m +%endif +.v1_loop: + mova m0, [t1+wq+400*0] + mova m4, [t1+wq+400*2] + mova m5, [t1+wq+400*4] + paddw m1, m0, [t2+wq+400*0] + paddd m2, m4, [t2+wq+400*2] + paddd m3, m5, [t2+wq+400*4] + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m4 + mova [t2+wq+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m14 + MAXSD m5, m3, m14 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m14 ; p * s + MULLD m5, m9, m14 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m4, 20 ; min(z, 255) + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, r0, dstm + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m14 + MULLD m1, m5, m14 +%if ARCH_X86_32 + pxor m6, m6 +%endif + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + mova [t4+wq*1+400*2+ 4], m3 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+400*4+ 8], m0 + mova [t3+wq*2+400*4+24], m1 + add wq, 16 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + movif64 wq, r4 + movif32 wd, w1m +.prep_n_loop: + movu m0, [t4+wq*1+400*0+ 4] + movu m1, [t3+wq*2+400*0+ 8] + movu m2, [t3+wq*2+400*0+24] + movu m3, [t4+wq*1+400*0+ 2] + movu m4, [t3+wq*2+400*0+ 4] + movu m5, [t3+wq*2+400*0+20] + paddw m0, [t4+wq*1+400*0+ 0] + paddd m1, [t3+wq*2+400*0+ 0] + paddd m2, [t3+wq*2+400*0+16] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + psllw m3, 2 ; a[-1] 444 + pslld m4, 2 ; b[-1] 444 + pslld m5, 2 + psubw m3, m0 ; a[-1] 343 + psubd m4, m1 ; b[-1] 343 + psubd m5, m2 + mova [t4+wq*1+400*4], m3 + mova [t3+wq*2+400*8+ 0], m4 + mova [t3+wq*2+400*8+16], m5 + movu m0, [t4+wq*1+400*2+ 4] + movu m1, [t3+wq*2+400*4+ 8] + movu m2, [t3+wq*2+400*4+24] + movu m3, [t4+wq*1+400*2+ 2] + movu m4, [t3+wq*2+400*4+ 4] + movu m5, [t3+wq*2+400*4+20] + paddw m0, [t4+wq*1+400*2+ 0] + paddd m1, [t3+wq*2+400*4+ 0] + paddd m2, [t3+wq*2+400*4+16] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + psllw m3, 2 ; a[ 0] 444 + pslld m4, 2 ; b[ 0] 444 + pslld m5, 2 + mova [t4+wq*1+400* 6], m3 + mova [t3+wq*2+400*12+ 0], m4 + mova [t3+wq*2+400*12+16], m5 + psubw m3, m0 ; a[ 0] 343 + psubd m4, m1 ; b[ 0] 343 + psubd m5, m2 + mova [t4+wq*1+400* 8], m3 + mova [t3+wq*2+400*16+ 0], m4 + mova [t3+wq*2+400*16+16], m5 + add wq, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + movif64 wq, r4 + movif32 wd, w1m +.n0_loop: + movu m3, [t4+wq*1+400*0+4] + movu m1, [t4+wq*1+400*0+2] + paddw m3, [t4+wq*1+400*0+0] + paddw m1, m3 + psllw m1, 2 ; a[ 1] 444 + psubw m2, m1, m3 ; a[ 1] 343 + paddw m3, m2, [t4+wq*1+400*4] + paddw m3, [t4+wq*1+400*6] + mova [t4+wq*1+400*4], m2 + mova [t4+wq*1+400*6], m1 + movu m4, [t3+wq*2+400*0+8] + movu m1, [t3+wq*2+400*0+4] + paddd m4, [t3+wq*2+400*0+0] + paddd m1, m4 + pslld m1, 2 ; b[ 1] 444 + psubd m2, m1, m4 ; b[ 1] 343 + paddd m4, m2, [t3+wq*2+400* 8+ 0] + paddd m4, [t3+wq*2+400*12+ 0] + mova [t3+wq*2+400* 8+ 0], m2 + mova [t3+wq*2+400*12+ 0], m1 + movu m5, [t3+wq*2+400*0+24] + movu m1, [t3+wq*2+400*0+20] + paddd m5, [t3+wq*2+400*0+16] + paddd m1, m5 + pslld m1, 2 + psubd m2, m1, m5 + paddd m5, m2, [t3+wq*2+400* 8+16] + paddd m5, [t3+wq*2+400*12+16] + mova [t3+wq*2+400* 8+16], m2 + mova [t3+wq*2+400*12+16], m1 + mova m0, [dstq+wq] + punpcklwd m1, m0, m6 + punpcklwd m2, m3, m6 + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + psubd m4, m2 ; b - a * src + (1 << 8) + psubd m5, m3 + psrad m4, 9 + psrad m5, 9 + packssdw m4, m5 + pmulhrsw m4, m7 + paddw m0, m4 + pmaxsw m0, m6 + pminsw m0, m13 + mova [dstq+wq], m0 + add wq, 16 + jl .n0_loop + add dstq, stridemp + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + movif64 wq, r4 + movif32 wd, w1m +.n1_loop: + movu m3, [t4+wq*1+400*2+4] + movu m1, [t4+wq*1+400*2+2] + paddw m3, [t4+wq*1+400*2+0] + paddw m1, m3 + psllw m1, 2 ; a[ 1] 444 + psubw m2, m1, m3 ; a[ 1] 343 + paddw m3, m2, [t4+wq*1+400*6] + paddw m3, [t4+wq*1+400*8] + mova [t4+wq*1+400*6], m1 + mova [t4+wq*1+400*8], m2 + movu m4, [t3+wq*2+400*4+8] + movu m1, [t3+wq*2+400*4+4] + paddd m4, [t3+wq*2+400*4+0] + paddd m1, m4 + pslld m1, 2 ; b[ 1] 444 + psubd m2, m1, m4 ; b[ 1] 343 + paddd m4, m2, [t3+wq*2+400*12+ 0] + paddd m4, [t3+wq*2+400*16+ 0] + mova [t3+wq*2+400*12+ 0], m1 + mova [t3+wq*2+400*16+ 0], m2 + movu m5, [t3+wq*2+400*4+24] + movu m1, [t3+wq*2+400*4+20] + paddd m5, [t3+wq*2+400*4+16] + paddd m1, m5 + pslld m1, 2 + psubd m2, m1, m5 + paddd m5, m2, [t3+wq*2+400*12+16] + paddd m5, [t3+wq*2+400*16+16] + mova [t3+wq*2+400*12+16], m1 + mova [t3+wq*2+400*16+16], m2 + mova m0, [dstq+wq] + punpcklwd m1, m0, m6 + punpcklwd m2, m3, m6 + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + psubd m4, m2 ; b - a * src + (1 << 8) + psubd m5, m3 + psrad m4, 9 + psrad m5, 9 + packssdw m4, m5 + pmulhrsw m4, m7 + paddw m0, m4 + pmaxsw m0, m6 + pminsw m0, m13 + mova [dstq+wq], m0 + add wq, 16 + jl .n1_loop + add dstq, stridemp + movif32 dstm, dstq + ret + +%if ARCH_X86_32 + %if STACK_ALIGNMENT < 16 + %assign extra_stack 10*16 + %else + %assign extra_stack 8*16 + %endif +cglobal sgr_filter_mix_16bpc, 1, 7, 8, -400*66-48-extra_stack, \ + dst, stride, left, lpf, w + %if STACK_ALIGNMENT < 16 + %define dstm dword [esp+calloff+16*8+4*0] + %define stridemp dword [esp+calloff+16*8+4*1] + %define leftm dword [esp+calloff+16*8+4*2] + %define lpfm dword [esp+calloff+16*8+4*3] + %define w0m dword [esp+calloff+16*8+4*4] + %define hd dword [esp+calloff+16*8+4*5] + %define edgeb byte [esp+calloff+16*8+4*6] + %define edged dword [esp+calloff+16*8+4*6] + %define leftmp leftm + %else + %define w0m wm + %define hd dword r5m + %define edgeb byte r7m + %define edged dword r7m + %endif + %define hvsrcm dword [esp+calloff+4*0] + %define w1m dword [esp+calloff+4*1] + %define t3m dword [esp+calloff+4*2] + %define t4m dword [esp+calloff+4*3] + %xdefine m8 m6 + %define m9 [base+pd_8] + %define m10 [base+pd_34816] + %define m11 [base+pd_0xf00801c7] + %define m12 [base+pd_0xf00800a4] + %define m13 [esp+calloff+16*4] + %define m14 [esp+calloff+16*5] + %define m15 [esp+calloff+16*6] + %define m6 [esp+calloff+16*7] + %define base r6-$$ + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov strideq, [rstk+stack_offset+ 8] + mov leftq, [rstk+stack_offset+12] + mov lpfq, [rstk+stack_offset+16] + mov wd, [rstk+stack_offset+20] + mov dstm, dstq + mov stridemp, strideq + mov leftm, leftq + mov r1, [rstk+stack_offset+24] + mov r2, [rstk+stack_offset+32] + mov lpfm, lpfq + mov hd, r1 + mov edged, r2 + %endif +%else +cglobal sgr_filter_mix_16bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \ + w, h, edge, params +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + movifnidn wd, wm +%endif +%if ARCH_X86_64 + mov paramsq, r6mp + lea r13, [sgr_x_by_x-0xf03] + movifnidn hd, hm + add wd, wd + mov edged, r7m + mova m14, [paramsq] + add lpfq, wq + mova m9, [pd_8] + lea t1, [rsp+wq+44] + mova m10, [pd_34816] + add dstq, wq + mova m11, [pd_0xf00801c7] + lea t3, [rsp+wq*2+400*24+40] + mova m12, [pd_0xf00800a4] + lea t4, [rsp+wq+400*52+40] + neg wq + pshufd m15, m14, q2222 ; w0 w1 + punpcklwd m14, m14 + pshufd m13, m14, q0000 ; s0 + pshufd m14, m14, q2222 ; s1 + pxor m6, m6 + psllw m15, 2 + DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w + %define lpfm [rsp] +%else + mov r1, [rstk+stack_offset+28] ; params + LEA r6, $$ + add wd, wd + mova m2, [r1] + add lpfm, wq + lea t1, [rsp+extra_stack+wq+52] + add dstq, wq + lea t3, [rsp+extra_stack+wq*2+400*24+48] + mov dstm, dstq + lea t4, [rsp+extra_stack+wq+400*52+48] + mov t3m, t3 + mov t4m, t4 + neg wq + pshuflw m0, m2, q0000 + pshuflw m1, m2, q2222 + pshufhw m2, m2, q1010 + punpcklqdq m0, m0 ; s0 + punpcklqdq m1, m1 ; s1 + punpckhqdq m2, m2 ; w0 w1 + mov w1m, wd + pxor m3, m3 + psllw m2, 2 + mova m13, m0 + mova m14, m1 + sub wd, 4 + mova m15, m2 + mova m6, m3 + mov lpfq, lpfm + mov w0m, wd + %define strideq r5 +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, stridemp + mov t2, t1 +%if ARCH_X86_64 + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup +%else + mov wq, w0m + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup_loop +%endif + add t1, 400*12 + call .h_top + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov lpfm, r10 ; below + movif32 t4, t4m + call .hv0 +.main: + dec hd + jz .height1 + movif32 lpfq, hvsrcm + add lpfq, stridemp + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + movif32 lpfq, hvsrcm + add lpfq, stridemp + call .hv0 +%if ARCH_X86_64 + test hd, hd +%else + mov r4, hd + test r4, r4 +%endif + jz .odd_height + movif32 lpfq, hvsrcm + add lpfq, stridemp + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, lpfm + call .hv0_bottom + movif32 lpfq, hvsrcm + add lpfq, stridemp + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov lpfm, r10 + call .h +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov wq, w0m + mov hvsrcm, lpfq +%endif + lea t2, [t1+400*12] +.top_fixup_loop: + mova m0, [t1+wq+400* 0] + mova m1, [t1+wq+400* 2] + mova m2, [t1+wq+400* 4] + paddw m0, m0 + mova m3, [t1+wq+400* 6] + paddd m1, m1 + mova m4, [t1+wq+400* 8] + paddd m2, m2 + mova m5, [t1+wq+400*10] + mova [t2+wq+400* 0], m0 + mova [t2+wq+400* 2], m1 + mova [t2+wq+400* 4], m2 + mova [t2+wq+400* 6], m3 + mova [t2+wq+400* 8], m4 + mova [t2+wq+400*10], m5 + add wq, 16 + jl .top_fixup_loop + movif32 t3, t3m + movif32 t4, t4m + call .v0 + jmp .main +.h: ; horizontal boxsum +%assign stack_offset stack_offset+4 +%assign calloff 4 +%if ARCH_X86_64 + lea wq, [r4-4] +%else + %define leftq r4 +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 10 + jmp .h_main +.h_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, [base+sgr_lshuf5] + jmp .h_main +.h_top: +%if ARCH_X86_64 + lea wq, [r4-4] +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 wq, w0m +.h_loop: + movu m4, [lpfq+wq- 2] +.h_main: + movu m5, [lpfq+wq+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -20 + jl .h_have_right +%if ARCH_X86_32 + pxor m8, m8 +%endif + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right +.h_have_right: + palignr m3, m5, m4, 2 + palignr m0, m5, m4, 4 + paddw m1, m3, m0 + punpcklwd m2, m3, m0 + pmaddwd m2, m2 + punpckhwd m3, m0 + pmaddwd m3, m3 + palignr m0, m5, m4, 6 + paddw m1, m0 ; sum3 + punpcklwd m7, m0, m6 + pmaddwd m7, m7 + punpckhwd m0, m6 + pmaddwd m0, m0 + paddd m2, m7 ; sumsq3 + palignr m5, m4, 8 + punpcklwd m7, m5, m4 + paddw m8, m4, m5 + pmaddwd m7, m7 + punpckhwd m5, m4 + pmaddwd m5, m5 + paddd m3, m0 + mova [t1+wq+400* 6], m1 + mova [t1+wq+400* 8], m2 + mova [t1+wq+400*10], m3 + paddw m8, m1 ; sum5 + paddd m7, m2 ; sumsq5 + paddd m5, m3 + mova [t1+wq+400* 0], m8 + mova [t1+wq+400* 2], m7 + mova [t1+wq+400* 4], m5 + add wq, 16 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 10 + jmp .hv0_main +.hv0_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, [base+sgr_lshuf5] + jmp .hv0_main +.hv0_bottom: +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv0_loop_start +%endif +.hv0_loop: + movif32 lpfq, hvsrcm +.hv0_loop_start: + movu m4, [lpfq+wq- 2] +.hv0_main: + movu m5, [lpfq+wq+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp wd, -20 + jl .hv0_have_right +%if ARCH_X86_32 + pxor m8, m8 +%endif + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right +.hv0_have_right: + palignr m3, m5, m4, 2 + palignr m0, m5, m4, 4 + movif32 t3, t3m + paddw m1, m3, m0 + punpcklwd m2, m3, m0 + pmaddwd m2, m2 + punpckhwd m3, m0 + pmaddwd m3, m3 + palignr m0, m5, m4, 6 + paddw m1, m0 ; h sum3 + punpcklwd m7, m0, m6 + pmaddwd m7, m7 + punpckhwd m0, m6 + pmaddwd m0, m0 + paddd m2, m7 ; h sumsq3 + palignr m5, m4, 8 + punpcklwd m7, m5, m4 + paddw m8, m4, m5 + pmaddwd m7, m7 + punpckhwd m5, m4 + pmaddwd m5, m5 + paddd m3, m0 + paddw m8, m1 ; h sum5 + paddd m7, m2 ; h sumsq5 + paddd m5, m3 + mova [t3+wq*2+400*8+ 8], m8 + mova [t3+wq*2+400*0+ 8], m7 + mova [t3+wq*2+400*0+24], m5 + paddw m8, [t1+wq+400* 0] + paddd m7, [t1+wq+400* 2] + paddd m5, [t1+wq+400* 4] + mova [t1+wq+400* 0], m8 + mova [t1+wq+400* 2], m7 + mova [t1+wq+400* 4], m5 + paddw m0, m1, [t1+wq+400* 6] + paddd m4, m2, [t1+wq+400* 8] + paddd m5, m3, [t1+wq+400*10] + mova [t1+wq+400* 6], m1 + mova [t1+wq+400* 8], m2 + mova [t1+wq+400*10], m3 + paddw m1, m0, [t2+wq+400* 6] + paddd m2, m4, [t2+wq+400* 8] + paddd m3, m5, [t2+wq+400*10] + mova [t2+wq+400* 6], m0 + mova [t2+wq+400* 8], m4 + mova [t2+wq+400*10], m5 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MAXSD m4, m2, m7 + MAXSD m5, m3, m7 + psubd m4, m2 ; p3 + psubd m5, m3 + MULLD m4, m14, m7 ; p3 * s1 + MULLD m5, m14, m7 + pmaddwd m0, m11 ; b3 * 455 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m4, 20 ; min(z3, 255) + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, r0, dstm + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m7 + MULLD m1, m5, m7 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + mova [t4+wq*1+400*2+ 4], m3 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+400*4+ 8], m0 + mova [t3+wq*2+400*4+24], m1 + add wq, 16 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 10 + jmp .hv1_main +.hv1_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, [base+sgr_lshuf5] + jmp .hv1_main +.hv1_bottom: +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv1_loop_start +%endif +.hv1_loop: + movif32 lpfq, hvsrcm +.hv1_loop_start: + movu m4, [lpfq+wq- 2] +.hv1_main: + movu m5, [lpfq+wq+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp wd, -20 + jl .hv1_have_right +%if ARCH_X86_32 + pxor m8, m8 +%endif + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right +.hv1_have_right: + palignr m7, m5, m4, 2 + palignr m3, m5, m4, 4 + paddw m2, m7, m3 + punpcklwd m0, m7, m3 + pmaddwd m0, m0 + punpckhwd m7, m3 + pmaddwd m7, m7 + palignr m3, m5, m4, 6 + paddw m2, m3 ; h sum3 + punpcklwd m1, m3, m6 + pmaddwd m1, m1 + punpckhwd m3, m6 + pmaddwd m3, m3 + paddd m0, m1 ; h sumsq3 + palignr m5, m4, 8 + punpckhwd m1, m4, m5 + paddw m8, m4, m5 + pmaddwd m1, m1 + punpcklwd m4, m5 + pmaddwd m4, m4 + paddd m7, m3 + paddw m5, m2, [t2+wq+400* 6] + mova [t2+wq+400* 6], m2 + paddw m8, m2 ; h sum5 + paddd m2, m0, [t2+wq+400* 8] + paddd m3, m7, [t2+wq+400*10] + mova [t2+wq+400* 8], m0 + mova [t2+wq+400*10], m7 + paddd m4, m0 ; h sumsq5 + paddd m1, m7 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 + pslld m0, m2, 3 + pslld m7, m3, 3 + paddd m2, m0 ; ((a3 + 8) >> 4) * 9 + paddd m3, m7 + psrlw m7, m5, 1 + pavgw m7, m6 ; (b3 + 2) >> 2 + punpcklwd m0, m7, m6 + pmaddwd m0, m0 + punpckhwd m7, m6 + pmaddwd m7, m7 +%if ARCH_X86_32 + mova [esp+20], m8 +%else + SWAP m8, m6 +%endif + MAXSD m2, m0, m8 + MAXSD m3, m7, m8 + pxor m8, m8 + psubd m2, m0 ; p3 + psubd m3, m7 + punpcklwd m0, m5, m8 ; b3 + punpckhwd m5, m8 + MULLD m2, m14, m8 ; p3 * s1 + MULLD m3, m14, m8 + pmaddwd m0, m11 ; b3 * 455 + pmaddwd m5, m11 + paddusw m2, m11 + paddusw m3, m11 + psrld m2, 20 ; min(z3, 255) + movif32 t3, t3m + psrld m3, 20 + GATHER_X_BY_X m8, m2, m3, r0, dstm + punpcklwd m2, m8, m8 + punpckhwd m3, m8, m8 + MULLD m0, m2, m7 + MULLD m5, m3, m7 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m5, m10 + psrld m0, 12 + psrld m5, 12 + mova [t4+wq*1+400*4+4], m8 + mova [t3+wq*2+400*8+ 8], m0 + mova [t3+wq*2+400*8+24], m5 +%if ARCH_X86_32 + mova m8, [esp+20] +%else + SWAP m6, m8 + pxor m6, m6 +%endif + paddw m5, m8, [t2+wq+400*0] + paddd m2, m4, [t2+wq+400*2] + paddd m3, m1, [t2+wq+400*4] + paddw m5, [t1+wq+400*0] + paddd m2, [t1+wq+400*2] + paddd m3, [t1+wq+400*4] + mova [t2+wq+400*0], m8 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a5 + 8) >> 4 + psrld m3, 4 + mova [t2+wq+400*2], m4 + pslld m8, m2, 4 + mova [t2+wq+400*4], m1 + pslld m4, m3, 4 + paddd m8, m2 + pslld m2, 3 + paddd m4, m3 + pslld m3, 3 + paddd m2, m8 ; ((a5 + 8) >> 4) * 25 + paddd m3, m4 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + psrlw m1, m5, 1 + pavgw m1, m7 ; (b5 + 2) >> 2 + punpcklwd m4, m1, m7 + pmaddwd m4, m4 + punpckhwd m1, m7 + pmaddwd m1, m1 + punpcklwd m0, m5, m7 ; b5 + punpckhwd m5, m7 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MAXSD m2, m4, m7 + psubd m2, m4 ; p5 + MAXSD m3, m1, m7 + psubd m3, m1 + MULLD m2, m13, m7 ; p5 * s0 + MULLD m3, m13, m7 + pmaddwd m0, m12 ; b5 * 164 + pmaddwd m5, m12 + paddusw m2, m12 + paddusw m3, m12 + psrld m2, 20 ; min(z5, 255) + psrld m3, 20 + GATHER_X_BY_X m1, m2, m3, r0, dstm + punpcklwd m2, m1, m1 + punpckhwd m3, m1, m1 + MULLD m0, m2, m7 + MULLD m5, m3, m7 + paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m5, m10 + mova [t4+wq*1+400*0+ 4], m1 + psrld m0, 12 + psrld m5, 12 + mova [t3+wq*2+400*0+ 8], m0 + mova [t3+wq*2+400*0+24], m5 + add wq, 16 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab3 (even rows) +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov wd, w0m +%endif +.v0_loop: + mova m0, [t1+wq+400* 6] + mova m4, [t1+wq+400* 8] + mova m5, [t1+wq+400*10] + paddw m0, m0 + paddd m4, m4 + paddd m5, m5 + paddw m1, m0, [t2+wq+400* 6] + paddd m2, m4, [t2+wq+400* 8] + paddd m3, m5, [t2+wq+400*10] + mova [t2+wq+400* 6], m0 + mova [t2+wq+400* 8], m4 + mova [t2+wq+400*10], m5 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MAXSD m4, m2, m7 + MAXSD m5, m3, m7 + psubd m4, m2 ; p3 + psubd m5, m3 + MULLD m4, m14, m7 ; p3 * s1 + MULLD m5, m14, m7 + pmaddwd m0, m11 ; b3 * 455 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m4, 20 ; min(z3, 255) + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, r0, dstm + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m7 + MULLD m1, m5, m7 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + mova [t4+wq*1+400*2+4], m3 + psrld m0, 12 + psrld m1, 12 + mova m3, [t1+wq+400*0] + mova m4, [t1+wq+400*2] + mova m5, [t1+wq+400*4] + mova [t3+wq*2+400*8+ 8], m3 + mova [t3+wq*2+400*0+ 8], m4 + mova [t3+wq*2+400*0+24], m5 + paddw m3, m3 ; cc5 + paddd m4, m4 + paddd m5, m5 + mova [t1+wq+400*0], m3 + mova [t1+wq+400*2], m4 + mova [t1+wq+400*4], m5 + mova [t3+wq*2+400*4+ 8], m0 + mova [t3+wq*2+400*4+24], m1 + add wq, 16 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov wd, w0m +%endif +.v1_loop: + mova m4, [t1+wq+400* 6] + mova m5, [t1+wq+400* 8] + mova m7, [t1+wq+400*10] + paddw m1, m4, [t2+wq+400* 6] + paddd m2, m5, [t2+wq+400* 8] + paddd m3, m7, [t2+wq+400*10] + mova [t2+wq+400* 6], m4 + mova [t2+wq+400* 8], m5 + mova [t2+wq+400*10], m7 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MAXSD m4, m2, m7 + MAXSD m5, m3, m7 + psubd m4, m2 ; p3 + psubd m5, m3 + MULLD m4, m14, m7 ; p3 * s1 + MULLD m5, m14, m7 + pmaddwd m0, m11 ; b3 * 455 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m4, 20 ; min(z3, 255) + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, r0, dstm + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m7 + MULLD m1, m5, m7 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + mova [t4+wq*1+400*4+4], m3 + psrld m0, 12 + psrld m8, m1, 12 + mova m4, [t3+wq*2+400*8+ 8] + mova m5, [t3+wq*2+400*0+ 8] + mova m7, [t3+wq*2+400*0+24] + paddw m1, m4, [t2+wq+400*0] + paddd m2, m5, [t2+wq+400*2] + paddd m3, m7, [t2+wq+400*4] + paddw m1, [t1+wq+400*0] + paddd m2, [t1+wq+400*2] + paddd m3, [t1+wq+400*4] + mova [t2+wq+400*0], m4 + mova [t2+wq+400*2], m5 + mova [t2+wq+400*4], m7 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a5 + 8) >> 4 + psrld m3, 4 + mova [t3+wq*2+400*8+ 8], m0 + pslld m4, m2, 4 + mova [t3+wq*2+400*8+24], m8 + pslld m5, m3, 4 + paddd m4, m2 + pslld m2, 3 + paddd m5, m3 + pslld m3, 3 + paddd m2, m4 + paddd m3, m5 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + psrlw m5, m1, 1 + pavgw m5, m7 ; (b5 + 2) >> 2 + punpcklwd m4, m5, m7 + pmaddwd m4, m4 + punpckhwd m5, m7 + pmaddwd m5, m5 + punpcklwd m0, m1, m7 ; b5 + punpckhwd m1, m7 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MAXSD m2, m4, m7 + psubd m2, m4 ; p5 + MAXSD m3, m5, m7 + psubd m3, m5 + MULLD m2, m13, m7 ; p5 * s0 + MULLD m3, m13, m7 + pmaddwd m0, m12 ; b5 * 164 + pmaddwd m1, m12 + paddusw m2, m12 + paddusw m3, m12 + psrld m2, 20 ; min(z5, 255) + psrld m3, 20 + GATHER_X_BY_X m4, m2, m3, r0, dstm + punpcklwd m2, m4, m4 + punpckhwd m3, m4, m4 + MULLD m0, m2, m7 + MULLD m1, m3, m7 + paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m1, m10 + mova [t4+wq*1+400*0+ 4], m4 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+400*0+ 8], m0 + mova [t3+wq*2+400*0+24], m1 + add wq, 16 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + movif64 wq, r4 + movif32 wd, w1m +.prep_n_loop: + movu m0, [t4+wq*1+400*0+ 2] + movu m1, [t3+wq*2+400*0+ 4] + movu m2, [t3+wq*2+400*0+20] + movu m7, [t4+wq*1+400*0+ 4] + movu m8, [t3+wq*2+400*0+ 8] + paddw m3, m0, [t4+wq*1+400*0+ 0] + paddd m4, m1, [t3+wq*2+400*0+ 0] + paddd m5, m2, [t3+wq*2+400*0+16] + paddw m3, m7 + paddd m4, m8 + movu m7, [t3+wq*2+400*0+24] + paddw m0, m3 + paddd m1, m4 + psllw m3, 2 + pslld m4, 2 + paddd m5, m7 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a5 565 + paddd m1, m4 ; b5 565 + paddd m2, m5 + mova [t4+wq*1+400* 6+ 0], m0 + mova [t3+wq*2+400*12+ 0], m1 + mova [t3+wq*2+400*12+16], m2 + movu m0, [t4+wq*1+400*2+ 4] + movu m1, [t3+wq*2+400*4+ 8] + movu m2, [t3+wq*2+400*4+24] + movu m3, [t4+wq*1+400*2+ 2] + movu m4, [t3+wq*2+400*4+ 4] + movu m5, [t3+wq*2+400*4+20] + paddw m0, [t4+wq*1+400*2+ 0] + paddd m1, [t3+wq*2+400*4+ 0] + paddd m2, [t3+wq*2+400*4+16] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + psllw m3, 2 ; a3[-1] 444 + pslld m4, 2 ; b3[-1] 444 + pslld m5, 2 + psubw m3, m0 ; a3[-1] 343 + psubd m4, m1 ; b3[-1] 343 + psubd m5, m2 + mova [t4+wq*1+400* 8+ 0], m3 + mova [t3+wq*2+400*16+ 0], m4 + mova [t3+wq*2+400*16+16], m5 + movu m0, [t4+wq*1+400*4+ 4] + movu m1, [t3+wq*2+400*8+ 8] + movu m2, [t3+wq*2+400*8+24] + movu m3, [t4+wq*1+400*4+ 2] + movu m4, [t3+wq*2+400*8+ 4] + movu m5, [t3+wq*2+400*8+20] + paddw m0, [t4+wq*1+400*4+ 0] + paddd m1, [t3+wq*2+400*8+ 0] + paddd m2, [t3+wq*2+400*8+16] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + psllw m3, 2 ; a3[ 0] 444 + pslld m4, 2 ; b3[ 0] 444 + pslld m5, 2 + mova [t4+wq*1+400*10+ 0], m3 + mova [t3+wq*2+400*20+ 0], m4 + mova [t3+wq*2+400*20+16], m5 + psubw m3, m0 ; a3[ 0] 343 + psubd m4, m1 ; b3[ 0] 343 + psubd m5, m2 + mova [t4+wq*1+400*12+ 0], m3 + mova [t3+wq*2+400*24+ 0], m4 + mova [t3+wq*2+400*24+16], m5 + add wq, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + movif64 wq, r4 + movif32 wd, w1m +.n0_loop: + movu m0, [t4+wq*1+ 4] + movu m2, [t4+wq*1+ 2] + paddw m0, [t4+wq*1+ 0] + paddw m0, m2 + paddw m2, m0 + psllw m0, 2 + paddw m0, m2 ; a5 + movu m4, [t3+wq*2+ 8] + movu m5, [t3+wq*2+24] + movu m1, [t3+wq*2+ 4] + movu m3, [t3+wq*2+20] + paddd m4, [t3+wq*2+ 0] + paddd m5, [t3+wq*2+16] + paddd m4, m1 + paddd m5, m3 + paddd m1, m4 + paddd m3, m5 + pslld m4, 2 + pslld m5, 2 + paddd m4, m1 ; b5 + paddd m5, m3 + movu m2, [t4+wq*1+400* 6] + paddw m2, m0 + mova [t4+wq*1+400* 6], m0 + paddd m0, m4, [t3+wq*2+400*12+ 0] + paddd m1, m5, [t3+wq*2+400*12+16] + mova [t3+wq*2+400*12+ 0], m4 + mova [t3+wq*2+400*12+16], m5 + mova [rsp+16+ARCH_X86_32*4], m1 + movu m3, [t4+wq*1+400*2+4] + movu m5, [t4+wq*1+400*2+2] + paddw m3, [t4+wq*1+400*2+0] + paddw m5, m3 + psllw m5, 2 ; a3[ 1] 444 + psubw m4, m5, m3 ; a3[ 1] 343 + movu m3, [t4+wq*1+400* 8] + paddw m3, [t4+wq*1+400*10] + paddw m3, m4 + mova [t4+wq*1+400* 8], m4 + mova [t4+wq*1+400*10], m5 + movu m1, [t3+wq*2+400*4+ 8] + movu m5, [t3+wq*2+400*4+ 4] + movu m7, [t3+wq*2+400*4+24] + movu m8, [t3+wq*2+400*4+20] + paddd m1, [t3+wq*2+400*4+ 0] + paddd m7, [t3+wq*2+400*4+16] + paddd m5, m1 + paddd m8, m7 + pslld m5, 2 ; b3[ 1] 444 + pslld m8, 2 + psubd m4, m5, m1 ; b3[ 1] 343 +%if ARCH_X86_32 + mova [esp+52], m8 + psubd m8, m7 +%else + psubd m6, m8, m7 + SWAP m8, m6 +%endif + paddd m1, m4, [t3+wq*2+400*16+ 0] + paddd m7, m8, [t3+wq*2+400*16+16] + paddd m1, [t3+wq*2+400*20+ 0] + paddd m7, [t3+wq*2+400*20+16] + mova [t3+wq*2+400*16+ 0], m4 + mova [t3+wq*2+400*16+16], m8 + mova [t3+wq*2+400*20+ 0], m5 +%if ARCH_X86_32 + mova m8, [esp+52] +%else + SWAP m8, m6 + pxor m6, m6 +%endif + mova [t3+wq*2+400*20+16], m8 + mova [rsp+32+ARCH_X86_32*4], m7 + movu m5, [dstq+wq] + punpcklwd m4, m5, m6 + punpcklwd m7, m2, m6 + pmaddwd m7, m4 ; a5 * src + punpcklwd m8, m3, m6 + pmaddwd m8, m4 ; a3 * src + punpckhwd m5, m6 + punpckhwd m2, m6 + pmaddwd m2, m5 + punpckhwd m3, m6 + pmaddwd m3, m5 + pslld m4, 13 + pslld m5, 13 + psubd m0, m7 ; b5 - a5 * src + (1 << 8) + psubd m1, m8 ; b3 - a3 * src + (1 << 8) + mova m7, [base+pd_0xffff] + psrld m0, 9 + pslld m1, 7 + pand m0, m7 + pandn m8, m7, m1 + por m0, m8 + mova m1, [rsp+16+ARCH_X86_32*4] + mova m8, [rsp+32+ARCH_X86_32*4] + psubd m1, m2 + psubd m8, m3 + mova m2, [base+pd_4096] + psrld m1, 9 + pslld m8, 7 + pand m1, m7 + pandn m7, m8 + por m1, m7 + pmaddwd m0, m15 + pmaddwd m1, m15 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + paddd m4, m2 + paddd m5, m2 + paddd m0, m4 + paddd m1, m5 + psrad m0, 8 + psrad m1, 8 + packssdw m0, m1 ; clip + pmaxsw m0, m7 + psrlw m0, 5 + mova [dstq+wq], m0 + add wq, 16 + jl .n0_loop + add dstq, stridemp + ret +%if ARCH_X86_64 + SWAP m6, m7 +%endif +ALIGN function_align +.n1: ; neighbor + output (odd rows) + movif64 wq, r4 + movif32 wd, w1m +.n1_loop: + movu m3, [t4+wq*1+400*4+4] + movu m5, [t4+wq*1+400*4+2] + paddw m3, [t4+wq*1+400*4+0] + paddw m5, m3 + psllw m5, 2 ; a3[ 1] 444 + psubw m4, m5, m3 ; a3[ 1] 343 + paddw m3, m4, [t4+wq*1+400*12] + paddw m3, [t4+wq*1+400*10] + mova [t4+wq*1+400*10], m5 + mova [t4+wq*1+400*12], m4 + movu m1, [t3+wq*2+400*8+ 8] + movu m5, [t3+wq*2+400*8+ 4] + movu m7, [t3+wq*2+400*8+24] + movu m8, [t3+wq*2+400*8+20] + paddd m1, [t3+wq*2+400*8+ 0] + paddd m7, [t3+wq*2+400*8+16] + paddd m5, m1 + paddd m8, m7 + pslld m5, 2 ; b3[ 1] 444 + pslld m8, 2 + psubd m4, m5, m1 ; b3[ 1] 343 + psubd m0, m8, m7 + paddd m1, m4, [t3+wq*2+400*24+ 0] + paddd m7, m0, [t3+wq*2+400*24+16] + paddd m1, [t3+wq*2+400*20+ 0] + paddd m7, [t3+wq*2+400*20+16] + mova [t3+wq*2+400*20+ 0], m5 + mova [t3+wq*2+400*20+16], m8 + mova [t3+wq*2+400*24+ 0], m4 + mova [t3+wq*2+400*24+16], m0 + mova m5, [dstq+wq] + mova m2, [t4+wq*1+400* 6] + punpcklwd m4, m5, m6 + punpcklwd m8, m2, m6 + pmaddwd m8, m4 ; a5 * src + punpcklwd m0, m3, m6 + pmaddwd m0, m4 ; a3 * src + punpckhwd m5, m6 + punpckhwd m2, m6 + pmaddwd m2, m5 + punpckhwd m3, m6 + pmaddwd m3, m5 + psubd m1, m0 ; b3 - a3 * src + (1 << 8) + pslld m4, 13 + pslld m5, 13 + mova m0, [t3+wq*2+400*12+ 0] + psubd m0, m8 ; b5 - a5 * src + (1 << 8) + mova m8, [t3+wq*2+400*12+16] + psubd m8, m2 + psubd m7, m3 + mova m2, [base+pd_0xffff] + pslld m1, 7 + psrld m0, 8 + psrld m8, 8 + pslld m7, 7 + pand m0, m2 + pandn m3, m2, m1 + por m0, m3 + pand m8, m2 + pandn m2, m7 + por m2, m8 + mova m1, [base+pd_4096] + pmaddwd m0, m15 + pmaddwd m2, m15 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + pxor m7, m7 + paddd m4, m1 + paddd m5, m1 + paddd m0, m4 + paddd m2, m5 + psrad m0, 8 + psrad m2, 8 + packssdw m0, m2 ; clip + pmaxsw m0, m7 + psrlw m0, 5 + mova [dstq+wq], m0 + add wq, 16 + jl .n1_loop + add dstq, stridemp + movif32 dstm, dstq + ret -- cgit v1.2.3