diff options
Diffstat (limited to 'third_party/dav1d/src/x86/looprestoration_avx512.asm')
-rw-r--r-- | third_party/dav1d/src/x86/looprestoration_avx512.asm | 2122 |
1 files changed, 2122 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/looprestoration_avx512.asm b/third_party/dav1d/src/x86/looprestoration_avx512.asm new file mode 100644 index 0000000000..1e571774ca --- /dev/null +++ b/third_party/dav1d/src/x86/looprestoration_avx512.asm @@ -0,0 +1,2122 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 + +wiener_shufA: db 1, 2, 7, 6, 3, 4, 9, 8, 5, 6, 11, 10, 7, 8, 13, 12 +wiener_shufB: db 2, 3, 8, 7, 4, 5, 10, 9, 6, 7, 12, 11, 8, 9, 14, 13 +wiener_shufC: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 +wiener_shufD: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +wiener_perm32: db 1, 9, 3, 11, 5, 13, 7, 15, 33, 41, 35, 43, 37, 45, 39, 47 + db 17, 25, 19, 27, 21, 29, 23, 31, 49, 57, 51, 59, 53, 61, 55, 63 +sgr_shuf: db 128, 1, -1, 2,132, 3, -1, 4,136, 5, -1, 6,140, 7, -1, 8 + db 129, 9, -1, 10,133, 11, -1, 12,137, -1, -1, -1,141, -1, 0,128 +sgr_mix_perm: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55 +r_ext_mask: times 68 db -1 + times 4 db 0 +wiener_x_shuf: db 0, 2, -1, 0 +wiener_x_add: db 0, 1,127, 0 + +pw_61448: times 2 dw 61448 +pw_164_455: dw 164, 455 +pd_m16380: dd -16380 +pd_m4096: dd -4096 +pd_m25 dd -25 +pd_m9: dd -9 +pd_34816: dd 34816 +pd_8421376: dd 8421376 + +cextern sgr_x_by_x + +SECTION .text + +DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers + +INIT_ZMM avx512icl +cglobal wiener_filter7_8bpc, 4, 15, 20, -384*12-16, dst, stride, left, lpf, \ + w, h, edge, flt + mov fltq, r6mp + mov wd, wm + movifnidn hd, hm + mov edged, r7m + vbroadcasti32x4 m6, [wiener_shufA] + vbroadcasti32x4 m7, [wiener_shufB] + mov r10d, 0xfffe + vbroadcasti32x4 m8, [wiener_shufC] + vbroadcasti32x4 m9, [wiener_shufD] + kmovw k1, r10d + vpbroadcastd m0, [wiener_x_shuf] + vpbroadcastd m1, [wiener_x_add] + mov r10, 0xaaaaaaaaaaaaaaaa + vpbroadcastd m11, [fltq+ 0] + vpbroadcastd m12, [fltq+ 4] + kmovq k2, r10 + vpbroadcastd m10, [pd_m16380] + packsswb m11, m11 ; x0 x1 x0 x1 + vpbroadcastd m14, [fltq+16] + pshufb m12, m0 + vpbroadcastd m15, [fltq+20] + paddb m12, m1 ; x2 x3+1 x2 127 + vpbroadcastd m13, [pd_8421376] + psllw m14, 5 ; y0 y1 + psllw m15, 5 ; y2 y3 + cmp wd, 32 ; the minimum lr unit size for chroma in 4:2:0 is 32 + jle .w32 ; pixels, so we need a special case for small widths + lea t1, [rsp+wq*2+16] + add lpfq, wq + add dstq, wq + neg wq + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t6, t1 + mov t5, t1 + add t1, 384*2 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + mov t4, t1 + add t1, 384*2 + add r10, strideq + mov [rsp], r10 ; below + call .h + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v3 +.main: + lea t0, [t1+384*2] +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v3 + mov lpfq, [rsp] + call .hv_bottom + add lpfq, strideq + call .hv_bottom +.v1: + call .v + RET +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + mov t6, t1 + mov t5, t1 + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v3 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v3 + add t0, 384*8 + call .hv + dec hd + jnz .main +.v3: + call .v +.v2: + call .v + jmp .v1 +.h: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm16, [leftq] + vmovdqu32 m16{k1}, [lpfq+r10-4] + add leftq, 4 + jmp .h_main +.h_extend_left: + vpbroadcastb xm16, [lpfq+r10] ; the masked load ensures that no exception + vmovdqu32 m16{k1}, [lpfq+r10-4] ; gets raised from accessing invalid memory + jmp .h_main +.h_top: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m16, [lpfq+r10-4] +.h_main: + movu m17, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -66 + jl .h_have_right + push r0 + lea r0, [r_ext_mask+65] + vpbroadcastb m0, [lpfq-1] + vpternlogd m16, m0, [r0+r10+0], 0xe4 ; c ? a : b + vpternlogd m17, m0, [r0+r10+8], 0xe4 + pop r0 +.h_have_right: + pshufb m4, m16, m6 + mova m0, m10 + vpdpbusd m0, m4, m11 + pshufb m4, m16, m7 + mova m2, m10 + vpdpbusd m2, m4, m11 + pshufb m4, m17, m6 + mova m1, m10 + vpdpbusd m1, m4, m11 + pshufb m4, m17, m7 + mova m3, m10 + vpdpbusd m3, m4, m11 + pshufb m4, m16, m8 + vpdpbusd m0, m4, m12 + pshufb m16, m9 + vpdpbusd m2, m16, m12 + pshufb m4, m17, m8 + vpdpbusd m1, m4, m12 + pshufb m17, m9 + vpdpbusd m3, m17, m12 + packssdw m0, m2 + packssdw m1, m3 + psraw m0, 3 + psraw m1, 3 + mova [t1+r10*2+ 0], m0 + mova [t1+r10*2+64], m1 + add r10, 64 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, strideq + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movd xm16, [leftq] + vmovdqu32 m16{k1}, [lpfq+r10-4] + add leftq, 4 + jmp .hv_main +.hv_extend_left: + vpbroadcastb xm16, [lpfq+r10] + vmovdqu32 m16{k1}, [lpfq+r10-4] + jmp .hv_main +.hv_bottom: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m16, [lpfq+r10-4] +.hv_main: + movu m17, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -66 + jl .hv_have_right + push r0 + lea r0, [r_ext_mask+65] + vpbroadcastb m0, [lpfq-1] + vpternlogd m16, m0, [r0+r10+0], 0xe4 ; c ? a : b + vpternlogd m17, m0, [r0+r10+8], 0xe4 + pop r0 +.hv_have_right: + pshufb m4, m16, m6 + mova m0, m10 + vpdpbusd m0, m4, m11 + pshufb m4, m16, m7 + mova m2, m10 + vpdpbusd m2, m4, m11 + pshufb m4, m17, m6 + mova m1, m10 + vpdpbusd m1, m4, m11 + pshufb m4, m17, m7 + mova m3, m10 + vpdpbusd m3, m4, m11 + pshufb m4, m16, m8 + vpdpbusd m0, m4, m12 + pshufb m16, m9 + vpdpbusd m2, m16, m12 + pshufb m4, m17, m8 + vpdpbusd m1, m4, m12 + pshufb m17, m9 + vpdpbusd m3, m17, m12 + packssdw m0, m2 + packssdw m1, m3 + psraw m0, 3 + psraw m1, 3 + mova m16, [t4+r10*2] + paddw m16, [t2+r10*2] + mova m3, [t3+r10*2] + mova m17, [t4+r10*2+64] + paddw m17, [t2+r10*2+64] + mova m5, [t3+r10*2+64] + punpcklwd m4, m16, m3 + mova m2, m13 + vpdpwssd m2, m4, m15 + punpcklwd m18, m17, m5 + mova m4, m13 + vpdpwssd m4, m18, m15 + punpckhwd m16, m3 + mova m3, m13 + vpdpwssd m3, m16, m15 + punpckhwd m17, m5 + mova m5, m13 + vpdpwssd m5, m17, m15 + mova m17, [t5+r10*2] + paddw m17, [t1+r10*2] + paddw m16, m0, [t6+r10*2] + mova m19, [t5+r10*2+64] + paddw m19, [t1+r10*2+64] + paddw m18, m1, [t6+r10*2+64] + mova [t0+r10*2+ 0], m0 + mova [t0+r10*2+64], m1 + punpcklwd m0, m16, m17 + vpdpwssd m2, m0, m14 + punpcklwd m1, m18, m19 + vpdpwssd m4, m1, m14 + punpckhwd m16, m17 + vpdpwssd m3, m16, m14 + punpckhwd m18, m19 + vpdpwssd m5, m18, m14 + packuswb m2, m4 + psrlw m2, 8 + vpackuswb m2{k2}, m3, m5 + movu [dstq+r10], m2 ; We don't have a separate 5-tap version so the 7-tap + add r10, 64 ; function is used for chroma as well, and in some + jl .hv_loop ; esoteric edge cases chroma dst pointers may only + mov t6, t5 ; have a 32-byte alignment despite having a width + mov t5, t4 ; larger than 32, so use an unaligned store here. + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t6 + add dstq, strideq + ret +.v: + mov r10, wq +.v_loop: + mova m4, [t4+r10*2+ 0] + paddw m4, [t2+r10*2+ 0] + mova m1, [t3+r10*2+ 0] + mova m5, [t4+r10*2+64] + paddw m5, [t2+r10*2+64] + mova m3, [t3+r10*2+64] + punpcklwd m6, m4, m1 + mova m0, m13 + vpdpwssd m0, m6, m15 + punpcklwd m6, m5, m3 + mova m2, m13 + vpdpwssd m2, m6, m15 + punpckhwd m4, m1 + mova m1, m13 + vpdpwssd m1, m4, m15 + punpckhwd m5, m3 + mova m3, m13 + vpdpwssd m3, m5, m15 + mova m5, [t1+r10*2+ 0] + paddw m4, m5, [t6+r10*2+ 0] + paddw m5, [t5+r10*2+ 0] + mova m7, [t1+r10*2+64] + paddw m6, m7, [t6+r10*2+64] + paddw m7, [t5+r10*2+64] + punpcklwd m8, m4, m5 + vpdpwssd m0, m8, m14 + punpcklwd m8, m6, m7 + vpdpwssd m2, m8, m14 + punpckhwd m4, m5 + vpdpwssd m1, m4, m14 + punpckhwd m6, m7 + vpdpwssd m3, m6, m14 + packuswb m0, m2 + psrlw m0, 8 + vpackuswb m0{k2}, m1, m3 + movu [dstq+r10], m0 + add r10, 64 + jl .v_loop + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + add dstq, strideq + ret +.w32: + lea r10, [r_ext_mask+73] + mova ym18, [wiener_perm32] + lea t1, [rsp+16] + sub r10, wq + test edgeb, 4 ; LR_HAVE_TOP + jz .w32_no_top + call .w32_h_top + add lpfq, strideq + mov t6, t1 + mov t5, t1 + add t1, 32*2 + call .w32_h_top + lea r9, [lpfq+strideq*4] + mov lpfq, dstq + mov t4, t1 + add t1, 32*2 + add r9, strideq + mov [rsp], r9 ; below + call .w32_h + mov t3, t1 + mov t2, t1 + dec hd + jz .w32_v1 + add lpfq, strideq + add t1, 32*2 + call .w32_h + mov t2, t1 + dec hd + jz .w32_v2 + add lpfq, strideq + add t1, 32*2 + call .w32_h + dec hd + jz .w32_v3 +.w32_main: + lea t0, [t1+32*2] +.w32_main_loop: + call .w32_hv + dec hd + jnz .w32_main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .w32_v3 + mov lpfq, [rsp] + call .w32_hv_bottom + add lpfq, strideq + call .w32_hv_bottom +.w32_v1: + call .w32_v + RET +.w32_no_top: + lea r9, [lpfq+strideq*4] + mov lpfq, dstq + lea r9, [r9+strideq*2] + mov [rsp], r9 + call .w32_h + mov t6, t1 + mov t5, t1 + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .w32_v1 + add lpfq, strideq + add t1, 32*2 + call .w32_h + mov t2, t1 + dec hd + jz .w32_v2 + add lpfq, strideq + add t1, 32*2 + call .w32_h + dec hd + jz .w32_v3 + lea t0, [t1+32*2] + call .w32_hv + dec hd + jz .w32_v3 + add t0, 32*8 + call .w32_hv + dec hd + jnz .w32_main +.w32_v3: + call .w32_v +.w32_v2: + call .w32_v + jmp .w32_v1 +.w32_h: + test edgeb, 1 ; LR_HAVE_LEFT + jz .w32_h_extend_left + movd xm16, [leftq] + vmovdqu32 ym16{k1}, [lpfq-4] + add leftq, 4 + jmp .w32_h_main +.w32_h_extend_left: + vpbroadcastb xm16, [lpfq] ; the masked load ensures that no exception + vmovdqu32 ym16{k1}, [lpfq-4] ; gets raised from accessing invalid memory + jmp .w32_h_main +.w32_h_top: + test edgeb, 1 ; LR_HAVE_LEFT + jz .w32_h_extend_left + movu ym16, [lpfq-4] +.w32_h_main: + vinserti32x8 m16, [lpfq+4], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .w32_h_have_right + vpbroadcastb m0, [lpfq+wq-1] + movu ym17, [r10-8] + vinserti32x8 m17, [r10+0], 1 + vpternlogd m16, m0, m17, 0xe4 ; c ? a : b +.w32_h_have_right: + pshufb m2, m16, m6 + mova m0, m10 + vpdpbusd m0, m2, m11 + pshufb m2, m16, m7 + mova m1, m10 + vpdpbusd m1, m2, m11 + pshufb m2, m16, m8 + vpdpbusd m0, m2, m12 + pshufb m16, m9 + vpdpbusd m1, m16, m12 + packssdw m0, m1 + psraw m0, 3 + mova [t1], m0 + ret +.w32_hv: + add lpfq, strideq + test edgeb, 1 ; LR_HAVE_LEFT + jz .w32_hv_extend_left + movd xm16, [leftq] + vmovdqu32 ym16{k1}, [lpfq-4] + add leftq, 4 + jmp .w32_hv_main +.w32_hv_extend_left: + vpbroadcastb xm16, [lpfq] + vmovdqu32 ym16{k1}, [lpfq-4] + jmp .w32_hv_main +.w32_hv_bottom: + test edgeb, 1 ; LR_HAVE_LEFT + jz .w32_hv_extend_left + movu ym16, [lpfq-4] +.w32_hv_main: + vinserti32x8 m16, [lpfq+4], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .w32_hv_have_right + vpbroadcastb m0, [lpfq+wq-1] + movu ym17, [r10-8] + vinserti32x8 m17, [r10+0], 1 + vpternlogd m16, m0, m17, 0xe4 +.w32_hv_have_right: + mova m3, [t4] + paddw m3, [t2] + mova m2, [t3] + pshufb m4, m16, m6 + mova m0, m10 + vpdpbusd m0, m4, m11 + pshufb m4, m16, m7 + mova m5, m10 + vpdpbusd m5, m4, m11 + punpcklwd m4, m3, m2 + mova m1, m13 + vpdpwssd m1, m4, m15 + punpckhwd m3, m2 + mova m2, m13 + vpdpwssd m2, m3, m15 + pshufb m4, m16, m8 + vpdpbusd m0, m4, m12 + pshufb m16, m9 + vpdpbusd m5, m16, m12 + packssdw m0, m5 + psraw m0, 3 + mova m4, [t5] + paddw m4, [t1] + paddw m3, m0, [t6] + mova [t0], m0 + punpcklwd m0, m3, m4 + vpdpwssd m1, m0, m14 + punpckhwd m3, m4 + vpdpwssd m2, m3, m14 + packuswb m1, m2 + vpermb m16, m18, m1 + mova [dstq], ym16 + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t6 + add dstq, strideq + ret +.w32_v: + mova m2, [t4] + paddw m2, [t2] + mova m1, [t3] + mova m4, [t1] + paddw m3, m4, [t6] + paddw m4, [t5] + punpcklwd m5, m2, m1 + mova m0, m13 + vpdpwssd m0, m5, m15 + punpckhwd m2, m1 + mova m1, m13 + vpdpwssd m1, m2, m15 + punpcklwd m2, m3, m4 + vpdpwssd m0, m2, m14 + punpckhwd m3, m4 + vpdpwssd m1, m3, m14 + packuswb m0, m1 + vpermb m16, m18, m0 + mova [dstq], ym16 + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + add dstq, strideq + ret + +cglobal sgr_filter_5x5_8bpc, 4, 13, 23, 416*24+16, dst, stride, left, lpf, \ + w, h, edge, params + mov paramsq, r6mp + mov wd, wm + mov hd, hm + mov edged, r7m + vbroadcasti32x4 m5, [sgr_shuf+1] + add lpfq, wq + vbroadcasti32x4 m6, [sgr_shuf+9] + add dstq, wq + vbroadcasti32x4 m7, [sgr_shuf+3] + lea t3, [rsp+wq*4+16+416*12] + vbroadcasti32x4 m8, [sgr_shuf+7] + pxor m4, m4 + vpbroadcastd m9, [pd_m25] + vpsubd m11, m4, [paramsq+0] {1to16} ; -s0 + vpbroadcastw m15, [paramsq+8] ; w0 + lea t1, [rsp+wq*2+20] + vpbroadcastd m10, [pw_164_455] + neg wq + vpbroadcastd m12, [pw_61448] ; (15 << 12) + (1 << 3) + mov r10d, 0xfe + vpbroadcastd m13, [pd_m4096] + kmovb k1, r10d + vpbroadcastd m14, [pd_34816] ; (1 << 11) + (1 << 15) + mov r10, 0x3333333333333333 + mova m18, [sgr_x_by_x+64*0] + kmovq k2, r10 + mova m19, [sgr_x_by_x+64*1] + lea r12, [r_ext_mask+75] + mova m20, [sgr_x_by_x+64*2] + psllw m15, 4 + mova m21, [sgr_x_by_x+64*3] + lea r10, [lpfq+strideq*4] + mova ym22, [sgr_shuf] + add r10, strideq + mov [rsp], r10 ; below + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t2, t1 + call .top_fixup + add t1, 416*6 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov [rsp], r10 ; below + mov t0, t2 + dec hd + jz .height1 + or edged, 16 + call .h +.main: + add lpfq, strideq + call .hv + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, strideq + test hd, hd + jz .odd_height + call .h + add lpfq, strideq + call .hv + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .h_top + add lpfq, strideq + call .hv_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .hv + call .prep_n + jmp .odd_height_end +.odd_height: + call .hv + call .n0 + call .n1 +.odd_height_end: + call .v + call .n0 + jmp .end2 +.extend_bottom: + call .v + jmp .end +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + lea t2, [t1+416*6] + call .top_fixup + dec hd + jz .no_top_height1 + or edged, 16 + mov t0, t1 + mov t1, t2 + jmp .main +.no_top_height1: + call .v + call .prep_n + jmp .odd_height_end +.h: ; horizontal boxsum + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm17, [leftq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + add leftq, 4 + jmp .h_main +.h_extend_left: + vpbroadcastb xm17, [lpfq+wq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + jmp .h_main +.h_top: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu ym17, [lpfq+r10-2] +.h_main: + vinserti32x8 m17, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -34 + jl .h_have_right + vpbroadcastb m0, [lpfq-1] + movu ym16, [r12+r10-8] + vinserti32x8 m16, [r12+r10+0], 1 + vpternlogd m17, m0, m16, 0xe4 +.h_have_right: + pshufb m3, m17, m5 + pmullw m2, m3, m3 + pshufb m1, m17, m6 + paddw m0, m3, m1 + shufps m3, m1, q2121 + paddw m0, m3 + punpcklwd m16, m3, m1 + punpckhwd m3, m1 + punpcklwd m1, m2, m4 + vpdpwssd m1, m16, m16 + punpckhwd m2, m4 + vpdpwssd m2, m3, m3 + pshufb m16, m17, m7 + paddw m0, m16 + pshufb m17, m8 + paddw m0, m17 ; sum + punpcklwd m3, m16, m17 + vpdpwssd m1, m3, m3 ; sumsq + punpckhwd m16, m17 + vpdpwssd m2, m16, m16 + test edgeb, 16 ; y > 0 + jz .h_loop_end + paddw m0, [t1+r10*2+416*0] + paddd m1, [t1+r10*2+416*2] + paddd m2, [t1+r10*2+416*4] +.h_loop_end: + mova [t1+r10*2+416*0], m0 + mova [t1+r10*2+416*2], m1 + mova [t1+r10*2+416*4], m2 + add r10, 32 + jl .h_loop + ret +.top_fixup: + lea r10, [wq-2] +.top_fixup_loop: ; the sums of the first row needs to be doubled + mova m0, [t1+r10*2+416*0] + mova m1, [t1+r10*2+416*2] + mova m2, [t1+r10*2+416*4] + paddw m0, m0 + paddd m1, m1 + paddd m2, m2 + mova [t2+r10*2+416*0], m0 + mova [t2+r10*2+416*2], m1 + mova [t2+r10*2+416*4], m2 + add r10, 32 + jl .top_fixup_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movd xm17, [leftq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + add leftq, 4 + jmp .hv_main +.hv_extend_left: + vpbroadcastb xm17, [lpfq+wq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + jmp .hv_main +.hv_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu ym17, [lpfq+r10-2] +.hv_main: + vinserti32x8 m17, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -34 + jl .hv_have_right + vpbroadcastb m0, [lpfq-1] + movu ym16, [r12+r10-8] + vinserti32x8 m16, [r12+r10+0], 1 + vpternlogd m17, m0, m16, 0xe4 +.hv_have_right: + pshufb m1, m17, m5 + pmullw m3, m1, m1 + pshufb m2, m17, m6 + paddw m0, m1, m2 + shufps m1, m2, q2121 + paddw m0, m1 + punpcklwd m16, m1, m2 + punpckhwd m1, m2 + punpcklwd m2, m3, m4 + vpdpwssd m2, m16, m16 + punpckhwd m3, m4 + vpdpwssd m3, m1, m1 + pshufb m16, m17, m7 + paddw m0, m16 + pshufb m17, m8 + paddw m0, m17 ; h sum + punpcklwd m1, m16, m17 + vpdpwssd m2, m1, m1 ; h sumsq + punpckhwd m16, m17 + vpdpwssd m3, m16, m16 + paddw m1, m0, [t1+r10*2+416*0] + paddd m16, m2, [t1+r10*2+416*2] + paddd m17, m3, [t1+r10*2+416*4] + test hd, hd + jz .hv_last_row +.hv_main2: + paddd m16, [t2+r10*2+416*2] ; hv sumsq + paddd m17, [t2+r10*2+416*4] + paddw m1, [t2+r10*2+416*0] ; hv sum + mova [t0+r10*2+416*2], m2 + mova [t0+r10*2+416*4], m3 + mova [t0+r10*2+416*0], m0 + pmulld m16, m9 ; -a * 25 + pmulld m17, m9 + punpcklwd m0, m1, m4 ; b + vpdpwssd m16, m0, m0 ; -p + punpckhwd m1, m4 + vpdpwssd m17, m1, m1 + pmaddwd m0, m10 ; b * 164 + pmaddwd m1, m10 + pmulld m16, m11 ; p * s + pmulld m17, m11 + vpalignr m17{k2}, m16, m16, 2 + mova m16, m20 + paddusw m17, m12 + psraw m17, 4 ; min(z, 255) - 256 + vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x + pandn m16, m13, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m14 + vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) + vpternlogd m17, m1, m13, 0xd8 + mova [t3+r10*4+ 8], m16 ; The neighbor calculations requires + mova [t3+r10*4+ 24], xm17 ; 13 bits for a and 21 bits for b. + vextracti32x4 [t3+r10*4+ 56], m17, 2 ; Packing them allows for 12+20, but + mova [t3+r10*4+ 72], m17 ; that gets us most of the way. + vextracti128 [t3+r10*4+ 72], ym16, 1 + vextracti32x4 [t3+r10*4+104], m16, 3 + add r10, 32 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + ret +.hv_last_row: ; esoteric edge case for odd heights + mova [t1+r10*2+416*0], m1 + paddw m1, m0 + mova [t1+r10*2+416*2], m16 + paddd m16, m2 + mova [t1+r10*2+416*4], m17 + paddd m17, m3 + jmp .hv_main2 +.v: ; vertical boxsum + ab + lea r10, [wq-2] +.v_loop: + mova m2, [t1+r10*2+416*2] + paddd m16, m2, [t2+r10*2+416*2] + mova m3, [t1+r10*2+416*4] + paddd m17, m3, [t2+r10*2+416*4] + paddd m2, m2 + paddd m3, m3 + paddd m16, m2 ; hv sumsq + paddd m17, m3 + pmulld m16, m9 ; -a * 25 + pmulld m17, m9 + mova m0, [t1+r10*2+416*0] + paddw m1, m0, [t2+r10*2+416*0] + paddw m0, m0 + paddw m1, m0 ; hv sum + punpcklwd m0, m1, m4 ; b + vpdpwssd m16, m0, m0 ; -p + punpckhwd m1, m4 + vpdpwssd m17, m1, m1 + pmaddwd m0, m10 ; b * 164 + pmaddwd m1, m10 + pmulld m16, m11 ; p * s + pmulld m17, m11 + vpalignr m17{k2}, m16, m16, 2 + mova m16, m20 + paddusw m17, m12 + psraw m17, 4 ; min(z, 255) - 256 + vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x + pandn m16, m13, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m14 + vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) + vpternlogd m17, m1, m13, 0xd8 + mova [t3+r10*4+ 8], m16 + mova [t3+r10*4+ 24], xm17 + vextracti32x4 [t3+r10*4+ 56], m17, 2 + mova [t3+r10*4+ 72], m17 + vextracti128 [t3+r10*4+ 72], ym16, 1 + vextracti32x4 [t3+r10*4+104], m16, 3 + add r10, 32 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu m0, [t3+r10*4+ 4] + movu m1, [t3+r10*4+68] + paddd m2, m0, [t3+r10*4+ 0] + paddd m3, m1, [t3+r10*4+64] + paddd m2, [t3+r10*4+ 8] + paddd m3, [t3+r10*4+72] + paddd m0, m2 + pslld m2, 2 + paddd m1, m3 + pslld m3, 2 + paddd m2, m0 ; ab 565 + paddd m3, m1 + pandn m0, m13, m2 ; a + psrld m2, 12 ; b + pandn m1, m13, m3 + psrld m3, 12 + mova [t3+r10*4+416*4+ 0], m0 + mova [t3+r10*4+416*8+ 0], m2 + mova [t3+r10*4+416*4+64], m1 + mova [t3+r10*4+416*8+64], m3 + add r10, 32 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu m16, [t3+r10*4+ 4] + movu m17, [t3+r10*4+68] + paddd m0, m16, [t3+r10*4+ 0] + paddd m1, m17, [t3+r10*4+64] + paddd m0, [t3+r10*4+ 8] + paddd m1, [t3+r10*4+72] + paddd m16, m0 + pslld m0, 2 + paddd m17, m1 + pslld m1, 2 + paddd m0, m16 + paddd m1, m17 + pandn m16, m13, m0 + psrld m0, 12 + pandn m17, m13, m1 + psrld m1, 12 + paddd m2, m16, [t3+r10*4+416*4+ 0] ; a + paddd m3, m17, [t3+r10*4+416*4+64] + mova [t3+r10*4+416*4+ 0], m16 + mova [t3+r10*4+416*4+64], m17 + paddd m16, m0, [t3+r10*4+416*8+ 0] ; b + (1 << 8) + paddd m17, m1, [t3+r10*4+416*8+64] + mova [t3+r10*4+416*8+ 0], m0 + mova [t3+r10*4+416*8+64], m1 + pmovzxbd m0, [dstq+r10+ 0] + pmovzxbd m1, [dstq+r10+16] + pmaddwd m2, m0 ; a * src + pmaddwd m3, m1 + packssdw m0, m1 + psubd m16, m2 ; b - a * src + (1 << 8) + psubd m17, m3 + psrad m16, 9 + psrad m17, 9 + packssdw m16, m17 + pmulhrsw m16, m15 + paddw m16, m0 + packuswb m16, m16 + vpermd m16, m22, m16 + mova [dstq+r10], ym16 + add r10, 32 + jl .n0_loop + add dstq, strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + pmovzxbd m0, [dstq+r10+ 0] + pmovzxbd m1, [dstq+r10+16] + pmaddwd m2, m0, [t3+r10*4+416*4+ 0] ; a * src + pmaddwd m3, m1, [t3+r10*4+416*4+64] + mova m16, [t3+r10*4+416*8+ 0] ; b + (1 << 7) + mova m17, [t3+r10*4+416*8+64] + packssdw m0, m1 + psubd m16, m2 ; b - a * src + (1 << 7) + psubd m17, m3 + psrad m16, 8 + psrad m17, 8 + packssdw m16, m17 + pmulhrsw m16, m15 + paddw m16, m0 + packuswb m16, m16 + vpermd m16, m22, m16 + mova [dstq+r10], ym16 + add r10, 32 + jl .n1_loop + add dstq, strideq + ret + +cglobal sgr_filter_3x3_8bpc, 4, 15, 22, -416*28-16, dst, stride, left, lpf, \ + w, h, edge, params + mov paramsq, r6mp + mov wd, wm + movifnidn hd, hm + mov edged, r7m + vbroadcasti32x4 m5, [sgr_shuf+3] + add lpfq, wq + vbroadcasti32x4 m6, [sgr_shuf+5] + add dstq, wq + vbroadcasti32x4 m7, [sgr_shuf+7] + pxor m4, m4 + vpbroadcastd m8, [pd_m9] + vpsubd m11, m4, [paramsq+4] {1to16} ; -s1 + vpbroadcastw m15, [paramsq+10] ; w1 + lea t1, [rsp+wq*2+20] + vpbroadcastd m10, [pw_164_455] + lea t3, [rsp+wq*4+16+416*12] + vpbroadcastd m12, [pw_61448] ; (15 << 12) + (1 << 3) + neg wq + vpbroadcastd m13, [pd_m4096] + mov r10d, 0xfe + vpbroadcastd m14, [pd_34816] ; (1 << 11) + (1 << 15) + kmovb k1, r10d + mova m18, [sgr_x_by_x+64*0] + mov r10, 0x3333333333333333 + mova m19, [sgr_x_by_x+64*1] + kmovq k2, r10 + mova m20, [sgr_x_by_x+64*2] + psllw m15, 4 + mova m21, [sgr_x_by_x+64*3] + lea r14, [r_ext_mask+75] + mova ym9, [sgr_shuf] + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t2, t1 + add t1, 416*6 + call .h_top + lea t4, [lpfq+strideq*4] + mov lpfq, dstq + add t4, strideq + mov [rsp], t4 ; below + mov t0, t2 + call .hv +.main: + mov t5, t3 + add t3, 416*4 + dec hd + jz .height1 + add lpfq, strideq + call .hv + call .prep_n + dec hd + jz .extend_bottom +.main_loop: + add lpfq, strideq + call .hv + call .n + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .hv_bottom + call .n + add lpfq, strideq + call .hv_bottom +.end: + call .n + RET +.height1: + call .v + call .prep_n + mov t2, t1 + call .v + jmp .end +.extend_bottom: + call .v + call .n + mov t2, t1 + call .v + jmp .end +.no_top: + lea t4, [lpfq+strideq*4] + mov lpfq, dstq + lea t4, [t4+strideq*2] + mov [rsp], t4 + call .h + lea t0, [t1+416*6] + mov t2, t1 + call .v + jmp .main +.h: ; horizontal boxsum + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm17, [leftq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + add leftq, 4 + jmp .h_main +.h_extend_left: + vpbroadcastb xm17, [lpfq+wq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + jmp .h_main +.h_top: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu ym17, [lpfq+r10-2] +.h_main: + vinserti32x8 m17, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -33 + jl .h_have_right + vpbroadcastb m0, [lpfq-1] + movu ym16, [r14+r10-8] + vinserti32x8 m16, [r14+r10+0], 1 + vpternlogd m17, m0, m16, 0xe4 +.h_have_right: + pshufb m0, m17, m5 + pmullw m2, m0, m0 + pshufb m16, m17, m6 + paddw m0, m16 + pshufb m17, m7 + paddw m0, m17 ; sum + punpcklwd m3, m16, m17 + punpcklwd m1, m2, m4 + vpdpwssd m1, m3, m3 ; sumsq + punpckhwd m16, m17 + punpckhwd m2, m4 + vpdpwssd m2, m16, m16 + mova [t1+r10*2+416*0], m0 + mova [t1+r10*2+416*2], m1 + mova [t1+r10*2+416*4], m2 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movd xm17, [leftq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + add leftq, 4 + jmp .hv_main +.hv_extend_left: + vpbroadcastb xm17, [lpfq+wq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + jmp .hv_main +.hv_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu ym17, [lpfq+r10-2] +.hv_main: + vinserti32x8 m17, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -33 + jl .hv_have_right + vpbroadcastb m0, [lpfq-1] + movu ym16, [r14+r10-8] + vinserti32x8 m16, [r14+r10+0], 1 + vpternlogd m17, m0, m16, 0xe4 +.hv_have_right: + pshufb m0, m17, m5 + pmullw m3, m0, m0 + pshufb m1, m17, m6 + paddw m0, m1 + pshufb m17, m7 + paddw m0, m17 ; h sum + punpcklwd m16, m17, m1 + punpcklwd m2, m3, m4 + vpdpwssd m2, m16, m16 ; h sumsq + punpckhwd m17, m1 + punpckhwd m3, m4 + vpdpwssd m3, m17, m17 + paddw m1, m0, [t2+r10*2+416*0] + paddw m1, [t1+r10*2+416*0] ; hv sum + paddd m16, m2, [t2+r10*2+416*2] + paddd m17, m3, [t2+r10*2+416*4] + paddd m16, [t1+r10*2+416*2] ; hv sumsq + paddd m17, [t1+r10*2+416*4] + mova [t0+r10*2+416*0], m0 + mova [t0+r10*2+416*2], m2 + mova [t0+r10*2+416*4], m3 + pmulld m16, m8 ; -a * 9 + pmulld m17, m8 + punpcklwd m0, m4, m1 ; b + vpdpwssd m16, m0, m0 ; -p + punpckhwd m1, m4, m1 + vpdpwssd m17, m1, m1 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + pmulld m16, m11 ; p * s + pmulld m17, m11 + vpalignr m17{k2}, m16, m16, 2 + mova m16, m20 + paddusw m17, m12 + psraw m17, 4 ; min(z, 255) - 256 + vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x + pandn m16, m13, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m14 + vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) + vpternlogd m17, m1, m13, 0xd8 + mova [t3+r10*4+ 8], m16 + mova [t3+r10*4+ 24], xm17 + vextracti32x4 [t3+r10*4+ 56], m17, 2 + mova [t3+r10*4+ 72], m17 + vextracti128 [t3+r10*4+ 72], ym16, 1 + vextracti32x4 [t3+r10*4+104], m16, 3 + add r10, 32 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + ret +.v: ; vertical boxsum + ab + lea r10, [wq-2] +.v_loop: + mova m16, [t1+r10*2+416*2] + mova m17, [t1+r10*2+416*4] + paddd m16, m16 + paddd m17, m17 + paddd m16, [t2+r10*2+416*2] ; hv sumsq + paddd m17, [t2+r10*2+416*4] + pmulld m16, m8 ; -a * 9 + pmulld m17, m8 + mova m1, [t1+r10*2+416*0] + paddw m1, m1 + paddw m1, [t2+r10*2+416*0] ; hv sum + punpcklwd m0, m4, m1 ; b + vpdpwssd m16, m0, m0 ; -p + punpckhwd m1, m4, m1 + vpdpwssd m17, m1, m1 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + pmulld m16, m11 ; p * s + pmulld m17, m11 + vpalignr m17{k2}, m16, m16, 2 + mova m16, m20 + paddusw m17, m12 + psraw m17, 4 ; min(z, 255) - 256 + vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x + pandn m16, m13, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m14 + vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) + vpternlogd m17, m1, m13, 0xd8 + mova [t3+r10*4+ 8], m16 + mova [t3+r10*4+ 24], xm17 + vextracti32x4 [t3+r10*4+ 56], m17, 2 + mova [t3+r10*4+ 72], m17 + vextracti128 [t3+r10*4+ 72], ym16, 1 + vextracti32x4 [t3+r10*4+104], m16, 3 + add r10, 32 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + mov r10, wq + mov t4, t3 + add t3, 416*4 +.prep_n_loop: + mova m2, [t5+r10*4+0] + mova m3, [t4+r10*4+0] + paddd m2, [t5+r10*4+8] + paddd m3, [t4+r10*4+8] + paddd m0, m2, [t5+r10*4+4] + paddd m1, m3, [t4+r10*4+4] + pslld m0, 2 + paddd m1, m1 ; ab[ 0] 222 + psubd m0, m2 ; ab[-1] 343 + mova [t3+r10*4+416*4], m1 + paddd m1, m1 + mova [t5+r10*4], m0 + psubd m1, m3 ; ab[ 0] 343 + mova [t4+r10*4], m1 + add r10, 16 + jl .prep_n_loop + ret +; a+b are packed together in a single dword, but we can't do the +; full neighbor calculations before splitting them since we don't +; have sufficient precision. The solution is to do the calculations +; in two equal halves and split a and b before doing the final sum. +ALIGN function_align +.n: ; neighbor + output + mov r10, wq +.n_loop: + mova m16, [t3+r10*4+ 0] + paddd m16, [t3+r10*4+ 8] + paddd m17, m16, [t3+r10*4+ 4] + paddd m17, m17 ; ab[+1] 222 + mova m2, [t3+r10*4+416*4+ 0] + paddd m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343 + mova m3, [t3+r10*4+416*4+64] + paddd m1, m3, [t5+r10*4+64] + mova [t3+r10*4+416*4+ 0], m17 + paddd m17, m17 + psubd m17, m16 ; ab[+1] 343 + mova [t5+r10*4+ 0], m17 + paddd m2, m17 ; ab[ 0] 222 + ab[+1] 343 + mova m16, [t3+r10*4+64] + paddd m16, [t3+r10*4+72] + paddd m17, m16, [t3+r10*4+68] + paddd m17, m17 + mova [t3+r10*4+416*4+64], m17 + paddd m17, m17 + psubd m17, m16 + mova [t5+r10*4+64], m17 + pandn m16, m13, m0 + psrld m0, 12 + paddd m3, m17 + pandn m17, m13, m2 + psrld m2, 12 + paddd m16, m17 ; a + pandn m17, m13, m1 + psrld m1, 12 + paddd m0, m2 ; b + (1 << 8) + pandn m2, m13, m3 + psrld m3, 12 + paddd m17, m2 + pmovzxbd m2, [dstq+r10+ 0] + paddd m1, m3 + pmovzxbd m3, [dstq+r10+16] + pmaddwd m16, m2 ; a * src + pmaddwd m17, m3 + packssdw m2, m3 + psubd m0, m16 ; b - a * src + (1 << 8) + psubd m1, m17 + psrad m0, 9 + psrad m1, 9 + packssdw m0, m1 + pmulhrsw m0, m15 + paddw m0, m2 + packuswb m0, m0 + vpermd m16, m9, m0 + mova [dstq+r10], ym16 + add r10, 32 + jl .n_loop + mov r10, t5 + mov t5, t4 + mov t4, r10 + add dstq, strideq + ret + +cglobal sgr_filter_mix_8bpc, 4, 13, 28, 416*56+8, dst, stride, left, lpf, \ + w, h, edge, params + mov paramsq, r6mp + mov wd, wm + movifnidn hd, hm + mov edged, r7m + vbroadcasti128 m5, [sgr_shuf+1] + add lpfq, wq + vbroadcasti128 m6, [sgr_shuf+9] + add dstq, wq + vbroadcasti128 m7, [sgr_shuf+3] + lea t3, [rsp+wq*4+416*24+8] + vbroadcasti128 m8, [sgr_shuf+7] + pxor m4, m4 + vpbroadcastd m9, [pd_m9] + vpsubd m11, m4, [paramsq+0] {1to16} ; -s0 + vpbroadcastd m14, [pw_61448] + vpsubd m12, m4, [paramsq+4] {1to16} ; -s1 + vpbroadcastd m26, [paramsq+8] ; w0 w1 + lea t1, [rsp+wq*2+12] + vpbroadcastd m10, [pd_m25] + neg wq + vpbroadcastd m13, [pw_164_455] + mov r10d, 0xfe + vpbroadcastd m15, [pd_34816] + kmovb k1, r10d + mova m20, [sgr_x_by_x+64*0] + mov r10, 0x3333333333333333 + mova m21, [sgr_x_by_x+64*1] + kmovq k2, r10 + mova m22, [sgr_x_by_x+64*2] + lea r12, [r_ext_mask+75] + mova m23, [sgr_x_by_x+64*3] + vpbroadcastd m24, [pd_m4096] + vpbroadcastd m25, [sgr_shuf+28] ; 0x8000____ + psllw m26, 5 + mova xm27, [sgr_mix_perm] + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t2, t1 + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx512icl).top_fixup + add t1, 416*12 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov [rsp], r10 ; below + call .hv0 +.main: + dec hd + jz .height1 + add lpfq, strideq + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, strideq + call .hv0 + test hd, hd + jz .odd_height + add lpfq, strideq + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .hv0_bottom + add lpfq, strideq + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + lea t2, [t1+416*12] + lea r10, [wq-2] +.top_fixup_loop: + mova m0, [t1+r10*2+416* 0] + mova m1, [t1+r10*2+416* 2] + mova m2, [t1+r10*2+416* 4] + paddw m0, m0 + mova m3, [t1+r10*2+416* 6] + paddd m1, m1 + mova m16, [t1+r10*2+416* 8] + paddd m2, m2 + mova m17, [t1+r10*2+416*10] + mova [t2+r10*2+416* 0], m0 + mova [t2+r10*2+416* 2], m1 + mova [t2+r10*2+416* 4], m2 + mova [t2+r10*2+416* 6], m3 + mova [t2+r10*2+416* 8], m16 + mova [t2+r10*2+416*10], m17 + add r10, 32 + jl .top_fixup_loop + call .v0 + jmp .main +.h: ; horizontal boxsums + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm17, [leftq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + add leftq, 4 + jmp .h_main +.h_extend_left: + vpbroadcastb xm17, [lpfq+wq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + jmp .h_main +.h_top: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu ym17, [lpfq+r10-2] +.h_main: + vinserti32x8 m17, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -34 + jl .h_have_right + vpbroadcastb m0, [lpfq-1] + movu ym16, [r12+r10-8] + vinserti32x8 m16, [r12+r10+0], 1 + vpternlogd m17, m0, m16, 0xe4 +.h_have_right: + pshufb m3, m17, m5 + pshufb m18, m17, m6 + shufps m0, m3, m18, q2121 + pmullw m2, m0, m0 + pshufb m19, m17, m7 + paddw m0, m19 + pshufb m17, m8 + paddw m0, m17 ; sum3 + punpcklwd m16, m19, m17 + punpcklwd m1, m2, m4 + vpdpwssd m1, m16, m16 ; sumsq3 + punpckhwd m19, m17 + punpckhwd m2, m4 + vpdpwssd m2, m19, m19 + mova [t1+r10*2+416* 6], m0 + mova [t1+r10*2+416* 8], m1 + mova [t1+r10*2+416*10], m2 + punpcklwd m19, m3, m18 + paddw m0, m3 + vpdpwssd m1, m19, m19 ; sumsq5 + punpckhwd m3, m18 + paddw m0, m18 ; sum5 + vpdpwssd m2, m3, m3 + mova [t1+r10*2+416* 0], m0 + mova [t1+r10*2+416* 2], m1 + mova [t1+r10*2+416* 4], m2 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows) + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movd xm17, [leftq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + add leftq, 4 + jmp .hv0_main +.hv0_extend_left: + vpbroadcastb xm17, [lpfq+wq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + jmp .hv0_main +.hv0_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left +.hv0_loop: + movu ym17, [lpfq+r10-2] +.hv0_main: + vinserti32x8 m17, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp r10d, -34 + jl .hv0_have_right + vpbroadcastb m0, [lpfq-1] + movu ym16, [r12+r10-8] + vinserti32x8 m16, [r12+r10+0], 1 + vpternlogd m17, m0, m16, 0xe4 +.hv0_have_right: + pshufb m18, m17, m5 + pshufb m19, m17, m6 + shufps m1, m18, m19, q2121 + pmullw m3, m1, m1 + pshufb m0, m17, m7 + paddw m1, m0 + pshufb m17, m8 + paddw m1, m17 ; sum3 + punpcklwd m16, m0, m17 + punpcklwd m2, m3, m4 + vpdpwssd m2, m16, m16 ; sumsq3 + punpckhwd m0, m17 + punpckhwd m3, m4 + vpdpwssd m3, m0, m0 + paddw m0, m1, [t1+r10*2+416* 6] + paddd m16, m2, [t1+r10*2+416* 8] + paddd m17, m3, [t1+r10*2+416*10] + mova [t1+r10*2+416* 6], m1 + mova [t1+r10*2+416* 8], m2 + mova [t1+r10*2+416*10], m3 + paddw m1, m18 + paddw m1, m19 ; sum5 + mova [t3+r10*4+416*8+ 8], m1 + paddw m1, [t1+r10*2+416* 0] + mova [t1+r10*2+416* 0], m1 + punpcklwd m1, m18, m19 + vpdpwssd m2, m1, m1 ; sumsq5 + punpckhwd m18, m19 + vpdpwssd m3, m18, m18 + mova [t3+r10*4+416*0+ 8], m2 ; we need a clean copy of the last row + mova [t3+r10*4+416*0+72], m3 ; in case height is odd + paddd m2, [t1+r10*2+416* 2] + paddd m3, [t1+r10*2+416* 4] + mova [t1+r10*2+416* 2], m2 + mova [t1+r10*2+416* 4], m3 + paddw m1, m0, [t2+r10*2+416* 6] + paddd m2, m16, [t2+r10*2+416* 8] + paddd m3, m17, [t2+r10*2+416*10] + mova [t2+r10*2+416* 6], m0 + mova [t2+r10*2+416* 8], m16 + mova [t2+r10*2+416*10], m17 + pmulld m16, m2, m9 ; -a3 * 9 + pmulld m17, m3, m9 + punpcklwd m0, m4, m1 ; b3 + vpdpwssd m16, m0, m0 ; -p3 + punpckhwd m1, m4, m1 + vpdpwssd m17, m1, m1 + pmulld m16, m12 ; p3 * s1 + pmulld m17, m12 + pmaddwd m0, m13 ; b3 * 455 + pmaddwd m1, m13 + vpalignr m17{k2}, m16, m16, 2 + mova m16, m22 + paddusw m17, m14 + psraw m17, 4 ; min(z3, 255) - 256 + vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x3 + pandn m16, m24, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m15 + vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) + vpternlogd m17, m1, m24, 0xd8 + mova [t3+r10*4+416*4+ 8], m16 + mova [t3+r10*4+416*4+ 24], xm17 + vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2 + mova [t3+r10*4+416*4+ 72], m17 + vextracti128 [t3+r10*4+416*4+ 72], ym16, 1 + vextracti32x4 [t3+r10*4+416*4+104], m16, 3 + add r10, 32 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movd xm17, [leftq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + add leftq, 4 + jmp .hv1_main +.hv1_extend_left: + vpbroadcastb xm17, [lpfq+wq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + jmp .hv1_main +.hv1_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left +.hv1_loop: + movu ym17, [lpfq+r10-2] +.hv1_main: + vinserti32x8 m17, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp r10d, -34 + jl .hv1_have_right + vpbroadcastb m0, [lpfq-1] + movu ym16, [r12+r10-8] + vinserti32x8 m16, [r12+r10+0], 1 + vpternlogd m17, m0, m16, 0xe4 +.hv1_have_right: + pshufb m3, m17, m5 + pshufb m19, m17, m6 + shufps m2, m3, m19, q2121 + pmullw m1, m2, m2 + pshufb m18, m17, m7 + paddw m2, m18 + pshufb m17, m8 + paddw m2, m17 ; sum3 + punpcklwd m16, m17, m18 + punpcklwd m0, m1, m4 + vpdpwssd m0, m16, m16 ; sumsq3 + punpckhwd m17, m18 + punpckhwd m1, m4 + vpdpwssd m1, m17, m17 + paddd m16, m0, [t2+r10*2+416* 8] + paddd m17, m1, [t2+r10*2+416*10] + mova [t2+r10*2+416* 8], m0 + mova [t2+r10*2+416*10], m1 + punpcklwd m18, m3, m19 + vpdpwssd m0, m18, m18 ; sumsq5 + punpckhwd m18, m3, m19 + vpdpwssd m1, m18, m18 + paddw m3, m19 + pmulld m16, m9 ; -a3 * 9 + pmulld m17, m9 + paddd m18, m0, [t2+r10*2+416*2] + paddd m19, m1, [t2+r10*2+416*4] + paddd m18, [t1+r10*2+416*2] + paddd m19, [t1+r10*2+416*4] + mova [t2+r10*2+416*2], m0 + mova [t2+r10*2+416*4], m1 + pmulld m18, m10 ; -a5 * 25 + pmulld m19, m10 + paddw m1, m2, [t2+r10*2+416* 6] + mova [t2+r10*2+416* 6], m2 + paddw m2, m3 ; sum5 + paddw m3, m2, [t2+r10*2+416*0] + paddw m3, [t1+r10*2+416*0] + mova [t2+r10*2+416*0], m2 + punpcklwd m0, m4, m1 ; b3 + vpdpwssd m16, m0, m0 ; -p3 + punpckhwd m1, m4, m1 + vpdpwssd m17, m1, m1 + punpcklwd m2, m3, m4 ; b5 + vpdpwssd m18, m2, m2 ; -p5 + punpckhwd m3, m4 + vpdpwssd m19, m3, m3 + pmulld m16, m12 ; p3 * s1 + pmulld m17, m12 + pmulld m18, m11 ; p5 * s0 + pmulld m19, m11 + pmaddwd m0, m13 ; b3 * 455 + pmaddwd m1, m13 + pmaddwd m2, m13 ; b5 * 164 + pmaddwd m3, m13 + vpalignr m17{k2}, m16, m16, 2 + vpalignr m19{k2}, m18, m18, 2 + paddusw m17, m14 + mova m16, m22 + psraw m17, 4 ; min(z3, 255) - 256 + vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] + paddusw m19, m14 + mova m18, m22 + psraw m19, 4 ; min(z5, 255) - 256 + vpermt2b m18, m19, m23 ; sgr_x_by_x[128..255] + vpmovb2m k4, m19 + vpermi2b m19, m20, m21 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x3 + vmovdqu8 m19{k4}, m18 ; x5 + pandn m16, m24, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + pandn m18, m24, m19 + psrld m19, 16 + pmulld m2, m18 + pmulld m3, m19 + paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m15 + vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) + vpternlogd m17, m1, m24, 0xd8 + mova [t3+r10*4+416*8+ 8], m16 + mova [t3+r10*4+416*8+ 24], xm17 + vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2 + paddd m2, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m3, m15 + mova [t3+r10*4+416*8+ 72], m17 + vextracti128 [t3+r10*4+416*8+ 72], ym16, 1 + vextracti32x4 [t3+r10*4+416*8+104], m16, 3 + vpternlogd m18, m2, m24, 0xd8 ; a5 | (b5 << 12) + vpternlogd m19, m3, m24, 0xd8 + mova [t3+r10*4+416*0+ 8], m18 + mova [t3+r10*4+416*0+ 24], xm19 + vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2 + mova [t3+r10*4+416*0+ 72], m19 + vextracti128 [t3+r10*4+416*0+ 72], ym18, 1 + vextracti32x4 [t3+r10*4+416*0+104], m18, 3 + add r10, 32 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab3 (even rows) + lea r10, [wq-2] +.v0_loop: + mova m2, [t1+r10*2+416* 8] + mova m3, [t1+r10*2+416*10] + paddd m2, m2 + paddd m3, m3 + paddd m16, m2, [t2+r10*2+416* 8] + paddd m17, m3, [t2+r10*2+416*10] + mova m0, [t1+r10*2+416* 6] + paddw m0, m0 + paddw m1, m0, [t2+r10*2+416* 6] + pmulld m16, m9 ; -a3 * 9 + pmulld m17, m9 + mova [t2+r10*2+416* 6], m0 + mova [t2+r10*2+416* 8], m2 + mova [t2+r10*2+416*10], m3 + mova m2, [t1+r10*2+416*0] + mova m3, [t1+r10*2+416*2] + mova m18, [t1+r10*2+416*4] + punpcklwd m0, m4, m1 ; b3 + vpdpwssd m16, m0, m0 ; -p3 + punpckhwd m1, m4, m1 + vpdpwssd m17, m1, m1 + pmulld m16, m12 ; p3 * s1 + pmulld m17, m12 + pmaddwd m0, m13 ; b3 * 455 + pmaddwd m1, m13 + mova [t3+r10*4+416*8+ 8], m2 + mova [t3+r10*4+416*0+ 8], m3 + mova [t3+r10*4+416*0+72], m18 + vpalignr m17{k2}, m16, m16, 2 + mova m16, m22 + paddusw m17, m14 + psraw m17, 4 ; min(z3, 255) - 256 + vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x3 + pandn m16, m24, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + paddw m2, m2 ; cc5 + paddd m3, m3 + paddd m18, m18 + mova [t1+r10*2+416*0], m2 + mova [t1+r10*2+416*2], m3 + mova [t1+r10*2+416*4], m18 + paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m15 + vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) + vpternlogd m17, m1, m24, 0xd8 + mova [t3+r10*4+416*4+ 8], m16 + mova [t3+r10*4+416*4+ 24], xm17 + vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2 + mova [t3+r10*4+416*4+ 72], m17 + vextracti128 [t3+r10*4+416*4+ 72], ym16, 1 + vextracti32x4 [t3+r10*4+416*4+104], m16, 3 + add r10, 32 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) + lea r10, [wq-2] +.v1_loop: + mova m0, [t1+r10*2+416* 8] + paddd m16, m0, [t2+r10*2+416* 8] + mova m1, [t1+r10*2+416*10] + paddd m17, m1, [t2+r10*2+416*10] + mova m2, [t3+r10*4+416*0+ 8] + paddd m18, m2, [t2+r10*2+416* 2] + mova m3, [t3+r10*4+416*0+72] + paddd m19, m3, [t2+r10*2+416* 4] + paddd m18, [t1+r10*2+416* 2] + paddd m19, [t1+r10*2+416* 4] + mova [t2+r10*2+416* 8], m0 + mova [t2+r10*2+416*10], m1 + mova [t2+r10*2+416* 2], m2 + mova [t2+r10*2+416* 4], m3 + pmulld m16, m9 ; -a3 * 9 + pmulld m17, m9 + pmulld m18, m10 ; -a5 * 25 + pmulld m19, m10 + mova m0, [t1+r10*2+416* 6] + paddw m1, m0, [t2+r10*2+416* 6] + mova m2, [t3+r10*4+416*8+ 8] + paddw m3, m2, [t2+r10*2+416*0] + paddw m3, [t1+r10*2+416*0] + mova [t2+r10*2+416* 6], m0 + mova [t2+r10*2+416*0], m2 + punpcklwd m0, m4, m1 ; b3 + vpdpwssd m16, m0, m0 ; -p3 + punpckhwd m1, m4, m1 + vpdpwssd m17, m1, m1 + punpcklwd m2, m3, m4 ; b5 + vpdpwssd m18, m2, m2 ; -p5 + punpckhwd m3, m4 + vpdpwssd m19, m3, m3 + pmulld m16, m12 ; p3 * s1 + pmulld m17, m12 + pmulld m18, m11 ; p5 * s0 + pmulld m19, m11 + pmaddwd m0, m13 ; b3 * 455 + pmaddwd m1, m13 + pmaddwd m2, m13 ; b5 * 164 + pmaddwd m3, m13 + vpalignr m17{k2}, m16, m16, 2 + vpalignr m19{k2}, m18, m18, 2 + paddusw m17, m14 + mova m16, m22 + psraw m17, 4 ; min(z3, 255) - 256 + vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] + paddusw m19, m14 + mova m18, m22 + psraw m19, 4 ; min(z5, 255) - 256 + vpermt2b m18, m19, m23 ; sgr_x_by_x[128..255] + vpmovb2m k4, m19 + vpermi2b m19, m20, m21 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x3 + vmovdqu8 m19{k4}, m18 ; x5 + pandn m16, m24, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + pandn m18, m24, m19 + psrld m19, m19, 16 + pmulld m2, m18 + pmulld m3, m19 + paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m15 + vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) + vpternlogd m17, m1, m24, 0xd8 + mova [t3+r10*4+416*8+ 8], m16 + mova [t3+r10*4+416*8+ 24], xm17 + vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2 + paddd m2, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m3, m15 + mova [t3+r10*4+416*8+ 72], m17 + vextracti128 [t3+r10*4+416*8+ 72], ym16, 1 + vextracti32x4 [t3+r10*4+416*8+104], m16, 3 + vpternlogd m18, m2, m24, 0xd8 ; a5 | (b5 << 12) + vpternlogd m19, m3, m24, 0xd8 + mova [t3+r10*4+416*0+ 8], m18 + mova [t3+r10*4+416*0+ 24], xm19 + vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2 + mova [t3+r10*4+416*0+ 72], m19 + vextracti128 [t3+r10*4+416*0+ 72], ym18, 1 + vextracti32x4 [t3+r10*4+416*0+104], m18, 3 + add r10, 32 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu m0, [t3+r10*4+416*0+4] + paddd m1, m0, [t3+r10*4+416*0+0] + mova m16, [t3+r10*4+416*4+0] + paddd m1, [t3+r10*4+416*0+8] + mova m17, [t3+r10*4+416*8+0] + paddd m16, [t3+r10*4+416*4+8] + paddd m17, [t3+r10*4+416*8+8] + paddd m2, m16, [t3+r10*4+416*4+4] + paddd m3, m17, [t3+r10*4+416*8+4] + paddd m0, m1 + pslld m1, 2 + pslld m2, 2 + paddd m1, m0 ; ab5 565 + paddd m3, m3 ; ab3[ 0] 222 + psubd m2, m16 ; ab3[-1] 343 + mova [t3+r10*4+416*20], m3 + pandn m0, m24, m1 ; a5 565 + mova [t3+r10*4+416*24], m2 + psrld m1, 12 ; b5 565 + mova [t3+r10*4+416*12], m0 + paddd m3, m3 + mova [t3+r10*4+416*16], m1 + psubd m3, m17 ; ab3[ 0] 343 + mova [t3+r10*4+416*28], m3 + add r10, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu m2, [t3+r10*4+4] + paddd m3, m2, [t3+r10*4+0] + paddd m3, [t3+r10*4+8] + mova m1, [t3+r10*4+416*4+0] + paddd m2, m3 + pslld m3, 2 + paddd m1, [t3+r10*4+416*4+8] + paddd m3, m2 + pandn m2, m24, m3 + psrld m3, 12 + paddd m0, m2, [t3+r10*4+416*12] ; a5 + paddd m16, m3, [t3+r10*4+416*16] ; b5 + (1 << 8) + mova [t3+r10*4+416*12], m2 + mova [t3+r10*4+416*16], m3 + paddd m2, m1, [t3+r10*4+416*4+4] + paddd m2, m2 ; ab3[ 1] 222 + mova m3, [t3+r10*4+416*20] + paddd m17, m3, [t3+r10*4+416*24] ; ab3[ 0] 222 + ab3[-1] 343 + mova [t3+r10*4+416*20], m2 + paddd m2, m2 + psubd m2, m1 ; ab3[ 1] 343 + mova [t3+r10*4+416*24], m2 + paddd m2, m3 ; ab3[ 0] 222 + ab3[ 1] 343 + pandn m1, m24, m17 + psrld m17, 12 + pandn m3, m24, m2 + psrld m2, 12 + paddd m1, m3 ; a3 + pmovzxbd m3, [dstq+r10] + paddd m17, m2 ; b3 + (1 << 8) + pmaddwd m0, m3 ; a5 * src + pmaddwd m1, m3 ; a3 * src + vpshldd m3, m25, 16 ; (dst << 16) + (1 << 15) + psubd m16, m0 ; b5 - a5 * src + (1 << 8) + psubd m17, m1 ; b3 - a3 * src + (1 << 8) + psrld m16, 9 + pslld m17, 7 + vmovdqu8 m17{k2}, m16 + vpdpwssd m3, m17, m26 + packuswb m3, m2 + vpermb m16, m27, m3 + mova [dstq+r10], xm16 + add r10, 16 + jl .n0_loop + add dstq, strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + mova m1, [t3+r10*4+416*8+0] + paddd m1, [t3+r10*4+416*8+8] + paddd m2, m1, [t3+r10*4+416*8+4] + paddd m2, m2 ; ab3[ 1] 222 + mova m0, [t3+r10*4+416*20] + paddd m17, m0, [t3+r10*4+416*28] ; ab3[ 0] 222 + ab3[-1] 343 + pmovzxbd m3, [dstq+r10] + mova [t3+r10*4+416*20], m2 + paddd m2, m2 + psubd m2, m1 ; ab3[ 1] 343 + mova [t3+r10*4+416*28], m2 + paddd m0, m2 ; ab3[ 0] 222 + ab3[ 1] 343 + pandn m1, m24, m17 + psrld m17, 12 + pandn m2, m24, m0 + psrld m0, 12 + paddd m1, m2 ; a3 + paddd m17, m0 ; b3 + (1 << 8) + mova m16, [t3+r10*4+416*16] ; b5 + (1 << 7) + pmaddwd m1, m3 ; a3 * src + pmaddwd m0, m3, [t3+r10*4+416*12] ; a5 * src + vpshldd m3, m25, 16 ; (dst << 16) + (1 << 15) + psubd m17, m1 ; b3 - a3 * src + (1 << 8) + psubd m16, m0 ; b5 - a5 * src + (1 << 7) + pslld m17, 7 + palignr m17{k2}, m16, m16, 1 + vpdpwssd m3, m17, m26 + packuswb m3, m3 + vpermb m16, m27, m3 + mova [dstq+r10], xm16 + add r10, 16 + jl .n1_loop + add dstq, strideq + ret + +%endif ; ARCH_X86_64 |