diff options
Diffstat (limited to 'third_party/dav1d/src/x86/looprestoration.asm')
-rw-r--r-- | third_party/dav1d/src/x86/looprestoration.asm | 1158 |
1 files changed, 1158 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/looprestoration.asm b/third_party/dav1d/src/x86/looprestoration.asm new file mode 100644 index 0000000000..fc6e9f124e --- /dev/null +++ b/third_party/dav1d/src/x86/looprestoration.asm @@ -0,0 +1,1158 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 +pb_right_ext_mask: times 32 db 0xff + times 32 db 0 +pb_14x0_1_2: times 14 db 0 + db 1, 2 +pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13 + db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14 +pb_15: times 16 db 15 +pw_16: times 2 dw 16 +pw_256: times 2 dw 256 +pw_2048: times 2 dw 2048 +pw_16380: times 2 dw 16380 +pw_0_128: dw 0, 128 +pw_5_6: dw 5, 6 +pd_6: dd 6 +pd_1024: dd 1024 +pd_0xf0080029: dd 0xf0080029 +pd_0xf00801c7: dd 0xf00801c7 + +cextern sgr_x_by_x + +SECTION .text + +INIT_YMM avx2 +cglobal wiener_filter_h, 5, 12, 16, dst, left, src, stride, fh, w, h, edge + mov edged, edgem + vpbroadcastb m15, [fhq+0] + movifnidn wd, wm + vpbroadcastb m14, [fhq+2] + mov hd, hm + vpbroadcastb m13, [fhq+4] + vpbroadcastw m12, [fhq+6] + vpbroadcastd m11, [pw_2048] + vpbroadcastd m10, [pw_16380] + lea r11, [pb_right_ext_mask] + + DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim + + ; if (edge & has_right) align_w_to_32 + ; else w -= 32, and use that as limit in x loop + test edgeb, 2 ; has_right + jnz .align + mov xlimq, -3 + jmp .loop +.align: + add wd, 31 + and wd, ~31 + xor xlimd, xlimd + + ; main y loop for vertical filter +.loop: + mov srcptrq, srcq + mov dstptrq, dstq + lea xq, [wq+xlimq] + + ; load left edge pixels + test edgeb, 1 ; have_left + jz .emu_left + test leftq, leftq ; left == NULL for the edge-extended bottom/top + jz .load_left_combined + movd xm0, [leftq] + add leftq, 4 + pinsrd xm0, [srcq], 1 + pslldq xm0, 9 + jmp .left_load_done +.load_left_combined: + movq xm0, [srcq-3] + pslldq xm0, 10 + jmp .left_load_done +.emu_left: + movd xm0, [srcq] + pshufb xm0, [pb_14x0_1_2] + + ; load right edge pixels +.left_load_done: + cmp xd, 32 + jg .main_load + test xd, xd + jg .load_and_splat + je .splat_right + + ; for very small images (w=[1-2]), edge-extend the original cache, + ; ugly, but only runs in very odd cases + add wd, wd + pshufb xm0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16] + shr wd, 1 + + ; main x loop, mostly this starts in .main_load +.splat_right: + ; no need to load new pixels, just extend them from the (possibly previously + ; extended) previous load into m0 + pshufb xm1, xm0, [pb_15] + jmp .main_loop +.load_and_splat: + ; load new pixels and extend edge for right-most + movu m1, [srcptrq+3] + sub r11, xq + movu m2, [r11-pb_right_ext_mask+pb_right_ext_mask+32] + add r11, xq + vpbroadcastb m3, [srcptrq+2+xq] + pand m1, m2 + pandn m3, m2, m3 + por m1, m3 + jmp .main_loop +.main_load: + ; load subsequent line + movu m1, [srcptrq+3] +.main_loop: + vinserti128 m0, xm1, 1 + + palignr m2, m1, m0, 10 + palignr m3, m1, m0, 11 + palignr m4, m1, m0, 12 + palignr m5, m1, m0, 13 + palignr m6, m1, m0, 14 + palignr m7, m1, m0, 15 + + punpcklbw m0, m2, m1 + punpckhbw m2, m1 + punpcklbw m8, m3, m7 + punpckhbw m3, m7 + punpcklbw m7, m4, m6 + punpckhbw m4, m6 + pxor m9, m9 + punpcklbw m6, m5, m9 + punpckhbw m5, m9 + + pmaddubsw m0, m15 + pmaddubsw m2, m15 + pmaddubsw m8, m14 + pmaddubsw m3, m14 + pmaddubsw m7, m13 + pmaddubsw m4, m13 + paddw m0, m8 + paddw m2, m3 + psllw m8, m6, 7 + psllw m3, m5, 7 + psubw m8, m10 + psubw m3, m10 + pmullw m6, m12 + pmullw m5, m12 + paddw m0, m7 + paddw m2, m4 + paddw m0, m6 + paddw m2, m5 + ; for a signed overflow to happen we need filter and pixels as follow: + ; filter => -5,-23,-17,90,-17,-23,-5 + ; pixels => 255,255,255,0,255,255,255 or 0,0,0,255,0,0,0 + ; m0 would fall in the range [-59A6;+59A6] = [A65A;59A6] + ; m8 would fall in the range [-3FFC;+3F84] = [C004;3F84] + ; 32-bit arithmetic m0+m8 = [-99A2;+992A] = [FFFF665E;992A] + ; => signed 16-bit overflow occurs + paddsw m0, m8 ; paddsw clips this range to [-8000;+7FFF] + paddsw m2, m3 + psraw m0, 3 ; shift changes the range to [-1000;+FFF] + psraw m2, 3 + paddw m0, m11 ; adding back 800 (removed in m8) changes the + paddw m2, m11 ; range to [-800;+17FF] as defined in the spec + mova [dstptrq], xm0 ; (note that adding another 800 would give us + mova [dstptrq+16], xm2; the same range as in the C code => [0;1FFF]) + vextracti128 [dstptrq+32], m0, 1 + vextracti128 [dstptrq+48], m2, 1 + vextracti128 xm0, m1, 1 + add srcptrq, 32 + add dstptrq, 64 + sub xq, 32 + cmp xd, 32 + jg .main_load + test xd, xd + jg .load_and_splat + cmp xd, xlimd + jg .splat_right + + add srcq, strideq + add dstq, 384*2 + dec hd + jg .loop + RET + +cglobal wiener_filter_v, 4, 10, 13, dst, stride, mid, w, h, fv, edge + movifnidn fvq, fvmp + mov edged, edgem + movifnidn hd, hm + vpbroadcastd m10, [fvq] + vpbroadcastd m11, [fvq+4] + vpbroadcastd m0, [pw_0_128] + vpbroadcastd m12, [pd_1024] + + DEFINE_ARGS dst, stride, mid, w, h, ylim, edge, y, mptr, dstptr + rorx ylimd, edged, 2 + paddw m11, m0 + and ylimd, 2 ; have_bottom + sub ylimd, 3 + + ; main x loop for vertical filter, does one column of 16 pixels +.loop_x: + mova m3, [midq] ; middle line + + ; load top pixels + test edgeb, 4 ; have_top + jz .emu_top + mova m0, [midq-384*4] + mova m2, [midq-384*2] + mova m1, m0 + jmp .load_bottom_pixels +.emu_top: + mova m0, m3 + mova m1, m3 + mova m2, m3 + + ; load bottom pixels +.load_bottom_pixels: + mov yd, hd + mov mptrq, midq + mov dstptrq, dstq + add yd, ylimd + jg .load_threelines + + ; the remainder here is somewhat messy but only runs in very weird + ; circumstances at the bottom of the image in very small blocks (h=[1-3]), + ; so performance is not terribly important here... + je .load_twolines + cmp yd, -1 + je .load_oneline + ; h == 1 case + mova m5, m3 + mova m4, m3 + mova m6, m3 + jmp .loop +.load_oneline: + ; h == 2 case + mova m4, [midq+384*2] + mova m5, m4 + mova m6, m4 + jmp .loop +.load_twolines: + ; h == 3 case + mova m4, [midq+384*2] + mova m5, [midq+384*4] + mova m6, m5 + jmp .loop +.load_threelines: + ; h > 3 case + mova m4, [midq+384*2] + mova m5, [midq+384*4] + ; third line loaded in main loop below + + ; main y loop for vertical filter +.loop_load: + ; load one line into m6. if that pixel is no longer available, do + ; nothing, since m6 still has the data from the previous line in it. We + ; try to structure the loop so that the common case is evaluated fastest + mova m6, [mptrq+384*6] +.loop: + paddw m0, m6 + paddw m7, m1, m5 + paddw m8, m2, m4 + punpcklwd m9, m0, m7 + punpckhwd m0, m7 + punpcklwd m7, m8, m3 + punpckhwd m8, m3 + pmaddwd m9, m10 + pmaddwd m0, m10 + pmaddwd m7, m11 + pmaddwd m8, m11 + add mptrq, 384*2 + paddd m7, m9 + paddd m0, m8 + paddd m7, m12 + paddd m0, m12 + psrad m7, 11 + psrad m0, 11 + packssdw m7, m0 + vextracti128 xm0, m7, 1 + packuswb xm7, xm0 + mova [dstptrq], xm7 + ; shift pixels one position + mova m0, m1 + mova m1, m2 + mova m2, m3 + mova m3, m4 + mova m4, m5 + mova m5, m6 + add dstptrq, strideq + dec yd + jg .loop_load + ; for the bottom pixels, continue using m6 (as extended edge) + cmp yd, ylimd + jg .loop + add midq, 32 + add dstq, 16 + sub wd, 16 + jg .loop_x + RET + +INIT_YMM avx2 +cglobal sgr_box3_h, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim + mov xlimd, edgem + movifnidn wd, wm + mov hd, hm + mov edged, xlimd + and xlimd, 2 ; have_right + jz .no_right + add wd, 2+15 + and wd, ~15 +.no_right: + lea r10, [pb_right_ext_mask+32] + xor xlimd, 2 ; 2*!have_right + pxor m1, m1 + add srcq, wq + lea sumq, [sumq+wq*2-2] + lea sumsqq, [sumsqq+wq*4-4] + neg wq +.loop_y: + mov xq, wq + + ; load left + test edgeb, 1 ; have_left + jz .no_left + test leftq, leftq + jz .load_left_from_main + vpbroadcastw xm0, [leftq+2] + add leftq, 4 + jmp .expand_x +.no_left: + vpbroadcastb xm0, [srcq+xq] + jmp .expand_x +.load_left_from_main: + vpbroadcastw xm0, [srcq+xq-2] +.expand_x: + punpckhbw xm0, xm1 + + ; when we reach this, xm0 contains left two px in highest words + cmp xd, -16 + jle .loop_x +.partial_load_and_extend: + vpbroadcastb m3, [srcq-1] + pmovzxbw m2, [srcq+xq] + movu m4, [r10+xq*2] + punpcklbw m3, m1 + pand m2, m4 + pandn m4, m3 + por m2, m4 + jmp .loop_x_noload +.right_extend: + psrldq xm2, xm0, 14 + vpbroadcastw m2, xm2 + jmp .loop_x_noload + +.loop_x: + pmovzxbw m2, [srcq+xq] +.loop_x_noload: + vinserti128 m0, xm2, 1 + palignr m3, m2, m0, 12 + palignr m4, m2, m0, 14 + + punpcklwd m5, m3, m2 + punpckhwd m6, m3, m2 + paddw m3, m4 + punpcklwd m0, m4, m1 + punpckhwd m4, m1 + pmaddwd m5, m5 + pmaddwd m6, m6 + pmaddwd m0, m0 + pmaddwd m4, m4 + paddw m3, m2 + paddd m5, m0 + vextracti128 xm0, m2, 1 + paddd m6, m4 + movu [sumq+xq*2], m3 + movu [sumsqq+xq*4+ 0], xm5 + movu [sumsqq+xq*4+16], xm6 + vextracti128 [sumsqq+xq*4+32], m5, 1 + vextracti128 [sumsqq+xq*4+48], m6, 1 + add xq, 16 + + ; if x <= -16 we can reload more pixels + ; else if x < 0 we reload and extend (this implies have_right=0) + ; else if x < xlimd we extend from previous load (this implies have_right=0) + ; else we are done + + cmp xd, -16 + jle .loop_x + test xd, xd + jl .partial_load_and_extend + cmp xd, xlimd + jl .right_extend + + add sumsqq, (384+16)*4 + add sumq, (384+16)*2 + add srcq, strideq + dec hd + jg .loop_y + RET + +INIT_YMM avx2 +cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim + movifnidn edged, edgem + mov xq, -2 + rorx ylimd, edged, 2 + and ylimd, 2 ; have_bottom + sub ylimd, 2 ; -2 if have_bottom=0, else 0 +.loop_x: + lea yd, [hq+ylimq+2] + lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] + lea sum_ptrq, [sumq+xq*2+2-(384+16)*2] + test edgeb, 4 ; have_top + jnz .load_top + movu m0, [sumsq_ptrq+(384+16)*4*1] + movu m1, [sumsq_ptrq+(384+16)*4*1+32] + movu m6, [sum_ptrq+(384+16)*2*1] + mova m2, m0 + mova m3, m1 + mova m4, m0 + mova m5, m1 + mova m7, m6 + mova m8, m6 + jmp .loop_y_noload +.load_top: + movu m0, [sumsq_ptrq-(384+16)*4*1] ; l2sq [left] + movu m1, [sumsq_ptrq-(384+16)*4*1+32] ; l2sq [right] + movu m2, [sumsq_ptrq-(384+16)*4*0] ; l1sq [left] + movu m3, [sumsq_ptrq-(384+16)*4*0+32] ; l1sq [right] + movu m6, [sum_ptrq-(384+16)*2*1] ; l2 + movu m7, [sum_ptrq-(384+16)*2*0] ; l1 +.loop_y: + movu m4, [sumsq_ptrq+(384+16)*4*1] ; l0sq [left] + movu m5, [sumsq_ptrq+(384+16)*4*1+32] ; l0sq [right] + movu m8, [sum_ptrq+(384+16)*2*1] ; l0 +.loop_y_noload: + paddd m0, m2 + paddd m1, m3 + paddw m6, m7 + paddd m0, m4 + paddd m1, m5 + paddw m6, m8 + movu [sumsq_ptrq+ 0], m0 + movu [sumsq_ptrq+32], m1 + movu [sum_ptrq], m6 + + ; shift position down by one + mova m0, m2 + mova m1, m3 + mova m2, m4 + mova m3, m5 + mova m6, m7 + mova m7, m8 + add sumsq_ptrq, (384+16)*4 + add sum_ptrq, (384+16)*2 + dec yd + jg .loop_y + cmp yd, ylimd + jg .loop_y_noload + add xd, 16 + cmp xd, wd + jl .loop_x + RET + +INIT_YMM avx2 +cglobal sgr_calc_ab1, 4, 6, 11, a, b, w, h, s + sub aq, (384+16-1)*4 + sub bq, (384+16-1)*2 + add hd, 2 + lea r5, [sgr_x_by_x-0xf03] +%ifidn sd, sm + movd xm6, sd + vpbroadcastd m6, xm6 +%else + vpbroadcastd m6, sm +%endif + vpbroadcastd m8, [pd_0xf00801c7] + vpbroadcastd m9, [pw_256] + pcmpeqb m7, m7 + psrld m10, m9, 13 ; pd_2048 + DEFINE_ARGS a, b, w, h, x + +.loop_y: + mov xq, -2 +.loop_x: + pmovzxwd m0, [bq+xq*2] + pmovzxwd m1, [bq+xq*2+(384+16)*2] + movu m2, [aq+xq*4] + movu m3, [aq+xq*4+(384+16)*4] + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m2, m4 ; aa * 9 + paddd m3, m5 + pmaddwd m4, m0, m0 + pmaddwd m5, m1, m1 + pmaddwd m0, m8 + pmaddwd m1, m8 + psubd m2, m4 ; p = aa * 9 - bb * bb + psubd m3, m5 + pmulld m2, m6 + pmulld m3, m6 + paddusw m2, m8 + paddusw m3, m8 + psrld m2, 20 ; z + psrld m3, 20 + mova m5, m7 + vpgatherdd m4, [r5+m2], m5 ; xx + mova m5, m7 + vpgatherdd m2, [r5+m3], m5 + psrld m4, 24 + psrld m2, 24 + pmulld m0, m4 + pmulld m1, m2 + packssdw m4, m2 + psubw m4, m9, m4 + vpermq m4, m4, q3120 + paddd m0, m10 + paddd m1, m10 + psrld m0, 12 + psrld m1, 12 + movu [bq+xq*2], xm4 + vextracti128 [bq+xq*2+(384+16)*2], m4, 1 + movu [aq+xq*4], m0 + movu [aq+xq*4+(384+16)*4], m1 + add xd, 8 + cmp xd, wd + jl .loop_x + add aq, (384+16)*4*2 + add bq, (384+16)*2*2 + sub hd, 2 + jg .loop_y + RET + +INIT_YMM avx2 +cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \ + tmp_ptr, src_ptr, a_ptr, b_ptr, x, y + movifnidn wd, wm + mov hd, hm + vpbroadcastd m15, [pw_16] + xor xd, xd +.loop_x: + lea tmp_ptrq, [tq+xq*2] + lea src_ptrq, [srcq+xq*1] + lea a_ptrq, [aq+xq*4+(384+16)*4] + lea b_ptrq, [bq+xq*2+(384+16)*2] + movu m0, [aq+xq*4-(384+16)*4-4] + movu m2, [aq+xq*4-(384+16)*4+4] + mova m1, [aq+xq*4-(384+16)*4] ; a:top [first half] + paddd m0, m2 ; a:tl+tr [first half] + movu m2, [aq+xq*4-(384+16)*4-4+32] + movu m4, [aq+xq*4-(384+16)*4+4+32] + mova m3, [aq+xq*4-(384+16)*4+32] ; a:top [second half] + paddd m2, m4 ; a:tl+tr [second half] + movu m4, [aq+xq*4-4] + movu m5, [aq+xq*4+4] + paddd m1, [aq+xq*4] ; a:top+ctr [first half] + paddd m4, m5 ; a:l+r [first half] + movu m5, [aq+xq*4+32-4] + movu m6, [aq+xq*4+32+4] + paddd m3, [aq+xq*4+32] ; a:top+ctr [second half] + paddd m5, m6 ; a:l+r [second half] + + movu m6, [bq+xq*2-(384+16)*2-2] + movu m8, [bq+xq*2-(384+16)*2+2] + mova m7, [bq+xq*2-(384+16)*2] ; b:top + paddw m6, m8 ; b:tl+tr + movu m8, [bq+xq*2-2] + movu m9, [bq+xq*2+2] + paddw m7, [bq+xq*2] ; b:top+ctr + paddw m8, m9 ; b:l+r + mov yd, hd +.loop_y: + movu m9, [b_ptrq-2] + movu m10, [b_ptrq+2] + paddw m7, [b_ptrq] ; b:top+ctr+bottom + paddw m9, m10 ; b:bl+br + paddw m10, m7, m8 ; b:top+ctr+bottom+l+r + paddw m6, m9 ; b:tl+tr+bl+br + psubw m7, [b_ptrq-(384+16)*2*2] ; b:ctr+bottom + paddw m10, m6 + psllw m10, 2 + psubw m10, m6 ; aa + pmovzxbw m12, [src_ptrq] + punpcklwd m6, m10, m15 + punpckhwd m10, m15 + punpcklwd m13, m12, m15 + punpckhwd m12, m15 + pmaddwd m6, m13 ; aa*src[x]+256 [first half] + pmaddwd m10, m12 ; aa*src[x]+256 [second half] + + movu m11, [a_ptrq-4] + movu m12, [a_ptrq+4] + paddd m1, [a_ptrq] ; a:top+ctr+bottom [first half] + paddd m11, m12 ; a:bl+br [first half] + movu m12, [a_ptrq+32-4] + movu m13, [a_ptrq+32+4] + paddd m3, [a_ptrq+32] ; a:top+ctr+bottom [second half] + paddd m12, m13 ; a:bl+br [second half] + paddd m13, m1, m4 ; a:top+ctr+bottom+l+r [first half] + paddd m14, m3, m5 ; a:top+ctr+bottom+l+r [second half] + paddd m0, m11 ; a:tl+tr+bl+br [first half] + paddd m2, m12 ; a:tl+tr+bl+br [second half] + paddd m13, m0 + paddd m14, m2 + pslld m13, 2 + pslld m14, 2 + psubd m13, m0 ; bb [first half] + psubd m14, m2 ; bb [second half] + vperm2i128 m0, m13, m14, 0x31 + vinserti128 m13, xm14, 1 + psubd m1, [a_ptrq-(384+16)*4*2] ; a:ctr+bottom [first half] + psubd m3, [a_ptrq-(384+16)*4*2+32] ; a:ctr+bottom [second half] + + paddd m6, m13 + paddd m10, m0 + psrad m6, 9 + psrad m10, 9 + packssdw m6, m10 + mova [tmp_ptrq], m6 + + ; shift to next row + mova m0, m4 + mova m2, m5 + mova m4, m11 + mova m5, m12 + mova m6, m8 + mova m8, m9 + + add a_ptrq, (384+16)*4 + add b_ptrq, (384+16)*2 + add tmp_ptrq, 384*2 + add src_ptrq, strideq + dec yd + jg .loop_y + add xd, 16 + cmp xd, wd + jl .loop_x + RET + +INIT_YMM avx2 +cglobal sgr_weighted1, 4, 6, 6, dst, stride, t, w, h, wt +%ifidn wtd, wtm + shl wtd, 4 + movd xm5, wtd + vpbroadcastw m5, xm5 +%else + vpbroadcastw m5, wtm + mov hd, hm + psllw m5, 4 +%endif + DEFINE_ARGS dst, stride, t, w, h, idx +.loop_y: + xor idxd, idxd +.loop_x: + mova m0, [tq+idxq*2+ 0] + mova m1, [tq+idxq*2+32] + pmovzxbw m2, [dstq+idxq+ 0] + pmovzxbw m3, [dstq+idxq+16] + psllw m4, m2, 4 + psubw m0, m4 + psllw m4, m3, 4 + psubw m1, m4 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpermq m0, m0, q3120 + mova [dstq+idxq], m0 + add idxd, 32 + cmp idxd, wd + jl .loop_x + add tq, 384*2 + add dstq, strideq + dec hd + jg .loop_y + RET + +INIT_YMM avx2 +cglobal sgr_box5_h, 5, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xlim + mov edged, edgem + movifnidn wd, wm + mov hd, hm + test edgeb, 2 ; have_right + jz .no_right + xor xlimd, xlimd + add wd, 2+15 + and wd, ~15 + jmp .right_done +.no_right: + mov xlimd, 3 + sub wd, 1 +.right_done: + lea r10, [pb_right_ext_mask+32] + pxor m1, m1 + lea srcq, [srcq+wq+1] + lea sumq, [sumq+wq*2-2] + lea sumsqq, [sumsqq+wq*4-4] + neg wq +.loop_y: + mov xq, wq + + ; load left + test edgeb, 1 ; have_left + jz .no_left + test leftq, leftq + jz .load_left_from_main + vpbroadcastd xm2, [leftq] + movd xm0, [srcq+xq-1] + add leftq, 4 + palignr xm0, xm2, 1 + jmp .expand_x +.no_left: + vpbroadcastb xm0, [srcq+xq-1] + jmp .expand_x +.load_left_from_main: + vpbroadcastd xm0, [srcq+xq-4] +.expand_x: + punpckhbw xm0, xm1 + + ; when we reach this, xm0 contains left two px in highest words + cmp xd, -16 + jle .loop_x + test xd, xd + jge .right_extend +.partial_load_and_extend: + vpbroadcastb m3, [srcq-1] + pmovzxbw m2, [srcq+xq] + movu m4, [r10+xq*2] + punpcklbw m3, m1 + pand m2, m4 + pandn m4, m3 + por m2, m4 + jmp .loop_x_noload +.right_extend: + psrldq xm2, xm0, 14 + vpbroadcastw m2, xm2 + jmp .loop_x_noload + +.loop_x: + pmovzxbw m2, [srcq+xq] +.loop_x_noload: + vinserti128 m0, xm2, 1 + palignr m3, m2, m0, 8 + palignr m4, m2, m0, 10 + palignr m5, m2, m0, 12 + palignr m6, m2, m0, 14 + + paddw m0, m3, m2 + punpcklwd m7, m3, m2 + punpckhwd m3, m2 + paddw m0, m4 + punpcklwd m8, m4, m5 + punpckhwd m4, m5 + paddw m0, m5 + punpcklwd m9, m6, m1 + punpckhwd m5, m6, m1 + paddw m0, m6 + pmaddwd m7, m7 + pmaddwd m3, m3 + pmaddwd m8, m8 + pmaddwd m4, m4 + pmaddwd m9, m9 + pmaddwd m5, m5 + paddd m7, m8 + paddd m3, m4 + paddd m7, m9 + paddd m3, m5 + movu [sumq+xq*2], m0 + movu [sumsqq+xq*4+ 0], xm7 + movu [sumsqq+xq*4+16], xm3 + vextracti128 [sumsqq+xq*4+32], m7, 1 + vextracti128 [sumsqq+xq*4+48], m3, 1 + + vextracti128 xm0, m2, 1 + add xq, 16 + + ; if x <= -16 we can reload more pixels + ; else if x < 0 we reload and extend (this implies have_right=0) + ; else if x < xlimd we extend from previous load (this implies have_right=0) + ; else we are done + + cmp xd, -16 + jle .loop_x + test xd, xd + jl .partial_load_and_extend + cmp xd, xlimd + jl .right_extend + + add srcq, strideq + add sumsqq, (384+16)*4 + add sumq, (384+16)*2 + dec hd + jg .loop_y + RET + +INIT_YMM avx2 +cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim + movifnidn edged, edgem + mov xq, -2 + rorx ylimd, edged, 2 + and ylimd, 2 ; have_bottom + sub ylimd, 3 ; -3 if have_bottom=0, else -1 +.loop_x: + lea yd, [hq+ylimq+2] + lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] + lea sum_ptrq, [sumq+xq*2+2-(384+16)*2] + test edgeb, 4 ; have_top + jnz .load_top + movu m0, [sumsq_ptrq+(384+16)*4*1] + movu m1, [sumsq_ptrq+(384+16)*4*1+32] + movu m10, [sum_ptrq+(384+16)*2*1] + mova m2, m0 + mova m3, m1 + mova m4, m0 + mova m5, m1 + mova m6, m0 + mova m7, m1 + mova m11, m10 + mova m12, m10 + mova m13, m10 + jmp .loop_y_second_load +.load_top: + movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left] + movu m1, [sumsq_ptrq-(384+16)*4*1+32] ; l3/4sq [right] + movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left] + movu m5, [sumsq_ptrq-(384+16)*4*0+32] ; l2sq [right] + movu m10, [sum_ptrq-(384+16)*2*1] ; l3/4 + movu m12, [sum_ptrq-(384+16)*2*0] ; l2 + mova m2, m0 + mova m3, m1 + mova m11, m10 +.loop_y: + movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left] + movu m7, [sumsq_ptrq+(384+16)*4*1+32] ; l1sq [right] + movu m13, [sum_ptrq+(384+16)*2*1] ; l1 +.loop_y_second_load: + test yd, yd + jle .emulate_second_load + movu m8, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left] + movu m9, [sumsq_ptrq+(384+16)*4*2+32] ; l0sq [right] + movu m14, [sum_ptrq+(384+16)*2*2] ; l0 +.loop_y_noload: + paddd m0, m2 + paddd m1, m3 + paddw m10, m11 + paddd m0, m4 + paddd m1, m5 + paddw m10, m12 + paddd m0, m6 + paddd m1, m7 + paddw m10, m13 + paddd m0, m8 + paddd m1, m9 + paddw m10, m14 + movu [sumsq_ptrq+ 0], m0 + movu [sumsq_ptrq+32], m1 + movu [sum_ptrq], m10 + + ; shift position down by one + mova m0, m4 + mova m1, m5 + mova m2, m6 + mova m3, m7 + mova m4, m8 + mova m5, m9 + mova m10, m12 + mova m11, m13 + mova m12, m14 + add sumsq_ptrq, (384+16)*4*2 + add sum_ptrq, (384+16)*2*2 + sub yd, 2 + jge .loop_y + ; l1 = l0 + mova m6, m8 + mova m7, m9 + mova m13, m14 + cmp yd, ylimd + jg .loop_y_noload + add xd, 16 + cmp xd, wd + jl .loop_x + RET +.emulate_second_load: + mova m8, m6 + mova m9, m7 + mova m14, m13 + jmp .loop_y_noload + +INIT_YMM avx2 +cglobal sgr_calc_ab2, 4, 6, 11, a, b, w, h, s + sub aq, (384+16-1)*4 + sub bq, (384+16-1)*2 + add hd, 2 + lea r5, [sgr_x_by_x-0xf03] +%ifidn sd, sm + movd xm6, sd + vpbroadcastd m6, xm6 +%else + vpbroadcastd m6, sm +%endif + vpbroadcastd m8, [pd_0xf0080029] + vpbroadcastd m9, [pw_256] + pcmpeqb m7, m7 + psrld m10, m9, 15 ; pd_512 + DEFINE_ARGS a, b, w, h, x +.loop_y: + mov xq, -2 +.loop_x: + pmovzxwd m0, [bq+xq*2+ 0] + pmovzxwd m1, [bq+xq*2+16] + movu m2, [aq+xq*4+ 0] + movu m3, [aq+xq*4+32] + pslld m4, m2, 3 ; aa * 8 + pslld m5, m3, 3 + paddd m2, m4 ; aa * 9 + paddd m3, m5 + paddd m4, m4 ; aa * 16 + paddd m5, m5 + paddd m2, m4 ; aa * 25 + paddd m3, m5 + pmaddwd m4, m0, m0 + pmaddwd m5, m1, m1 + psubd m2, m4 ; p = aa * 25 - bb * bb + psubd m3, m5 + pmulld m2, m6 + pmulld m3, m6 + paddusw m2, m8 + paddusw m3, m8 + psrld m2, 20 ; z + psrld m3, 20 + mova m5, m7 + vpgatherdd m4, [r5+m2], m5 ; xx + mova m5, m7 + vpgatherdd m2, [r5+m3], m5 + psrld m4, 24 + psrld m2, 24 + packssdw m3, m4, m2 + pmullw m4, m8 + pmullw m2, m8 + psubw m3, m9, m3 + vpermq m3, m3, q3120 + pmaddwd m0, m4 + pmaddwd m1, m2 + paddd m0, m10 + paddd m1, m10 + psrld m0, 10 + psrld m1, 10 + movu [bq+xq*2], m3 + movu [aq+xq*4+ 0], m0 + movu [aq+xq*4+32], m1 + add xd, 16 + cmp xd, wd + jl .loop_x + add aq, (384+16)*4*2 + add bq, (384+16)*2*2 + sub hd, 2 + jg .loop_y + RET + +INIT_YMM avx2 +cglobal sgr_finish_filter2, 5, 13, 13, t, src, stride, a, b, w, h, \ + tmp_ptr, src_ptr, a_ptr, b_ptr, x, y + movifnidn wd, wm + mov hd, hm + vpbroadcastd m9, [pw_5_6] + vpbroadcastd m12, [pw_256] + psrlw m11, m12, 1 ; pw_128 + psrlw m10, m12, 8 ; pw_1 + xor xd, xd +.loop_x: + lea tmp_ptrq, [tq+xq*2] + lea src_ptrq, [srcq+xq*1] + lea a_ptrq, [aq+xq*4+(384+16)*4] + lea b_ptrq, [bq+xq*2+(384+16)*2] + movu m0, [aq+xq*4-(384+16)*4-4] + mova m1, [aq+xq*4-(384+16)*4] + movu m2, [aq+xq*4-(384+16)*4+4] + movu m3, [aq+xq*4-(384+16)*4-4+32] + mova m4, [aq+xq*4-(384+16)*4+32] + movu m5, [aq+xq*4-(384+16)*4+4+32] + paddd m0, m2 + paddd m3, m5 + paddd m0, m1 + paddd m3, m4 + pslld m2, m0, 2 + pslld m5, m3, 2 + paddd m2, m0 + paddd m5, m3 + paddd m0, m2, m1 ; prev_odd_b [first half] + paddd m1, m5, m4 ; prev_odd_b [second half] + movu m3, [bq+xq*2-(384+16)*2-2] + mova m4, [bq+xq*2-(384+16)*2] + movu m5, [bq+xq*2-(384+16)*2+2] + paddw m3, m5 + punpcklwd m5, m3, m4 + punpckhwd m3, m4 + pmaddwd m5, m9 + pmaddwd m3, m9 + packssdw m2, m5, m3 ; prev_odd_a + mov yd, hd +.loop_y: + movu m3, [a_ptrq-4] + mova m4, [a_ptrq] + movu m5, [a_ptrq+4] + movu m6, [a_ptrq+32-4] + mova m7, [a_ptrq+32] + movu m8, [a_ptrq+32+4] + paddd m3, m5 + paddd m6, m8 + paddd m3, m4 + paddd m6, m7 + pslld m5, m3, 2 + pslld m8, m6, 2 + paddd m5, m3 + paddd m8, m6 + paddd m3, m5, m4 ; cur_odd_b [first half] + paddd m4, m8, m7 ; cur_odd_b [second half] + movu m5, [b_ptrq-2] + mova m6, [b_ptrq] + movu m7, [b_ptrq+2] + paddw m5, m7 + punpcklwd m7, m5, m6 + punpckhwd m5, m6 + pmaddwd m7, m9 + pmaddwd m5, m9 + packssdw m5, m7, m5 ; cur_odd_a + + paddd m0, m3 ; cur_even_b [first half] + paddd m1, m4 ; cur_even_b [second half] + paddw m2, m5 ; cur_even_a + + pmovzxbw m6, [src_ptrq] + vperm2i128 m8, m0, m1, 0x31 + vinserti128 m0, xm1, 1 + punpcklwd m7, m6, m10 + punpckhwd m6, m10 + punpcklwd m1, m2, m12 + punpckhwd m2, m12 + pmaddwd m7, m1 + pmaddwd m6, m2 + paddd m7, m0 + paddd m6, m8 + psrad m7, 9 + psrad m6, 9 + + pmovzxbw m8, [src_ptrq+strideq] + punpcklwd m0, m8, m10 + punpckhwd m8, m10 + punpcklwd m1, m5, m11 + punpckhwd m2, m5, m11 + pmaddwd m0, m1 + pmaddwd m8, m2 + vinserti128 m2, m3, xm4, 1 + vperm2i128 m1, m3, m4, 0x31 + paddd m0, m2 + paddd m8, m1 + psrad m0, 8 + psrad m8, 8 + + packssdw m7, m6 + packssdw m0, m8 + mova [tmp_ptrq+384*2*0], m7 + mova [tmp_ptrq+384*2*1], m0 + + mova m0, m3 + mova m1, m4 + mova m2, m5 + add a_ptrq, (384+16)*4*2 + add b_ptrq, (384+16)*2*2 + add tmp_ptrq, 384*2*2 + lea src_ptrq, [src_ptrq+strideq*2] + sub yd, 2 + jg .loop_y + add xd, 16 + cmp xd, wd + jl .loop_x + RET + +INIT_YMM avx2 +cglobal sgr_weighted2, 4, 7, 11, dst, stride, t1, t2, w, h, wt + movifnidn wd, wm + movifnidn hd, hm + vpbroadcastd m0, wtm + vpbroadcastd m10, [pd_1024] + DEFINE_ARGS dst, stride, t1, t2, w, h, idx +.loop_y: + xor idxd, idxd +.loop_x: + mova m1, [t1q+idxq*2+ 0] + mova m2, [t1q+idxq*2+32] + mova m3, [t2q+idxq*2+ 0] + mova m4, [t2q+idxq*2+32] + pmovzxbw m5, [dstq+idxq+ 0] + pmovzxbw m6, [dstq+idxq+16] + psllw m7, m5, 4 + psllw m8, m6, 4 + psubw m1, m7 + psubw m2, m8 + psubw m3, m7 + psubw m4, m8 + punpcklwd m9, m1, m3 + punpckhwd m1, m3 + punpcklwd m3, m2, m4 + punpckhwd m2, m4 + pmaddwd m9, m0 + pmaddwd m1, m0 + pmaddwd m3, m0 + pmaddwd m2, m0 + paddd m9, m10 + paddd m1, m10 + paddd m3, m10 + paddd m2, m10 + psrad m9, 11 + psrad m1, 11 + psrad m3, 11 + psrad m2, 11 + packssdw m1, m9, m1 + packssdw m2, m3, m2 + paddw m1, m5 + paddw m2, m6 + packuswb m1, m2 + vpermq m1, m1, q3120 + mova [dstq+idxq], m1 + add idxd, 32 + cmp idxd, wd + jl .loop_x + add dstq, strideq + add t1q, 384 * 2 + add t2q, 384 * 2 + dec hd + jg .loop_y + RET +%endif ; ARCH_X86_64 |