diff options
Diffstat (limited to 'third_party/dav1d/src/x86/cdef_avx2.asm')
-rw-r--r-- | third_party/dav1d/src/x86/cdef_avx2.asm | 1772 |
1 files changed, 1772 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/cdef_avx2.asm b/third_party/dav1d/src/x86/cdef_avx2.asm new file mode 100644 index 0000000000..1f30f8a3b7 --- /dev/null +++ b/third_party/dav1d/src/x86/cdef_avx2.asm @@ -0,0 +1,1772 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +%macro JMP_TABLE 2-* + %xdefine %1_jmptable %%table + %xdefine %%base mangle(private_prefix %+ _%1_avx2) + %%table: + %rep %0 - 1 + dd %%base %+ .%2 - %%table + %rotate 1 + %endrep +%endmacro + +%macro CDEF_FILTER_JMP_TABLE 1 +JMP_TABLE cdef_filter_%1_8bpc, \ + d6k0, d6k1, d7k0, d7k1, \ + d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \ + d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \ + d0k0, d0k1, d1k0, d1k1 +%endmacro + +SECTION_RODATA 32 + +pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6 +blend_4x4: dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00 + dd 0x80, 0x00, 0x00 +blend_4x8_0: dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 +blend_4x8_1: dd 0x00, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + dd 0x00, 0x00 +blend_4x8_2: dd 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080 + dd 0x0000 +blend_4x8_3: dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080 + dd 0x0000, 0x0000 +blend_8x8_0: dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80 +blend_8x8_1: dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000 +div_table: dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105 +shufw_6543210x:db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 +shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +pw_128: times 2 dw 128 +pw_2048: times 2 dw 2048 +tap_table: ; masks for 8 bit shifts + db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01 + ; weights + db 4, 2, 3, 3, 2, 1 + db -1 * 16 + 1, -2 * 16 + 2 + db 0 * 16 + 1, -1 * 16 + 2 + db 0 * 16 + 1, 0 * 16 + 2 + db 0 * 16 + 1, 1 * 16 + 2 + db 1 * 16 + 1, 2 * 16 + 2 + db 1 * 16 + 0, 2 * 16 + 1 + db 1 * 16 + 0, 2 * 16 + 0 + db 1 * 16 + 0, 2 * 16 - 1 + ; the last 6 are repeats of the first 6 so we don't need to & 7 + db -1 * 16 + 1, -2 * 16 + 2 + db 0 * 16 + 1, -1 * 16 + 2 + db 0 * 16 + 1, 0 * 16 + 2 + db 0 * 16 + 1, 1 * 16 + 2 + db 1 * 16 + 1, 2 * 16 + 2 + db 1 * 16 + 0, 2 * 16 + 1 + +CDEF_FILTER_JMP_TABLE 4x4 +CDEF_FILTER_JMP_TABLE 4x8 +CDEF_FILTER_JMP_TABLE 8x8 + +SECTION .text + +%macro PREP_REGS 2 ; w, h + ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] + mov dird, r7m + lea tableq, [cdef_filter_%1x%2_8bpc_jmptable] + lea dirq, [tableq+dirq*2*4] +%if %1 == 4 + %if %2 == 4 + DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \ + table, dir, dirjmp, stride3, k + %else + DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \ + table, dir, dirjmp, dst4, stride3, k + lea dst4q, [dstq+strideq*4] + %endif +%else + DEFINE_ARGS dst, stride, h, top1, bot, pri, sec, \ + table, dir, dirjmp, top2, stride3, k + mov hq, -8 + lea top1q, [top1q+strideq*0] + lea top2q, [top1q+strideq*1] +%endif +%if %1 == 4 + lea stride3q, [strideq*3] +%endif +%endmacro + +%macro LOAD_BLOCK 2-3 0 ; w, h, init_min_max + mov kd, 1 + pxor m15, m15 ; sum +%if %2 == 8 + pxor m12, m12 + %if %1 == 4 + movd xm4, [dstq +strideq*0] + movd xm6, [dstq +strideq*1] + movd xm5, [dstq +strideq*2] + movd xm7, [dstq +stride3q ] + vinserti128 m4, [dst4q+strideq*0], 1 + vinserti128 m6, [dst4q+strideq*1], 1 + vinserti128 m5, [dst4q+strideq*2], 1 + vinserti128 m7, [dst4q+stride3q ], 1 + punpckldq m4, m6 + punpckldq m5, m7 + %else + movq xm4, [dstq+strideq*0] + movq xm5, [dstq+strideq*1] + vinserti128 m4, [dstq+strideq*2], 1 + vinserti128 m5, [dstq+stride3q ], 1 + %endif + punpcklqdq m4, m5 +%else + movd xm4, [dstq+strideq*0] + movd xm5, [dstq+strideq*1] + vinserti128 m4, [dstq+strideq*2], 1 + vinserti128 m5, [dstq+stride3q ], 1 + punpckldq m4, m5 +%endif +%if %3 == 1 + mova m7, m4 ; min + mova m8, m4 ; max +%endif +%endmacro + +%macro ACCUMULATE_TAP_BYTE 7-8 0 ; tap_offset, shift, mask, strength + ; mul_tap, w, h, clip + ; load p0/p1 + movsxd dirjmpq, [dirq+kq*4+%1*2*4] + add dirjmpq, tableq + call dirjmpq + +%if %8 == 1 + pmaxub m7, m5 + pminub m8, m5 + pmaxub m7, m6 + pminub m8, m6 +%endif + + ; accumulate sum[m15] over p0/p1 +%if %7 == 4 + punpcklbw m5, m6 + punpcklbw m6, m4, m4 + psubusb m9, m5, m6 + psubusb m5, m6, m5 + por m9, m5 ; abs_diff_p01(p01 - px) + pcmpeqb m5, m9 + por m5, %5 + psignb m6, %5, m5 + psrlw m5, m9, %2 ; emulate 8-bit shift + pand m5, %3 + psubusb m5, %4, m5 + pminub m5, m9 + pmaddubsw m5, m6 + paddw m15, m5 +%else + psubusb m9, m5, m4 + psubusb m5, m4, m5 + psubusb m11, m6, m4 + psubusb m6, m4, m6 + por m9, m5 ; abs_diff_p0(p0 - px) + por m11, m6 ; abs_diff_p1(p1 - px) + pcmpeqb m5, m9 + pcmpeqb m6, m11 + punpckhbw m10, m9, m11 + punpcklbw m9, m11 + por m5, %5 + por m11, m6, %5 + punpckhbw m6, m5, m11 + punpcklbw m5, m11 + psignb m11, %5, m6 + psrlw m6, m10, %2 ; emulate 8-bit shift + pand m6, %3 + psubusb m6, %4, m6 + pminub m6, m10 + pmaddubsw m6, m11 + paddw m12, m6 + psignb m11, %5, m5 + psrlw m5, m9, %2 ; emulate 8-bit shift + pand m5, %3 + psubusb m5, %4, m5 + pminub m5, m9 + pmaddubsw m5, m11 + paddw m15, m5 +%endif +%endmacro + +%macro ADJUST_PIXEL 4-5 0 ; w, h, zero, pw_2048, clip +%if %2 == 4 + %if %5 == 1 + punpcklbw m4, %3 + %endif + pcmpgtw %3, m15 + paddw m15, %3 + pmulhrsw m15, %4 + %if %5 == 0 + packsswb m15, m15 + paddb m4, m15 + %else + paddw m4, m15 + packuswb m4, m4 ; clip px in [0x0,0xff] + pminub m4, m7 + pmaxub m4, m8 + %endif + vextracti128 xm5, m4, 1 + movd [dstq+strideq*0], xm4 + movd [dstq+strideq*2], xm5 + pextrd [dstq+strideq*1], xm4, 1 + pextrd [dstq+stride3q ], xm5, 1 +%else + pcmpgtw m6, %3, m12 + pcmpgtw m5, %3, m15 + paddw m12, m6 + paddw m15, m5 + %if %5 == 1 + punpckhbw m5, m4, %3 + punpcklbw m4, %3 + %endif + pmulhrsw m12, %4 + pmulhrsw m15, %4 + %if %5 == 0 + packsswb m15, m12 + paddb m4, m15 + %else + paddw m5, m12 + paddw m4, m15 + packuswb m4, m5 ; clip px in [0x0,0xff] + pminub m4, m7 + pmaxub m4, m8 + %endif + vextracti128 xm5, m4, 1 + %if %1 == 4 + movd [dstq +strideq*0], xm4 + movd [dst4q+strideq*0], xm5 + pextrd [dstq +strideq*1], xm4, 1 + pextrd [dst4q+strideq*1], xm5, 1 + pextrd [dstq +strideq*2], xm4, 2 + pextrd [dst4q+strideq*2], xm5, 2 + pextrd [dstq +stride3q ], xm4, 3 + pextrd [dst4q+stride3q ], xm5, 3 + %else + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*2], xm5 + movhps [dstq+strideq*1], xm4 + movhps [dstq+stride3q ], xm5 + %endif +%endif +%endmacro + +%macro BORDER_PREP_REGS 2 ; w, h + ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] + mov dird, r7m + lea dirq, [tableq+dirq*2+14] +%if %1*%2*2/mmsize > 1 + %if %1 == 4 + DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, h, off + %else + DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, h, off + %endif + mov hd, %1*%2*2/mmsize +%else + DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, off +%endif + lea stkq, [px] + pxor m11, m11 +%endmacro + +%macro BORDER_LOAD_BLOCK 2-3 0 ; w, h, init_min_max + mov kd, 1 +%if %1 == 4 + movq xm4, [stkq+32*0] + movhps xm4, [stkq+32*1] + movq xm5, [stkq+32*2] + movhps xm5, [stkq+32*3] + vinserti128 m4, xm5, 1 +%else + mova xm4, [stkq+32*0] ; px + vinserti128 m4, [stkq+32*1], 1 +%endif + pxor m15, m15 ; sum +%if %3 == 1 + mova m7, m4 ; max + mova m8, m4 ; min +%endif +%endmacro + +%macro ACCUMULATE_TAP_WORD 6-7 0 ; tap_offset, shift, mask, strength + ; mul_tap, w, clip + ; load p0/p1 + movsx offq, byte [dirq+kq+%1] ; off1 +%if %6 == 4 + movq xm5, [stkq+offq*2+32*0] ; p0 + movq xm6, [stkq+offq*2+32*2] + movhps xm5, [stkq+offq*2+32*1] + movhps xm6, [stkq+offq*2+32*3] + vinserti128 m5, xm6, 1 +%else + movu xm5, [stkq+offq*2+32*0] ; p0 + vinserti128 m5, [stkq+offq*2+32*1], 1 +%endif + neg offq ; -off1 +%if %6 == 4 + movq xm6, [stkq+offq*2+32*0] ; p1 + movq xm9, [stkq+offq*2+32*2] + movhps xm6, [stkq+offq*2+32*1] + movhps xm9, [stkq+offq*2+32*3] + vinserti128 m6, xm9, 1 +%else + movu xm6, [stkq+offq*2+32*0] ; p1 + vinserti128 m6, [stkq+offq*2+32*1], 1 +%endif +%if %7 == 1 + ; out of bounds values are set to a value that is a both a large unsigned + ; value and a negative signed value. + ; use signed max and unsigned min to remove them + pmaxsw m7, m5 ; max after p0 + pminuw m8, m5 ; min after p0 + pmaxsw m7, m6 ; max after p1 + pminuw m8, m6 ; min after p1 +%endif + + ; accumulate sum[m15] over p0/p1 + ; calculate difference before converting + psubw m5, m4 ; diff_p0(p0 - px) + psubw m6, m4 ; diff_p1(p1 - px) + + ; convert to 8-bits with signed saturation + ; saturating to large diffs has no impact on the results + packsswb m5, m6 + + ; group into pairs so we can accumulate using maddubsw + pshufb m5, m12 + pabsb m9, m5 + psignb m10, %5, m5 + psrlw m5, m9, %2 ; emulate 8-bit shift + pand m5, %3 + psubusb m5, %4, m5 + + ; use unsigned min since abs diff can equal 0x80 + pminub m5, m9 + pmaddubsw m5, m10 + paddw m15, m5 +%endmacro + +%macro BORDER_ADJUST_PIXEL 2-3 0 ; w, pw_2048, clip + pcmpgtw m9, m11, m15 + paddw m15, m9 + pmulhrsw m15, %2 + paddw m4, m15 +%if %3 == 1 + pminsw m4, m7 + pmaxsw m4, m8 +%endif + packuswb m4, m4 + vextracti128 xm5, m4, 1 +%if %1 == 4 + movd [dstq+strideq*0], xm4 + pextrd [dstq+strideq*1], xm4, 1 + movd [dstq+strideq*2], xm5 + pextrd [dstq+stride3q ], xm5, 1 +%else + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*1], xm5 +%endif +%endmacro + +%macro CDEF_FILTER 2 ; w, h +INIT_YMM avx2 +cglobal cdef_filter_%1x%2_8bpc, 5, 11, 0, dst, stride, left, top, bot, \ + pri, sec, dir, damping, edge +%assign stack_offset_entry stack_offset + mov edged, edgem + cmp edged, 0xf + jne .border_block + + PUSH r11 + PUSH r12 +%if %2 == 4 +%assign regs_used 13 + ALLOC_STACK 0x60, 16 + pmovzxbw xm0, [leftq+1] + vpermq m0, m0, q0110 + psrldq m1, m0, 4 + vpalignr m2, m0, m0, 12 + movu [rsp+0x10], m0 + movu [rsp+0x28], m1 + movu [rsp+0x40], m2 +%elif %1 == 4 +%assign regs_used 14 + PUSH r13 + ALLOC_STACK 8*2+%1*%2*1, 16 + pmovzxwd m0, [leftq] + mova [rsp+0x10], m0 +%else +%assign regs_used 15 + PUSH r13 + PUSH r14 + ALLOC_STACK 8*4+%1*%2*2+32, 16 + lea r11, [strideq*3] + movu xm4, [dstq+strideq*2] + pmovzxwq m0, [leftq+0] + pmovzxwq m1, [leftq+8] + vinserti128 m4, [dstq+r11], 1 + pmovzxbd m2, [leftq+1] + pmovzxbd m3, [leftq+9] + mov [rsp+16], botq + mova [rsp+0x20], m0 + mova [rsp+0x40], m1 + mova [rsp+0x60], m2 + mova [rsp+0x80], m3 + mova [rsp+0xa0], m4 + lea botq, [dstq+strideq*4] +%endif + + DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, zero, pridmp, damping + mov dampingd, r8m + xor zerod, zerod + movifnidn prid, prim + sub dampingd, 31 + movifnidn secdmpd, secdmpm + test prid, prid + jz .sec_only + movd xm0, prid + lzcnt pridmpd, prid + add pridmpd, dampingd + cmovs pridmpd, zerod + mov [rsp+0], pridmpq ; pri_shift + test secdmpd, secdmpd + jz .pri_only + movd xm1, secdmpd + lzcnt secdmpd, secdmpd + add secdmpd, dampingd + mov [rsp+8], secdmpq ; sec_shift + + DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, table, pridmp + lea tableq, [tap_table] + vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask + vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask + + ; pri/sec_taps[k] [4 total] + DEFINE_ARGS dst, stride, left, top, bot, pri, sec, table, dir + vpbroadcastb m0, xm0 ; pri_strength + vpbroadcastb m1, xm1 ; sec_strength + and prid, 1 + lea priq, [tableq+priq*2+8] ; pri_taps + lea secq, [tableq+12] ; sec_taps + + PREP_REGS %1, %2 +%if %1*%2 > mmsize +.v_loop: +%endif + LOAD_BLOCK %1, %2, 1 +.k_loop: + vpbroadcastb m2, [priq+kq] ; pri_taps + vpbroadcastb m3, [secq+kq] ; sec_taps + ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2, 1 ; dir + 0 + ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir + 2 + ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir - 2 + dec kq + jge .k_loop + + vpbroadcastd m10, [pw_2048] + pxor m9, m9 + ADJUST_PIXEL %1, %2, m9, m10, 1 +%if %1*%2 > mmsize + lea dstq, [dstq+strideq*4] + lea top1q, [rsp+0xa0] + lea top2q, [rsp+0xb0] + mov botq, [rsp+16] + add hq, 4 + jl .v_loop +%endif + RET + +.pri_only: + DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, pridmp + lea tableq, [tap_table] + vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask + ; pri/sec_taps[k] [4 total] + DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, dir + vpbroadcastb m0, xm0 ; pri_strength + and prid, 1 + lea priq, [tableq+priq*2+8] ; pri_taps + PREP_REGS %1, %2 + vpbroadcastd m3, [pw_2048] + pxor m1, m1 +%if %1*%2 > mmsize +.pri_v_loop: +%endif + LOAD_BLOCK %1, %2 +.pri_k_loop: + vpbroadcastb m2, [priq+kq] ; pri_taps + ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0 + dec kq + jge .pri_k_loop + ADJUST_PIXEL %1, %2, m1, m3 +%if %1*%2 > mmsize + lea dstq, [dstq+strideq*4] + lea top1q, [rsp+0xa0] + lea top2q, [rsp+0xb0] + mov botq, [rsp+16] + add hq, 4 + jl .pri_v_loop +%endif + RET + +.sec_only: + DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, zero, _, damping + movd xm1, secdmpd + lzcnt secdmpd, secdmpd + add secdmpd, dampingd + mov [rsp+8], secdmpq ; sec_shift + DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, table + lea tableq, [tap_table] + vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask + ; pri/sec_taps[k] [4 total] + DEFINE_ARGS dst, stride, left, top, bot, _, sec, table, dir + vpbroadcastb m1, xm1 ; sec_strength + lea secq, [tableq+12] ; sec_taps + PREP_REGS %1, %2 + vpbroadcastd m2, [pw_2048] + pxor m0, m0 +%if %1*%2 > mmsize +.sec_v_loop: +%endif + LOAD_BLOCK %1, %2 +.sec_k_loop: + vpbroadcastb m3, [secq+kq] ; sec_taps + ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2 + ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2 + dec kq + jge .sec_k_loop + ADJUST_PIXEL %1, %2, m0, m2 +%if %1*%2 > mmsize + lea dstq, [dstq+strideq*4] + lea top1q, [rsp+0xa0] + lea top2q, [rsp+0xb0] + mov botq, [rsp+16] + add hq, 4 + jl .sec_v_loop +%endif + RET + +.d0k0: +%if %1 == 4 + %if %2 == 4 + vpbroadcastq m6, [dstq+strideq*1-1] + vpbroadcastq m10, [dstq+strideq*2-1] + movd xm5, [topq+strideq*1+1] + movd xm9, [dstq+strideq*0+1] + psrldq m11, m6, 2 + psrldq m12, m10, 2 + vinserti128 m6, [dstq+stride3q -1], 1 + vinserti128 m10, [botq -1], 1 + vpblendd m5, m11, 0x10 + vpblendd m9, m12, 0x10 + movu m11, [blend_4x4+16] + punpckldq m6, m10 + punpckldq m5, m9 + vpblendvb m6, [rsp+gprsize+0x28], m11 + %else + movd xm5, [topq +strideq*1+1] + movq xm6, [dstq +strideq*1-1] + movq xm10, [dstq +stride3q -1] + movq xm11, [dst4q+strideq*1-1] + pinsrd xm5, [dstq +strideq*0+1], 1 + movhps xm6, [dstq +strideq*2-1] + movhps xm10, [dst4q+strideq*0-1] + movhps xm11, [dst4q+strideq*2-1] + psrldq xm9, xm6, 2 + shufps xm5, xm9, q2010 ; -1 +0 +1 +2 + shufps xm6, xm10, q2020 ; +1 +2 +3 +4 + psrldq xm9, xm11, 2 + psrldq xm10, 2 + shufps xm10, xm9, q2020 ; +3 +4 +5 +6 + movd xm9, [dst4q+stride3q -1] + pinsrd xm9, [botq -1], 1 + shufps xm11, xm9, q1020 ; +5 +6 +7 +8 + pmovzxbw m9, [leftq+3] + vinserti128 m6, xm11, 1 + movu m11, [blend_4x8_0+4] + vinserti128 m5, xm10, 1 + vpblendvb m6, m9, m11 + %endif +%else + lea r13, [blend_8x8_0+16] + movq xm5, [top2q +1] + vbroadcasti128 m10, [dstq+strideq*1-1] + vbroadcasti128 m11, [dstq+strideq*2-1] + movhps xm5, [dstq+strideq*0+1] + vinserti128 m6, m10, [dstq+stride3q-1], 1 + vinserti128 m9, m11, [botq -1], 1 + psrldq m10, 2 + psrldq m11, 2 + punpcklqdq m6, m9 + movu m9, [r13+hq*2*1+16*1] + punpcklqdq m10, m11 + vpblendd m5, m10, 0xF0 + vpblendvb m6, [rsp+gprsize+0x60+hq*8+64+8*1], m9 +%endif + ret +.d1k0: +.d2k0: +.d3k0: +%if %1 == 4 + %if %2 == 4 + movq xm6, [dstq+strideq*0-1] + movq xm9, [dstq+strideq*1-1] + vinserti128 m6, [dstq+strideq*2-1], 1 + vinserti128 m9, [dstq+stride3q -1], 1 + movu m11, [rsp+gprsize+0x10] + pcmpeqd m12, m12 + psrldq m5, m6, 2 + psrldq m10, m9, 2 + psrld m12, 24 + punpckldq m6, m9 + punpckldq m5, m10 + vpblendvb m6, m11, m12 + %else + movq xm6, [dstq +strideq*0-1] + movq xm9, [dstq +strideq*2-1] + movhps xm6, [dstq +strideq*1-1] + movhps xm9, [dstq +stride3q -1] + movq xm10, [dst4q+strideq*0-1] + movhps xm10, [dst4q+strideq*1-1] + psrldq xm5, xm6, 2 + psrldq xm11, xm9, 2 + shufps xm5, xm11, q2020 + movq xm11, [dst4q+strideq*2-1] + movhps xm11, [dst4q+stride3q -1] + shufps xm6, xm9, q2020 + shufps xm9, xm10, xm11, q2020 + vinserti128 m6, xm9, 1 + pmovzxbw m9, [leftq+1] + psrldq xm10, 2 + psrldq xm11, 2 + shufps xm10, xm11, q2020 + vpbroadcastd m11, [blend_4x8_0+4] + vinserti128 m5, xm10, 1 + vpblendvb m6, m9, m11 + %endif +%else + movu xm5, [dstq+strideq*0-1] + movu xm9, [dstq+strideq*1-1] + vinserti128 m5, [dstq+strideq*2-1], 1 + vinserti128 m9, [dstq+stride3q -1], 1 + movu m10, [blend_8x8_0+16] + punpcklqdq m6, m5, m9 + vpblendvb m6, [rsp+gprsize+0x60+hq*8+64], m10 + psrldq m5, 2 + psrldq m9, 2 + punpcklqdq m5, m9 +%endif + ret +.d4k0: +%if %1 == 4 + %if %2 == 4 + vpbroadcastq m10, [dstq+strideq*1-1] + vpbroadcastq m11, [dstq+strideq*2-1] + movd xm6, [topq+strideq*1-1] + movd xm9, [dstq+strideq*0-1] + psrldq m5, m10, 2 + psrldq m12, m11, 2 + vpblendd m6, m10, 0x10 + vpblendd m9, m11, 0x10 + movu m10, [blend_4x4] + vinserti128 m5, [dstq+stride3q +1], 1 + vinserti128 m12, [botq +1], 1 + punpckldq m6, m9 + punpckldq m5, m12 + vpblendvb m6, [rsp+gprsize+0x40], m10 + %else + movd xm6, [topq +strideq*1-1] + movq xm9, [dstq +strideq*1-1] + movq xm10, [dstq +stride3q -1] + movq xm11, [dst4q+strideq*1-1] + pinsrd xm6, [dstq +strideq*0-1], 1 + movhps xm9, [dstq +strideq*2-1] + movhps xm10, [dst4q+strideq*0-1] + movhps xm11, [dst4q+strideq*2-1] + psrldq xm5, xm9, 2 + shufps xm6, xm9, q2010 + psrldq xm9, xm10, 2 + shufps xm5, xm9, q2020 + shufps xm10, xm11, q2020 + movd xm9, [dst4q+stride3q +1] + vinserti128 m6, xm10, 1 + pinsrd xm9, [botq +1], 1 + psrldq xm11, 2 + pmovzxbw m10, [leftq-1] + shufps xm11, xm9, q1020 + movu m9, [blend_4x8_0] + vinserti128 m5, xm11, 1 + vpblendvb m6, m10, m9 + %endif +%else + lea r13, [blend_8x8_0+8] + movq xm6, [top2q -1] + vbroadcasti128 m5, [dstq+strideq*1-1] + vbroadcasti128 m9, [dstq+strideq*2-1] + movhps xm6, [dstq+strideq*0-1] + movu m11, [r13+hq*2*1+16*1] + punpcklqdq m10, m5, m9 + vinserti128 m5, [dstq+stride3q -1], 1 + vinserti128 m9, [botq -1], 1 + vpblendd m6, m10, 0xF0 + vpblendvb m6, [rsp+gprsize+0x60+hq*8+64-8*1], m11 + psrldq m5, 2 + psrldq m9, 2 + punpcklqdq m5, m9 +%endif + ret +.d5k0: +.d6k0: +.d7k0: +%if %1 == 4 + %if %2 == 4 + movd xm6, [topq+strideq*1 ] + vpbroadcastd m5, [dstq+strideq*1 ] + vpbroadcastd m9, [dstq+strideq*2 ] + vpblendd xm6, [dstq+strideq*0-4], 0x2 + vpblendd m5, m9, 0x22 + vpblendd m6, m5, 0x30 + vinserti128 m5, [dstq+stride3q ], 1 + vpblendd m5, [botq -20], 0x20 + %else + movd xm6, [topq +strideq*1] + movd xm5, [dstq +strideq*1] + movd xm9, [dstq +stride3q ] + movd xm10, [dst4q+strideq*1] + movd xm11, [dst4q+stride3q ] + pinsrd xm6, [dstq +strideq*0], 1 + pinsrd xm5, [dstq +strideq*2], 1 + pinsrd xm9, [dst4q+strideq*0], 1 + pinsrd xm10, [dst4q+strideq*2], 1 + pinsrd xm11, [botq ], 1 + punpcklqdq xm6, xm5 + punpcklqdq xm5, xm9 + punpcklqdq xm9, xm10 + punpcklqdq xm10, xm11 + vinserti128 m6, xm9, 1 + vinserti128 m5, xm10, 1 + %endif +%else + movq xm6, [top2q ] + movq xm5, [dstq+strideq*1] + movq xm9, [dstq+stride3q ] + movhps xm6, [dstq+strideq*0] + movhps xm5, [dstq+strideq*2] + movhps xm9, [botq ] + vinserti128 m6, xm5, 1 + vinserti128 m5, xm9, 1 +%endif + ret +.d0k1: +%if %1 == 4 + %if %2 == 4 + movd xm6, [dstq+strideq*2-2] + movd xm9, [dstq+stride3q -2] + movd xm5, [topq+strideq*0+2] + movd xm10, [topq+strideq*1+2] + pinsrw xm6, [leftq+4], 0 + pinsrw xm9, [leftq+6], 0 + vinserti128 m5, [dstq+strideq*0+2], 1 + vinserti128 m10, [dstq+strideq*1+2], 1 + vinserti128 m6, [botq+strideq*0-2], 1 + vinserti128 m9, [botq+strideq*1-2], 1 + punpckldq m5, m10 + punpckldq m6, m9 + %else + movq xm6, [dstq +strideq*2-2] + movd xm10, [dst4q+strideq*2-2] + movd xm5, [topq +strideq*0+2] + movq xm9, [dst4q+strideq*0-2] + movhps xm6, [dstq +stride3q -2] + pinsrw xm10, [dst4q+stride3q ], 3 + pinsrd xm5, [topq +strideq*1+2], 1 + movhps xm9, [dst4q+strideq*1-2] + pinsrd xm10, [botq +strideq*0-2], 2 + pinsrd xm5, [dstq +strideq*0+2], 2 + pinsrd xm10, [botq +strideq*1-2], 3 + pinsrd xm5, [dstq +strideq*1+2], 3 + shufps xm11, xm6, xm9, q3131 + shufps xm6, xm9, q2020 + movu m9, [blend_4x8_3+8] + vinserti128 m6, xm10, 1 + vinserti128 m5, xm11, 1 + vpblendvb m6, [rsp+gprsize+0x10+8], m9 + %endif +%else + lea r13, [blend_8x8_1+16] + movq xm6, [dstq+strideq*2-2] + movq xm9, [dstq+stride3q -2] + movq xm5, [top1q +2] + movq xm10, [top2q +2] + movu m11, [r13+hq*2*2+16*2] + vinserti128 m6, [botq+strideq*0-2], 1 + vinserti128 m9, [botq+strideq*1-2], 1 + vinserti128 m5, [dstq+strideq*0+2], 1 + vinserti128 m10, [dstq+strideq*1+2], 1 + punpcklqdq m6, m9 + punpcklqdq m5, m10 + vpblendvb m6, [rsp+gprsize+0x20+hq*8+64+8*2], m11 +%endif + ret +.d1k1: +%if %1 == 4 + %if %2 == 4 + vpbroadcastq m6, [dstq+strideq*1-2] + vpbroadcastq m9, [dstq+strideq*2-2] + movd xm5, [topq+strideq*1+2] + movd xm10, [dstq+strideq*0+2] + psrldq m11, m6, 4 + psrldq m12, m9, 4 + vpblendd m5, m11, 0x10 + movq xm11, [leftq+2] + vinserti128 m6, [dstq+stride3q-2], 1 + punpckldq xm11, xm11 + vpblendd m10, m12, 0x10 + pcmpeqd m12, m12 + pmovzxwd m11, xm11 + psrld m12, 16 + punpckldq m6, m9 + vpbroadcastd m9, [botq-2] + vpblendvb m6, m11, m12 + punpckldq m5, m10 + vpblendd m6, m9, 0x20 + %else + movd xm5, [topq +strideq*1+2] + movq xm6, [dstq +strideq*1-2] + movq xm9, [dstq +stride3q -2] + movq xm10, [dst4q+strideq*1-2] + movd xm11, [dst4q+stride3q -2] + pinsrd xm5, [dstq +strideq*0+2], 1 + movhps xm6, [dstq +strideq*2-2] + movhps xm9, [dst4q+strideq*0-2] + movhps xm10, [dst4q+strideq*2-2] + pinsrd xm11, [botq -2], 1 + shufps xm5, xm6, q3110 + shufps xm6, xm9, q2020 + shufps xm9, xm10, q3131 + shufps xm10, xm11, q1020 + movu m11, [blend_4x8_2+4] + vinserti128 m6, xm10, 1 + vinserti128 m5, xm9, 1 + vpblendvb m6, [rsp+gprsize+0x10+4], m11 + %endif +%else + lea r13, [blend_8x8_1+16] + movq xm5, [top2q +2] + vbroadcasti128 m6, [dstq+strideq*1-2] + vbroadcasti128 m9, [dstq+strideq*2-2] + movhps xm5, [dstq+strideq*0+2] + shufps m10, m6, m9, q2121 + vinserti128 m6, [dstq+stride3q -2], 1 + vinserti128 m9, [botq -2], 1 + movu m11, [r13+hq*2*1+16*1] + vpblendd m5, m10, 0xF0 + punpcklqdq m6, m9 + vpblendvb m6, [rsp+gprsize+0x20+hq*8+64+8*1], m11 +%endif + ret +.d2k1: +%if %1 == 4 + %if %2 == 4 + movq xm11, [leftq] + movq xm6, [dstq+strideq*0-2] + movq xm9, [dstq+strideq*1-2] + vinserti128 m6, [dstq+strideq*2-2], 1 + vinserti128 m9, [dstq+stride3q -2], 1 + punpckldq xm11, xm11 + psrldq m5, m6, 4 + psrldq m10, m9, 4 + pmovzxwd m11, xm11 + punpckldq m6, m9 + punpckldq m5, m10 + pblendw m6, m11, 0x05 + %else + movq xm5, [dstq +strideq*0-2] + movq xm9, [dstq +strideq*2-2] + movq xm10, [dst4q+strideq*0-2] + movq xm11, [dst4q+strideq*2-2] + movhps xm5, [dstq +strideq*1-2] + movhps xm9, [dstq +stride3q -2] + movhps xm10, [dst4q+strideq*1-2] + movhps xm11, [dst4q+stride3q -2] + shufps xm6, xm5, xm9, q2020 + shufps xm5, xm9, q3131 + shufps xm9, xm10, xm11, q2020 + shufps xm10, xm11, q3131 + pmovzxwd m11, [leftq] + vinserti128 m6, xm9, 1 + vinserti128 m5, xm10, 1 + pblendw m6, m11, 0x55 + %endif +%else + mova m11, [rsp+gprsize+0x20+hq*8+64] + movu xm5, [dstq+strideq*0-2] + movu xm9, [dstq+strideq*1-2] + vinserti128 m5, [dstq+strideq*2-2], 1 + vinserti128 m9, [dstq+stride3q -2], 1 + shufps m6, m5, m9, q1010 + shufps m5, m9, q2121 + pblendw m6, m11, 0x11 +%endif + ret +.d3k1: +%if %1 == 4 + %if %2 == 4 + vpbroadcastq m11, [dstq+strideq*1-2] + vpbroadcastq m12, [dstq+strideq*2-2] + movd xm6, [topq+strideq*1-2] + movd xm9, [dstq+strideq*0-2] + pblendw m11, [leftq-16+2], 0x01 + pblendw m12, [leftq-16+4], 0x01 + pinsrw xm9, [leftq- 0+0], 0 + psrldq m5, m11, 4 + psrldq m10, m12, 4 + vinserti128 m5, [dstq+stride3q +2], 1 + vinserti128 m10, [botq +2], 1 + vpblendd m6, m11, 0x10 + vpblendd m9, m12, 0x10 + punpckldq m6, m9 + punpckldq m5, m10 + %else + movd xm6, [topq +strideq*1-2] + movq xm5, [dstq +strideq*1-2] + movq xm9, [dstq +stride3q -2] + movq xm10, [dst4q+strideq*1-2] + movd xm11, [dst4q+stride3q +2] + pinsrw xm6, [dstq +strideq*0 ], 3 + movhps xm5, [dstq +strideq*2-2] + movhps xm9, [dst4q+strideq*0-2] + movhps xm10, [dst4q+strideq*2-2] + pinsrd xm11, [botq +2], 1 + shufps xm6, xm5, q2010 + shufps xm5, xm9, q3131 + shufps xm9, xm10, q2020 + shufps xm10, xm11, q1031 + movu m11, [blend_4x8_2] + vinserti128 m6, xm9, 1 + vinserti128 m5, xm10, 1 + vpblendvb m6, [rsp+gprsize+0x10-4], m11 + %endif +%else + lea r13, [blend_8x8_1+8] + movq xm6, [top2q -2] + vbroadcasti128 m5, [dstq+strideq*1-2] + vbroadcasti128 m10, [dstq+strideq*2-2] + movhps xm6, [dstq+strideq*0-2] + punpcklqdq m9, m5, m10 + vinserti128 m5, [dstq+stride3q -2], 1 + vinserti128 m10, [botq -2], 1 + movu m11, [r13+hq*2*1+16*1] + vpblendd m6, m9, 0xF0 + shufps m5, m10, q2121 + vpblendvb m6, [rsp+gprsize+0x20+hq*8+64-8*1], m11 +%endif + ret +.d4k1: +%if %1 == 4 + %if %2 == 4 + vinserti128 m6, [dstq+strideq*0-2], 1 + vinserti128 m9, [dstq+strideq*1-2], 1 + movd xm5, [dstq+strideq*2+2] + movd xm10, [dstq+stride3q +2] + pblendw m6, [leftq-16+0], 0x01 + pblendw m9, [leftq-16+2], 0x01 + vinserti128 m5, [botq+strideq*0+2], 1 + vinserti128 m10, [botq+strideq*1+2], 1 + vpblendd m6, [topq+strideq*0-2], 0x01 + vpblendd m9, [topq+strideq*1-2], 0x01 + punpckldq m5, m10 + punpckldq m6, m9 + %else + movd xm6, [topq +strideq*0-2] + movq xm5, [dstq +strideq*2-2] + movq xm9, [dst4q+strideq*0-2] + movd xm10, [dst4q+strideq*2+2] + pinsrd xm6, [topq +strideq*1-2], 1 + movhps xm5, [dstq +stride3q -2] + movhps xm9, [dst4q+strideq*1-2] + pinsrd xm10, [dst4q+stride3q +2], 1 + pinsrd xm6, [dstq +strideq*0-2], 2 + pinsrd xm10, [botq +strideq*0+2], 2 + pinsrd xm6, [dstq +strideq*1-2], 3 + pinsrd xm10, [botq +strideq*1+2], 3 + shufps xm11, xm5, xm9, q2020 + shufps xm5, xm9, q3131 + movu m9, [blend_4x8_3] + vinserti128 m6, xm11, 1 + vinserti128 m5, xm10, 1 + vpblendvb m6, [rsp+gprsize+0x10-8], m9 + %endif +%else + lea r13, [blend_8x8_1] + movu m11, [r13+hq*2*2+16*2] + movq xm6, [top1q -2] + movq xm9, [top2q -2] + movq xm5, [dstq+strideq*2+2] + movq xm10, [dstq+stride3q +2] + vinserti128 m6, [dstq+strideq*0-2], 1 + vinserti128 m9, [dstq+strideq*1-2], 1 + vinserti128 m5, [botq+strideq*0+2], 1 + vinserti128 m10, [botq+strideq*1+2], 1 + punpcklqdq m6, m9 + vpblendvb m6, [rsp+gprsize+0x20+hq*8+64-8*2], m11 + punpcklqdq m5, m10 +%endif + ret +.d5k1: +%if %1 == 4 + %if %2 == 4 + movd xm6, [topq+strideq*0-1] + movd xm9, [topq+strideq*1-1] + movd xm5, [dstq+strideq*2+1] + movd xm10, [dstq+stride3q +1] + pcmpeqd m12, m12 + pmovzxbw m11, [leftq-8+1] + psrld m12, 24 + vinserti128 m6, [dstq+strideq*0-1], 1 + vinserti128 m9, [dstq+strideq*1-1], 1 + vinserti128 m5, [botq+strideq*0+1], 1 + vinserti128 m10, [botq+strideq*1+1], 1 + punpckldq m6, m9 + pxor m9, m9 + vpblendd m12, m9, 0x0F + punpckldq m5, m10 + vpblendvb m6, m11, m12 + %else + movd xm6, [topq +strideq*0-1] + movq xm5, [dstq +strideq*2-1] + movq xm9, [dst4q+strideq*0-1] + movd xm10, [dst4q+strideq*2+1] + pinsrd xm6, [topq +strideq*1-1], 1 + movhps xm5, [dstq +stride3q -1] + movhps xm9, [dst4q+strideq*1-1] + pinsrd xm10, [dst4q+stride3q +1], 1 + pinsrd xm6, [dstq +strideq*0-1], 2 + pinsrd xm10, [botq +strideq*0+1], 2 + pinsrd xm6, [dstq +strideq*1-1], 3 + pinsrd xm10, [botq +strideq*1+1], 3 + shufps xm11, xm5, xm9, q2020 + vinserti128 m6, xm11, 1 + pmovzxbw m11, [leftq-3] + psrldq xm5, 2 + psrldq xm9, 2 + shufps xm5, xm9, q2020 + movu m9, [blend_4x8_1] + vinserti128 m5, xm10, 1 + vpblendvb m6, m11, m9 + %endif +%else + lea r13, [blend_8x8_0] + movu m11, [r13+hq*2*2+16*2] + movq xm6, [top1q -1] + movq xm9, [top2q -1] + movq xm5, [dstq+strideq*2+1] + movq xm10, [dstq+stride3q +1] + vinserti128 m6, [dstq+strideq*0-1], 1 + vinserti128 m9, [dstq+strideq*1-1], 1 + vinserti128 m5, [botq+strideq*0+1], 1 + vinserti128 m10, [botq+strideq*1+1], 1 + punpcklqdq m6, m9 + punpcklqdq m5, m10 + vpblendvb m6, [rsp+gprsize+0x60+hq*8+64-8*2], m11 +%endif + ret +.d6k1: +%if %1 == 4 + %if %2 == 4 + movd xm6, [topq+strideq*0] + movd xm9, [topq+strideq*1] + movd xm5, [dstq+strideq*2] + movd xm10, [dstq+stride3q ] + vinserti128 m6, [dstq+strideq*0], 1 + vinserti128 m9, [dstq+strideq*1], 1 + vinserti128 m5, [botq+strideq*0], 1 + vinserti128 m10, [botq+strideq*1], 1 + punpckldq m6, m9 + punpckldq m5, m10 + %else + movd xm5, [dstq +strideq*2] + movd xm6, [topq +strideq*0] + movd xm9, [dst4q+strideq*2] + pinsrd xm5, [dstq +stride3q ], 1 + pinsrd xm6, [topq +strideq*1], 1 + pinsrd xm9, [dst4q+stride3q ], 1 + pinsrd xm5, [dst4q+strideq*0], 2 + pinsrd xm6, [dstq +strideq*0], 2 + pinsrd xm9, [botq +strideq*0], 2 + pinsrd xm5, [dst4q+strideq*1], 3 + pinsrd xm6, [dstq +strideq*1], 3 + pinsrd xm9, [botq +strideq*1], 3 + vinserti128 m6, xm5, 1 + vinserti128 m5, xm9, 1 + %endif +%else + movq xm5, [dstq+strideq*2] + movq xm9, [botq+strideq*0] + movq xm6, [top1q ] + movq xm10, [dstq+strideq*0] + movhps xm5, [dstq+stride3q ] + movhps xm9, [botq+strideq*1] + movhps xm6, [top2q ] + movhps xm10, [dstq+strideq*1] + vinserti128 m5, xm9, 1 + vinserti128 m6, xm10, 1 +%endif + ret +.d7k1: +%if %1 == 4 + %if %2 == 4 + movd xm5, [dstq+strideq*2-1] + movd xm9, [dstq+stride3q -1] + movd xm6, [topq+strideq*0+1] + movd xm10, [topq+strideq*1+1] + pinsrb xm5, [leftq+ 5], 0 + pinsrb xm9, [leftq+ 7], 0 + vinserti128 m6, [dstq+strideq*0+1], 1 + vinserti128 m10, [dstq+strideq*1+1], 1 + vinserti128 m5, [botq+strideq*0-1], 1 + vinserti128 m9, [botq+strideq*1-1], 1 + punpckldq m6, m10 + punpckldq m5, m9 + %else + movd xm6, [topq +strideq*0+1] + movq xm9, [dstq +strideq*2-1] + movq xm10, [dst4q+strideq*0-1] + movd xm11, [dst4q+strideq*2-1] + pinsrd xm6, [topq +strideq*1+1], 1 + movhps xm9, [dstq +stride3q -1] + movhps xm10, [dst4q+strideq*1-1] + pinsrd xm11, [dst4q+stride3q -1], 1 + pinsrd xm6, [dstq +strideq*0+1], 2 + pinsrd xm11, [botq +strideq*0-1], 2 + pinsrd xm6, [dstq +strideq*1+1], 3 + pinsrd xm11, [botq +strideq*1-1], 3 + shufps xm5, xm9, xm10, q2020 + vinserti128 m5, xm11, 1 + pmovzxbw m11, [leftq+5] + psrldq xm9, 2 + psrldq xm10, 2 + shufps xm9, xm10, q2020 + movu m10, [blend_4x8_1+8] + vinserti128 m6, xm9, 1 + vpblendvb m5, m11, m10 + %endif +%else + lea r13, [blend_8x8_0+16] + movq xm5, [dstq+strideq*2-1] + movq xm9, [botq+strideq*0-1] + movq xm6, [top1q +1] + movq xm10, [dstq+strideq*0+1] + movhps xm5, [dstq+stride3q -1] + movhps xm9, [botq+strideq*1-1] + movhps xm6, [top2q +1] + movhps xm10, [dstq+strideq*1+1] + movu m11, [r13+hq*2*2+16*2] + vinserti128 m5, xm9, 1 + vinserti128 m6, xm10, 1 + vpblendvb m5, [rsp+gprsize+0x60+hq*8+64+8*2], m11 +%endif + ret + +.border_block: + DEFINE_ARGS dst, stride, left, top, bot, pri, sec, stride3, dst4, edge +%define rstk rsp +%assign stack_offset stack_offset_entry +%assign regs_used 11 + ALLOC_STACK 2*16+(%2+4)*32, 16 +%define px rsp+2*16+2*32 + + pcmpeqw m14, m14 + psllw m14, 15 ; 0x8000 + + ; prepare pixel buffers - body/right +%if %1 == 4 + INIT_XMM avx2 +%endif +%if %2 == 8 + lea dst4q, [dstq+strideq*4] +%endif + lea stride3q, [strideq*3] + test edgeb, 2 ; have_right + jz .no_right + pmovzxbw m1, [dstq+strideq*0] + pmovzxbw m2, [dstq+strideq*1] + pmovzxbw m3, [dstq+strideq*2] + pmovzxbw m4, [dstq+stride3q] + mova [px+0*32], m1 + mova [px+1*32], m2 + mova [px+2*32], m3 + mova [px+3*32], m4 +%if %2 == 8 + pmovzxbw m1, [dst4q+strideq*0] + pmovzxbw m2, [dst4q+strideq*1] + pmovzxbw m3, [dst4q+strideq*2] + pmovzxbw m4, [dst4q+stride3q] + mova [px+4*32], m1 + mova [px+5*32], m2 + mova [px+6*32], m3 + mova [px+7*32], m4 +%endif + jmp .body_done +.no_right: +%if %1 == 4 + movd xm1, [dstq+strideq*0] + movd xm2, [dstq+strideq*1] + movd xm3, [dstq+strideq*2] + movd xm4, [dstq+stride3q] + pmovzxbw xm1, xm1 + pmovzxbw xm2, xm2 + pmovzxbw xm3, xm3 + pmovzxbw xm4, xm4 + movq [px+0*32], xm1 + movq [px+1*32], xm2 + movq [px+2*32], xm3 + movq [px+3*32], xm4 +%else + pmovzxbw xm1, [dstq+strideq*0] + pmovzxbw xm2, [dstq+strideq*1] + pmovzxbw xm3, [dstq+strideq*2] + pmovzxbw xm4, [dstq+stride3q] + mova [px+0*32], xm1 + mova [px+1*32], xm2 + mova [px+2*32], xm3 + mova [px+3*32], xm4 +%endif + movd [px+0*32+%1*2], xm14 + movd [px+1*32+%1*2], xm14 + movd [px+2*32+%1*2], xm14 + movd [px+3*32+%1*2], xm14 +%if %2 == 8 + %if %1 == 4 + movd xm1, [dst4q+strideq*0] + movd xm2, [dst4q+strideq*1] + movd xm3, [dst4q+strideq*2] + movd xm4, [dst4q+stride3q] + pmovzxbw xm1, xm1 + pmovzxbw xm2, xm2 + pmovzxbw xm3, xm3 + pmovzxbw xm4, xm4 + movq [px+4*32], xm1 + movq [px+5*32], xm2 + movq [px+6*32], xm3 + movq [px+7*32], xm4 + %else + pmovzxbw xm1, [dst4q+strideq*0] + pmovzxbw xm2, [dst4q+strideq*1] + pmovzxbw xm3, [dst4q+strideq*2] + pmovzxbw xm4, [dst4q+stride3q] + mova [px+4*32], xm1 + mova [px+5*32], xm2 + mova [px+6*32], xm3 + mova [px+7*32], xm4 + %endif + movd [px+4*32+%1*2], xm14 + movd [px+5*32+%1*2], xm14 + movd [px+6*32+%1*2], xm14 + movd [px+7*32+%1*2], xm14 +%endif +.body_done: + + ; top + test edgeb, 4 ; have_top + jz .no_top + test edgeb, 1 ; have_left + jz .top_no_left + test edgeb, 2 ; have_right + jz .top_no_right + pmovzxbw m1, [topq+strideq*0-(%1/2)] + pmovzxbw m2, [topq+strideq*1-(%1/2)] + movu [px-2*32-%1], m1 + movu [px-1*32-%1], m2 + jmp .top_done +.top_no_right: + pmovzxbw m1, [topq+strideq*0-%1] + pmovzxbw m2, [topq+strideq*1-%1] + movu [px-2*32-%1*2], m1 + movu [px-1*32-%1*2], m2 + movd [px-2*32+%1*2], xm14 + movd [px-1*32+%1*2], xm14 + jmp .top_done +.top_no_left: + test edgeb, 2 ; have_right + jz .top_no_left_right + pmovzxbw m1, [topq+strideq*0] + pmovzxbw m2, [topq+strideq*1] + mova [px-2*32+0], m1 + mova [px-1*32+0], m2 + movd [px-2*32-4], xm14 + movd [px-1*32-4], xm14 + jmp .top_done +.top_no_left_right: +%if %1 == 4 + movd xm1, [topq+strideq*0] + pinsrd xm1, [topq+strideq*1], 1 + pmovzxbw xm1, xm1 + movq [px-2*32+0], xm1 + movhps [px-1*32+0], xm1 +%else + pmovzxbw xm1, [topq+strideq*0] + pmovzxbw xm2, [topq+strideq*1] + mova [px-2*32+0], xm1 + mova [px-1*32+0], xm2 +%endif + movd [px-2*32-4], xm14 + movd [px-1*32-4], xm14 + movd [px-2*32+%1*2], xm14 + movd [px-1*32+%1*2], xm14 + jmp .top_done +.no_top: + movu [px-2*32-%1], m14 + movu [px-1*32-%1], m14 +.top_done: + + ; left + test edgeb, 1 ; have_left + jz .no_left + pmovzxbw xm1, [leftq+ 0] +%if %2 == 8 + pmovzxbw xm2, [leftq+ 8] +%endif + movd [px+0*32-4], xm1 + pextrd [px+1*32-4], xm1, 1 + pextrd [px+2*32-4], xm1, 2 + pextrd [px+3*32-4], xm1, 3 +%if %2 == 8 + movd [px+4*32-4], xm2 + pextrd [px+5*32-4], xm2, 1 + pextrd [px+6*32-4], xm2, 2 + pextrd [px+7*32-4], xm2, 3 +%endif + jmp .left_done +.no_left: + movd [px+0*32-4], xm14 + movd [px+1*32-4], xm14 + movd [px+2*32-4], xm14 + movd [px+3*32-4], xm14 +%if %2 == 8 + movd [px+4*32-4], xm14 + movd [px+5*32-4], xm14 + movd [px+6*32-4], xm14 + movd [px+7*32-4], xm14 +%endif +.left_done: + + ; bottom + DEFINE_ARGS dst, stride, _, _, bot, pri, sec, stride3, _, edge + test edgeb, 8 ; have_bottom + jz .no_bottom + test edgeb, 1 ; have_left + jz .bottom_no_left + test edgeb, 2 ; have_right + jz .bottom_no_right + pmovzxbw m1, [botq+strideq*0-(%1/2)] + pmovzxbw m2, [botq+strideq*1-(%1/2)] + movu [px+(%2+0)*32-%1], m1 + movu [px+(%2+1)*32-%1], m2 + jmp .bottom_done +.bottom_no_right: + pmovzxbw m1, [botq+strideq*0-%1] + pmovzxbw m2, [botq+strideq*1-%1] + movu [px+(%2+0)*32-%1*2], m1 + movu [px+(%2+1)*32-%1*2], m2 +%if %1 == 8 + movd [px+(%2-1)*32+%1*2], xm14 ; overwritten by previous movu +%endif + movd [px+(%2+0)*32+%1*2], xm14 + movd [px+(%2+1)*32+%1*2], xm14 + jmp .bottom_done +.bottom_no_left: + test edgeb, 2 ; have_right + jz .bottom_no_left_right + pmovzxbw m1, [botq+strideq*0] + pmovzxbw m2, [botq+strideq*1] + mova [px+(%2+0)*32+0], m1 + mova [px+(%2+1)*32+0], m2 + movd [px+(%2+0)*32-4], xm14 + movd [px+(%2+1)*32-4], xm14 + jmp .bottom_done +.bottom_no_left_right: +%if %1 == 4 + movd xm1, [botq+strideq*0] + pinsrd xm1, [botq+strideq*1], 1 + pmovzxbw xm1, xm1 + movq [px+(%2+0)*32+0], xm1 + movhps [px+(%2+1)*32+0], xm1 +%else + pmovzxbw xm1, [botq+strideq*0] + pmovzxbw xm2, [botq+strideq*1] + mova [px+(%2+0)*32+0], xm1 + mova [px+(%2+1)*32+0], xm2 +%endif + movd [px+(%2+0)*32-4], xm14 + movd [px+(%2+1)*32-4], xm14 + movd [px+(%2+0)*32+%1*2], xm14 + movd [px+(%2+1)*32+%1*2], xm14 + jmp .bottom_done +.no_bottom: + movu [px+(%2+0)*32-%1], m14 + movu [px+(%2+1)*32-%1], m14 +.bottom_done: + + ; actual filter + INIT_YMM avx2 + DEFINE_ARGS dst, stride, _, pridmp, damping, pri, secdmp, stride3, zero +%undef edged + ; register to shuffle values into after packing + vbroadcasti128 m12, [shufb_lohi] + + mov dampingd, r8m + xor zerod, zerod + movifnidn prid, prim + sub dampingd, 31 + movifnidn secdmpd, secdmpm + test prid, prid + jz .border_sec_only + movd xm0, prid + lzcnt pridmpd, prid + add pridmpd, dampingd + cmovs pridmpd, zerod + mov [rsp+0], pridmpq ; pri_shift + test secdmpd, secdmpd + jz .border_pri_only + movd xm1, secdmpd + lzcnt secdmpd, secdmpd + add secdmpd, dampingd + mov [rsp+8], secdmpq ; sec_shift + + DEFINE_ARGS dst, stride, _, pridmp, table, pri, secdmp, stride3 + lea tableq, [tap_table] + vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask + vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask + + ; pri/sec_taps[k] [4 total] + DEFINE_ARGS dst, stride, _, dir, table, pri, sec, stride3 + vpbroadcastb m0, xm0 ; pri_strength + vpbroadcastb m1, xm1 ; sec_strength + and prid, 1 + lea priq, [tableq+priq*2+8] ; pri_taps + lea secq, [tableq+12] ; sec_taps + + BORDER_PREP_REGS %1, %2 +%if %1*%2*2/mmsize > 1 +.border_v_loop: +%endif + BORDER_LOAD_BLOCK %1, %2, 1 +.border_k_loop: + vpbroadcastb m2, [priq+kq] ; pri_taps + vpbroadcastb m3, [secq+kq] ; sec_taps + ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1, 1 + ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1, 1 + ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1, 1 + dec kq + jge .border_k_loop + + vpbroadcastd m10, [pw_2048] + BORDER_ADJUST_PIXEL %1, m10, 1 +%if %1*%2*2/mmsize > 1 + %define vloop_lines (mmsize/(%1*2)) + lea dstq, [dstq+strideq*vloop_lines] + add stkq, 32*vloop_lines + dec hd + jg .border_v_loop +%endif + RET + +.border_pri_only: + DEFINE_ARGS dst, stride, _, pridmp, table, pri, _, stride3 + lea tableq, [tap_table] + vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask + DEFINE_ARGS dst, stride, _, dir, table, pri, _, stride3 + vpbroadcastb m0, xm0 ; pri_strength + and prid, 1 + lea priq, [tableq+priq*2+8] ; pri_taps + BORDER_PREP_REGS %1, %2 + vpbroadcastd m1, [pw_2048] +%if %1*%2*2/mmsize > 1 +.border_pri_v_loop: +%endif + BORDER_LOAD_BLOCK %1, %2 +.border_pri_k_loop: + vpbroadcastb m2, [priq+kq] ; pri_taps + ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1 + dec kq + jge .border_pri_k_loop + BORDER_ADJUST_PIXEL %1, m1 +%if %1*%2*2/mmsize > 1 + %define vloop_lines (mmsize/(%1*2)) + lea dstq, [dstq+strideq*vloop_lines] + add stkq, 32*vloop_lines + dec hd + jg .border_pri_v_loop +%endif + RET + +.border_sec_only: + DEFINE_ARGS dst, stride, _, _, damping, _, secdmp, stride3 + movd xm1, secdmpd + lzcnt secdmpd, secdmpd + add secdmpd, dampingd + mov [rsp+8], secdmpq ; sec_shift + DEFINE_ARGS dst, stride, _, _, table, _, secdmp, stride3 + lea tableq, [tap_table] + vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask + DEFINE_ARGS dst, stride, _, dir, table, _, sec, stride3 + vpbroadcastb m1, xm1 ; sec_strength + lea secq, [tableq+12] ; sec_taps + BORDER_PREP_REGS %1, %2 + vpbroadcastd m0, [pw_2048] +%if %1*%2*2/mmsize > 1 +.border_sec_v_loop: +%endif + BORDER_LOAD_BLOCK %1, %2 +.border_sec_k_loop: + vpbroadcastb m3, [secq+kq] ; sec_taps + ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1 + ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1 + dec kq + jge .border_sec_k_loop + BORDER_ADJUST_PIXEL %1, m0 +%if %1*%2*2/mmsize > 1 + %define vloop_lines (mmsize/(%1*2)) + lea dstq, [dstq+strideq*vloop_lines] + add stkq, 32*vloop_lines + dec hd + jg .border_sec_v_loop +%endif + RET +%endmacro + +CDEF_FILTER 8, 8 +CDEF_FILTER 4, 8 +CDEF_FILTER 4, 4 + +INIT_YMM avx2 +cglobal cdef_dir_8bpc, 3, 4, 6, src, stride, var, stride3 + lea stride3q, [strideq*3] + movq xm0, [srcq+strideq*0] + movq xm1, [srcq+strideq*1] + movq xm2, [srcq+strideq*2] + movq xm3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpbroadcastq m4, [srcq+stride3q ] + vpbroadcastq m5, [srcq+strideq*2] + vpblendd m0, m4, 0xf0 + vpblendd m1, m5, 0xf0 + vpbroadcastq m4, [srcq+strideq*1] + vpbroadcastq m5, [srcq+strideq*0] + vpblendd m2, m4, 0xf0 + vpblendd m3, m5, 0xf0 + pxor m4, m4 + punpcklbw m0, m4 + punpcklbw m1, m4 + punpcklbw m2, m4 + punpcklbw m3, m4 +cglobal_label .main + vpbroadcastd m4, [pw_128] + PROLOGUE 3, 4, 15 + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 + + ; shuffle registers to generate partial_sum_diag[0-1] together + vperm2i128 m7, m0, m0, 0x01 + vperm2i128 m6, m1, m1, 0x01 + vperm2i128 m5, m2, m2, 0x01 + vperm2i128 m4, m3, m3, 0x01 + + ; start with partial_sum_hv[0-1] + paddw m8, m0, m1 + paddw m9, m2, m3 + phaddw m10, m0, m1 + phaddw m11, m2, m3 + paddw m8, m9 + phaddw m10, m11 + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + paddw xm8, xm9 ; partial_sum_hv[1] + phaddw xm10, xm11 ; partial_sum_hv[0] + vinserti128 m8, xm10, 1 + vpbroadcastd m9, [div_table+44] + pmaddwd m8, m8 + pmulld m8, m9 ; cost6[2a-d] | cost2[a-d] + + ; create aggregates [lower half]: + ; m9 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234+ + ; m4:xxxx0123+m5:xxxxx012+m6:xxxxxx01+m7:xxxxxxx0 + ; m10= m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx+ + ; m4:4567xxxx+m5:34567xxx+m6:234567xx+m7:1234567x + ; and [upper half]: + ; m9 = m0:xxxxxxx0+m1:xxxxxx01+m2:xxxxx012+m3:xxxx0123+ + ; m4:xxx01234+m5:xx012345+m6:x0123456+m7:01234567 + ; m10= m0:1234567x+m1:234567xx+m2:34567xxx+m3:4567xxxx+ + ; m4:567xxxxx+m5:67xxxxxx+m6:7xxxxxxx + ; and then shuffle m11 [shufw_6543210x], unpcklwd, pmaddwd, pmulld, paddd + + pslldq m9, m1, 2 + psrldq m10, m1, 14 + pslldq m11, m2, 4 + psrldq m12, m2, 12 + pslldq m13, m3, 6 + psrldq m14, m3, 10 + paddw m9, m11 + paddw m10, m12 + paddw m9, m13 + paddw m10, m14 + pslldq m11, m4, 8 + psrldq m12, m4, 8 + pslldq m13, m5, 10 + psrldq m14, m5, 6 + paddw m9, m11 + paddw m10, m12 + paddw m9, m13 + paddw m10, m14 + pslldq m11, m6, 12 + psrldq m12, m6, 4 + pslldq m13, m7, 14 + psrldq m14, m7, 2 + paddw m9, m11 + paddw m10, m12 + paddw m9, m13 + paddw m10, m14 ; partial_sum_diag[0/1][8-14,zero] + vbroadcasti128 m14, [shufw_6543210x] + vbroadcasti128 m13, [div_table+16] + vbroadcasti128 m12, [div_table+0] + paddw m9, m0 ; partial_sum_diag[0/1][0-7] + pshufb m10, m14 + punpckhwd m11, m9, m10 + punpcklwd m9, m10 + pmaddwd m11, m11 + pmaddwd m9, m9 + pmulld m11, m13 + pmulld m9, m12 + paddd m9, m11 ; cost0[a-d] | cost4[a-d] + + ; merge horizontally and vertically for partial_sum_alt[0-3] + paddw m10, m0, m1 + paddw m11, m2, m3 + paddw m12, m4, m5 + paddw m13, m6, m7 + phaddw m0, m4 + phaddw m1, m5 + phaddw m2, m6 + phaddw m3, m7 + + ; create aggregates [lower half]: + ; m4 = m10:01234567+m11:x0123456+m12:xx012345+m13:xxx01234 + ; m11= m11:7xxxxxxx+m12:67xxxxxx+m13:567xxxxx + ; and [upper half]: + ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567 + ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx + ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd + + pslldq m4, m11, 2 + psrldq m11, 14 + pslldq m5, m12, 4 + psrldq m12, 12 + pslldq m6, m13, 6 + psrldq m13, 10 + paddw m4, m10 + paddw m11, m12 + vpbroadcastd m12, [div_table+44] + paddw m5, m6 + paddw m11, m13 ; partial_sum_alt[3/2] right + vbroadcasti128 m13, [div_table+32] + paddw m4, m5 ; partial_sum_alt[3/2] left + pshuflw m5, m11, q3012 + punpckhwd m6, m11, m4 + punpcklwd m4, m5 + pmaddwd m6, m6 + pmaddwd m4, m4 + pmulld m6, m12 + pmulld m4, m13 + paddd m4, m6 ; cost7[a-d] | cost5[a-d] + + ; create aggregates [lower half]: + ; m5 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234 + ; m1 = m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx + ; and [upper half]: + ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567 + ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx + ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd + + pslldq m5, m1, 2 + psrldq m1, 14 + pslldq m6, m2, 4 + psrldq m2, 12 + pslldq m7, m3, 6 + psrldq m3, 10 + paddw m5, m0 + paddw m1, m2 + paddw m6, m7 + paddw m1, m3 ; partial_sum_alt[0/1] right + paddw m5, m6 ; partial_sum_alt[0/1] left + pshuflw m0, m1, q3012 + punpckhwd m1, m5 + punpcklwd m5, m0 + pmaddwd m1, m1 + pmaddwd m5, m5 + pmulld m1, m12 + pmulld m5, m13 + paddd m5, m1 ; cost1[a-d] | cost3[a-d] + + mova xm0, [pd_47130256+ 16] + mova m1, [pd_47130256] + phaddd m9, m8 + phaddd m5, m4 + phaddd m9, m5 + vpermd m0, m9 ; cost[0-3] + vpermd m1, m9 ; cost[4-7] | cost[0-3] + + ; now find the best cost + pmaxsd xm2, xm0, xm1 + pshufd xm3, xm2, q1032 + pmaxsd xm2, xm3 + pshufd xm3, xm2, q2301 + pmaxsd xm2, xm3 ; best cost + + ; find the idx using minpos + ; make everything other than the best cost negative via subtraction + ; find the min of unsigned 16-bit ints to sort out the negative values + psubd xm4, xm1, xm2 + psubd xm3, xm0, xm2 + packssdw xm3, xm4 + phminposuw xm3, xm3 + + ; convert idx to 32-bits + psrld xm3, 16 + movd eax, xm3 + + ; get idx^4 complement + vpermd m3, m1 + psubd xm2, xm3 + psrld xm2, 10 + movd [varq], xm2 + RET + +%endif ; ARCH_X86_64 |