diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
commit | 26a029d407be480d791972afb5975cf62c9360a6 (patch) | |
tree | f435a8308119effd964b339f76abb83a57c29483 /third_party/dav1d/src/x86/refmvs.asm | |
parent | Initial commit. (diff) | |
download | firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz firefox-26a029d407be480d791972afb5975cf62c9360a6.zip |
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/dav1d/src/x86/refmvs.asm')
-rw-r--r-- | third_party/dav1d/src/x86/refmvs.asm | 912 |
1 files changed, 912 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/refmvs.asm b/third_party/dav1d/src/x86/refmvs.asm new file mode 100644 index 0000000000..d95861fa17 --- /dev/null +++ b/third_party/dav1d/src/x86/refmvs.asm @@ -0,0 +1,912 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 64 + +%macro JMP_TABLE 2-* + %xdefine %%prefix mangle(private_prefix %+ _%1) + %1_table: + %xdefine %%base %1_table + %rep %0 - 1 + dd %%prefix %+ .w%2 - %%base + %rotate 1 + %endrep +%endmacro + +%macro SAVE_TMVS_TABLE 3 ; num_entries, w, suffix + %rep %1 + db %2*3 + db mangle(private_prefix %+ _save_tmvs_%3).write%2 - \ + mangle(private_prefix %+ _save_tmvs_%3).write1 + %endrep +%endmacro + +%if ARCH_X86_64 +mv_proj: dw 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340 + dw 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092 + dw 1024, 963, 910, 862, 819, 780, 744, 712 + dw 682, 655, 630, 606, 585, 564, 546, 528 +splat_mv_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3 + db 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7 + db 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 + db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3 +%endif +save_pack0: db 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0 + db 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1 +save_pack1: db 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2 + db 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3 +save_ref_shuf: db 0, -1, -1, -1, 1, -1, -1, -1, 8, -1, -1, -1, 9, -1, -1, -1 +cond_shuf512: db 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3 +save_cond0: db 0x80, 0x81, 0x82, 0x83, 0x89, 0x84, 0x00, 0x00 +save_cond1: db 0x84, 0x85, 0x86, 0x87, 0x88, 0x80, 0x00, 0x00 +pb_128: times 16 db 128 +pq_8192: dq 8192 + +save_tmvs_ssse3_table: SAVE_TMVS_TABLE 2, 16, ssse3 + SAVE_TMVS_TABLE 4, 8, ssse3 + SAVE_TMVS_TABLE 4, 4, ssse3 + SAVE_TMVS_TABLE 5, 2, ssse3 + SAVE_TMVS_TABLE 7, 1, ssse3 + +%if ARCH_X86_64 +save_tmvs_avx2_table: SAVE_TMVS_TABLE 2, 16, avx2 + SAVE_TMVS_TABLE 4, 8, avx2 + SAVE_TMVS_TABLE 4, 4, avx2 + SAVE_TMVS_TABLE 5, 2, avx2 + SAVE_TMVS_TABLE 7, 1, avx2 + +save_tmvs_avx512icl_table: SAVE_TMVS_TABLE 2, 16, avx512icl + SAVE_TMVS_TABLE 4, 8, avx512icl + SAVE_TMVS_TABLE 4, 4, avx512icl + SAVE_TMVS_TABLE 5, 2, avx512icl + SAVE_TMVS_TABLE 7, 1, avx512icl + +JMP_TABLE splat_mv_avx512icl, 1, 2, 4, 8, 16, 32 +JMP_TABLE splat_mv_avx2, 1, 2, 4, 8, 16, 32 +%endif + +JMP_TABLE splat_mv_sse2, 1, 2, 4, 8, 16, 32 + +SECTION .text + +%macro movif32 2 +%if ARCH_X86_32 + mov %1, %2 +%endif +%endmacro + +INIT_XMM ssse3 +; refmvs_temporal_block *rp, ptrdiff_t stride, +; refmvs_block **rr, uint8_t *ref_sign, +; int col_end8, int row_end8, int col_start8, int row_start8 +%if ARCH_X86_64 +cglobal save_tmvs, 4, 13, 11, rp, stride, rr, ref_sign, \ + xend, yend, xstart, ystart +%define base_reg r12 +%else +cglobal save_tmvs, 6, 7, 8, rp, stride, rr, ref_sign, \ + xend, yend, xstart, ystart + movq m5, [ref_signq] + lea strided, [strided*5] + mov stridem, strided + mov r3, xstartm + mov r1, ystartm + DEFINE_ARGS b, ystart, rr, cand, xend, x +%define stridemp r1m +%define m8 [base+pb_128] +%define m9 [base+save_pack0+ 0] +%define m10 [base+save_pack0+16] +%define base_reg r6 +%endif +%define base base_reg-.write1 + LEA base_reg, .write1 +%if ARCH_X86_64 + movifnidn xendd, xendm + movifnidn yendd, yendm + mov xstartd, xstartm + mov ystartd, ystartm + movq m5, [ref_signq] +%endif + movu m4, [base+save_ref_shuf] + movddup m6, [base+save_cond0] + movddup m7, [base+save_cond1] +%if ARCH_X86_64 + mova m8, [base+pb_128] + mova m9, [base+save_pack0+ 0] + mova m10, [base+save_pack0+16] +%endif + psllq m5, 8 +%if ARCH_X86_64 + lea r9d, [xendq*5] + lea xstartd, [xstartq*5] + sub yendd, ystartd + add ystartd, ystartd + lea strideq, [strideq*5] + sub xstartq, r9 + add xendd, r9d + add rpq, r9 + DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand +%else + lea r0, [xendd*5] ; xend5 + lea r3, [r3*5] ; xstart5 + sub r3, r0 ; -w5 + mov r6m, r3 +%define xstartq r6m + add xendd, r0 ; xend6 + add r0m, r0 ; rp+xend5 + mov xendm, xendd + sub r5, r1 ; h + add r1, r1 + mov r7m, r1 + mov r5m, r5 +%define hd r5mp + jmp .loop_y_noload +%endif +.loop_y: + movif32 ystartd, r7m + movif32 xendd, xendm +.loop_y_noload: + and ystartd, 30 + mov xq, xstartq + mov bq, [rrq+ystartq*gprsize] + add ystartd, 2 + movif32 r7m, ystartd + lea bq, [bq+xendq*4] +.loop_x: +%if ARCH_X86_32 +%define rpq r3 +%define r10 r1 +%define r10d r1 +%define r11 r4 +%define r11d r4 +%endif + imul candq, xq, 0x9999 ; x / 5 * 3 + sar candq, 16 + movzx r10d, byte [bq+candq*8+22] ; cand_b->bs + movu m0, [bq+candq*8+12] ; cand_b + movzx r11d, byte [base+save_tmvs_ssse3_table+r10*2+0] + movzx r10d, byte [base+save_tmvs_ssse3_table+r10*2+1] + add r10, base_reg + add candq, r11 + jge .calc + movu m1, [bq+candq*8+12] + movzx r11d, byte [bq+candq*8+22] + movzx r11d, byte [base+save_tmvs_ssse3_table+r11*2+1] + add r11, base_reg +.calc: + movif32 rpq, r0m + ; ref check + punpckhqdq m2, m0, m1 + pshufb m2, m4 ; b0.ref0 b0.ref1 b1.ref0 b1.ref1 | ... + pshufb m3, m5, m2 ; ref > 0 && res_sign[ref - 1] + ; mv check + punpcklqdq m2, m0, m1 ; b0.mv0 b0.mv1 b1.mv0 b1.mv1 | ... + pabsw m2, m2 + psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096 + ; res + pcmpgtd m3, m2 + pshufd m2, m3, q2301 + pand m3, m6 ; b0c0 b0c1 b1c0 b1c1 | ... + pand m2, m7 ; b0c1 b0c0 b1c1 b1c0 | ... + por m3, m2 ; b0.shuf b1.shuf | ... + pxor m3, m8 ; if cond0|cond1 == 0 => zero out + pshufb m0, m3 + pshufb m1, m3 + call r10 + jge .next_line + pshufd m0, m1, q3232 + call r11 + jl .loop_x +.next_line: + add rpq, stridemp + movif32 r0m, rpq + dec hd + jg .loop_y + RET +.write1: + movd [rpq+xq+0], m0 + psrlq m0, 8 + movd [rpq+xq+1], m0 + add xq, 5*1 + ret +.write2: + movq [rpq+xq+0], m0 + psrlq m0, 8 + movd [rpq+xq+6], m0 + add xq, 5*2 + ret +.write4: + pshufb m0, m9 + movu [rpq+xq+ 0], m0 + psrlq m0, 8 + movd [rpq+xq+16], m0 + add xq, 5*4 + ret +.write8: + pshufb m2, m0, m9 + movu [rpq+xq+ 0], m2 + pshufb m0, m10 + movu [rpq+xq+16], m0 + psrldq m2, 2 + movq [rpq+xq+32], m2 + add xq, 5*8 + ret +.write16: + pshufb m2, m0, m9 + movu [rpq+xq+ 0], m2 + pshufb m0, m10 + movu [rpq+xq+16], m0 + shufps m2, m0, q1032 + movu [rpq+xq+48], m2 + shufps m2, m0, q2121 + movu [rpq+xq+32], m2 + shufps m0, m2, q1032 + movu [rpq+xq+64], m0 + add xq, 5*16 + ret + +INIT_XMM sse2 +; refmvs_block **rr, refmvs_block *a, int bx4, int bw4, int bh4 +cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4 + add bx4d, bw4d + tzcnt bw4d, bw4d + mova m2, [aq] + LEA aq, splat_mv_sse2_table + lea bx4q, [bx4q*3-32] + movsxd bw4q, [aq+bw4q*4] + movifnidn bh4d, bh4m + pshufd m0, m2, q0210 + pshufd m1, m2, q1021 + pshufd m2, m2, q2102 + add bw4q, aq +.loop: + mov aq, [rrq] + add rrq, gprsize + lea aq, [aq+bx4q*4] + jmp bw4q +.w32: + mova [aq-16*16], m0 + mova [aq-16*15], m1 + mova [aq-16*14], m2 + mova [aq-16*13], m0 + mova [aq-16*12], m1 + mova [aq-16*11], m2 + mova [aq-16*10], m0 + mova [aq-16* 9], m1 + mova [aq-16* 8], m2 + mova [aq-16* 7], m0 + mova [aq-16* 6], m1 + mova [aq-16* 5], m2 +.w16: + mova [aq-16* 4], m0 + mova [aq-16* 3], m1 + mova [aq-16* 2], m2 + mova [aq-16* 1], m0 + mova [aq+16* 0], m1 + mova [aq+16* 1], m2 +.w8: + mova [aq+16* 2], m0 + mova [aq+16* 3], m1 + mova [aq+16* 4], m2 +.w4: + mova [aq+16* 5], m0 + mova [aq+16* 6], m1 + mova [aq+16* 7], m2 + dec bh4d + jg .loop + RET +.w2: + movu [aq+104], m0 + movq [aq+120], m1 + dec bh4d + jg .loop + RET +.w1: + movq [aq+116], m0 + movd [aq+124], m2 + dec bh4d + jg .loop + RET + +%if ARCH_X86_64 +INIT_XMM sse4 +; refmvs_frame *rf, int tile_row_idx, +; int col_start8, int col_end8, int row_start8, int row_end8 +cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \ + stride, rp_proj, roff, troff, \ + xendi, xstarti, iw8, ih8, dst + xor r14d, r14d + cmp dword [rfq+212], 1 ; n_tile_threads + mov ih8d, [rfq+20] ; rf->ih8 + mov iw8d, [rfq+16] ; rf->iw8 + mov xstartd, xstartd + mov xendd, xendd + cmove tridxd, r14d + lea xstartid, [xstartq-8] + lea xendid, [xendq+8] + mov strideq, [rfq+184] + mov rp_projq, [rfq+176] + cmp ih8d, yendd + mov [rsp+0x30], strideq + cmovs yendd, ih8d + test xstartid, xstartid + cmovs xstartid, r14d + cmp iw8d, xendid + cmovs xendid, iw8d + mov troffq, strideq + shl troffq, 4 + imul troffq, tridxq + mov dstd, ystartd + and dstd, 15 + imul dstq, strideq + add dstq, troffq ; (16 * tridx + (ystart & 15)) * stride + lea dstq, [dstq*5] + add dstq, rp_projq + lea troffq, [troffq*5] ; 16 * tridx * stride * 5 + lea r13d, [xendq*5] + lea r12, [strideq*5] + DEFINE_ARGS rf, w5, xstart, xend, ystart, yend, h, x5, \ + _, troff, xendi, xstarti, stride5, _, dst + lea w5d, [xstartq*5] + add r7, troffq ; rp_proj + tile_row_offset + mov hd, yendd + mov [rsp+0x28], r7 + add dstq, r13 + sub w5q, r13 + sub hd, ystartd +.init_xloop_start: + mov x5q, w5q + test w5b, 1 + jz .init_2blk + mov dword [dstq+x5q], 0x80008000 + add x5q, 5 + jz .init_next_row +.init_2blk: + mov dword [dstq+x5q+0], 0x80008000 + mov dword [dstq+x5q+5], 0x80008000 + add x5q, 10 + jl .init_2blk +.init_next_row: + add dstq, stride5q + dec hd + jg .init_xloop_start + DEFINE_ARGS rf, _, xstart, xend, ystart, yend, n7, stride, \ + _, _, xendi, xstarti, stride5, _, n + mov r13d, [rfq+152] ; rf->n_mfmvs + test r13d, r13d + jz .ret + mov [rsp+0x0c], r13d + mov strideq, [rsp+0x30] + movddup m3, [pq_8192] + mov r9d, ystartd + mov [rsp+0x38], yendd + mov [rsp+0x20], xstartid + xor nd, nd + xor n7d, n7d + imul r9, strideq ; ystart * stride + mov [rsp+0x48], rfq + mov [rsp+0x18], stride5q + lea r7, [r9*5] + mov [rsp+0x24], ystartd + mov [rsp+0x00], r7 +.nloop: + DEFINE_ARGS y, off, xstart, xend, ystart, rf, n7, refsign, \ + ref, rp_ref, xendi, xstarti, _, _, n + mov rfq, [rsp+0x48] + mov refd, [rfq+56+nq*4] ; ref2cur + cmp refd, 0x80000000 + je .next_n + mov [rsp+0x40], refd + mov offq, [rsp+0x00] ; ystart * stride * 5 + movzx refd, byte [rfq+53+nq] ; rf->mfmv_ref[n] + lea refsignq, [refq-4] + mov rp_refq, [rfq+168] + movq m2, refsignq + add offq, [rp_refq+refq*8] ; r = rp_ref[ref] + row_offset + mov [rsp+0x14], nd + mov yd, ystartd +.yloop: + mov r11d, [rsp+0x24] ; ystart + mov r12d, [rsp+0x38] ; yend + mov r14d, yd + and r14d, ~7 ; y_sb_align + cmp r11d, r14d + cmovs r11d, r14d ; imax(y_sb_align, ystart) + mov [rsp+0x44], r11d ; y_proj_start + add r14d, 8 + cmp r12d, r14d + cmovs r14d, r12d ; imin(y_sb_align + 8, yend) + mov [rsp+0x3c], r14d ; y_proj_end + DEFINE_ARGS y, src, xstart, xend, frac, rf, n7, mv, \ + ref, x, xendi, mvx, mvy, rb, ref2ref + mov xd, [rsp+0x20] ; xstarti +.xloop: + lea rbd, [xq*5] + add rbq, srcq + movsx refd, byte [rbq+4] + test refd, refd + jz .next_x_bad_ref + mov rfq, [rsp+0x48] + lea r14d, [16+n7q+refq] + mov ref2refd, [rfq+r14*4] ; rf->mfmv_ref2ref[n][b_ref-1] + test ref2refd, ref2refd + jz .next_x_bad_ref + lea fracq, [mv_proj] + movzx fracd, word [fracq+ref2refq*2] + mov mvd, [rbq] + imul fracd, [rsp+0x40] ; ref2cur + pmovsxwq m0, [rbq] + movd m1, fracd + punpcklqdq m1, m1 + pmuldq m0, m1 ; mv * frac + pshufd m1, m0, q3311 + paddd m0, m3 + paddd m0, m1 + psrad m0, 14 ; offset = (xy + (xy >> 31) + 8192) >> 14 + pabsd m1, m0 + packssdw m0, m0 + psrld m1, 6 + packuswb m1, m1 + pxor m0, m2 ; offset ^ ref_sign + psignd m1, m0 ; apply_sign(abs(offset) >> 6, offset ^ refsign) + movq mvxq, m1 + lea mvyd, [mvxq+yq] ; ypos + sar mvxq, 32 + DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, \ + ref, x, xendi, mvx, ypos, rb, ref2ref + cmp yposd, [rsp+0x44] ; y_proj_start + jl .next_x_bad_pos_y + cmp yposd, [rsp+0x3c] ; y_proj_end + jge .next_x_bad_pos_y + and yposd, 15 + add mvxq, xq ; xpos + imul yposq, [rsp+0x30] ; pos = (ypos & 15) * stride + DEFINE_ARGS y, src, xstart, xend, dst, _, n7, mv, \ + ref, x, xendi, xpos, pos, rb, ref2ref + mov dstq, [rsp+0x28] ; dst = rp_proj + tile_row_offset + add posq, xposq ; pos += xpos + lea posq, [posq*5] + add dstq, posq ; dst += pos5 + jmp .write_loop_entry +.write_loop: + add rbq, 5 + cmp refb, byte [rbq+4] + jne .xloop + cmp mvd, [rbq] + jne .xloop + add dstq, 5 + inc xposd +.write_loop_entry: + mov r12d, xd + and r12d, ~7 + lea r5d, [r12-8] + cmp r5d, xstartd + cmovs r5d, xstartd ; x_proj_start + cmp xposd, r5d + jl .next_xpos + add r12d, 16 + cmp xendd, r12d + cmovs r12d, xendd ; x_proj_end + cmp xposd, r12d + jge .next_xpos + mov [dstq+0], mvd + mov byte [dstq+4], ref2refb +.next_xpos: + inc xd + cmp xd, xendid + jl .write_loop +.next_y: + DEFINE_ARGS y, src, xstart, xend, ystart, _, n7, _, _, x, xendi, _, _, _, n + add srcq, [rsp+0x18] ; stride5 + inc yd + cmp yd, [rsp+0x38] ; yend + jne .yloop + mov nd, [rsp+0x14] + mov ystartd, [rsp+0x24] +.next_n: + add n7d, 7 + inc nd + cmp nd, [rsp+0x0c] ; n_mfmvs + jne .nloop +.ret: + RET +.next_x: + DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, ref, x, xendi, _, _, rb, _ + add rbq, 5 + cmp refb, byte [rbq+4] + jne .xloop + cmp mvd, [rbq] + jne .xloop +.next_x_bad_pos_y: + inc xd + cmp xd, xendid + jl .next_x + jmp .next_y +.next_x_bad_ref: + inc xd + cmp xd, xendid + jl .xloop + jmp .next_y + +INIT_YMM avx2 +; refmvs_temporal_block *rp, ptrdiff_t stride, +; refmvs_block **rr, uint8_t *ref_sign, +; int col_end8, int row_end8, int col_start8, int row_start8 +cglobal save_tmvs, 4, 13, 10, rp, stride, rr, ref_sign, \ + xend, yend, xstart, ystart +%define base r12-.write1 + lea r12, [.write1] + movifnidn xendd, xendm + movifnidn yendd, yendm + mov xstartd, xstartm + mov ystartd, ystartm + vpbroadcastq m4, [ref_signq] + vpbroadcastq m3, [base+save_ref_shuf+8] + vpbroadcastq m5, [base+save_cond0] + vpbroadcastq m6, [base+save_cond1] + vpbroadcastd m7, [base+pb_128] + mova m8, [base+save_pack0] + mova m9, [base+save_pack1] + psllq m4, 8 + lea r9d, [xendq*5] + lea xstartd, [xstartq*5] + sub yendd, ystartd + add ystartd, ystartd + lea strideq, [strideq*5] + sub xstartq, r9 + add xendd, r9d + add rpq, r9 + DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand +.loop_y: + and ystartd, 30 + mov xq, xstartq + mov bq, [rrq+ystartq*8] + add ystartd, 2 + lea bq, [bq+xendq*4] +.loop_x: + imul candq, xq, 0x9999 + sar candq, 16 ; x / 5 * 3 + movzx r10d, byte [bq+candq*8+22] ; cand_b->bs + movu xm0, [bq+candq*8+12] ; cand_b + movzx r11d, byte [base+save_tmvs_avx2_table+r10*2+0] + movzx r10d, byte [base+save_tmvs_avx2_table+r10*2+1] + add r10, r12 + add candq, r11 + jge .calc + vinserti128 m0, [bq+candq*8+12], 1 + movzx r11d, byte [bq+candq*8+22] + movzx r11d, byte [base+save_tmvs_avx2_table+r11*2+1] + add r11, r12 +.calc: + pshufb m1, m0, m3 + pabsw m2, m0 + pshufb m1, m4, m1 ; ref > 0 && res_sign[ref - 1] + psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096 + pcmpgtd m1, m2 + pshufd m2, m1, q2301 + pand m1, m5 ; b0.cond0 b1.cond0 + pand m2, m6 ; b0.cond1 b1.cond1 + por m1, m2 ; b0.shuf b1.shuf + pxor m1, m7 ; if cond0|cond1 == 0 => zero out + pshufb m0, m1 + call r10 + jge .next_line + vextracti128 xm0, m0, 1 + call r11 + jl .loop_x +.next_line: + add rpq, strideq + dec hd + jg .loop_y + RET +.write1: + movd [rpq+xq+ 0], xm0 + pextrb [rpq+xq+ 4], xm0, 4 + add xq, 5*1 + ret +.write2: + movq [rpq+xq+0], xm0 + psrlq xm1, xm0, 8 + movd [rpq+xq+6], xm1 + add xq, 5*2 + ret +.write4: + pshufb xm1, xm0, xm8 + movu [rpq+xq+ 0], xm1 + psrlq xm1, 8 + movd [rpq+xq+16], xm1 + add xq, 5*4 + ret +.write8: + vinserti128 m1, m0, xm0, 1 + pshufb m1, m8 + movu [rpq+xq+ 0], m1 + psrldq xm1, 2 + movq [rpq+xq+32], xm1 + add xq, 5*8 + ret +.write16: + vinserti128 m1, m0, xm0, 1 + pshufb m2, m1, m8 + movu [rpq+xq+ 0], m2 + pshufb m1, m9 + movu [rpq+xq+32], m1 + shufps xm2, xm1, q1021 + movu [rpq+xq+64], xm2 + add xq, 5*16 + ret + +cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4 + add bx4d, bw4d + tzcnt bw4d, bw4d + vbroadcasti128 m0, [aq] + lea aq, [splat_mv_avx2_table] + lea bx4q, [bx4q*3-32] + movsxd bw4q, [aq+bw4q*4] + pshufb m0, [splat_mv_shuf] + movifnidn bh4d, bh4m + pshufd m1, m0, q2102 + pshufd m2, m0, q1021 + add bw4q, aq +.loop: + mov aq, [rrq] + add rrq, gprsize + lea aq, [aq+bx4q*4] + jmp bw4q +.w32: + mova [aq-32*8], m0 + mova [aq-32*7], m1 + mova [aq-32*6], m2 + mova [aq-32*5], m0 + mova [aq-32*4], m1 + mova [aq-32*3], m2 +.w16: + mova [aq-32*2], m0 + mova [aq-32*1], m1 + mova [aq+32*0], m2 +.w8: + mova [aq+32*1], m0 + mova [aq+32*2], m1 + mova [aq+32*3], m2 + dec bh4d + jg .loop + RET +.w4: + movu [aq+ 80], m0 + mova [aq+112], xm1 + dec bh4d + jg .loop + RET +.w2: + movu [aq+104], xm0 + movq [aq+120], xm2 + dec bh4d + jg .loop + RET +.w1: + movq [aq+116], xm0 + movd [aq+124], xm1 + dec bh4d + jg .loop + RET + +INIT_ZMM avx512icl +; refmvs_temporal_block *rp, ptrdiff_t stride, +; refmvs_block **rr, uint8_t *ref_sign, +; int col_end8, int row_end8, int col_start8, int row_start8 +cglobal save_tmvs, 4, 15, 10, rp, stride, rr, ref_sign, \ + xend, yend, xstart, ystart +%define base r14-.write1 + lea r14, [.write1] + movifnidn xendd, xendm + movifnidn yendd, yendm + mov xstartd, xstartm + mov ystartd, ystartm + psllq m4, [ref_signq]{bcstq}, 8 + vpbroadcastq m3, [base+save_ref_shuf+8] + vbroadcasti32x4 m5, [base+cond_shuf512] + vbroadcasti32x4 m6, [base+save_cond0] + vpbroadcastd m7, [base+pb_128] + mova m8, [base+save_pack0] + movu xm9, [base+save_pack0+4] + lea r9d, [xendq*5] + lea xstartd, [xstartq*5] + sub yendd, ystartd + add ystartd, ystartd + lea strideq, [strideq*5] + sub xstartq, r9 + add xendd, r9d + add rpq, r9 + mov r10d, 0x1f + kmovb k2, r10d + DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand +.loop_y: + and ystartd, 30 + mov xq, xstartq + mov bq, [rrq+ystartq*8] + add ystartd, 2 + lea bq, [bq+xendq*4] +.loop_x: + imul candq, xq, 0x9999 + sar candq, 16 ; x / 5 * 3 + movzx r10d, byte [bq+candq*8+22] ; cand_b->bs + movu xm0, [bq+candq*8+12] ; cand_b + movzx r11d, byte [base+save_tmvs_avx512icl_table+r10*2+0] + movzx r10d, byte [base+save_tmvs_avx512icl_table+r10*2+1] + add r10, r14 + add candq, r11 + jge .calc + movzx r11d, byte [bq+candq*8+22] + vinserti32x4 ym0, [bq+candq*8+12], 1 + movzx r12d, byte [base+save_tmvs_avx512icl_table+r11*2+0] + movzx r11d, byte [base+save_tmvs_avx512icl_table+r11*2+1] + add r11, r14 + add candq, r12 + jge .calc + movzx r12d, byte [bq+candq*8+22] + vinserti32x4 m0, [bq+candq*8+12], 2 + movzx r13d, byte [base+save_tmvs_avx512icl_table+r12*2+0] + movzx r12d, byte [base+save_tmvs_avx512icl_table+r12*2+1] + add r12, r14 + add candq, r13 + jge .calc + vinserti32x4 m0, [bq+candq*8+12], 3 + movzx r13d, byte [bq+candq*8+22] + movzx r13d, byte [base+save_tmvs_avx512icl_table+r13*2+1] + add r13, r14 +.calc: + pshufb m1, m0, m3 + pabsw m2, m0 + pshufb m1, m4, m1 ; ref > 0 && res_sign[ref - 1] + psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096 + psubd m2, m1 + pshufb m2, m5 ; c0 c1 c1 c0 + pand m2, m6 + punpckhqdq m1, m2, m2 + vpternlogd m1, m2, m7, 0x56 ; (c0shuf | c1shuf) ^ 0x80 + pshufb m2, m0, m1 + mova xm0, xm2 + call r10 + jge .next_line + vextracti32x4 xm0, m2, 1 + call r11 + jge .next_line + vextracti32x4 xm0, m2, 2 + call r12 + jge .next_line + vextracti32x4 xm0, m2, 3 + call r13 + jl .loop_x +.next_line: + add rpq, strideq + dec hd + jg .loop_y + RET +.write1: + vmovdqu8 [rpq+xq]{k2}, xm0 + add xq, 5*1 + ret +.write2: + pshufb xm0, xm8 + vmovdqu16 [rpq+xq]{k2}, xm0 + add xq, 5*2 + ret +.write4: + vpermb ym0, ym8, ym0 + vmovdqu32 [rpq+xq]{k2}, ym0 + add xq, 5*4 + ret +.write8: + vpermb m0, m8, m0 + vmovdqu64 [rpq+xq]{k2}, m0 + add xq, 5*8 + ret +.write16: + vpermb m1, m8, m0 + movu [rpq+xq+ 0], m1 + pshufb xm0, xm9 + movu [rpq+xq+64], xm0 + add xq, 5*16 + ret + +INIT_ZMM avx512icl +cglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4 + vbroadcasti32x4 m0, [aq] + lea r1, [splat_mv_avx512icl_table] + tzcnt bw4d, bw4d + lea bx4d, [bx4q*3] + pshufb m0, [splat_mv_shuf] + movsxd bw4q, [r1+bw4q*4] + mov r6d, bh4m + add bw4q, r1 + lea rrq, [rrq+r6*8] + mov r1d, 0x3f + neg r6 + kmovb k1, r1d + jmp bw4q +.w1: + mov r1, [rrq+r6*8] + vmovdqu16 [r1+bx4q*4]{k1}, xm0 + inc r6 + jl .w1 + RET +.w2: + mov r1, [rrq+r6*8] + vmovdqu32 [r1+bx4q*4]{k1}, ym0 + inc r6 + jl .w2 + RET +.w4: + mov r1, [rrq+r6*8] + vmovdqu64 [r1+bx4q*4]{k1}, m0 + inc r6 + jl .w4 + RET +.w8: + pshufd ym1, ym0, q1021 +.w8_loop: + mov r1, [rrq+r6*8+0] + mov r3, [rrq+r6*8+8] + movu [r1+bx4q*4+ 0], m0 + mova [r1+bx4q*4+64], ym1 + movu [r3+bx4q*4+ 0], m0 + mova [r3+bx4q*4+64], ym1 + add r6, 2 + jl .w8_loop + RET +.w16: + pshufd m1, m0, q1021 + pshufd m2, m0, q2102 +.w16_loop: + mov r1, [rrq+r6*8+0] + mov r3, [rrq+r6*8+8] + mova [r1+bx4q*4+64*0], m0 + mova [r1+bx4q*4+64*1], m1 + mova [r1+bx4q*4+64*2], m2 + mova [r3+bx4q*4+64*0], m0 + mova [r3+bx4q*4+64*1], m1 + mova [r3+bx4q*4+64*2], m2 + add r6, 2 + jl .w16_loop + RET +.w32: + pshufd m1, m0, q1021 + pshufd m2, m0, q2102 +.w32_loop: + mov r1, [rrq+r6*8] + lea r1, [r1+bx4q*4] + mova [r1+64*0], m0 + mova [r1+64*1], m1 + mova [r1+64*2], m2 + mova [r1+64*3], m0 + mova [r1+64*4], m1 + mova [r1+64*5], m2 + inc r6 + jl .w32_loop + RET +%endif ; ARCH_X86_64 |