diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 09:22:09 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 09:22:09 +0000 |
commit | 43a97878ce14b72f0981164f87f2e35e14151312 (patch) | |
tree | 620249daf56c0258faa40cbdcf9cfba06de2a846 /third_party/dav1d/src/x86/refmvs.asm | |
parent | Initial commit. (diff) | |
download | firefox-43a97878ce14b72f0981164f87f2e35e14151312.tar.xz firefox-43a97878ce14b72f0981164f87f2e35e14151312.zip |
Adding upstream version 110.0.1.upstream/110.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/dav1d/src/x86/refmvs.asm')
-rw-r--r-- | third_party/dav1d/src/x86/refmvs.asm | 248 |
1 files changed, 248 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/refmvs.asm b/third_party/dav1d/src/x86/refmvs.asm new file mode 100644 index 0000000000..fb4ca1033a --- /dev/null +++ b/third_party/dav1d/src/x86/refmvs.asm @@ -0,0 +1,248 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 64 + +%macro JMP_TABLE 2-* + %xdefine %%prefix mangle(private_prefix %+ _%1) + %1_table: + %xdefine %%base %1_table + %rep %0 - 1 + dd %%prefix %+ .w%2 - %%base + %rotate 1 + %endrep +%endmacro + +%if ARCH_X86_64 +splat_mv_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3 + db 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7 + db 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 + db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3 + +JMP_TABLE splat_mv_avx512icl, 1, 2, 4, 8, 16, 32 +JMP_TABLE splat_mv_avx2, 1, 2, 4, 8, 16, 32 +%endif +JMP_TABLE splat_mv_sse2, 1, 2, 4, 8, 16, 32 + +SECTION .text + +INIT_XMM sse2 +; refmvs_block **rr, refmvs_block *a, int bx4, int bw4, int bh4 +cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4 + add bx4d, bw4d + tzcnt bw4d, bw4d + mova m2, [aq] + LEA aq, splat_mv_sse2_table + lea bx4q, [bx4q*3-32] + movsxd bw4q, [aq+bw4q*4] + movifnidn bh4d, bh4m + pshufd m0, m2, q0210 + pshufd m1, m2, q1021 + pshufd m2, m2, q2102 + add bw4q, aq +.loop: + mov aq, [rrq] + add rrq, gprsize + lea aq, [aq+bx4q*4] + jmp bw4q +.w32: + mova [aq-16*16], m0 + mova [aq-16*15], m1 + mova [aq-16*14], m2 + mova [aq-16*13], m0 + mova [aq-16*12], m1 + mova [aq-16*11], m2 + mova [aq-16*10], m0 + mova [aq-16* 9], m1 + mova [aq-16* 8], m2 + mova [aq-16* 7], m0 + mova [aq-16* 6], m1 + mova [aq-16* 5], m2 +.w16: + mova [aq-16* 4], m0 + mova [aq-16* 3], m1 + mova [aq-16* 2], m2 + mova [aq-16* 1], m0 + mova [aq+16* 0], m1 + mova [aq+16* 1], m2 +.w8: + mova [aq+16* 2], m0 + mova [aq+16* 3], m1 + mova [aq+16* 4], m2 +.w4: + mova [aq+16* 5], m0 + mova [aq+16* 6], m1 + mova [aq+16* 7], m2 + dec bh4d + jg .loop + RET +.w2: + movu [aq+104], m0 + movq [aq+120], m1 + dec bh4d + jg .loop + RET +.w1: + movq [aq+116], m0 + movd [aq+124], m2 + dec bh4d + jg .loop + RET + +%if ARCH_X86_64 +INIT_YMM avx2 +cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4 + add bx4d, bw4d + tzcnt bw4d, bw4d + vbroadcasti128 m0, [aq] + lea aq, [splat_mv_avx2_table] + lea bx4q, [bx4q*3-32] + movsxd bw4q, [aq+bw4q*4] + pshufb m0, [splat_mv_shuf] + movifnidn bh4d, bh4m + pshufd m1, m0, q2102 + pshufd m2, m0, q1021 + add bw4q, aq +.loop: + mov aq, [rrq] + add rrq, gprsize + lea aq, [aq+bx4q*4] + jmp bw4q +.w32: + mova [aq-32*8], m0 + mova [aq-32*7], m1 + mova [aq-32*6], m2 + mova [aq-32*5], m0 + mova [aq-32*4], m1 + mova [aq-32*3], m2 +.w16: + mova [aq-32*2], m0 + mova [aq-32*1], m1 + mova [aq+32*0], m2 +.w8: + mova [aq+32*1], m0 + mova [aq+32*2], m1 + mova [aq+32*3], m2 + dec bh4d + jg .loop + RET +.w4: + movu [aq+ 80], m0 + mova [aq+112], xm1 + dec bh4d + jg .loop + RET +.w2: + movu [aq+104], xm0 + movq [aq+120], xm2 + dec bh4d + jg .loop + RET +.w1: + movq [aq+116], xm0 + movd [aq+124], xm1 + dec bh4d + jg .loop + RET + +INIT_ZMM avx512icl +cglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4 + vbroadcasti32x4 m0, [aq] + lea r1, [splat_mv_avx512icl_table] + tzcnt bw4d, bw4d + lea bx4d, [bx4q*3] + pshufb m0, [splat_mv_shuf] + movsxd bw4q, [r1+bw4q*4] + mov r6d, bh4m + add bw4q, r1 + lea rrq, [rrq+r6*8] + mov r1d, 0x3f + neg r6 + kmovb k1, r1d + jmp bw4q +.w1: + mov r1, [rrq+r6*8] + vmovdqu16 [r1+bx4q*4]{k1}, xm0 + inc r6 + jl .w1 + RET +.w2: + mov r1, [rrq+r6*8] + vmovdqu32 [r1+bx4q*4]{k1}, ym0 + inc r6 + jl .w2 + RET +.w4: + mov r1, [rrq+r6*8] + vmovdqu64 [r1+bx4q*4]{k1}, m0 + inc r6 + jl .w4 + RET +.w8: + pshufd ym1, ym0, q1021 +.w8_loop: + mov r1, [rrq+r6*8+0] + mov r3, [rrq+r6*8+8] + movu [r1+bx4q*4+ 0], m0 + mova [r1+bx4q*4+64], ym1 + movu [r3+bx4q*4+ 0], m0 + mova [r3+bx4q*4+64], ym1 + add r6, 2 + jl .w8_loop + RET +.w16: + pshufd m1, m0, q1021 + pshufd m2, m0, q2102 +.w16_loop: + mov r1, [rrq+r6*8+0] + mov r3, [rrq+r6*8+8] + mova [r1+bx4q*4+64*0], m0 + mova [r1+bx4q*4+64*1], m1 + mova [r1+bx4q*4+64*2], m2 + mova [r3+bx4q*4+64*0], m0 + mova [r3+bx4q*4+64*1], m1 + mova [r3+bx4q*4+64*2], m2 + add r6, 2 + jl .w16_loop + RET +.w32: + pshufd m1, m0, q1021 + pshufd m2, m0, q2102 +.w32_loop: + mov r1, [rrq+r6*8] + lea r1, [r1+bx4q*4] + mova [r1+64*0], m0 + mova [r1+64*1], m1 + mova [r1+64*2], m2 + mova [r1+64*3], m0 + mova [r1+64*4], m1 + mova [r1+64*5], m2 + inc r6 + jl .w32_loop + RET +%endif ; ARCH_X86_64 |