summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/x86/refmvs.asm
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 09:22:09 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 09:22:09 +0000
commit43a97878ce14b72f0981164f87f2e35e14151312 (patch)
tree620249daf56c0258faa40cbdcf9cfba06de2a846 /third_party/dav1d/src/x86/refmvs.asm
parentInitial commit. (diff)
downloadfirefox-43a97878ce14b72f0981164f87f2e35e14151312.tar.xz
firefox-43a97878ce14b72f0981164f87f2e35e14151312.zip
Adding upstream version 110.0.1.upstream/110.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/dav1d/src/x86/refmvs.asm')
-rw-r--r--third_party/dav1d/src/x86/refmvs.asm248
1 files changed, 248 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/refmvs.asm b/third_party/dav1d/src/x86/refmvs.asm
new file mode 100644
index 0000000000..fb4ca1033a
--- /dev/null
+++ b/third_party/dav1d/src/x86/refmvs.asm
@@ -0,0 +1,248 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 64
+
+%macro JMP_TABLE 2-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1)
+ %1_table:
+ %xdefine %%base %1_table
+ %rep %0 - 1
+ dd %%prefix %+ .w%2 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%if ARCH_X86_64
+splat_mv_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3
+ db 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7
+ db 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+ db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3
+
+JMP_TABLE splat_mv_avx512icl, 1, 2, 4, 8, 16, 32
+JMP_TABLE splat_mv_avx2, 1, 2, 4, 8, 16, 32
+%endif
+JMP_TABLE splat_mv_sse2, 1, 2, 4, 8, 16, 32
+
+SECTION .text
+
+INIT_XMM sse2
+; refmvs_block **rr, refmvs_block *a, int bx4, int bw4, int bh4
+cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
+ add bx4d, bw4d
+ tzcnt bw4d, bw4d
+ mova m2, [aq]
+ LEA aq, splat_mv_sse2_table
+ lea bx4q, [bx4q*3-32]
+ movsxd bw4q, [aq+bw4q*4]
+ movifnidn bh4d, bh4m
+ pshufd m0, m2, q0210
+ pshufd m1, m2, q1021
+ pshufd m2, m2, q2102
+ add bw4q, aq
+.loop:
+ mov aq, [rrq]
+ add rrq, gprsize
+ lea aq, [aq+bx4q*4]
+ jmp bw4q
+.w32:
+ mova [aq-16*16], m0
+ mova [aq-16*15], m1
+ mova [aq-16*14], m2
+ mova [aq-16*13], m0
+ mova [aq-16*12], m1
+ mova [aq-16*11], m2
+ mova [aq-16*10], m0
+ mova [aq-16* 9], m1
+ mova [aq-16* 8], m2
+ mova [aq-16* 7], m0
+ mova [aq-16* 6], m1
+ mova [aq-16* 5], m2
+.w16:
+ mova [aq-16* 4], m0
+ mova [aq-16* 3], m1
+ mova [aq-16* 2], m2
+ mova [aq-16* 1], m0
+ mova [aq+16* 0], m1
+ mova [aq+16* 1], m2
+.w8:
+ mova [aq+16* 2], m0
+ mova [aq+16* 3], m1
+ mova [aq+16* 4], m2
+.w4:
+ mova [aq+16* 5], m0
+ mova [aq+16* 6], m1
+ mova [aq+16* 7], m2
+ dec bh4d
+ jg .loop
+ RET
+.w2:
+ movu [aq+104], m0
+ movq [aq+120], m1
+ dec bh4d
+ jg .loop
+ RET
+.w1:
+ movq [aq+116], m0
+ movd [aq+124], m2
+ dec bh4d
+ jg .loop
+ RET
+
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
+ add bx4d, bw4d
+ tzcnt bw4d, bw4d
+ vbroadcasti128 m0, [aq]
+ lea aq, [splat_mv_avx2_table]
+ lea bx4q, [bx4q*3-32]
+ movsxd bw4q, [aq+bw4q*4]
+ pshufb m0, [splat_mv_shuf]
+ movifnidn bh4d, bh4m
+ pshufd m1, m0, q2102
+ pshufd m2, m0, q1021
+ add bw4q, aq
+.loop:
+ mov aq, [rrq]
+ add rrq, gprsize
+ lea aq, [aq+bx4q*4]
+ jmp bw4q
+.w32:
+ mova [aq-32*8], m0
+ mova [aq-32*7], m1
+ mova [aq-32*6], m2
+ mova [aq-32*5], m0
+ mova [aq-32*4], m1
+ mova [aq-32*3], m2
+.w16:
+ mova [aq-32*2], m0
+ mova [aq-32*1], m1
+ mova [aq+32*0], m2
+.w8:
+ mova [aq+32*1], m0
+ mova [aq+32*2], m1
+ mova [aq+32*3], m2
+ dec bh4d
+ jg .loop
+ RET
+.w4:
+ movu [aq+ 80], m0
+ mova [aq+112], xm1
+ dec bh4d
+ jg .loop
+ RET
+.w2:
+ movu [aq+104], xm0
+ movq [aq+120], xm2
+ dec bh4d
+ jg .loop
+ RET
+.w1:
+ movq [aq+116], xm0
+ movd [aq+124], xm1
+ dec bh4d
+ jg .loop
+ RET
+
+INIT_ZMM avx512icl
+cglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4
+ vbroadcasti32x4 m0, [aq]
+ lea r1, [splat_mv_avx512icl_table]
+ tzcnt bw4d, bw4d
+ lea bx4d, [bx4q*3]
+ pshufb m0, [splat_mv_shuf]
+ movsxd bw4q, [r1+bw4q*4]
+ mov r6d, bh4m
+ add bw4q, r1
+ lea rrq, [rrq+r6*8]
+ mov r1d, 0x3f
+ neg r6
+ kmovb k1, r1d
+ jmp bw4q
+.w1:
+ mov r1, [rrq+r6*8]
+ vmovdqu16 [r1+bx4q*4]{k1}, xm0
+ inc r6
+ jl .w1
+ RET
+.w2:
+ mov r1, [rrq+r6*8]
+ vmovdqu32 [r1+bx4q*4]{k1}, ym0
+ inc r6
+ jl .w2
+ RET
+.w4:
+ mov r1, [rrq+r6*8]
+ vmovdqu64 [r1+bx4q*4]{k1}, m0
+ inc r6
+ jl .w4
+ RET
+.w8:
+ pshufd ym1, ym0, q1021
+.w8_loop:
+ mov r1, [rrq+r6*8+0]
+ mov r3, [rrq+r6*8+8]
+ movu [r1+bx4q*4+ 0], m0
+ mova [r1+bx4q*4+64], ym1
+ movu [r3+bx4q*4+ 0], m0
+ mova [r3+bx4q*4+64], ym1
+ add r6, 2
+ jl .w8_loop
+ RET
+.w16:
+ pshufd m1, m0, q1021
+ pshufd m2, m0, q2102
+.w16_loop:
+ mov r1, [rrq+r6*8+0]
+ mov r3, [rrq+r6*8+8]
+ mova [r1+bx4q*4+64*0], m0
+ mova [r1+bx4q*4+64*1], m1
+ mova [r1+bx4q*4+64*2], m2
+ mova [r3+bx4q*4+64*0], m0
+ mova [r3+bx4q*4+64*1], m1
+ mova [r3+bx4q*4+64*2], m2
+ add r6, 2
+ jl .w16_loop
+ RET
+.w32:
+ pshufd m1, m0, q1021
+ pshufd m2, m0, q2102
+.w32_loop:
+ mov r1, [rrq+r6*8]
+ lea r1, [r1+bx4q*4]
+ mova [r1+64*0], m0
+ mova [r1+64*1], m1
+ mova [r1+64*2], m2
+ mova [r1+64*3], m0
+ mova [r1+64*4], m1
+ mova [r1+64*5], m2
+ inc r6
+ jl .w32_loop
+ RET
+%endif ; ARCH_X86_64