diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
commit | 36d22d82aa202bb199967e9512281e9a53db42c9 (patch) | |
tree | 105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/dav1d/src/x86/ipred16_avx512.asm | |
parent | Initial commit. (diff) | |
download | firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip |
Adding upstream version 115.7.0esr.upstream/115.7.0esr
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/dav1d/src/x86/ipred16_avx512.asm')
-rw-r--r-- | third_party/dav1d/src/x86/ipred16_avx512.asm | 833 |
1 files changed, 833 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/ipred16_avx512.asm b/third_party/dav1d/src/x86/ipred16_avx512.asm new file mode 100644 index 0000000000..1a307adc98 --- /dev/null +++ b/third_party/dav1d/src/x86/ipred16_avx512.asm @@ -0,0 +1,833 @@ +; Copyright © 2022, VideoLAN and dav1d authors +; Copyright © 2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +ipred_shuf: db 14, 15, 14, 15, 0, 1, 2, 3, 6, 7, 6, 7, 0, 1, 2, 3 + db 10, 11, 10, 11, 8, 9, 10, 11, 2, 3, 2, 3, 8, 9, 10, 11 + db 12, 13, 12, 13, 4, 5, 6, 7, 4, 5, 4, 5, 4, 5, 6, 7 + db 8, 9, 8, 9, 12, 13, 14, 15, 0, 1, 0, 1, 12, 13, 14, 15 +smooth_perm: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 + db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 + db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94 + db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126 +pal_pred_perm: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39 + db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 + db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55 + db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 +filter_permA: times 4 db 6, 7, 8, 9, 14, 15, 4, 5 + times 4 db 10, 11, 12, 13, 2, 3, -1, -1 +filter_permB: times 4 db 22, 23, 24, 25, 30, 31, 6, 7 + times 4 db 26, 27, 28, 29, 14, 15, -1, -1 +filter_permC: dd 8 ; dq 8, 10, 1, 11, 0, 9 +pw_1: times 2 dw 1 + dd 10 +filter_rnd: dd 32 + dd 1 + dd 8 + dd 11 +filter_shift: times 2 dw 6 + dd 0 + times 2 dw 4 + dd 9 + +%macro JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - 2*4) + %xdefine %%base mangle(private_prefix %+ _%1_%2) + %%table: + %rep %0 - 2 + dd %%base %+ .%3 - (%%table - 2*4) + %rotate 1 + %endrep +%endmacro + +JMP_TABLE ipred_paeth_16bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_16bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_h_16bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_v_16bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE pal_pred_16bpc, avx512icl, w4, w8, w16, w32, w64 + +cextern smooth_weights_1d_16bpc +cextern smooth_weights_2d_16bpc +cextern filter_intra_taps + +SECTION .text + +%macro PAETH 3 ; top, signed_ldiff, ldiff + paddw m0, m%2, m2 + psubw m1, m0, m3 ; tldiff + psubw m0, m%1 ; tdiff + pabsw m1, m1 + pabsw m0, m0 + pcmpgtw k1, m0, m1 + pminsw m0, m1 + pcmpgtw k2, m%3, m0 + vpblendmw m0{k1}, m%1, m3 + vpblendmw m0{k2}, m2, m0 +%endmacro + +INIT_ZMM avx512icl +cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, w, h +%define base r6-ipred_paeth_16bpc_avx512icl_table + lea r6, [ipred_paeth_16bpc_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r6+wq*4] + vpbroadcastw m3, [tlq] ; topleft + add wq, r6 + jmp wq +.w4: + vpbroadcastq m4, [tlq+2] ; top + movsldup m7, [base+ipred_shuf] + lea r6, [strideq*3] + psubw m5, m4, m3 + pabsw m6, m5 +.w4_loop: + sub tlq, 16 + vbroadcasti32x4 m2, [tlq] + pshufb m2, m7 ; left + PAETH 4, 5, 6 + vextracti32x4 xm1, m0, 2 + vextracti32x4 xm8, ym0, 1 + vextracti32x4 xm9, m0, 3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm8 + movq [dstq+r6 ], xm9 + sub hd, 8 + jl .w4_end + lea dstq, [dstq+strideq*4] + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm8 + movhps [dstq+r6 ], xm9 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.w4_end: + RET +.w8: + vbroadcasti32x4 m4, [tlq+2] + movsldup m7, [base+ipred_shuf] + lea r6, [strideq*3] + psubw m5, m4, m3 + pabsw m6, m5 +.w8_loop: + sub tlq, 8 + vpbroadcastq m2, [tlq] + pshufb m2, m7 + PAETH 4, 5, 6 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+r6 ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +.w16: + vbroadcasti32x8 m4, [tlq+2] + movsldup m7, [base+ipred_shuf] + psubw m5, m4, m3 + pabsw m6, m5 +.w16_loop: + sub tlq, 4 + vpbroadcastd m2, [tlq] + pshufb m2, m7 + PAETH 4, 5, 6 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_loop + RET +.w32: + movu m4, [tlq+2] + psubw m5, m4, m3 + pabsw m6, m5 +.w32_loop: + sub tlq, 2 + vpbroadcastw m2, [tlq] + PAETH 4, 5, 6 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w32_loop + RET +.w64: + movu m4, [tlq+ 2] + movu m7, [tlq+66] + psubw m5, m4, m3 + psubw m8, m7, m3 + pabsw m6, m5 + pabsw m9, m8 +.w64_loop: + sub tlq, 2 + vpbroadcastw m2, [tlq] + PAETH 4, 5, 6 + mova [dstq+64*0], m0 + PAETH 7, 8, 9 + mova [dstq+64*1], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET + +cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 +%define base r6-$$ + lea r6, [$$] + tzcnt wd, wm + mov hd, hm + movsxd wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq*4] + lea weightsq, [base+smooth_weights_1d_16bpc+hq*4] + neg hq + vpbroadcastw m6, [tlq+hq*2] ; bottom + lea wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq] + lea stride3q, [strideq*3] + jmp wq +.w4: + vpbroadcastq m5, [tlq+2] ; top + movsldup m4, [ipred_shuf] + psubw m5, m6 ; top - bottom +.w4_loop: + vbroadcasti32x4 m3, [weightsq+hq*2] + pshufb m3, m4 + pmulhrsw m3, m5 + paddw m3, m6 + vextracti32x4 xm0, m3, 3 + vextracti32x4 xm1, ym3, 1 + vextracti32x4 xm2, m3, 2 + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 + add hq, 8 + jg .end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 + lea dstq, [dstq+strideq*4] + jl .w4_loop +.end: + RET +.w8: + vbroadcasti32x4 m5, [tlq+2] ; top + movsldup m4, [ipred_shuf] + psubw m5, m6 ; top - bottom +.w8_loop: + vpbroadcastq m0, [weightsq+hq*2] + pshufb m0, m4 + pmulhrsw m0, m5 + paddw m0, m6 + vextracti32x4 [dstq+strideq*0], m0, 3 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + mova [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w8_loop + RET +.w16: + vbroadcasti32x8 m5, [tlq+2] ; top + movsldup m4, [ipred_shuf] + psubw m5, m6 ; top - bottom +.w16_loop: + vpbroadcastd m0, [weightsq+hq*2+0] + vpbroadcastd m1, [weightsq+hq*2+4] + pshufb m0, m4 + pshufb m1, m4 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + paddw m0, m6 + paddw m1, m6 + vextracti32x8 [dstq+strideq*0], m0, 1 + mova [dstq+strideq*1], ym0 + vextracti32x8 [dstq+strideq*2], m1, 1 + mova [dstq+stride3q ], ym1 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w16_loop + RET +.w32: + movu m5, [tlq+2] + psubw m5, m6 +.w32_loop: + vpbroadcastw m0, [weightsq+hq*2+0] + vpbroadcastw m1, [weightsq+hq*2+2] + vpbroadcastw m2, [weightsq+hq*2+4] + vpbroadcastw m3, [weightsq+hq*2+6] + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + REPX {paddw x, m6}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w32_loop + RET +.w64: + movu m4, [tlq+ 2] + movu m5, [tlq+66] + psubw m4, m6 + psubw m5, m6 +.w64_loop: + vpbroadcastw m1, [weightsq+hq*2+0] + vpbroadcastw m3, [weightsq+hq*2+2] + pmulhrsw m0, m4, m1 + pmulhrsw m1, m5 + pmulhrsw m2, m4, m3 + pmulhrsw m3, m5 + REPX {paddw x, m6}, m0, m1, m2, m3 + mova [dstq+strideq*0+64*0], m0 + mova [dstq+strideq*0+64*1], m1 + mova [dstq+strideq*1+64*0], m2 + mova [dstq+strideq*1+64*1], m3 + lea dstq, [dstq+strideq*2] + add hq, 2 + jl .w64_loop + RET + +cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl, w, h, stride3 + lea r6, [$$] + mov wd, wm + movifnidn hd, hm + vpbroadcastw m6, [tlq+wq*2] ; right + tzcnt wd, wd + add hd, hd + movsxd wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq*4] + sub tlq, hq + lea stride3q, [strideq*3] + lea wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq] + jmp wq +.w4: + movsldup m4, [base+ipred_shuf] + vpbroadcastq m5, [base+smooth_weights_1d_16bpc+4*2] +.w4_loop: + vbroadcasti32x4 m0, [tlq+hq-16] ; left + pshufb m0, m4 + psubw m0, m6 ; left - right + pmulhrsw m0, m5 + paddw m0, m6 + vextracti32x4 xm1, m0, 2 + vextracti32x4 xm2, ym0, 1 + vextracti32x4 xm3, m0, 3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 + sub hd, 8*2 + jl .end + lea dstq, [dstq+strideq*4] + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.end: + RET +.w8: + movsldup m4, [base+ipred_shuf] + vbroadcasti32x4 m5, [base+smooth_weights_1d_16bpc+8*2] +.w8_loop: + vpbroadcastq m0, [tlq+hq-8] ; left + pshufb m0, m4 + psubw m0, m6 ; left - right + pmulhrsw m0, m5 + paddw m0, m6 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4*2 + jg .w8_loop + RET +.w16: + movsldup m4, [base+ipred_shuf] + vbroadcasti32x8 m5, [base+smooth_weights_1d_16bpc+16*2] +.w16_loop: + vpbroadcastd m0, [tlq+hq-4] + vpbroadcastd m1, [tlq+hq-8] + pshufb m0, m4 + pshufb m1, m4 + psubw m0, m6 + psubw m1, m6 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + paddw m0, m6 + paddw m1, m6 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + lea dstq, [dstq+strideq*4] + sub hq, 4*2 + jg .w16_loop + RET +.w32: + movu m5, [base+smooth_weights_1d_16bpc+32*2] +.w32_loop: + vpbroadcastq m3, [tlq+hq-8] + punpcklwd m3, m3 + psubw m3, m6 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + REPX {paddw x, m6}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hq, 4*2 + jg .w32_loop + RET +.w64: + movu m4, [base+smooth_weights_1d_16bpc+64*2] + movu m5, [base+smooth_weights_1d_16bpc+64*3] +.w64_loop: + vpbroadcastw m1, [tlq+hq-2] + vpbroadcastw m3, [tlq+hq-4] + psubw m1, m6 + psubw m3, m6 + pmulhrsw m0, m4, m1 + pmulhrsw m1, m5 + pmulhrsw m2, m4, m3 + pmulhrsw m3, m5 + REPX {paddw x, m6}, m0, m1, m2, m3 + mova [dstq+strideq*0+64*0], m0 + mova [dstq+strideq*0+64*1], m1 + mova [dstq+strideq*1+64*0], m2 + mova [dstq+strideq*1+64*1], m3 + lea dstq, [dstq+strideq*2] + sub hq, 2*2 + jg .w64_loop + RET + +cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3 + lea r6, [$$] + mov wd, wm + movifnidn hd, hm + vpbroadcastw m13, [tlq+wq*2] ; right + tzcnt wd, wd + add hd, hd + movsxd wq, [base+ipred_smooth_16bpc_avx512icl_table+wq*4] + mov r5d, 0x55555555 + sub tlq, hq + mova m14, [base+smooth_perm] + kmovd k1, r5d + vpbroadcastw m0, [tlq] ; bottom + mov r5, 0x3333333333333333 + pxor m15, m15 + lea wq, [base+ipred_smooth_16bpc_avx512icl_table+wq] + kmovq k2, r5 + lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*2] + jmp wq +.w4: + vpbroadcastq m5, [tlq+hq+2] + movshdup m3, [base+ipred_shuf] + movsldup m4, [base+ipred_shuf] + vbroadcasti32x4 m6, [base+smooth_weights_2d_16bpc+4*4] + lea stride3q, [strideq*3] + punpcklwd m5, m0 ; top, bottom +.w4_loop: + vbroadcasti32x4 m0, [v_weightsq] + vpbroadcastq m2, [tlq+hq-8] + mova m1, m13 + pshufb m0, m3 + pmaddwd m0, m5 + pshufb m1{k2}, m2, m4 ; left, right + vpdpwssd m0, m1, m6 + vpermb m0, m14, m0 + pavgw ym0, ym15 + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + lea dstq, [dstq+strideq*4] + add v_weightsq, 4*4 + sub hd, 4*2 + jg .w4_loop + RET +.w8: + vbroadcasti32x4 ym5, [tlq+hq+2] + movshdup m6, [base+ipred_shuf] + movsldup m7, [base+ipred_shuf] + pmovzxwd m5, ym5 + vbroadcasti32x8 m8, [base+smooth_weights_2d_16bpc+8*4] + lea stride3q, [strideq*3] + vpblendmw m5{k1}, m0, m5 ; top, bottom +.w8_loop: + vpbroadcastq m0, [v_weightsq+0] + vpbroadcastq m1, [v_weightsq+8] + vpbroadcastd m3, [tlq+hq-4] + vpbroadcastd m4, [tlq+hq-8] + pshufb m0, m6 + pmaddwd m0, m5 + pshufb m1, m6 + pmaddwd m1, m5 + mova m2, m13 + pshufb m2{k2}, m3, m7 ; left, right + mova m3, m13 + pshufb m3{k2}, m4, m7 + vpdpwssd m0, m2, m8 + vpdpwssd m1, m3, m8 + add v_weightsq, 4*4 + vpermt2b m0, m14, m1 + pavgw m0, m15 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4*2 + jg .w8_loop + RET +.w16: + pmovzxwd m5, [tlq+hq+2] + mova m6, [base+smooth_weights_2d_16bpc+16*4] + vpblendmw m5{k1}, m0, m5 ; top, bottom +.w16_loop: + vpbroadcastd m0, [v_weightsq+0] + vpbroadcastd m1, [v_weightsq+4] + pmaddwd m0, m5 + pmaddwd m1, m5 + mova m2, m13 + vpbroadcastw m2{k1}, [tlq+hq-2] ; left, right + mova m3, m13 + vpbroadcastw m3{k1}, [tlq+hq-4] + vpdpwssd m0, m2, m6 + vpdpwssd m1, m3, m6 + add v_weightsq, 2*4 + vpermt2b m0, m14, m1 + pavgw m0, m15 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hq, 2*2 + jg .w16_loop + RET +.w32: + pmovzxwd m5, [tlq+hq+ 2] + pmovzxwd m6, [tlq+hq+34] + mova m7, [base+smooth_weights_2d_16bpc+32*4] + mova m8, [base+smooth_weights_2d_16bpc+32*6] + vpblendmw m5{k1}, m0, m5 ; top, bottom + vpblendmw m6{k1}, m0, m6 +.w32_loop: + vpbroadcastd m2, [v_weightsq+0] + vpbroadcastd m3, [v_weightsq+4] + pmaddwd m0, m5, m2 + pmaddwd m2, m6 + pmaddwd m1, m5, m3 + pmaddwd m3, m6 + mova m4, m13 + vpbroadcastw m4{k1}, [tlq+hq-2] ; left, right + vpdpwssd m0, m4, m7 + vpdpwssd m2, m4, m8 + mova m4, m13 + vpbroadcastw m4{k1}, [tlq+hq-4] + vpdpwssd m1, m4, m7 + vpdpwssd m3, m4, m8 + add v_weightsq, 2*4 + vpermt2b m0, m14, m2 + vpermt2b m1, m14, m3 + pavgw m0, m15 + pavgw m1, m15 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hq, 2*2 + jg .w32_loop + RET +.w64: + pmovzxwd m5, [tlq+hq+ 2] + pmovzxwd m6, [tlq+hq+34] + pmovzxwd m7, [tlq+hq+66] + pmovzxwd m8, [tlq+hq+98] + mova m9, [base+smooth_weights_2d_16bpc+64*4] + vpblendmw m5{k1}, m0, m5 ; top, bottom + mova m10, [base+smooth_weights_2d_16bpc+64*5] + vpblendmw m6{k1}, m0, m6 + mova m11, [base+smooth_weights_2d_16bpc+64*6] + vpblendmw m7{k1}, m0, m7 + mova m12, [base+smooth_weights_2d_16bpc+64*7] + vpblendmw m8{k1}, m0, m8 +.w64_loop: + vpbroadcastd m3, [v_weightsq] + mova m4, m13 + vpbroadcastw m4{k1}, [tlq+hq-2] ; left, right + pmaddwd m0, m5, m3 + pmaddwd m2, m6, m3 + pmaddwd m1, m7, m3 + pmaddwd m3, m8 + vpdpwssd m0, m4, m9 + vpdpwssd m2, m4, m10 + vpdpwssd m1, m4, m11 + vpdpwssd m3, m4, m12 + add v_weightsq, 1*4 + vpermt2b m0, m14, m2 + vpermt2b m1, m14, m3 + pavgw m0, m15 + pavgw m1, m15 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, strideq + sub hd, 1*2 + jg .w64_loop + RET + +cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3 + lea r6, [pal_pred_16bpc_avx512icl_table] + tzcnt wd, wm + mova m2, [pal_pred_perm] + movsxd wq, [r6+wq*4] + mova xm3, [palq] + movifnidn hd, hm + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.w4: + pmovzxbw ym0, [idxq] + add idxq, 16 + vpermw ym0, ym0, ym3 + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4 + RET +.w8: + pmovzxbw m0, [idxq] + add idxq, 32 + vpermw m0, m0, m3 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8 + RET +.w16: + vpermb m1, m2, [idxq] + add idxq, 64 + vpermw m0, m1, m3 + psrlw m1, 8 + vpermw m1, m1, m3 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16 + RET +.w32: + vpermb m1, m2, [idxq] + add idxq, 64 + vpermw m0, m1, m3 + psrlw m1, 8 + vpermw m1, m1, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32 + RET +.w64: + vpermb m1, m2, [idxq] + add idxq, 64 + vpermw m0, m1, m3 + psrlw m1, 8 + vpermw m1, m1, m3 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, strideq + dec hd + jg .w64 + RET + +; The ipred_filter SIMD processes 4x2 blocks in the following order which +; increases parallelism compared to doing things row by row. +; w4 w8 w16 w32 +; 1 1 2 1 2 5 6 1 2 5 6 9 a d e +; 2 2 3 2 3 6 7 2 3 6 7 a b e f +; 3 3 4 3 4 7 8 3 4 7 8 b c f g +; 4 4 5 4 5 8 9 4 5 8 9 c d g h + +cglobal ipred_filter_16bpc, 4, 7, 14, dst, stride, tl, w, h, filter, top +%define base r6-$$ + lea r6, [$$] +%ifidn filterd, filterm + movzx filterd, filterb +%else + movzx filterd, byte filterm +%endif + shl filterd, 6 + movifnidn hd, hm + movu xm0, [tlq-6] + pmovsxbw m7, [base+filter_intra_taps+filterq+32*0] + pmovsxbw m8, [base+filter_intra_taps+filterq+32*1] + mov r5d, r8m ; bitdepth_max + movsldup m9, [base+filter_permA] + movshdup m10, [base+filter_permA] + shr r5d, 11 ; is_12bpc + jnz .12bpc + psllw m7, 2 ; upshift multipliers so that packusdw + psllw m8, 2 ; will perform clipping for free +.12bpc: + vpbroadcastd m5, [base+filter_rnd+r5*8] + vpbroadcastd m6, [base+filter_shift+r5*8] + sub wd, 8 + jl .w4 +.w8: + call .main4 + movsldup m11, [filter_permB] + lea r5d, [hq*2+2] + movshdup m12, [filter_permB] + lea topq, [tlq+2] + mova m13, [filter_permC] + sub hd, 4 + vinserti32x4 ym0, [topq], 1 ; a0 b0 t0 t1 + sub tlq, r5 +%if WIN64 + push r7 + push r8 +%endif + mov r7, dstq + mov r8d, hd +.w8_loop: + movlps xm4, xm0, [tlq+hq*2] + call .main8 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jge .w8_loop + test wd, wd + jz .end + mov r2d, 0x0d + kmovb k1, r2d + lea r2, [strideq*3] +.w16: + movd xmm0, [r7+strideq*1+12] + vpblendd xmm0, [topq+8], 0x0e ; t1 t2 + pinsrw xm4, xmm0, [r7+strideq*0+14], 2 + call .main8 + add r7, 16 + vinserti32x4 ym0, [topq+16], 1 ; a2 b2 t2 t3 + mov hd, r8d + mov dstq, r7 + add topq, 16 +.w16_loop: + movd xmm1, [dstq+strideq*2-4] + punpcklwd xm4, xmm1, xmm0 + movd xmm0, [dstq+r2-4] + shufps xm4{k1}, xmm0, xm0, q3210 + call .main8 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jge .w16_loop + sub wd, 8 + jg .w16 +.end: + vpermb m2, m11, m0 + mova ym1, ym5 + vpdpwssd m1, m2, m7 + vpermb m2, m12, m0 + vpdpwssd m1, m2, m8 +%if WIN64 + pop r8 + pop r7 +%endif + vextracti32x8 ym2, m1, 1 + paddd ym1, ym2 + packusdw ym1, ym1 + vpsrlvw ym1, ym6 + vpermt2q m0, m13, m1 + vextracti32x4 [dstq+strideq*0], m0, 2 + vextracti32x4 [dstq+strideq*1], ym0, 1 + RET +.w4_loop: + movlps xm0, [tlq-10] + lea dstq, [dstq+strideq*2] + sub tlq, 4 +.w4: + call .main4 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + sub hd, 2 + jg .w4_loop + RET +ALIGN function_align +.main4: + vpermb m2, m9, m0 + mova ym1, ym5 + vpdpwssd m1, m2, m7 + vpermb m0, m10, m0 + vpdpwssd m1, m0, m8 + vextracti32x8 ym0, m1, 1 + paddd ym0, ym1 + vextracti32x4 xm1, ym0, 1 + packusdw xm0, xm1 ; clip + vpsrlvw xm0, xm6 + ret +ALIGN function_align +.main8: + vpermb m3, m11, m0 + mova ym2, ym5 + vpdpwssd m2, m3, m7 + vpermb m3, m9, m4 + mova ym1, ym5 + vpdpwssd m1, m3, m7 + vpermb m3, m12, m0 + vpdpwssd m2, m3, m8 + vpermb m3, m10, m4 + vpdpwssd m1, m3, m8 + vextracti32x8 ym4, m2, 1 + vextracti32x8 ym3, m1, 1 + paddd ym2, ym4 + paddd ym1, ym3 + packusdw ym1, ym2 ; clip + vpsrlvw ym1, ym6 + vpermt2q m0, m13, m1 ; c0 d0 b0 b1 a0 a1 + vextracti32x4 [dstq+strideq*0], m0, 2 + vextracti32x4 [dstq+strideq*1], ym0, 1 + ret + +%endif |