diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
commit | 26a029d407be480d791972afb5975cf62c9360a6 (patch) | |
tree | f435a8308119effd964b339f76abb83a57c29483 /third_party/dav1d/src/x86/ipred_avx512.asm | |
parent | Initial commit. (diff) | |
download | firefox-upstream/124.0.1.tar.xz firefox-upstream/124.0.1.zip |
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/dav1d/src/x86/ipred_avx512.asm')
-rw-r--r-- | third_party/dav1d/src/x86/ipred_avx512.asm | 3143 |
1 files changed, 3143 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/ipred_avx512.asm b/third_party/dav1d/src/x86/ipred_avx512.asm new file mode 100644 index 0000000000..de953deba3 --- /dev/null +++ b/third_party/dav1d/src/x86/ipred_avx512.asm @@ -0,0 +1,3143 @@ +; Copyright © 2020, VideoLAN and dav1d authors +; Copyright © 2020, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +%macro SMOOTH_WEIGHT_TABLE 1-* + %rep %0 + db %1-128, 127-%1 + %rotate 1 + %endrep +%endmacro + +smooth_weights: SMOOTH_WEIGHT_TABLE \ + 0, 0, 255, 128, 255, 149, 85, 64, \ + 255, 197, 146, 105, 73, 50, 37, 32, \ + 255, 225, 196, 170, 145, 123, 102, 84, \ + 68, 54, 43, 33, 26, 20, 17, 16, \ + 255, 240, 225, 210, 196, 182, 169, 157, \ + 145, 133, 122, 111, 101, 92, 83, 74, \ + 66, 59, 52, 45, 39, 34, 29, 25, \ + 21, 17, 14, 12, 10, 9, 8, 8, \ + 255, 248, 240, 233, 225, 218, 210, 203, \ + 196, 189, 182, 176, 169, 163, 156, 150, \ + 144, 138, 133, 127, 121, 116, 111, 106, \ + 101, 96, 91, 86, 82, 77, 73, 69, \ + 65, 61, 57, 54, 50, 47, 44, 41, \ + 38, 35, 32, 29, 27, 25, 22, 20, \ + 18, 16, 15, 13, 12, 10, 9, 8, \ + 7, 6, 6, 5, 5, 4, 4, 4 + +; dav1d_filter_intra_taps[], reordered for VNNI: p1 p2 p3 p4, p6 p5 p0 __ +filter_taps: db 10, 0, 0, 0, 2, 10, 0, 0, 1, 1, 10, 0, 1, 1, 2, 10 + db 6, 0, 0, 0, 2, 6, 0, 0, 2, 2, 6, 0, 1, 2, 2, 6 + db 0, 12, -6, 0, 0, 9, -5, 0, 0, 7, -3, 0, 0, 5, -3, 0 + db 12, 2, -4, 0, 9, 2, -3, 0, 7, 2, -3, 0, 5, 3, -3, 0 + db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16 + db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16 + db 0, 10,-10, 0, 0, 6, -6, 0, 0, 4, -4, 0, 0, 2, -2, 0 + db 10, 0,-10, 0, 6, 0, -6, 0, 4, 0, -4, 0, 2, 0, -2, 0 + db 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8 + db 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4 + db 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0 + db 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0 + db 8, 0, 0, 0, 3, 8, 0, 0, 2, 3, 8, 0, 1, 2, 3, 8 + db 4, 0, 0, 0, 3, 4, 0, 0, 2, 3, 4, 0, 2, 2, 3, 4 + db 0, 10, -2, 0, 0, 6, -1, 0, 0, 4, -1, 0, 0, 2, 0, 0 + db 10, 3, -1, 0, 6, 4, -1, 0, 4, 4, -1, 0, 3, 3, -1, 0 + db 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14 + db 12, 0, 0, 0, 1, 12, 0, 0, 0, 0, 12, 0, 0, 0, 1, 12 + db 0, 14,-12, 0, 0, 12,-10, 0, 0, 11, -9, 0, 0, 10, -8, 0 + db 14, 0,-10, 0, 12, 0, -9, 0, 11, 1, -8, 0, 9, 1, -7, 0 +filter_perm: db 0, 1, 2, 3, 24, 25, 26, 27, 4, 5, 6, 7, 28, 29, 30, 31 + db 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7,131 + db 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23,147 + db 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39,163 +filter_end: dd 2, 3, 16, 17, -1, -1, 20, 21, 0, 6, 24, 30, 1, 7, 25, 31 +smooth_shuf: db 7, 7, 7, 7, 0, 1, 0, 1, 3, 3, 3, 3, 8, 9, 8, 9 + db 5, 5, 5, 5, 4, 5, 4, 5, 1, 1, 1, 1, 12, 13, 12, 13 + db 6, 6, 6, 6, 2, 3, 2, 3, 2, 2, 2, 2, 10, 11, 10, 11 + db 4, 4, 4, 4, 6, 7, 6, 7, 0, 0, 0, 0, 14, 15, 14, 15 +smooth_endA: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 + db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 + db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95 + db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127 +smooth_endB: db 1, 3, 5, 7, 9, 11, 13, 15, 65, 67, 69, 71, 73, 75, 77, 79 + db 17, 19, 21, 23, 25, 27, 29, 31, 81, 83, 85, 87, 89, 91, 93, 95 + db 33, 35, 37, 39, 41, 43, 45, 47, 97, 99,101,103,105,107,109,111 + db 49, 51, 53, 55, 57, 59, 61, 63,113,115,117,119,121,123,125,127 +ipred_h_shuf: db 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4 + db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 +pal_unpack: db 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 +pal_perm: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +pb_63to0: db 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48 + db 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32 + db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 + db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +z_frac_table: db 64, 0, 62, 2, 60, 4, 58, 6, 56, 8, 54, 10, 52, 12, 50, 14 + db 48, 16, 46, 18, 44, 20, 42, 22, 40, 24, 38, 26, 36, 28, 34, 30 + db 32, 32, 30, 34, 28, 36, 26, 38, 24, 40, 22, 42, 20, 44, 18, 46 + db 16, 48, 14, 50, 12, 52, 10, 54, 8, 56, 6, 58, 4, 60, 2, 62 +z_filter_s1: db -1, -1, -1, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6 + db 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22 + db 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38 + db 46, 47, 47, 48, 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54 +z_filter_s5: db 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15, 17, 16 + db 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31, 33, 32 + db 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47, 49, 48 + db 58, 57, 59, 58, 60, 59, 61, 60, 62, 61, 63, 62, 64, 63, 65, 64 +z_filter_s3: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +z_filter_s2: db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +z_filter_s4: db 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8 +z_xpos_bc: db 17, 17, 17, 17, 33, 33, 33, 33, 9, 9, 9, 9, 9, 9, 9, 9 +z_filter4_s1: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 + db 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 +z_xpos_off1a: db 64, 65, 65, 66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71, 72 +z_xpos_off1b: db 72, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79, 80 +z_xpos_off2a: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 + db 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24 + db 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40 + db 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55, 56 +z_xpos_off2b: db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16 + db 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32 + db 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48 + db 56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64 +z_xpos_mul: dw 4, 4, 4, 4, 8, 8, 4, 4, 12, 12, 8, 8, 16, 16, 8, 8 + dw 20, 20, 12, 12, 24, 24, 12, 12, 28, 28, 16, 16, 32, 32, 16, 16 +z_ypos_off1: db 64, 65, 64, 65, 64, 65, 64, 65, 65, 66, 65, 66, 66, 67, 66, 67 + db 66, 67, 66, 67, 68, 69, 68, 69, 67, 68, 67, 68, 70, 71, 70, 71 + db 68, 69, 68, 69, 72, 73, 72, 73, 69, 70, 69, 70, 74, 75, 74, 75 + db 70, 71, 70, 71, 76, 77, 76, 77, 71, 72, 71, 72, 78, 79, 78, 79 +z_ypos_off2: db 64, 65, 64, 65, 0, 0, 0, 0, 64, 65, 64, 65, 0, 0, 0, 0 + db 65, 66, 65, 66, 1, 1, 1, 1, 65, 66, 65, 66, 1, 1, 1, 1 + db 66, 67, 66, 67, 2, 2, 2, 2, 66, 67, 66, 67, 2, 2, 2, 2 + db 67, 68, 67, 68, 3, 3, 3, 3, 67, 68, 67, 68, 3, 3, 3, 3 +z_ypos_off3: db 1, 2, 1, 2, 1, 1, 1, 1, 3, 4, 3, 4, 1, 1, 1, 1 + db 5, 6, 5, 6, 3, 3, 3, 3, 7, 8, 7, 8, 3, 3, 3, 3 + db 9, 10, 9, 10, 5, 5, 5, 5, 11, 12, 11, 12, 5, 5, 5, 5 + db 13, 14, 13, 14, 7, 7, 7, 7, 15, 16, 15, 16, 7, 7, 7, 7 +z_ypos_mul1a: dw 1, 2, 3, 4, 5, 6, 7, 8, 17, 18, 19, 20, 21, 22, 23, 24 + dw 33, 34, 35, 36, 37, 38, 39, 40, 49, 50, 51, 52, 53, 54, 55, 56 +z_ypos_mul1b: dw 9, 10, 11, 12, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32 + dw 41, 42, 43, 44, 45, 46, 47, 48, 57, 58, 59, 60, 61, 62, 63, 64 +z_ypos_mul2a: dw 1*512, 2*512, 3*512, 4*512, 5*512, 6*512, 7*512, 8*512 + dw 17*512, 18*512, 19*512, 20*512, 21*512, 22*512, 23*512, 24*512 + dw 33*512, 34*512, 35*512, 36*512, 37*512, 38*512, 39*512, 40*512 + dw 49*512, 50*512, 51*512, 52*512, 53*512, 54*512, 55*512, 56*512 +z_ypos_mul2b: dw 9*512, 10*512, 11*512, 12*512, 13*512, 14*512, 15*512, 16*512 + dw 25*512, 26*512, 27*512, 28*512, 29*512, 30*512, 31*512, 32*512 + dw 41*512, 42*512, 43*512, 44*512, 45*512, 46*512, 47*512, 48*512 + dw 57*512, 58*512, 59*512, 60*512, 61*512, 62*512, 63*512, 64*512 +z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 +z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 +z3_upsample: db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 + db 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8 +z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 + db 39, 39, 47, 47, 47, 79, 79, 79 +z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16 + db 32, 0, 32, 0, 24, 0, 24, 0, 16, 0, 16, 0 + db 0, 32, 0, 32, 0, 24, 0, 24, 0, 16, 0, 16 + +pb_8_56_0_0: db 8, 56, 0, 0 +pb_m4_36: times 2 db -4, 36 +pb_127_m127: times 2 db 127, -127 +pb_8: times 4 db 8 +pb_15: times 4 db 15 +pb_16: times 4 db 16 +pb_31: times 4 db 31 +pb_63: times 4 db 63 +pb_90: times 4 db 90 +pb_128: times 4 db 128 +pw_128: times 2 dw 128 +pw_255: times 2 dw 255 +pw_512: times 2 dw 512 + +%define pb_1 (ipred_h_shuf+24) +%define pb_2 (ipred_h_shuf+20) +%define pb_3 (ipred_h_shuf+16) +%define pb_4 (smooth_shuf +48) +%define pb_7 (ipred_h_shuf+ 0) +%define pb_9 (z_xpos_bc + 8) +%define pb_17 (z_xpos_bc + 0) +%define pb_33 (z_xpos_bc + 4) +%define pd_8 (filter_taps+128) + +%macro JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - 2*4) + %xdefine %%base mangle(private_prefix %+ _%1_%2) + %%table: + %rep %0 - 2 + dd %%base %+ .%3 - (%%table - 2*4) + %rotate 1 + %endrep +%endmacro + +%define ipred_dc_splat_8bpc_avx512icl_table (ipred_dc_8bpc_avx512icl_table + 10*4) + +JMP_TABLE ipred_h_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_paeth_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_v_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z1_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z2_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z3_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ + s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 +JMP_TABLE ipred_dc_left_8bpc, avx512icl, h4, h8, h16, h32, h64 + +cextern dr_intra_derivative +cextern pb_0to63 + +SECTION .text + +INIT_ZMM avx512icl +cglobal ipred_dc_top_8bpc, 3, 7, 5, dst, stride, tl, w, h + lea r5, [ipred_dc_left_8bpc_avx512icl_table] + movd xm0, wm + tzcnt wd, wm + inc tlq + movifnidn hd, hm + movu ym1, [tlq] + movd xmm3, wd + movsxd r6, [r5+wq*4] + vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1] + psrld xm0, 1 + vpdpbusd ym0, ym1, ym2 + add r6, r5 + add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 + +cglobal ipred_dc_left_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 + lea r5, [ipred_dc_left_8bpc_avx512icl_table] + mov hd, hm + tzcnt r6d, hd + sub tlq, hq + tzcnt wd, wm + movd xm0, hm + movu ym1, [tlq] + movd xmm3, r6d + movsxd r6, [r5+r6*4] + vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1] + psrld xm0, 1 + vpdpbusd ym0, ym1, ym2 + add r6, r5 + add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 +.h64: + movu ym1, [tlq+32] ; unaligned when jumping here from dc_top + vpdpbusd ym0, ym1, ym2 +.h32: + vextracti32x4 xm1, ym0, 1 + paddd xm0, xm1 +.h16: + punpckhqdq xm1, xm0, xm0 + paddd xm0, xm1 +.h8: + psrlq xm1, xm0, 32 + paddd xm0, xm1 +.h4: + vpsrlvd xm0, xmm3 + lea stride3q, [strideq*3] + vpbroadcastb m0, xm0 + jmp wq + +cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 + movifnidn hd, hm + movifnidn wd, wm + tzcnt r6d, hd + lea r5d, [wq+hq] + movd xm0, r5d + tzcnt r5d, r5d + movd xmm4, r5d + lea r5, [ipred_dc_8bpc_avx512icl_table] + tzcnt wd, wd + movsxd r6, [r5+r6*4] + movsxd wq, [r5+wq*4+5*4] + vpbroadcastd ym3, [r5-ipred_dc_8bpc_avx512icl_table+pb_1] + psrld xm0, 1 + add r6, r5 + add wq, r5 + lea stride3q, [strideq*3] + jmp r6 +.h4: + movd xmm1, [tlq-4] + vpdpbusd xm0, xmm1, xm3 + jmp wq +.w4: + movd xmm1, [tlq+1] + vpdpbusd xm0, xmm1, xm3 + cmp hd, 4 + jg .w4_mul + psrlw xmm0, xm0, 3 + jmp .w4_end +.w4_mul: + punpckhqdq xmm1, xm0, xm0 + lea r2d, [hq*2] + mov r6d, 0x55563334 + paddd xmm1, xm0 + shrx r6d, r6d, r2d + psrlq xmm0, xmm1, 32 + paddd xmm0, xmm1 + movd xmm1, r6d + psrld xmm0, 2 + pmulhuw xmm0, xmm1 +.w4_end: + vpbroadcastb xm0, xmm0 +.s4: + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xm0 + movd [dstq+strideq*2], xm0 + movd [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s4 + RET +.h8: + movq xmm1, [tlq-8] + vpdpbusd xm0, xmm1, xm3 + jmp wq +.w8: + movq xmm1, [tlq+1] + vextracti32x4 xm2, ym0, 1 + vpdpbusd xm0, xmm1, xm3 + paddd xmm2, xm2, xm0 + punpckhqdq xmm0, xmm2, xmm2 + paddd xmm0, xmm2 + psrlq xmm1, xmm0, 32 + paddd xmm0, xmm1 + vpsrlvd xmm0, xmm4 + cmp hd, 8 + je .w8_end + mov r6d, 0x5556 + mov r2d, 0x3334 + cmp hd, 32 + cmove r6d, r2d + movd xmm1, r6d + pmulhuw xmm0, xmm1 +.w8_end: + vpbroadcastb xm0, xmm0 +.s8: + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm0 + movq [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s8 + RET +.h16: + mova xmm1, [tlq-16] + vpdpbusd xm0, xmm1, xm3 + jmp wq +.w16: + movu xmm1, [tlq+1] + vextracti32x4 xm2, ym0, 1 + vpdpbusd xm0, xmm1, xm3 + paddd xmm2, xm2, xm0 + punpckhqdq xmm0, xmm2, xmm2 + paddd xmm0, xmm2 + psrlq xmm1, xmm0, 32 + paddd xmm0, xmm1 + vpsrlvd xmm0, xmm4 + cmp hd, 16 + je .w16_end + mov r6d, 0x5556 + mov r2d, 0x3334 + test hb, 8|32 + cmovz r6d, r2d + movd xmm1, r6d + pmulhuw xmm0, xmm1 +.w16_end: + vpbroadcastb xm0, xmm0 +.s16: + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm0 + mova [dstq+strideq*2], xm0 + mova [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s16 + RET +.h32: + mova ym1, [tlq-32] + vpdpbusd ym0, ym1, ym3 + jmp wq +.w32: + movu ym1, [tlq+1] + vpdpbusd ym0, ym1, ym3 + vextracti32x4 xm1, ym0, 1 + paddd xmm1, xm1, xm0 + punpckhqdq xmm0, xmm1, xmm1 + paddd xmm0, xmm1 + psrlq xmm1, xmm0, 32 + paddd xmm0, xmm1 + vpsrlvd xmm0, xmm4 + cmp hd, 32 + je .w32_end + lea r2d, [hq*2] + mov r6d, 0x33345556 + shrx r6d, r6d, r2d + movd xmm1, r6d + pmulhuw xmm0, xmm1 +.w32_end: + vpbroadcastb ym0, xmm0 +.s32: + mova [dstq+strideq*0], ym0 + mova [dstq+strideq*1], ym0 + mova [dstq+strideq*2], ym0 + mova [dstq+stride3q ], ym0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s32 + RET +.h64: + mova ym1, [tlq-64] + mova ym2, [tlq-32] + vpdpbusd ym0, ym1, ym3 + vpdpbusd ym0, ym2, ym3 + jmp wq +.w64: + movu ym1, [tlq+ 1] + movu ym2, [tlq+33] + vpdpbusd ym0, ym1, ym3 + vpdpbusd ym0, ym2, ym3 + vextracti32x4 xm1, ym0, 1 + paddd xmm1, xm1, xm0 + punpckhqdq xmm0, xmm1, xmm1 + paddd xmm0, xmm1 + psrlq xmm1, xmm0, 32 + paddd xmm0, xmm1 + vpsrlvd xmm0, xmm4 + cmp hd, 64 + je .w64_end + mov r6d, 0x33345556 + shrx r6d, r6d, hd + movd xmm1, r6d + pmulhuw xmm0, xmm1 +.w64_end: + vpbroadcastb m0, xmm0 +.s64: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s64 + RET + +cglobal ipred_dc_128_8bpc, 2, 7, 5, dst, stride, tl, w, h, stride3 + lea r5, [ipred_dc_splat_8bpc_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + vpbroadcastd m0, [r5-ipred_dc_splat_8bpc_avx512icl_table+pb_128] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +cglobal ipred_v_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 + lea r5, [ipred_dc_splat_8bpc_avx512icl_table] + tzcnt wd, wm + movu m0, [tlq+1] + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h, stride3 +%define base r6-ipred_h_8bpc_avx512icl_table + lea r6, [ipred_h_8bpc_avx512icl_table] + tzcnt wd, wm + mov hd, hm + movsxd wq, [r6+wq*4] + lea stride3q, [strideq*3] + sub tlq, hq + add wq, r6 + jmp wq +.w4: + mova xmm1, [base+ipred_h_shuf+16] +.w4_loop: + movd xmm0, [tlq+hq-4] + pshufb xmm0, xmm1 + movd [dstq+strideq*0], xmm0 + pextrd [dstq+strideq*1], xmm0, 1 + pextrd [dstq+strideq*2], xmm0, 2 + pextrd [dstq+stride3q ], xmm0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_loop + RET +.w8: + movsldup xmm2, [base+ipred_h_shuf+16] + movshdup xmm3, [base+ipred_h_shuf+16] +.w8_loop: + movd xmm1, [tlq+hq-4] + pshufb xmm0, xmm1, xmm2 + pshufb xmm1, xmm3 + movq [dstq+strideq*0], xmm0 + movq [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xmm0 + movhps [dstq+stride3q ], xmm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +.w16: + movsldup m1, [base+smooth_shuf] +.w16_loop: + vpbroadcastd m0, [tlq+hq-4] + pshufb m0, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16 + RET +.w32: + vpbroadcastd ym3, [base+pb_1] + vpord m2, m3, [base+pb_2] {1to16} +.w32_loop: + vpbroadcastd m1, [tlq+hq-4] + pshufb m0, m1, m2 + pshufb m1, m3 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w32_loop + RET +.w64: + vpbroadcastd m4, [base+pb_3] + vpbroadcastd m5, [base+pb_2] + vpbroadcastd m6, [base+pb_1] + pxor m7, m7 +.w64_loop: + vpbroadcastd m3, [tlq+hq-4] + pshufb m0, m3, m4 + pshufb m1, m3, m5 + pshufb m2, m3, m6 + pshufb m3, m7 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w64_loop + RET + +%macro PAETH 0 + psubusb m1, m5, m4 + psubusb m0, m4, m5 + por m1, m0 ; tdiff + pavgb m2, m6, m4 + vpcmpub k1, m1, m7, 1 ; tdiff < ldiff + vpblendmb m0{k1}, m4, m6 + vpternlogd m4, m6, m8, 0x28 ; (m4 ^ m6) & m8 + psubusb m3, m5, m2 + psubb m2, m4 + psubusb m2, m5 + por m2, m3 + pminub m1, m7 + paddusb m2, m2 + por m2, m4 ; min(tldiff, 255) + vpcmpub k1, m2, m1, 1 ; tldiff < ldiff && tldiff < tdiff + vmovdqu8 m0{k1}, m5 +%endmacro + +cglobal ipred_paeth_8bpc, 3, 7, 10, dst, stride, tl, w, h, top, stride3 + lea r6, [ipred_paeth_8bpc_avx512icl_table] + tzcnt wd, wm + vpbroadcastb m5, [tlq] ; topleft + mov hd, hm + movsxd wq, [r6+wq*4] + vpbroadcastd m8, [r6-ipred_paeth_8bpc_avx512icl_table+pb_1] + lea topq, [tlq+1] + sub tlq, hq + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +INIT_YMM avx512icl +.w4: + vpbroadcastd m6, [topq] + mova m9, [ipred_h_shuf] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 ; ldiff +.w4_loop: + vpbroadcastq m4, [tlq+hq-8] + pshufb m4, m9 ; left + PAETH + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+stride3q ], xm0, 3 + sub hd, 8 + jl .w4_ret + vextracti32x4 xm0, m0, 1 + lea dstq, [dstq+strideq*4] + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+stride3q ], xm0, 3 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.w4_ret: + RET +INIT_ZMM avx512icl +.w8: + vpbroadcastq m6, [topq] + movsldup m9, [smooth_shuf] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w8_loop: + vpbroadcastq m4, [tlq+hq-8] + pshufb m4, m9 + PAETH + vextracti32x4 xm1, m0, 2 + vextracti32x4 xm2, ym0, 1 + vextracti32x4 xm3, m0, 3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 + sub hd, 8 + jl .w8_ret + lea dstq, [dstq+strideq*4] + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 + lea dstq, [dstq+strideq*4] + jg .w8_loop +.w8_ret: + RET +.w16: + vbroadcasti32x4 m6, [topq] + movsldup m9, [smooth_shuf] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w16_loop: + vpbroadcastd m4, [tlq+hq-4] + pshufb m4, m9 + PAETH + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16_loop + RET +.w32: + vbroadcasti32x8 m6, [topq] + mova ym9, ym8 + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w32_loop: + vpbroadcastd m4, [tlq+hq-2] + pshufb m4, m9 + PAETH + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w64: + movu m6, [topq] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w64_loop: + vpbroadcastb m4, [tlq+hq-1] + PAETH + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET + +cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 +%define base r6-ipred_smooth_v_8bpc_avx512icl_table + lea r6, [ipred_smooth_v_8bpc_avx512icl_table] + tzcnt wd, wm + mov hd, hm + movsxd wq, [r6+wq*4] + vpbroadcastd m0, [base+pb_127_m127] + vpbroadcastd m1, [base+pw_128] + lea weightsq, [base+smooth_weights+hq*4] + neg hq + vpbroadcastb m4, [tlq+hq] ; bottom + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.w4: + vpbroadcastd m2, [tlq+1] + movshdup m5, [smooth_shuf] + mova ym6, [smooth_endA] + punpcklbw m2, m4 ; top, bottom + pmaddubsw m3, m2, m0 + paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok + paddw m3, m1 ; 128 * top + 129 * bottom + 128 +.w4_loop: + vbroadcasti32x4 m0, [weightsq+hq*2] + pshufb m0, m5 + pmaddubsw m0, m2, m0 + paddw m0, m3 + vpermb m0, m6, m0 + vextracti32x4 xm1, ym0, 1 + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xm1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+stride3q ], xm1, 2 + add hq, 8 + jg .ret + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 1 + pextrd [dstq+strideq*1], xm1, 1 + pextrd [dstq+strideq*2], xm0, 3 + pextrd [dstq+stride3q ], xm1, 3 + lea dstq, [dstq+strideq*4] + jl .w4_loop +.ret: + RET +.w8: + vpbroadcastq m2, [tlq+1] + movshdup m5, [smooth_shuf] + mova ym6, [smooth_endA] + punpcklbw m2, m4 + pmaddubsw m3, m2, m0 + paddw m1, m2 + paddw m3, m1 +.w8_loop: + vpbroadcastq m0, [weightsq+hq*2] + pshufb m0, m5 + pmaddubsw m0, m2, m0 + paddw m0, m3 + vpermb m0, m6, m0 + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w8_loop + RET +.w16: + vbroadcasti32x4 m3, [tlq+1] + movshdup m6, [smooth_shuf] + mova m7, [smooth_endB] + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + pmaddubsw m4, m2, m0 + pmaddubsw m5, m3, m0 + paddw m0, m1, m2 + paddw m1, m3 + paddw m4, m0 + paddw m5, m1 +.w16_loop: + vpbroadcastq m1, [weightsq+hq*2] + pshufb m1, m6 + pmaddubsw m0, m2, m1 + pmaddubsw m1, m3, m1 + paddw m0, m4 + paddw m1, m5 + vpermt2b m0, m7, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w16_loop + RET +.w32: + vbroadcasti32x8 m3, [tlq+1] + movshdup m6, [smooth_shuf] + mova m7, [smooth_endB] + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + pmaddubsw m4, m2, m0 + pmaddubsw m5, m3, m0 + paddw m0, m1, m2 + paddw m1, m3 + paddw m4, m0 + paddw m5, m1 +.w32_loop: + vpbroadcastd m1, [weightsq+hq*2] + pshufb m1, m6 + pmaddubsw m0, m2, m1 + pmaddubsw m1, m3, m1 + paddw m0, m4 + paddw m1, m5 + vpermt2b m0, m7, m1 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + add hq, 2 + jl .w32_loop + RET +.w64: + movu m3, [tlq+1] + mova m6, [smooth_endB] + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + pmaddubsw m4, m2, m0 + pmaddubsw m5, m3, m0 + paddw m0, m1, m2 + paddw m1, m3 + paddw m4, m0 + paddw m5, m1 +.w64_loop: + vpbroadcastw m1, [weightsq+hq*2] + pmaddubsw m0, m2, m1 + pmaddubsw m1, m3, m1 + paddw m0, m4 + paddw m1, m5 + vpermt2b m0, m6, m1 + mova [dstq], m0 + add dstq, strideq + inc hq + jl .w64_loop + RET + +cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3 +%define base r5-ipred_smooth_h_8bpc_avx512icl_table + lea r5, [ipred_smooth_h_8bpc_avx512icl_table] + mov r6d, wd + tzcnt wd, wd + vpbroadcastb m4, [tlq+r6] ; right + mov hd, hm + movsxd wq, [r5+wq*4] + vpbroadcastd m5, [base+pb_127_m127] + vpbroadcastd m6, [base+pw_128] + sub tlq, hq + add wq, r5 + vpmovb2m k1, m6 + lea stride3q, [strideq*3] + jmp wq +.w4: + movsldup m3, [smooth_shuf] + vpbroadcastq m7, [smooth_weights+4*2] + mova ym8, [smooth_endA] +.w4_loop: + vpbroadcastq m0, [tlq+hq-8] + mova m2, m4 + vpshufb m2{k1}, m0, m3 ; left, right + pmaddubsw m0, m2, m5 + pmaddubsw m1, m2, m7 + paddw m2, m6 + paddw m0, m2 + paddw m0, m1 + vpermb m0, m8, m0 + vextracti32x4 xm1, ym0, 1 + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xm1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+stride3q ], xm1, 2 + sub hd, 8 + jl .ret + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 1 + pextrd [dstq+strideq*1], xm1, 1 + pextrd [dstq+strideq*2], xm0, 3 + pextrd [dstq+stride3q ], xm1, 3 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.ret: + RET +.w8: + movsldup m3, [smooth_shuf] + vbroadcasti32x4 m7, [smooth_weights+8*2] + mova ym8, [smooth_endA] +.w8_loop: + vpbroadcastd m0, [tlq+hq-4] + mova m2, m4 + vpshufb m2{k1}, m0, m3 + pmaddubsw m0, m2, m5 + pmaddubsw m1, m2, m7 + paddw m2, m6 + paddw m0, m2 + paddw m0, m1 + vpermb m0, m8, m0 + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +.w16: + movsldup m7, [smooth_shuf] + vbroadcasti32x4 m8, [smooth_weights+16*2] + vbroadcasti32x4 m9, [smooth_weights+16*3] + mova m10, [smooth_endB] +.w16_loop: + vpbroadcastd m0, [tlq+hq-4] + mova m3, m4 + vpshufb m3{k1}, m0, m7 + pmaddubsw m2, m3, m5 + pmaddubsw m0, m3, m8 + pmaddubsw m1, m3, m9 + paddw m3, m6 + paddw m2, m3 + paddw m0, m2 + paddw m1, m2 + vpermt2b m0, m10, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16_loop + RET +.w32: + mova m10, [smooth_endA] + vpbroadcastd ym7, [pb_1] + vbroadcasti32x8 m8, [smooth_weights+32*2] + vbroadcasti32x8 m9, [smooth_weights+32*3] + vshufi32x4 m10, m10, q3120 +.w32_loop: + vpbroadcastd m0, [tlq+hq-2] + mova m3, m4 + vpshufb m3{k1}, m0, m7 + pmaddubsw m2, m3, m5 + pmaddubsw m0, m3, m8 + pmaddubsw m1, m3, m9 + paddw m3, m6 + paddw m2, m3 + paddw m0, m2 + paddw m1, m2 + vpermt2b m0, m10, m1 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w64: + mova m7, [smooth_weights+64*2] + mova m8, [smooth_weights+64*3] + mova m9, [smooth_endA] +.w64_loop: + mova m3, m4 + vpbroadcastb m3{k1}, [tlq+hq-1] + pmaddubsw m2, m3, m5 + pmaddubsw m0, m3, m7 + pmaddubsw m1, m3, m8 + paddw m3, m6 + paddw m2, m3 + paddw m0, m2 + paddw m1, m2 + vpermt2b m0, m9, m1 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET + +cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3 +%define base r5-ipred_smooth_8bpc_avx512icl_table + lea r5, [ipred_smooth_8bpc_avx512icl_table] + mov r6d, wd + tzcnt wd, wd + mov hd, hm + vpbroadcastb m6, [tlq+r6] ; right + sub tlq, hq + movsxd wq, [r5+wq*4] + vpbroadcastd m7, [base+pb_127_m127] + vpbroadcastb m0, [tlq] ; bottom + vpbroadcastd m1, [base+pw_255] + add wq, r5 + lea v_weightsq, [base+smooth_weights+hq*2] + vpmovb2m k1, m1 + lea stride3q, [strideq*3] + jmp wq +.w4: + vpbroadcastd m8, [tlq+hq+1] + movsldup m4, [smooth_shuf] + movshdup m5, [smooth_shuf] + vpbroadcastq m9, [smooth_weights+4*2] + mova ym11, [smooth_endA] + + punpcklbw m8, m0 ; top, bottom + pmaddubsw m10, m8, m7 + paddw m1, m8 ; 1 * top + 256 * bottom + 255 + paddw m10, m1 ; 128 * top + 129 * bottom + 255 +.w4_loop: + vpbroadcastq m1, [tlq+hq-8] + vbroadcasti32x4 m0, [v_weightsq] + add v_weightsq, 16 + mova m2, m6 + vpshufb m2{k1}, m1, m4 ; left, right + pmaddubsw m1, m2, m7 ; 127 * left - 127 * right + pshufb m0, m5 + pmaddubsw m0, m8, m0 + paddw m1, m2 ; 128 * left + 129 * right + pmaddubsw m2, m9 + paddw m0, m10 + paddw m1, m2 + pavgw m0, m1 + vpermb m0, m11, m0 + vextracti32x4 xm1, ym0, 1 + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xm1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+stride3q ], xm1, 2 + sub hd, 8 + jl .ret + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 1 + pextrd [dstq+strideq*1], xm1, 1 + pextrd [dstq+strideq*2], xm0, 3 + pextrd [dstq+stride3q ], xm1, 3 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.ret: + RET +.w8: + vpbroadcastq m8, [tlq+hq+1] + movsldup m4, [smooth_shuf] + movshdup m5, [smooth_shuf] + vbroadcasti32x4 m9, [smooth_weights+8*2] + mova ym11, [smooth_endA] + punpcklbw m8, m0 + pmaddubsw m10, m8, m7 + paddw m1, m8 + paddw m10, m1 +.w8_loop: + vpbroadcastd m1, [tlq+hq-4] + vpbroadcastq m0, [v_weightsq] + add v_weightsq, 8 + mova m2, m6 + vpshufb m2{k1}, m1, m4 + pmaddubsw m1, m2, m7 + pshufb m0, m5 + pmaddubsw m0, m8, m0 + paddw m1, m2 + pmaddubsw m2, m9 + paddw m0, m10 + paddw m1, m2 + pavgw m0, m1 + vpermb m0, m11, m0 + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +.w16: + vbroadcasti32x4 m9, [tlq+hq+1] + movsldup m5, [smooth_shuf] + movshdup m10, [smooth_shuf] + vbroadcasti32x4 m11, [smooth_weights+16*2] + vbroadcasti32x4 m12, [smooth_weights+16*3] + mova m15, [smooth_endB] + punpcklbw m8, m9, m0 + punpckhbw m9, m0 + pmaddubsw m13, m8, m7 + pmaddubsw m14, m9, m7 + paddw m0, m1, m8 + paddw m1, m9 + paddw m13, m0 + paddw m14, m1 +.w16_loop: + vpbroadcastd m0, [tlq+hq-4] + vpbroadcastq m1, [v_weightsq] + add v_weightsq, 8 + mova m4, m6 + vpshufb m4{k1}, m0, m5 + pmaddubsw m2, m4, m7 + pshufb m1, m10 + pmaddubsw m0, m8, m1 + pmaddubsw m1, m9, m1 + paddw m2, m4 + pmaddubsw m3, m4, m11 + pmaddubsw m4, m12 + paddw m0, m13 + paddw m1, m14 + paddw m3, m2 + paddw m4, m2 + pavgw m0, m3 + pavgw m1, m4 + vpermt2b m0, m15, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16_loop + RET +.w32: + vbroadcasti32x8 m9, [tlq+hq+1] + movshdup m10, [smooth_shuf] + mova m12, [smooth_weights+32*2] + vpbroadcastd ym5, [pb_1] + mova m15, [smooth_endB] + punpcklbw m8, m9, m0 + punpckhbw m9, m0 + pmaddubsw m13, m8, m7 + pmaddubsw m14, m9, m7 + vshufi32x4 m11, m12, m12, q2020 + vshufi32x4 m12, m12, q3131 + paddw m0, m1, m8 + paddw m1, m9 + paddw m13, m0 + paddw m14, m1 +.w32_loop: + vpbroadcastd m0, [tlq+hq-2] + vpbroadcastd m1, [v_weightsq] + add v_weightsq, 4 + mova m4, m6 + vpshufb m4{k1}, m0, m5 + pmaddubsw m2, m4, m7 + pshufb m1, m10 + pmaddubsw m0, m8, m1 + pmaddubsw m1, m9, m1 + paddw m2, m4 + pmaddubsw m3, m4, m11 + pmaddubsw m4, m12 + paddw m0, m13 + paddw m1, m14 + paddw m3, m2 + paddw m4, m2 + pavgw m0, m3 + pavgw m1, m4 + vpermt2b m0, m15, m1 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w64: + movu m9, [tlq+hq+1] + mova m11, [smooth_weights+64*2] + mova m2, [smooth_weights+64*3] + mova m14, [smooth_endB] + punpcklbw m8, m9, m0 + punpckhbw m9, m0 + pmaddubsw m12, m8, m7 + pmaddubsw m13, m9, m7 + vshufi32x4 m10, m11, m2, q2020 + vshufi32x4 m11, m2, q3131 + paddw m0, m1, m8 + paddw m1, m9 + paddw m12, m0 + paddw m13, m1 +.w64_loop: + mova m4, m6 + vpbroadcastb m4{k1}, [tlq+hq-1] + vpbroadcastw m1, [v_weightsq] + add v_weightsq, 2 + pmaddubsw m2, m4, m7 + pmaddubsw m0, m8, m1 + pmaddubsw m1, m9, m1 + paddw m2, m4 + pmaddubsw m3, m4, m10 + pmaddubsw m4, m11 + paddw m0, m12 + paddw m1, m13 + paddw m3, m2 + paddw m4, m2 + pavgw m0, m3 + pavgw m1, m4 + vpermt2b m0, m14, m1 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET + +cglobal pal_pred_8bpc, 4, 7, 6, dst, stride, pal, idx, w, h, stride3 + movifnidn wd, wm + movifnidn hd, hm + lea stride3q, [strideq*3] + cmp wd, 8 + jg .w32 + movq xmm3, [palq] + je .w8 +.w4: + movq xmm0, [idxq] + add idxq, 8 + psrlw xmm1, xmm0, 4 + punpcklbw xmm0, xmm1 + pshufb xmm0, xmm3, xmm0 + movd [dstq+strideq*0], xmm0 + pextrd [dstq+strideq*1], xmm0, 1 + pextrd [dstq+strideq*2], xmm0, 2 + pextrd [dstq+stride3q ], xmm0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4 + RET +.w8: + movu xmm2, [idxq] + add idxq, 16 + pshufb xmm1, xmm3, xmm2 + psrlw xmm2, 4 + pshufb xmm2, xmm3, xmm2 + punpcklbw xmm0, xmm1, xmm2 + punpckhbw xmm1, xmm2 + movq [dstq+strideq*0], xmm0 + movhps [dstq+strideq*1], xmm0 + movq [dstq+strideq*2], xmm1 + movhps [dstq+stride3q ], xmm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8 + RET +.w16: + pmovzxdq m0, [idxq] + add idxq, 32 + vpmultishiftqb m0, m3, m0 + pshufb m0, m5, m0 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16 + RET +.w32: + vpbroadcastq m3, [pal_unpack+0] + vpbroadcastq m5, [palq] + cmp wd, 32 + jl .w16 + pmovzxbd m2, [pal_perm] + vpbroadcastq m4, [pal_unpack+8] + jg .w64 +.w32_loop: + vpermd m1, m2, [idxq] + add idxq, 64 + vpmultishiftqb m0, m3, m1 + vpmultishiftqb m1, m4, m1 + pshufb m0, m5, m0 + pshufb m1, m5, m1 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w32_loop + RET +.w64: + vpermd m1, m2, [idxq] + add idxq, 64 + vpmultishiftqb m0, m3, m1 + vpmultishiftqb m1, m4, m1 + pshufb m0, m5, m0 + pshufb m1, m5, m1 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w64 + RET + +%if WIN64 + DECLARE_REG_TMP 4 +%else + DECLARE_REG_TMP 8 +%endif + +cglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx +%define base r7-z_filter_t0 + lea r7, [z_filter_t0] + tzcnt wd, wm + movifnidn angled, anglem + lea t0, [dr_intra_derivative] + movsxd wq, [base+ipred_z1_8bpc_avx512icl_table+wq*4] + inc tlq + mov dxd, angled + and dxd, 0x7e + add angled, 165 ; ~90 + movzx dxd, word [t0+dxq] + lea wq, [base+ipred_z1_8bpc_avx512icl_table+wq] + movifnidn hd, hm + xor angled, 0x4ff ; d = 90 - angle + mova m14, [base+z_frac_table] + vpbroadcastd m15, [base+pw_512] + jmp wq +.w4: + mova m9, [pb_0to63] + pminud m8, m9, [base+pb_7] {1to16} + vpbroadcastq m7, [tlq] + pshufb m7, m8 + cmp angleb, 40 + jae .w4_no_upsample + lea r3d, [angleq-1024] + sar r3d, 7 + add r3d, hd + jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) + pshufb xmm0, xm7, [base+z_filter_s4] + mova xmm1, [tlq-1] + pshufb xmm1, [base+z_xpos_off2a] + vpbroadcastd xmm2, [base+pb_m4_36] + vpbroadcastq m4, [pb_0to63] + pmaddubsw xmm0, xmm2 + pmaddubsw xmm1, xmm2 + add dxd, dxd + kxnorw k1, k1, k1 + paddw xmm0, xmm1 + pmulhrsw xm0, xmm0, xm15 + packuswb xm0, xm0 + punpcklbw ym7{k1}, ym0 + jmp .w4_main2 +.w4_no_upsample: + test angled, 0x400 + jnz .w4_main ; !enable_intra_edge_filter + lea r3d, [hq+3] + vpbroadcastb xm0, r3d + vpbroadcastb xm1, angled + shr angled, 8 ; is_sm << 1 + vpcmpeqb k1, xm0, [base+z_filter_wh] + vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8] + kmovw r5d, k1 + test r5d, r5d + jz .w4_main + vbroadcasti32x4 ym0, [tlq-1] + pshufb ym0, [base+z_filter4_s1] + popcnt r5d, r5d ; filter_strength + pshufb ym1, ym7, [z_filter_s4] + pshufb ym7, [base+z_filter_s3] + vpbroadcastd ym11, [base+z_filter_k+(r5-1)*4+12*0] + vpbroadcastd ym12, [base+z_filter_k+(r5-1)*4+12*1] + pmaddubsw ym0, ym11 + pmaddubsw ym1, ym11 + pmaddubsw ym7, ym12 + paddw ym0, ym1 + paddw ym7, ym0 + pmulhrsw ym7, ym15 + cmp hd, 4 + je .w4_filter_end + vpbroadcastd m8, [base+pb_9] + pminub m8, m9 +.w4_filter_end: + paddb m8, m8 + vpermb m7, m8, m7 +.w4_main: + vpbroadcastq m4, [base+z_xpos_off1a] +.w4_main2: + movsldup m2, [base+z_xpos_mul] + vpbroadcastw m5, dxd + vbroadcasti32x4 m3, [base+z_xpos_bc] + lea r2, [strideq*3] + pmullw m2, m5 ; xpos + psllw m5, 5 ; dx*8 +.w4_loop: + psrlw m1, m2, 3 + pshufb m0, m2, m3 + vpermw m1, m1, m14 ; 64-frac, frac + paddsb m0, m4 ; base, base+1 + vpermb m0, m0, m7 ; top[base], top[base+1] + paddsw m2, m5 ; xpos += dx + pmaddubsw m0, m1 ; v + pmulhrsw m0, m15 + packuswb m0, m0 + vextracti32x4 xm1, ym0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+r2 ], xm1, 1 + sub hd, 8 + jl .w4_end + vextracti32x4 xm1, m0, 2 ; top[max_base_x] + lea dstq, [dstq+strideq*4] + vextracti32x4 xm0, m0, 3 + movd [dstq+strideq*0], xm1 + pextrd [dstq+strideq*1], xm1, 1 + movd [dstq+strideq*2], xm0 + pextrd [dstq+r2 ], xm0, 1 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.w4_end: + RET +.w8_filter: + mova ym0, [base+z_filter_s1] + popcnt r5d, r5d + vbroadcasti32x4 ym1, [base+z_filter_s2] + vbroadcasti32x4 ym3, [base+z_filter_s3] + vbroadcasti32x4 ym4, [base+z_filter_s4] + vpermi2b ym0, ym7, ym2 ; al bl + mova ym5, [base+z_filter_s5] + pshufb ym1, ym7, ym1 ; ah bh + vpbroadcastd ym11, [base+z_filter_k+(r5-1)*4+12*0] + pshufb ym3, ym7, ym3 ; cl ch + vpbroadcastd ym12, [base+z_filter_k+(r5-1)*4+12*1] + pshufb ym4, ym7, ym4 ; el dl + vpbroadcastd ym13, [base+z_filter_k+(r5-1)*4+12*2] + vpermb ym5, ym5, ym7 ; eh dh + pmaddubsw ym0, ym11 + pmaddubsw ym1, ym11 + pmaddubsw ym2, ym3, ym12 + pmaddubsw ym3, ym13 + pmaddubsw ym4, ym11 + pmaddubsw ym5, ym11 + paddw ym0, ym2 + paddw ym1, ym3 + paddw ym0, ym4 + paddw ym1, ym5 + pmulhrsw ym0, ym15 + pmulhrsw ym1, ym15 + packuswb ym0, ym1 + ret +.w8: + lea r3d, [angleq+216] + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 + lea r3d, [hq-1] + mova xm1, [base+z_filter_s4] + vpbroadcastb xm2, r3d + mova xm7, [tlq-1] + vinserti32x4 ym7, [tlq+7], 1 + vbroadcasti32x4 ym0, [base+z_xpos_off1a] + vpbroadcastd ym3, [base+pb_m4_36] + pminub xm2, xm1 + pshufb ym0, ym7, ym0 + vinserti32x4 ym1, xm2, 1 + psrldq ym7, 1 + pshufb ym1, ym7, ym1 + pmaddubsw ym0, ym3 + pmaddubsw ym1, ym3 + vbroadcasti32x4 m8, [pb_0to63] + add dxd, dxd + paddw ym0, ym1 + pmulhrsw ym0, ym15 + packuswb ym0, ym0 + punpcklbw ym7, ym0 + jmp .w8_main2 +.w8_no_upsample: + lea r3d, [hq+7] + mova m9, [pb_0to63] + vpbroadcastb ym0, r3d + and r3d, 7 + vbroadcasti32x4 m7, [tlq] + or r3d, 8 ; imin(h+7, 15) + vpbroadcastb m8, r3d + pminub m8, m9 + pshufb m7, m8 + test angled, 0x400 + jnz .w8_main + vpbroadcastb ym1, angled + shr angled, 8 + vpcmpeqb k1, ym0, [base+z_filter_wh] + mova xm0, [base+z_filter_t0+angleq*8] + vpcmpgtb k1{k1}, ym1, ym0 + kmovd r5d, k1 + test r5d, r5d + jz .w8_main + vpbroadcastd ym2, [tlq-4] + call .w8_filter + cmp hd, 8 + jle .w8_filter_end + vpbroadcastd m8, [base+pb_17] + add r3d, 2 + pminub m8, m9 +.w8_filter_end: + vpermb m7, m8, m0 +.w8_main: + vbroadcasti32x4 m8, [base+z_xpos_off1a] +.w8_main2: + movsldup m4, [base+z_xpos_mul] + vpbroadcastw m9, dxd + shl r3d, 6 + vpbroadcastd m5, [base+z_xpos_bc+8*0] + pmullw m4, m9 ; xpos + vpbroadcastd m6, [base+z_xpos_bc+8*1] + sub r3d, dxd + shl dxd, 3 + psllw m9, 5 ; dx*8 + lea r2, [strideq*3] +.w8_loop: + psrlw m3, m4, 3 + pshufb m0, m4, m5 + pshufb m1, m4, m6 + vpermw m3, m3, m14 + paddsb m0, m8 + paddsb m1, m8 + vpermb m0, m0, m7 + vpermb m1, m1, m7 + paddsw m4, m9 + punpcklqdq m2, m3, m3 + pmaddubsw m0, m2 + punpckhqdq m3, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m0, m1 + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r2 ], xm1 + sub hd, 8 + jl .w8_end + vextracti32x8 ym0, m0, 1 + lea dstq, [dstq+strideq*4] + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r2 ], xm1 + jz .w8_end + lea dstq, [dstq+strideq*4] + sub r3d, dxd + jg .w8_loop + vextracti32x4 xm7, m7, 3 +.w8_end_loop: + movq [dstq+strideq*0], xm7 + movq [dstq+strideq*1], xm7 + movq [dstq+strideq*2], xm7 + movq [dstq+r2 ], xm7 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_end_loop +.w8_end: + RET +.w16_filter: + mova m0, [base+z_filter_s1] + popcnt r5d, r5d + vbroadcasti32x4 m1, [base+z_filter_s2] + vbroadcasti32x4 m3, [base+z_filter_s3] + vbroadcasti32x4 m4, [base+z_filter_s4] + vpermi2b m0, m7, m2 ; al bl + mova m5, [base+z_filter_s5] + pshufb m1, m7, m1 ; ah bh + vpbroadcastd m11, [base+z_filter_k+(r5-1)*4+12*0] + pshufb m3, m7, m3 ; cl ch + vpbroadcastd m12, [base+z_filter_k+(r5-1)*4+12*1] + pshufb m4, m7, m4 ; el dl + vpbroadcastd m13, [base+z_filter_k+(r5-1)*4+12*2] + vpermb m5, m5, m7 ; eh dh + pmaddubsw m0, m11 + pmaddubsw m1, m11 + pmaddubsw m2, m3, m12 + pmaddubsw m3, m13 + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m0, m2 + paddw m1, m3 + paddw m0, m4 + paddw m1, m5 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m0, m1 + ret +.w16: + lea r3d, [hq+15] + mova m9, [pb_0to63] + vpbroadcastb ym0, r3d + and r3d, 15 + movu ym7, [tlq] + or r3d, 16 ; imin(h+15, 31) + vpbroadcastb m8, r3d + pminub m8, m9 + vpermb m7, m8, m7 + test angled, 0x400 + jnz .w16_main + vpbroadcastb ym1, angled + shr angled, 8 + vpcmpeqb k1, ym0, [base+z_filter_wh] + mova xm0, [base+z_filter_t0+angleq*8] + vpcmpgtb k1{k1}, ym1, ym0 + kmovd r5d, k1 + test r5d, r5d + jz .w16_main + vpbroadcastd m2, [tlq-4] + call .w16_filter + cmp hd, 16 + jle .w16_filter_end + vpbroadcastd m8, [base+pb_33] + add r3d, 2 + pminub m8, m9 +.w16_filter_end: + vpermb m7, m8, m0 +.w16_main: + movshdup m3, [base+z_xpos_mul] + vpbroadcastw m8, dxd + shl r3d, 6 + vpbroadcastd m4, [base+z_xpos_bc] + pmullw m3, m8 ; xpos + vbroadcasti32x4 m5, [base+z_xpos_off1a] + sub r3d, dxd + shl dxd, 2 + vbroadcasti32x4 m6, [base+z_xpos_off1b] + psllw m8, 4 ; dx*4 + lea r2, [strideq*3] +.w16_loop: + pshufb m1, m3, m4 + psrlw m2, m3, 3 + paddsb m0, m1, m5 + vpermw m2, m2, m14 + paddsb m1, m6 + vpermb m0, m0, m7 + vpermb m1, m1, m7 + paddsw m3, m8 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m0, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+r2 ], m0, 3 + sub hd, 4 + jz .w16_end + lea dstq, [dstq+strideq*4] + sub r3d, dxd + jg .w16_loop + vextracti32x4 xm7, m7, 3 +.w16_end_loop: + mova [dstq+strideq*0], xm7 + mova [dstq+strideq*1], xm7 + mova [dstq+strideq*2], xm7 + mova [dstq+r2 ], xm7 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16_end_loop +.w16_end: + RET +.w32_filter: + mova m0, [base+z_filter_s1] + vbroadcasti32x4 m1, [base+z_filter_s2] + vbroadcasti32x4 m3, [base+z_filter_s3] + vbroadcasti32x4 m4, [base+z_filter_s4] + vpermi2b m0, m7, m2 ; al bl + mova m5, [base+z_filter_s5] + pshufb m1, m7, m1 ; ah bh + vpbroadcastd m11, [base+z_filter_k+4*2+12*0] + pshufb m3, m7, m3 ; cl ch + vpbroadcastd m12, [base+z_filter_k+4*2+12*1] + pshufb m4, m7, m4 ; el dl + vpbroadcastd m13, [base+z_filter_k+4*2+12*2] + vpermi2b m5, m7, m8 ; eh dh + pmaddubsw m0, m11 + pmaddubsw m1, m11 + pmaddubsw m2, m3, m12 + pmaddubsw m3, m13 + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m0, m2 + paddw m1, m3 + paddw m0, m4 + paddw m1, m5 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m7, m0, m1 + ret +.w32: + lea r3d, [hq+31] + vpbroadcastb m9, r3d + and r3d, 31 + pminub m10, m9, [pb_0to63] + or r3d, 32 ; imin(h+31, 63) + vpermb m7, m10, [tlq] + vpbroadcastb m8, [tlq+r3] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w32_main + vpbroadcastd m2, [tlq-4] + call .w32_filter + cmp hd, 64 + je .w32_h64_filter_end + vpermb m8, m9, m7 + vpermb m7, m10, m7 + jmp .w32_main +.w32_h64_filter_end: ; edge case for 32x64 + movd xmm0, [tlq+r3-1] + movd xmm1, [base+pb_8_56_0_0] + add r3d, 2 + pmaddubsw xmm0, xmm1 + vptestmw k1, xmm1, xmm1 ; 0x01 + pmulhrsw xm0, xmm0, xm15 + vmovdqu8 m8{k1}, m0 +.w32_main: + rorx r2d, dxd, 30 + vpbroadcastd m4, [base+z_xpos_bc] + vpbroadcastw m3, r2d + vbroadcasti32x8 m5, [base+z_xpos_off2a] + shl r3d, 6 + vbroadcasti32x8 m6, [base+z_xpos_off2b] + sub r3d, dxd + paddw m9, m3, m3 + add dxd, dxd + vinserti32x8 m3, ym9, 1 +.w32_loop: + pshufb m1, m3, m4 + psrlw m2, m3, 3 + paddsb m0, m1, m5 + vpermw m2, m2, m14 + paddsb m1, m6 + vpermi2b m0, m7, m8 + vpermi2b m1, m7, m8 + paddsw m3, m9 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m0, m1 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + sub hd, 2 + jz .w32_end + lea dstq, [dstq+strideq*2] + sub r3d, dxd + jg .w32_loop + punpckhqdq ym8, ym8 +.w32_end_loop: + mova [dstq+strideq*0], ym8 + mova [dstq+strideq*1], ym8 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_end_loop +.w32_end: + RET +.w64_filter: + vbroadcasti32x4 m3, [base+z_filter_s2] + mova m1, [base+z_filter_s1] + pshufb m0, m3 ; al bl + vpermi2b m1, m7, m2 + vbroadcasti32x4 m4, [base+z_filter_s4] + pshufb m6, m8, m4 ; el dl + pshufb m9, m7, m4 + pminub m10, m13, [base+z_filter_s5] + pshufb m2, m8, m3 ; ah bh + pshufb m3, m7, m3 + vbroadcasti32x4 m5, [base+z_filter_s3] + vpermb m10, m10, m8 ; eh dh + pshufb m11, m4 + vpbroadcastd m4, [base+z_filter_k+4*2+12*0] + pshufb m8, m5 ; cl ch + pshufb m7, m5 + vpbroadcastd m5, [base+z_filter_k+4*2+12*1] + REPX {pmaddubsw x, m4}, m0, m1, m6, m9, m2, m3, m10, m11 + pmaddubsw m4, m8, m5 + pmaddubsw m5, m7, m5 + paddw m0, m6 + vpbroadcastd m6, [base+z_filter_k+4*2+12*2] + paddw m1, m9 + pmaddubsw m7, m6 + pmaddubsw m8, m6 + paddw m2, m10 + paddw m3, m11 + paddw m0, m4 + paddw m1, m5 + paddw m2, m8 + paddw m3, m7 + REPX {pmulhrsw x, m15}, m0, m2, m1, m3 + packuswb m0, m2 + packuswb m7, m1, m3 + vpermb m8, m12, m0 + ret +.w64: + lea r3d, [hq-1] + movu m7, [tlq+64*0] + vpbroadcastb m13, r3d + pminub m12, m13, [pb_0to63] + or r3d, 64 + vpermb m8, m12, [tlq+64*1] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w64_main + movu m0, [tlq+56] + vpbroadcastd m2, [tlq-4] + movu m11, [tlq+8] + call .w64_filter +.w64_main: + rorx r2d, dxd, 30 + vpbroadcastd m4, [base+z_xpos_bc] + vpbroadcastw m3, r2d + mova m5, [base+z_xpos_off2a] + shl r3d, 6 + mova m6, [base+z_xpos_off2b] + sub r3d, dxd + mova m9, m3 +.w64_loop: + pshufb m1, m3, m4 + psrlw m2, m3, 3 + paddsb m0, m1, m5 + vpermw m2, m2, m14 + paddsb m1, m6 + vpermi2b m0, m7, m8 + vpermi2b m1, m7, m8 + paddsw m3, m9 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m0, m1 + mova [dstq], m0 + dec hd + jz .w64_end + add dstq, strideq + sub r3d, dxd + jg .w64_loop + vpermb m8, m13, m8 +.w64_end_loop: + mova [dstq], m8 + add dstq, strideq + dec hd + jg .w64_end_loop +.w64_end: + RET + +cglobal ipred_z2_8bpc, 3, 9, 18, dst, stride, tl, w, h, angle, dx, _, dy + tzcnt wd, wm + movifnidn angled, anglem + lea dxq, [dr_intra_derivative-90] + movzx dyd, angleb + xor angled, 0x400 + mov r7, dxq + sub dxq, dyq + movifnidn hd, hm + and dyd, ~1 + and dxq, ~1 + movzx dyd, word [r7+dyq] ; angle - 90 + lea r7, [z_filter_t0] + movzx dxd, word [dxq+270] ; 180 - angle + movsxd wq, [base+ipred_z2_8bpc_avx512icl_table+wq*4] + mova m8, [base+pb_63to0] + neg dyd + vpermb m8, m8, [tlq-64] ; left + lea wq, [base+ipred_z2_8bpc_avx512icl_table+wq] + mova m14, [base+z_frac_table] + inc tlq + vpbroadcastd m15, [base+pw_512] + neg dxd + jmp wq +.w4: + movd xm7, [tlq] + vpbroadcastq m10, [base+z_xpos_off2a] + test angled, 0x400 + jnz .w4_main ; !enable_intra_edge_filter + lea r3d, [hq+2] + add angled, 1022 + shl r3d, 6 + test r3d, angled + jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) + vpbroadcastd xm2, [base+pb_4] + sub angled, 1075 ; angle - 53 + call .upsample_above + lea r3d, [hq+3] + vpbroadcastq m10, [pb_0to63+1] + punpcklbw xm7, xm0, xm7 + call .filter_strength + jmp .w4_filter_left +.w4_upsample_left: + call .upsample_left + movsldup m16, [base+z_ypos_off3] + vpbroadcastd m9, [base+pb_16] + punpcklbw xm8, xm0, xm8 + jmp .w4_main2 +.w4_no_upsample_above: + lea r3d, [hq+3] + sub angled, 1112 ; angle - 90 + call .filter_strength + test r3d, r3d + jz .w4_no_filter_above + vpbroadcastd xm5, [base+pb_3] + call .filter_top_w16 +.w4_no_filter_above: + lea r3d, [hq+2] + add angled, 973 ; angle + 883 + shl r3d, 6 + test r3d, angled + jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) + vpbroadcastd ym0, [base+pb_90] + psubb ym0, ym17 + vpcmpgtb k2{k2}, ym0, ym16 + kmovd r3d, k2 +.w4_filter_left: + test r3d, r3d + jz .w4_main + popcnt r3d, r3d + call .filter_left_h16 +.w4_main: + movsldup m16, [base+z_ypos_off1] + vpbroadcastd m9, [base+pb_8] +.w4_main2: + vpbroadcastq m3, [base+z_ypos_mul1a] + vpbroadcastw m0, dyd + movsldup m1, [base+z_xpos_mul] + vpbroadcastw m5, dxd + vinserti32x4 m7, [tlq-16], 3 + vinserti32x4 m8, [tlq-16], 3 + pmullw m3, m0 + vbroadcasti32x4 m2, [base+z_xpos_bc] + pmullw m1, m5 ; xpos0..3 + psllw m5, 5 ; dx*8 + psraw m4, m3, 6 + psrlw m3, 1 + packsswb m4, m4 + vpermw m3, m3, m14 ; 64-frac, frac + punpcklbw m4, m4 + lea r2, [strideq*3] + paddb m4, m16 ; base, base+1 +.w4_loop: + pshufb m16, m1, m2 + psrlw m0, m1, 3 + paddb m16, m10 + vpermw m0, m0, m14 + vpmovw2m k1, m16 ; base_x < 0 + vpermb m16, m16, m7 + pmaddubsw m16, m0 + vpermb m0, m4, m8 + pmaddubsw m16{k1}, m0, m3 + pmulhrsw m16, m15 + vpmovwb ym16, m16 + movd [dstq+strideq*0], xm16 + pextrd [dstq+strideq*1], xm16, 1 + pextrd [dstq+strideq*2], xm16, 2 + pextrd [dstq+r2 ], xm16, 3 + sub hd, 8 + jl .w4_end + paddsw m1, m5 + vextracti128 xm16, ym16, 1 + lea dstq, [dstq+strideq*4] + paddb m4, m9 + movd [dstq+strideq*0], xm16 + pextrd [dstq+strideq*1], xm16, 1 + pextrd [dstq+strideq*2], xm16, 2 + pextrd [dstq+r2 ], xm16, 3 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.w4_end: + RET +.upsample_above: ; w4/w8 + mova xm0, [tlq-1] + xor angled, 0x7f ; 180 - angle + add dxd, dxd + jmp .upsample +.upsample_left: ; h4/h8 + palignr xm0, xm8, [tlq-16], 15 + vpbroadcastb xm2, hd + add dyd, dyd +.upsample: + pshufb xm1, xm0, [base+z_filter4_s1] + pminub xm2, [base+z_filter_s4] + vpbroadcastd xm3, [base+pb_m4_36] + pshufb xm0, xm2 + pmaddubsw xm1, xm3 + pmaddubsw xm0, xm3 + paddw xm0, xm1 + pmulhrsw xm0, xm15 + packuswb xm0, xm0 + ret +.filter_strength: + vpbroadcastb ym16, r3d + mov r3d, angled + vpbroadcastd m2, [tlq-4] + vpbroadcastb ym17, angled + shr r3d, 8 + vpcmpeqb k2, ym16, [base+z_filter_wh] + mova xm16, [base+z_filter_t0+r3*8] + vpcmpgtb k1{k2}, ym17, ym16 + mova m9, [pb_0to63] + kmovd r3d, k1 + ret +.w8: + movq xm7, [tlq] + vbroadcasti32x4 m10, [base+z_xpos_off2a] + test angled, 0x400 + jnz .w8_main + lea r3d, [angleq+126] + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm + vpbroadcastd xm2, [base+pb_8] + sub angled, 53 ; angle - 53 + call .upsample_above + lea r3d, [hq+7] + vbroadcasti32x4 m10, [pb_0to63+1] + punpcklbw xm7, xm0, xm7 + call .filter_strength + jmp .w8_filter_left +.w8_upsample_left: + call .upsample_left + movshdup m16, [base+z_ypos_off3] + vpbroadcastd m9, [base+pb_8] + punpcklbw xm8, xm0, xm8 + jmp .w8_main2 +.w8_no_upsample_above: + lea r3d, [hq+7] + sub angled, 90 ; angle - 90 + call .filter_strength + test r3d, r3d + jz .w8_no_filter_above + vpbroadcastd xm5, [base+pb_7] + call .filter_top_w16 +.w8_no_filter_above: + lea r3d, [angleq-51] + mov r3b, hb + cmp r3d, 8 + jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm + vpbroadcastd ym0, [base+pb_90] + psubb ym0, ym17 + vpcmpgtb k2{k2}, ym0, ym16 + kmovd r3d, k2 +.w8_filter_left: + test r3d, r3d + jz .w8_main + cmp hd, 32 + je .w8_filter_left_h32 + popcnt r3d, r3d + call .filter_left_h16 + jmp .w8_main +.w8_filter_left_h32: + call .filter_left_h64 +.w8_main: + movshdup m16, [base+z_ypos_off2] + vpbroadcastd m9, [base+pb_4] +.w8_main2: + vbroadcasti32x4 m3, [base+z_ypos_mul1a] + vpbroadcastw m0, dyd + movshdup m1, [base+z_xpos_mul] + vpbroadcastw m5, dxd + vinserti32x4 m7, [tlq-16], 3 + vinserti32x4 m8, [tlq-16], 3 + pmullw m3, m0 + vpbroadcastd m2, [base+pb_1] + pmullw m1, m5 ; xpos0..3 + psllw m5, 4 ; dx*4 + psraw m4, m3, 6 + psrlw m3, 1 + packsswb m4, m4 + vpermw m3, m3, m14 ; 64-frac, frac + lea r3d, [dxq+(8<<6)] + paddsb m4, m16 + shl dxd, 2 + paddsb m0, m4, m2 + lea r2, [strideq*3] + punpcklbw m4, m0 ; base, base+1 +.w8_loop: + pshufb m16, m1, m2 + psrlw m0, m1, 3 + paddb m16, m10 + vpermw m0, m0, m14 + vpmovw2m k1, m16 ; base_x < 0 + vpermb m16, m16, m7 + pmaddubsw m16, m0 + vpermb m0, m4, m8 + pmaddubsw m16{k1}, m0, m3 + pmulhrsw m16, m15 + vpmovwb ym16, m16 + vextracti128 xm17, ym16, 1 + movq [dstq+strideq*0], xm16 + movhps [dstq+strideq*1], xm16 + movq [dstq+strideq*2], xm17 + movhps [dstq+r2 ], xm17 + sub hd, 4 + jz .w8_end + paddw m1, m5 + lea dstq, [dstq+strideq*4] + paddb m4, m9 + add r3d, dxd + jge .w8_loop +.w8_leftonly_loop: + vpermb m16, m4, m8 + pmaddubsw m16, m3 + paddb m4, m9 + pmulhrsw m16, m15 + vpmovwb ym16, m16 + vextracti128 xm17, ym16, 1 + movq [dstq+strideq*0], xm16 + movhps [dstq+strideq*1], xm16 + movq [dstq+strideq*2], xm17 + movhps [dstq+r2 ], xm17 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_leftonly_loop +.w8_end: + RET +.filter_top_w16: + mova xm0, [base+z_filter_s1] + popcnt r3d, r3d + pminub xm4, xm5, [base+z_filter_s4] + vpermi2b xm0, xm7, xm2 + pminub xm5, [base+z_filter_s5] + pshufb xm1, xm7, [base+z_filter_s2] + vpbroadcastd xm11, [base+z_filter_k+(r3-1)*4+12*0] + pshufb xm3, xm7, [base+z_filter_s3] + vpbroadcastd xm12, [base+z_filter_k+(r3-1)*4+12*1] + pshufb xm4, xm7, xm4 + vpbroadcastd xm13, [base+z_filter_k+(r3-1)*4+12*2] + pshufb xm5, xm7, xm5 + pmaddubsw xm0, xm11 + pmaddubsw xm1, xm11 + pmaddubsw xm6, xm3, xm12 + vpbroadcastd xm12, r7m ; max_width + pmaddubsw xm3, xm13 + pmaddubsw xm4, xm11 + pmaddubsw xm5, xm11 + packssdw xm12, xm12 + paddw xm0, xm6 + paddw xm1, xm3 + paddw xm0, xm4 + paddw xm1, xm5 + packsswb xm12, xm12 + pmulhrsw xm0, xm15 + pmulhrsw xm1, xm15 + vpcmpgtb k1, xm12, xm9 ; x < max_width + packuswb xm7{k1}, xm0, xm1 + ret +.filter_left_h16: + lea r5d, [hq-1] + mova xm0, [base+z_filter_s1] + vpbroadcastb xm5, r5d + vpermi2b xm0, xm8, xm2 + pminub xm4, xm5, [base+z_filter_s4] + pshufb xm1, xm8, [base+z_filter_s2] + pminub xm5, [base+z_filter_s5] + pshufb xm3, xm8, [base+z_filter_s3] + vpbroadcastd xm11, [base+z_filter_k+(r3-1)*4+12*0] + pshufb xm4, xm8, xm4 + vpbroadcastd xm12, [base+z_filter_k+(r3-1)*4+12*1] + pshufb xm5, xm8, xm5 + vpbroadcastd xm13, [base+z_filter_k+(r3-1)*4+12*2] + pmaddubsw xm0, xm11 + pmaddubsw xm1, xm11 + pmaddubsw xm6, xm3, xm12 + vpbroadcastd xm12, r8m ; max_height + pmaddubsw xm3, xm13 + pmaddubsw xm4, xm11 + pmaddubsw xm5, xm11 + packssdw xm12, xm12 + paddw xm0, xm6 + paddw xm1, xm3 + paddw xm0, xm4 + paddw xm1, xm5 + packsswb xm12, xm12 + pmulhrsw xm0, xm15 + pmulhrsw xm1, xm15 + vpcmpgtb k1, xm12, xm9 ; y < max_height + packuswb xm8{k1}, xm0, xm1 + ret +.w16: + movu xm7, [tlq] ; top + test angled, 0x400 + jnz .w16_main + lea r3d, [hq+15] + sub angled, 90 + call .filter_strength + test r3d, r3d + jz .w16_no_filter_above + vpbroadcastd xm5, [base+pb_15] + call .filter_top_w16 +.w16_no_filter_above: + cmp hd, 16 + jg .w16_filter_left_h64 + vpbroadcastd ym0, [base+pb_90] + psubb ym0, ym17 + vpcmpgtb k2{k2}, ym0, ym16 + kmovd r3d, k2 + test r3d, r3d + jz .w16_main + popcnt r3d, r3d + call .filter_left_h16 + jmp .w16_main +.w16_filter_left_h64: + call .filter_left_h64 +.w16_main: + vbroadcasti32x4 m6, [base+z_ypos_mul1a] ; 1.. 8 + vbroadcasti32x4 m5, [base+z_ypos_mul1b] ; 9..15 + vpbroadcastw m0, dyd + vinserti32x4 m7, [tlq-16], 3 + vpbroadcastd m2, [base+pb_1] + vpbroadcastw m12, dxd + movshdup m1, [base+z_xpos_mul] + pmullw m6, m0 + vbroadcasti32x4 m3, [base+z_xpos_off2a] + pmullw m5, m0 + vbroadcasti32x4 m4, [base+z_xpos_off2b] + pmullw m1, m12 ; xpos0 xpos1 xpos2 xpos3 + vpbroadcastd m9, [base+pb_4] + psllw m12, 4 ; dx*4 + movshdup m16, [base+z_ypos_off2] + psrlw m10, m6, 1 + psrlw m11, m5, 1 + vpermw m10, m10, m14 ; 64-frac, frac + psraw m6, 6 + vpermw m11, m11, m14 + psraw m5, 6 + mov r5d, -(16<<6) ; 15 to avoid top, +1 to avoid topleft + packsswb m6, m5 + mov r3d, 1<<6 + paddsb m6, m16 + sub r5d, dxd ; left-only threshold + paddsb m0, m6, m2 + shl dxd, 2 + punpcklbw m5, m6, m0 ; base, base+1 + lea r2, [strideq*3] + punpckhbw m6, m0 +.w16_loop: + pshufb m17, m1, m2 + psrlw m0, m1, 3 + paddb m16, m3, m17 + vpermw m0, m0, m14 + paddb m17, m4 + vpmovw2m k1, m16 + vpermb m16, m16, m7 + vpmovw2m k2, m17 + vpermb m17, m17, m7 + pmaddubsw m16, m0 + pmaddubsw m17, m0 + add r3d, dxd + jge .w16_toponly + mova m0, m8 + vpermt2b m0, m5, m7 + pmaddubsw m16{k1}, m0, m10 + mova m0, m8 + vpermt2b m0, m6, m7 + pmaddubsw m17{k2}, m0, m11 +.w16_toponly: + pmulhrsw m16, m15 + pmulhrsw m17, m15 + packuswb m16, m17 + mova [dstq+strideq*0], xm16 + vextracti128 [dstq+strideq*1], ym16, 1 + vextracti32x4 [dstq+strideq*2], m16, 2 + vextracti32x4 [dstq+r2 ], m16, 3 + sub hd, 4 + jz .w16_end + paddw m1, m12 + lea dstq, [dstq+strideq*4] + paddb m5, m9 + paddb m6, m9 + cmp r3d, r5d + jge .w16_loop +.w16_leftonly_loop: + vpermb m16, m5, m8 + vpermb m17, m6, m8 + pmaddubsw m16, m10 + pmaddubsw m17, m11 + paddb m5, m9 + paddb m6, m9 + pmulhrsw m16, m15 + pmulhrsw m17, m15 + packuswb m16, m17 + mova [dstq+strideq*0], xm16 + vextracti128 [dstq+strideq*1], ym16, 1 + vextracti32x4 [dstq+strideq*2], m16, 2 + vextracti32x4 [dstq+r2 ], m16, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16_leftonly_loop +.w16_end: + RET +.w32: + movu ym7, [tlq] + test angled, 0x400 + jnz .w32_main + vpbroadcastd m2, [tlq-4] + mova ym0, [base+z_filter_s1] + vbroadcasti32x4 ym1, [base+z_filter_s2] + vbroadcasti32x4 ym3, [base+z_filter_s3] + vbroadcasti32x4 ym4, [base+z_filter_s4] + vpermi2b ym0, ym7, ym2 ; al bl + vpbroadcastd ym5, [base+pb_31] + pminub ym5, [base+z_filter_s5] + pshufb ym1, ym7, ym1 ; ah bh + vpbroadcastd ym11, [base+z_filter_k+4*2+12*0] + pshufb ym3, ym7, ym3 ; cl ch + vpbroadcastd ym12, [base+z_filter_k+4*2+12*1] + pshufb ym4, ym7, ym4 ; el dl + vpbroadcastd ym13, [base+z_filter_k+4*2+12*2] + vpermb ym5, ym5, ym7 ; eh dh + pmaddubsw ym0, ym11 + pmaddubsw ym1, ym11 + pmaddubsw ym6, ym3, ym12 + vpbroadcastd ym12, r6m + pmaddubsw ym3, ym13 + pmaddubsw ym4, ym11 + pmaddubsw ym5, ym11 + mova m9, [pb_0to63] + packssdw ym12, ym12 + paddw ym0, ym6 + paddw ym1, ym3 + paddw ym0, ym4 + paddw ym1, ym5 + packsswb ym12, ym12 + pmulhrsw ym0, ym15 + pmulhrsw ym1, ym15 + vpcmpgtb k1, ym12, ym9 ; x < max_width + packuswb ym7{k1}, ym0, ym1 + cmp hd, 16 + jg .w32_filter_h64 + mov r3d, 3 + call .filter_left_h16 + jmp .w32_main +.w32_filter_h64: + call .filter_left_h64 +.w32_main: + vbroadcasti32x8 m6, [base+z_ypos_mul1a] ; 1.. 8 + vbroadcasti32x8 m5, [base+z_ypos_mul1b] ; 9..15 + vpbroadcastw m0, dyd + vinserti32x4 m7, [tlq-16], 3 + rorx r2q, dxq, 62 ; dx << 2 + vpbroadcastd m2, [base+pb_1] + vpbroadcastw m1, r2d + pmullw m6, m0 + vbroadcasti32x8 m3, [base+z_xpos_off2a] + pmullw m5, m0 + vbroadcasti32x8 m4, [base+z_xpos_off2b] + mova ym0, ym1 + paddw m12, m1, m1 + vpbroadcastd m9, [base+pb_2] + paddw m1, m0 ; xpos1 xpos0 + mova ym0, ym2 + psrlw m10, m6, 1 + psrlw m11, m5, 1 + vpermw m10, m10, m14 ; 64-frac, frac + psraw m6, 6 + vpermw m11, m11, m14 + psraw m5, 6 + mov r5d, -(32<<6) ; 31 to avoid top, +1 to avoid topleft + packsswb m6, m5 + mov r3d, 1<<6 + paddsb m6, m0 + sub r5d, dxd ; left-only threshold + paddsb m0, m6, m2 + add dxd, dxd + punpcklbw m5, m6, m0 ; base, base+1 + punpckhbw m6, m0 +.w32_loop: + pshufb m17, m1, m2 + psrlw m0, m1, 3 + paddb m16, m3, m17 + vpermw m0, m0, m14 + paddb m17, m4 + vpmovw2m k1, m16 + vpermb m16, m16, m7 + vpmovw2m k2, m17 + vpermb m17, m17, m7 + pmaddubsw m16, m0 + pmaddubsw m17, m0 + add r3d, dxd + jge .w32_toponly + mova m0, m8 + vpermt2b m0, m5, m7 + pmaddubsw m16{k1}, m0, m10 + mova m0, m8 + vpermt2b m0, m6, m7 + pmaddubsw m17{k2}, m0, m11 +.w32_toponly: + pmulhrsw m16, m15 + pmulhrsw m17, m15 + packuswb m16, m17 + vextracti32x8 [dstq+strideq*0], m16, 1 + mova [dstq+strideq*1], ym16 + sub hd, 2 + jz .w32_end + paddw m1, m12 + lea dstq, [dstq+strideq*2] + paddb m5, m9 + paddb m6, m9 + cmp r3d, r5d + jge .w32_loop +.w32_leftonly_loop: + vpermb m16, m5, m8 + vpermb m17, m6, m8 + pmaddubsw m16, m10 + pmaddubsw m17, m11 + paddb m5, m9 + paddb m6, m9 + pmulhrsw m16, m15 + pmulhrsw m17, m15 + packuswb m16, m17 + vextracti32x8 [dstq+strideq*0], m16, 1 + mova [dstq+strideq*1], ym16 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_leftonly_loop +.w32_end: + RET +.filter_left_h64: + mova m0, [base+z_filter_s1] + lea r3d, [hq-1] + vbroadcasti32x4 m4, [base+z_filter_s4] + vpbroadcastb m5, r3d + vbroadcasti32x4 m1, [base+z_filter_s2] + vbroadcasti32x4 m3, [base+z_filter_s3] + vpermi2b m0, m8, m2 ; al bl + pminub m5, [base+z_filter_s5] + pshufb m1, m8, m1 ; ah bh + vpbroadcastd m11, [base+z_filter_k+4*2+12*0] + pshufb m3, m8, m3 ; cl ch + vpbroadcastd m12, [base+z_filter_k+4*2+12*1] + pshufb m4, m8, m4 ; el dl + vpbroadcastd m13, [base+z_filter_k+4*2+12*2] + vpermb m5, m5, m8 ; eh dh + pmaddubsw m0, m11 + pmaddubsw m1, m11 + pmaddubsw m6, m3, m12 + vpbroadcastd m12, r8m ; max_height + pmaddubsw m3, m13 + pmaddubsw m4, m11 + pmaddubsw m5, m11 + packssdw m12, m12 + paddw m0, m6 + paddw m1, m3 + paddw m0, m4 + paddw m1, m5 + packsswb m12, m12 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + vpcmpgtb k1, m12, m9 ; y < max_height + packuswb m8{k1}, m0, m1 + ret +.w64: + movu m7, [tlq] + test angled, 0x400 + jnz .w64_main + vpbroadcastd m2, [tlq-4] + mova m0, [base+z_filter_s1] + vbroadcasti32x4 m1, [base+z_filter_s2] + vbroadcasti32x4 m3, [base+z_filter_s3] + vbroadcasti32x4 m4, [base+z_filter_s4] + vpermi2b m0, m7, m2 ; al bl + vpbroadcastd m5, [base+pb_63] + pminub m5, [base+z_filter_s5] + pshufb m1, m7, m1 ; ah bh + vpbroadcastd m11, [base+z_filter_k+4*2+12*0] + pshufb m3, m7, m3 ; cl ch + vpbroadcastd m12, [base+z_filter_k+4*2+12*1] + pshufb m4, m7, m4 ; el dl + vpbroadcastd m13, [base+z_filter_k+4*2+12*2] + vpermb m5, m5, m7 ; eh dh + pmaddubsw m0, m11 + pmaddubsw m1, m11 + pmaddubsw m6, m3, m12 + vpbroadcastd m12, r6m + pmaddubsw m3, m13 + pmaddubsw m4, m11 + pmaddubsw m5, m11 + mova m9, [pb_0to63] + packssdw m12, m12 + paddw m0, m6 + paddw m1, m3 + paddw m0, m4 + paddw m1, m5 + packsswb m12, m12 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + vpcmpgtb k1, m12, m9 ; x < max_width + packuswb m7{k1}, m0, m1 + call .filter_left_h64 ; always filter the full 64 pixels for simplicity +.w64_main: + vpbroadcastw m5, dyd + vpbroadcastd m9, [tlq-4] + rorx r2q, dxq, 62 ; dx << 2 + pmullw m6, m5, [base+z_ypos_mul1a] ; can overflow, but it doesn't matter as such + pmullw m5, [base+z_ypos_mul1b] ; pixels aren't selected from the left edge + vpbroadcastw m1, r2d ; xpos + mova m3, [base+z_xpos_off2a] + mova m4, [base+z_xpos_off2b] + mova m12, m1 + vpbroadcastd m2, [base+pb_1] + psrlw m10, m6, 1 + psrlw m11, m5, 1 + vpermw m10, m10, m14 ; 64-frac, frac + psraw m6, 6 + vpermw m11, m11, m14 + psraw m5, 6 + mov r5d, -(64<<6) ; 63 to avoid top, +1 to avoid topleft + packsswb m6, m5 + mov r3d, 1<<6 + paddsb m0, m6, m2 + sub r5d, dxd ; left-only threshold + punpcklbw m5, m6, m0 ; base, base+1 + punpckhbw m6, m0 +.w64_loop: + pshufb m17, m1, m2 + psrlw m0, m1, 3 + paddb m16, m3, m17 + vpermw m0, m0, m14 + paddb m17, m4 + vpmovw2m k1, m16 ; base_x < 0 + vpermi2b m16, m7, m9 + vpmovw2m k2, m17 + vpermi2b m17, m7, m9 + pmaddubsw m16, m0 + pmaddubsw m17, m0 + add r3d, dxd + jge .w64_toponly + mova m0, m8 + vpermt2b m0, m5, m9 + pmaddubsw m16{k1}, m0, m10 + mova m0, m8 + vpermt2b m0, m6, m9 + pmaddubsw m17{k2}, m0, m11 +.w64_toponly: + pmulhrsw m16, m15 + pmulhrsw m17, m15 + packuswb m16, m17 + mova [dstq], m16 + dec hd + jz .w64_end + paddw m1, m12 + add dstq, strideq + paddb m5, m2 + paddb m6, m2 + cmp r3d, r5d + jge .w64_loop +.w64_leftonly_loop: + vpermb m16, m5, m8 + vpermb m17, m6, m8 + pmaddubsw m16, m10 + pmaddubsw m17, m11 + paddb m5, m2 + paddb m6, m2 + pmulhrsw m16, m15 + pmulhrsw m17, m15 + packuswb m16, m17 + mova [dstq], m16 + add dstq, strideq + dec hd + jg .w64_leftonly_loop +.w64_end: + RET + +cglobal ipred_z3_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy + lea r7, [z_filter_t0] + tzcnt wd, wm + movifnidn angled, anglem + lea t0, [dr_intra_derivative+45*2-1] + movsxd wq, [base+ipred_z3_8bpc_avx512icl_table+wq*4] + sub angled, 180 + mov dyd, angled + neg dyd + xor angled, 0x400 + or dyq, ~0x7e + mova m0, [base+pb_63to0] + movzx dyd, word [t0+dyq] + lea wq, [base+ipred_z3_8bpc_avx512icl_table+wq] + movifnidn hd, hm + mova m14, [base+z_frac_table] + shl dyd, 6 + vpbroadcastd m15, [base+pw_512] + jmp wq +.w4: + cmp angleb, 40 + jae .w4_no_upsample + lea r3d, [angleq-1024] + sar r3d, 7 + add r3d, hd + jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) + lea r3d, [hq+4] + call .upsample + movshdup m1, [base+z_ypos_off1] + vpbroadcastd m6, [base+pb_16] + jmp .w4_main2 +.w4_no_upsample: + lea r3d, [hq+3] + vpbroadcastb m9, r3d + vpxord m1, m9, [base+pb_63] {1to16} ; 63 - (h + 4) + pmaxub m1, m0 + vpermb m7, m1, [tlq-64*1] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w4_main + vpbroadcastb xm1, angled + shr angled, 8 + vpcmpeqb k1, xm9, [base+z_filter_wh] + vpbroadcastd m2, [tlq-3] + vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8] + kmovw r5d, k1 + test r5d, r5d + jz .w4_main + pminub m9, [pb_0to63] + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w8_filter + vpermb m7, m9, m0 +.w4_main: + movsldup m1, [base+z_ypos_off1] + vpbroadcastd m6, [base+pb_8] +.w4_main2: + vpbroadcastw m0, dyd + vpbroadcastq m2, [base+z_ypos_mul2a] ; 1..4 + pmulhuw m2, m0 ; ypos >> 1 + lea r2, [strideq*3] + vpermw m3, m2, m14 ; 64-frac, frac + psrlw m2, 5 + packsswb m2, m2 + punpcklbw m2, m2 + paddsb m2, m1 ; base, base+1 +.w4_loop: + vpermb m0, m2, m7 + pmaddubsw m0, m3 + paddsb m2, m6 + pmulhrsw m0, m15 + vpmovwb ym0, m0 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r2 ], xm0, 3 + sub hd, 8 + jl .w4_end + vextracti32x4 xm0, ym0, 1 + lea dstq, [dstq+strideq*4] + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r2 ], xm0, 3 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.w4_end: + RET +.upsample: + xor r3d, 31 ; 31 - (h + imin(w, h)) + vbroadcasti32x4 ym0, [base+z_xpos_off2a] + vpbroadcastb ym7, r3d + pmaxub ym7, [base+z3_upsample] + vbroadcasti32x4 ym1, [base+z_filter_s4] + vpermb ym7, ym7, [tlq-31] + vpbroadcastd ym2, [base+pb_m4_36] + pshufb ym0, ym7, ym0 + psrldq ym7, 1 + pshufb ym1, ym7, ym1 + pmaddubsw ym0, ym2 + pmaddubsw ym1, ym2 + add dyd, dyd + paddw ym0, ym1 + pmulhrsw ym0, ym15 + packuswb ym0, ym0 + punpcklbw ym7, ym0 + ret +.w8: + lea r3d, [angleq+216] + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 + lea r3d, [hq*2] + call .upsample + pshufd m1, [base+z_ypos_off1], q0000 + vpbroadcastd m6, [base+pb_8] + jmp .w8_main2 +.w8_no_upsample: + mov r3d, 8 + cmp hd, 4 + cmove r3d, hd + lea r3d, [r3+hq-1] + xor r3d, 63 ; 63 - (h + imin(w, h)) + vpbroadcastb m1, wd + pmaxub m1, m0 + vpermb m7, m1, [tlq-64*1] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w8_main + lea r3d, [hq+7] + call .filter_strength + test r5d, r5d + jz .w8_main + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w16_filter + vpermb m7, m10, m0 +.w8_main: + movsldup m1, [base+z_ypos_off2] + vpbroadcastd m6, [base+pb_4] +.w8_main2: + vpbroadcastw m0, dyd + vbroadcasti32x4 m2, [base+z_ypos_mul2a] ; 1..8 + pmulhuw m2, m0 ; ypos >> 1 + lea r2, [strideq*3] + vpermw m3, m2, m14 ; 64-frac, frac + psrlw m2, 5 + packsswb m2, m2 + punpcklbw m2, m2 + paddsb m2, m1 ; base, base+1 +.w8_loop: + vpermb m0, m2, m7 + pmaddubsw m0, m3 + paddsb m2, m6 + pmulhrsw m0, m15 + vpmovwb ym0, m0 + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r2 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +.filter_strength: + vpbroadcastd m2, [tlq-3] +.filter_strength2: + vpbroadcastb m9, r3d + vpbroadcastb ym1, angled + shr angled, 8 + vpcmpeqb k1, ym9, [base+z_filter_wh] + mova xm0, [base+z_filter_t0+angleq*8] + vpcmpgtb k1{k1}, ym1, ym0 + pminub m10, m9, [pb_0to63] + kmovd r5d, k1 + ret +.w16_load: + cmp r3d, hd + cmovae r3d, hd + add r3d, hd + mova m7, [tlq-64*1] + neg r3d ; -(h + imin(w, h)) + and r3d, 63 + vpbroadcastb m1, r3d + pmaxub m2, m0, m1 + cmp hd, 64 + je .w16_load_h64 + vpermb m8, m1, m7 + vpermb m7, m2, m7 + ret +.w16_load_h64: + vpermb m7, m0, m7 + vpermb m8, m2, [tlq-64*2] + ret +.w16: + mov r3d, 16 + call .w16_load + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w16_main + vpbroadcastd m2, [tlq-3] + cmp hd, 64 + je .w16_filter64 + lea r3d, [hq+15] + call .filter_strength2 + test r5d, r5d + jz .w16_main + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w16_filter + pminub m10, m9, [pb_0to63] + vpermb m8, m9, m0 + vpermb m7, m10, m0 + jmp .w16_main +.w16_filter64: + vpbroadcastd m13, [base+pb_15] + valignq m0, m8, m7, 7 + pminub m12, m13, [pb_0to63] + valignq m11, m8, m7, 1 + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter +.w16_main: + vbroadcasti32x4 m3, [base+z_ypos_mul2a] ; 1.. 8 + vbroadcasti32x4 m2, [base+z_ypos_mul2b] ; 9..15 + vpbroadcastw m0, dyd + vpbroadcastd m6, [base+pb_4] + pmulhuw m3, m0 ; ypos >> 1 + pmulhuw m2, m0 + movshdup m0, [base+z_ypos_off2] + lea r2, [strideq*3] + vpbroadcastd m1, [base+pb_1] + vpermw m4, m3, m14 ; 64-frac, frac + psrlw m3, 5 + vpermw m5, m2, m14 + psrlw m2, 5 + packsswb m3, m2 + paddsb m3, m0 + paddsb m1, m3 + punpcklbw m2, m3, m1 ; base, base+1 + punpckhbw m3, m1 +.w16_loop: +%macro Z3_PERM2 0 + mova m0, m7 + vpermt2b m0, m2, m8 + mova m1, m7 + vpermt2b m1, m3, m8 + pmaddubsw m0, m4 + pmaddubsw m1, m5 + paddsb m2, m6 + paddsb m3, m6 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m0, m1 +%endmacro + Z3_PERM2 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+r2 ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16_loop + RET +.w32: + mov r3d, 32 + call .w16_load + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w32_main + vpbroadcastd m2, [tlq-3] + cmp hd, 64 + je .w32_filter64 + lea r3d, [hq+31] + vpbroadcastb m9, r3d + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w32_filter + vpermb m8, m9, m7 + jmp .w32_main +.w32_filter64: + vpbroadcastd m13, [base+pb_31] + valignq m0, m8, m7, 7 + pminub m12, m13, [pb_0to63] + valignq m11, m8, m7, 1 + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter +.w32_main: + vbroadcasti32x8 m3, [base+z_ypos_mul2a] ; 1.. 8 + vbroadcasti32x8 m2, [base+z_ypos_mul2b] ; 9..15 + vpbroadcastw m0, dyd + vpbroadcastd m1, [base+pb_1] + pmulhuw m3, m0 ; ypos >> 1 + pmulhuw m2, m0 + vpbroadcastd m6, [base+pb_2] + mova ym0, ym1 + vpermw m4, m3, m14 ; 64-frac, frac + psrlw m3, 5 + vpermw m5, m2, m14 + psrlw m2, 5 + packsswb m3, m2 + paddsb m3, m0 + paddsb m1, m3 + punpcklbw m2, m3, m1 ; base, base+1 + punpckhbw m3, m1 +.w32_loop: + Z3_PERM2 + vextracti32x8 [dstq+strideq*0], m0, 1 + mova [dstq+strideq*1], ym0 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w64: + mova m7, [tlq-64*1] + cmp hd, 64 + je .w64_h64 + lea r3d, [hq*2-1] + xor r3d, 63 ; -(h + imin(w, h)) & 63 + vpbroadcastb m1, r3d + pmaxub m0, m1 + vpermb m8, m1, m7 + jmp .w64_filter +.w64_h64: + vpermb m8, m0, [tlq-64*2] +.w64_filter: + vpermb m7, m0, m7 + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w64_main + lea r3d, [hq-1] + vpbroadcastd m2, [tlq-3] + vpbroadcastb m13, r3d + valignq m0, m8, m7, 7 + pminub m12, m13, [pb_0to63] + valignq m11, m8, m7, 1 + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter +.w64_main: + vpbroadcastw m2, dyd + pmulhuw m3, m2, [base+z_ypos_mul2a] + pmulhuw m2, [base+z_ypos_mul2b] + vpbroadcastd m6, [base+pb_1] + vpermw m4, m3, m14 ; 64-frac, frac + psrlw m3, 5 + vpermw m5, m2, m14 + psrlw m2, 5 + packsswb m3, m2 + paddsb m1, m3, m6 + punpcklbw m2, m3, m1 ; base, base+1 + punpckhbw m3, m1 +.w64_loop: + Z3_PERM2 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET + +; The ipred_filter code processes 4x2 blocks in the following order +; which increases parallelism compared to doing things row by row. +; Some redundant blocks are calculated for w > 4. +; w4 w8 w16 w32 +; 1 1 2 1 2 3 4 1 2 3 4 9 a b c +; 2 2 3 2 3 4 5 2 3 4 5 a b c d +; 3 3 4 3 4 5 6 3 4 5 6 b c d e +; 4 4 5 4 5 6 7 4 5 6 7 c d e f +; 5 5 6 5 6 7 8 5 6 7 8 d e f g +; 6 6 7 6 7 8 9 6 7 8 9 e f g h +; 7 7 8 7 8 9 a 7 8 9 a f g h i +; ___ 8 ___ 8 9 ___ 8 9 a b ___ 8 9 a b g h i j ___ +; 9 9 a b h i j +; a b i j +; b j + +cglobal ipred_filter_8bpc, 4, 7, 14, dst, stride, tl, w, h, flt +%define base r6-filter_taps + lea r6, [filter_taps] +%ifidn fltd, fltm + movzx fltd, fltb +%else + movzx fltd, byte fltm +%endif + vpbroadcastd xmm2, [tlq+1] ; t0 t0 t0 t0 + movifnidn hd, hm + shl fltd, 6 + vpbroadcastd m6, [base+pd_8] + vpbroadcastd xmm3, [tlq-2] ; l1 l0 tl __ + vbroadcasti32x4 m7, [r6+fltq+16*0] ; p1 p2 p3 p4 + vbroadcasti32x4 m8, [r6+fltq+16*1] + vbroadcasti32x4 m9, [r6+fltq+16*2] ; p6 p5 p0 __ + vbroadcasti32x4 m10, [r6+fltq+16*3] + mova xmm0, xm6 + vpdpbusd xmm0, xmm2, xm7 + mova xmm1, xm6 + vpdpbusd xmm1, xmm2, xm8 + vpdpbusd xmm0, xmm3, xm9 + vpdpbusd xmm1, xmm3, xm10 + packssdw xmm0, xmm1 + cmp wd, 8 + jb .w4 + vpbroadcastd ym2, [tlq+5] + mova m11, [base+filter_perm] + mov r5, 0xffffffffffff000f + psrldq xmm2, 1 ; __ t0 + kmovq k1, r5 ; 0x000f + psraw xm5, xmm0, 4 + packuswb xmm2, xm5 ; __ t0 a0 b0 + pshufd ym2{k1}, ymm2, q3333 ; b0 b0 b0 b0 t1 t1 t1 t1 + je .w8 + kxnorb k3, k3, k3 ; 0x00ff + vpbroadcastd xm3, [tlq-4] + kandnq k2, k3, k1 ; 0xffffffffffff0000 + vpermb ym3{k2}, ym11, ymm2 ; l3 l2 l1 __ b3 a3 t3 __ + mova ym0, ym6 + vpdpbusd ym0, ym2, ym7 + mova ym1, ym6 + vpdpbusd ym1, ym2, ym8 + pshufb ym5{k2}, ym2, ym11 ; a0 b0 __ t0 + vpbroadcastd m2, [tlq+9] + vpdpbusd ym0, ym3, ym9 + vpdpbusd ym1, ym3, ym10 + vpbroadcastd xm3, [tlq-6] ; l5 l4 l3 __ + kunpckbw k4, k1, k3 ; 0x0fff + packssdw ym0, ym1 + psraw ym0, 4 ; a0 d0 a1 b1 + packuswb ym5, ym0 ; a0 b0 c0 d0 __ t1 a1 b1 + pshufd m2{k3}, m5, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1 t2 t2 t2 t2 + vpermb m3{k2}, m11, m5 ; l5 l4 l3 __ d3 c3 b3 __ b7 a7 t7 __ + mova m4, m6 + vpdpbusd m4, m2, m7 + mova m1, m6 + vpdpbusd m1, m2, m8 + psrldq m0, m2, 1 ; __ d0 __ b0 __ t0 + vpbroadcastd m2, [tlq+13] + vpdpbusd m4, m3, m9 + vpdpbusd m1, m3, m10 + mova m12, [base+filter_end] + lea r5d, [hq-6] + mov r6, dstq + cmovp hd, r5d ; w == 16 ? h : h - 6 + packssdw m4, m1 + psraw m4, 4 ; e0 f0 c1 d1 a2 b2 + packuswb m0, m4 ; __ d0 e0 f0 __ b1 c1 d1 __ t2 a2 b2 + pshufd m2{k4}, m0, q3333 ; f0 f0 f0 f0 d1 d1 d1 d1 b2 b2 b2 b2 t3 t3 t3 t3 +.w16_loop: + vpbroadcastd xm3, [tlq-8] + vpermb m3{k2}, m11, m0 ; l7 l6 l5 __ f3 e3 d3 __ d7 c7 b7 __ bb ab tb __ + mova m1, m6 + vpdpbusd m1, m2, m7 + mova m0, m6 + vpdpbusd m0, m2, m8 + sub tlq, 2 + vpdpbusd m1, m3, m9 + vpdpbusd m0, m3, m10 + packssdw m1, m0 + mova m0, m4 + psraw m4, m1, 4 ; g0 h0 e1 f1 c2 d2 a3 b3 + packuswb m0, m4 ; e0 f0 g0 h0 c1 d1 e1 f1 a2 b2 c2 d2 __ __ a3 b3 + pshufd m2, m0, q3333 ; h0 h0 h0 h0 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3 + vpermt2d m5, m12, m0 ; c0 d0 e0 f0 __ __ c1 d1 a0 a1 a2 a3 b0 b1 b2 b3 + vextracti32x4 [dstq+strideq*0], m5, 2 + vextracti32x4 [dstq+strideq*1], m5, 3 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_loop + cmp wd, 16 + je .ret + mova xm13, [filter_perm+16] + mova xmm3, [r6+strideq*0] + punpckhdq xmm3, [r6+strideq*1] + vpbroadcastd m2{k1}, [tlq+r5+17] ; t4 t4 t4 t4 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3 + pinsrb xm3, xmm3, [tlq+r5+16], 7 + pshufb xm3, xm13 + vpermb m3{k2}, m11, m0 ; bf af tf __ h3 g3 f3 __ f7 e7 d7 __ db cb bb __ + mova m0, m6 + vpdpbusd m0, m2, m7 + mova m1, m6 + vpdpbusd m1, m2, m8 + kunpckbw k5, k3, k1 ; 0xff0f + lea r3, [strideq*3] + vpdpbusd m0, m3, m9 + vpdpbusd m1, m3, m10 + packssdw m0, m1 + psraw m0, 4 ; a4 b4 g1 h1 e2 f2 c3 d3 + packuswb m4, m0 ; g0 h0 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3 + vpblendmb m1{k3}, m4, m2 ; __ t4 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3 + vpbroadcastd ym2, [tlq+r5+21] + pshufd m2{k5}, m4, q3333 ; b4 b4 b4 b4 t5 t5 t5 t5 f2 f2 f2 f2 d3 d3 d3 d3 + vpermt2d m5, m12, m4 ; e0 f0 g0 h0 __ __ e1 f1 c0 c1 c2 c3 d0 d1 d2 d3 + vextracti32x4 [dstq+strideq*0], m5, 2 + vextracti32x4 [dstq+strideq*1], m5, 3 + punpckhqdq xmm3, [r6+r3] + pinsrb xmm3, [r6+strideq*2+15], 11 + pshufb xm3, xmm3, xm13 + vpermb m3{k2}, m11, m1 ; df cf bf __ bj aj tj __ h7 g7 f7 __ fb eb db __ + mova m4, m6 + vpdpbusd m4, m2, m7 + mova m1, m6 + vpdpbusd m1, m2, m8 + kxnord k3, k3, k4 ; 0xfffff0ff + lea r4, [strideq*5] + vpdpbusd m4, m3, m9 + vpdpbusd m1, m3, m10 + packssdw m4, m1 + psraw m4, 4 ; c4 d4 a5 b5 g2 h2 e3 f3 + packuswb m0, m4 ; a4 b4 c4 d4 g1 h1 a5 b5 e2 f2 g2 h2 __ __ e3 f3 + vpblendmw m1{k3}, m2, m0 ; a4 b4 c4 d4 __ t5 a5 b5 e2 f2 g2 h2 __ __ e3 f3 + vpbroadcastd m2, [tlq+r5+25] + pshufd m2{k3}, m0, q3333 ; d4 d4 d4 d4 b5 b5 b5 b5 t6 t6 t6 t6 f3 f3 f3 f3 + vpermt2d m5, m12, m0 ; g0 h0 a4 b4 __ __ g1 h1 e0 e1 e2 e3 f0 f1 f2 f3 + vextracti32x4 [dstq+strideq*2], m5, 2 + vextracti32x4 [dstq+r3 ], m5, 3 + punpckhqdq xmm3, [r6+r4] + pinsrb xmm3, [r6+strideq*4+15], 11 + pshufb xm3, xmm3, xm13 + vpermb m3{k2}, m11, m1 ; ff ef df __ dj cj bj __ bn an tn __ hb hb fb __ + mova m0, m6 + vpdpbusd m0, m2, m7 + mova m1, m6 + vpdpbusd m1, m2, m8 + kunpckwd k1, k1, k2 ; 0x000f0000 + vpdpbusd m0, m3, m9 + vpdpbusd m1, m3, m10 + packssdw m0, m1 + psraw m0, 4 ; e4 f4 c5 d5 a6 b6 g3 h3 + packuswb m4, m0 ; c4 d4 e4 f4 a5 b5 c5 d5 g2 h2 a6 b6 __ __ g3 h3 + vpblendmw m1{k1}, m4, m2 ; c4 d4 e4 f4 a5 b5 c5 d5 __ t6 a6 b6 __ __ g3 h3 + vpbroadcastd m2, [tlq+r5+29] + pshufd m2{k4}, m4, q3333 ; f4 f4 f4 f4 d5 d5 d5 d5 b6 b6 b6 b6 t7 t7 t7 t7 + vpermt2d m5, m12, m4 ; a4 b4 c4 d4 __ __ a5 b5 g0 g1 g2 g3 h0 h1 h2 h3 + vextracti32x4 [dstq+strideq*4], m5, 2 + vextracti32x4 [dstq+r4 ], m5, 3 + lea r0, [strideq+r3*2] +.w32_loop: + punpckhqdq xmm3, [r6+r0] + pinsrb xmm3, [r6+r3*2+15], 11 + pshufb xm3, xmm3, xm13 + vpermb m3{k2}, m11, m1 ; hf gf ff __ fj ej dj __ dn cn bn __ br ar tr __ +.w32_loop_tail: + mova m4, m6 + vpdpbusd m4, m2, m7 + mova m1, m6 + vpdpbusd m1, m2, m8 + vpdpbusd m4, m3, m9 + vpdpbusd m1, m3, m10 + packssdw m4, m1 + mova m1, m0 + psraw m0, m4, 4 ; g4 h4 e5 f5 c6 d6 a7 b7 + packuswb m1, m0 ; e4 f4 g4 h4 c5 d5 e5 f5 a6 b6 c6 d6 __ __ a7 b7 + pshufd m2, m1, q3333 ; h4 h4 h4 h4 f5 f5 f5 f5 d6 d6 d6 d6 b7 b7 b7 b7 + vpermt2d m5, m12, m1 ; c4 d4 e4 f4 __ __ c5 d5 a4 a5 a6 a7 b4 b5 b6 b7 + vextracti32x4 [r6+strideq*0+16], m5, 2 + vextracti32x4 [r6+strideq*1+16], m5, 3 + lea r6, [r6+strideq*2] + sub r5d, 2 + jg .w32_loop + vpermb m3, m11, m1 + cmp r5d, -6 + jg .w32_loop_tail +.ret: + RET +.w8: + vpermb ym3, ym11, ymm2 +.w8_loop: + vpbroadcastd ym3{k1}, [tlq-4] ; l3 l2 l1 __ b3 a3 t3 __ + mova ym0, ym6 + vpdpbusd ym0, ym2, ym7 + mova ym1, ym6 + vpdpbusd ym1, ym2, ym8 + sub tlq, 2 + vpdpbusd ym0, ym3, ym9 + vpdpbusd ym1, ym3, ym10 + mova ym3, ym5 + packssdw ym0, ym1 + psraw ym5, ym0, 4 ; c0 d0 a1 b1 + packuswb ym3, ym5 ; a0 b0 c0 d0 __ __ a1 b1 + pshufd ym2, ym3, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1 + vpermb ym3, ym11, ym3 ; a0 a1 b0 b1 + movq [dstq+strideq*0], xm3 + movhps [dstq+strideq*1], xm3 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +.w4_loop: + vpbroadcastd xmm3, [tlq-4] ; l3 l2 l1 __ + mova xmm0, xm6 + vpdpbusd xmm0, xmm2, xm7 + mova xmm1, xm6 + vpdpbusd xmm1, xmm2, xm8 + sub tlq, 2 + vpdpbusd xmm0, xmm3, xm9 + vpdpbusd xmm1, xmm3, xm10 + packssdw xmm0, xmm1 +.w4: + psraw xmm0, 4 ; a0 b0 + packuswb xmm0, xmm0 + movd [dstq+strideq*0], xmm0 + pshufd xmm2, xmm0, q1111 ; b0 b0 b0 b0 + movd [dstq+strideq*1], xmm2 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w4_loop + RET + +%endif ; ARCH_X86_64 |