; Copyright © 2020, VideoLAN and dav1d authors ; Copyright © 2020, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 %macro SMOOTH_WEIGHT_TABLE 1-* %rep %0 db %1-128, 127-%1 %rotate 1 %endrep %endmacro smooth_weights: SMOOTH_WEIGHT_TABLE \ 0, 0, 255, 128, 255, 149, 85, 64, \ 255, 197, 146, 105, 73, 50, 37, 32, \ 255, 225, 196, 170, 145, 123, 102, 84, \ 68, 54, 43, 33, 26, 20, 17, 16, \ 255, 240, 225, 210, 196, 182, 169, 157, \ 145, 133, 122, 111, 101, 92, 83, 74, \ 66, 59, 52, 45, 39, 34, 29, 25, \ 21, 17, 14, 12, 10, 9, 8, 8, \ 255, 248, 240, 233, 225, 218, 210, 203, \ 196, 189, 182, 176, 169, 163, 156, 150, \ 144, 138, 133, 127, 121, 116, 111, 106, \ 101, 96, 91, 86, 82, 77, 73, 69, \ 65, 61, 57, 54, 50, 47, 44, 41, \ 38, 35, 32, 29, 27, 25, 22, 20, \ 18, 16, 15, 13, 12, 10, 9, 8, \ 7, 6, 6, 5, 5, 4, 4, 4 ; dav1d_filter_intra_taps[], reordered for VNNI: p1 p2 p3 p4, p6 p5 p0 __ filter_taps: db 10, 0, 0, 0, 2, 10, 0, 0, 1, 1, 10, 0, 1, 1, 2, 10 db 6, 0, 0, 0, 2, 6, 0, 0, 2, 2, 6, 0, 1, 2, 2, 6 db 0, 12, -6, 0, 0, 9, -5, 0, 0, 7, -3, 0, 0, 5, -3, 0 db 12, 2, -4, 0, 9, 2, -3, 0, 7, 2, -3, 0, 5, 3, -3, 0 db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16 db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16 db 0, 10,-10, 0, 0, 6, -6, 0, 0, 4, -4, 0, 0, 2, -2, 0 db 10, 0,-10, 0, 6, 0, -6, 0, 4, 0, -4, 0, 2, 0, -2, 0 db 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8 db 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4 db 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0 db 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0 db 8, 0, 0, 0, 3, 8, 0, 0, 2, 3, 8, 0, 1, 2, 3, 8 db 4, 0, 0, 0, 3, 4, 0, 0, 2, 3, 4, 0, 2, 2, 3, 4 db 0, 10, -2, 0, 0, 6, -1, 0, 0, 4, -1, 0, 0, 2, 0, 0 db 10, 3, -1, 0, 6, 4, -1, 0, 4, 4, -1, 0, 3, 3, -1, 0 db 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14 db 12, 0, 0, 0, 1, 12, 0, 0, 0, 0, 12, 0, 0, 0, 1, 12 db 0, 14,-12, 0, 0, 12,-10, 0, 0, 11, -9, 0, 0, 10, -8, 0 db 14, 0,-10, 0, 12, 0, -9, 0, 11, 1, -8, 0, 9, 1, -7, 0 filter_perm: db 0, 1, 2, 3, 24, 25, 26, 27, 4, 5, 6, 7, 28, 29, 30, 31 db 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7,131 db 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23,147 db 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39,163 filter_end: dd 2, 3, 16, 17, -1, -1, 20, 21, 0, 6, 24, 30, 1, 7, 25, 31 smooth_shuf: db 7, 7, 7, 7, 0, 1, 0, 1, 3, 3, 3, 3, 8, 9, 8, 9 db 5, 5, 5, 5, 4, 5, 4, 5, 1, 1, 1, 1, 12, 13, 12, 13 db 6, 6, 6, 6, 2, 3, 2, 3, 2, 2, 2, 2, 10, 11, 10, 11 db 4, 4, 4, 4, 6, 7, 6, 7, 0, 0, 0, 0, 14, 15, 14, 15 smooth_endA: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95 db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127 smooth_endB: db 1, 3, 5, 7, 9, 11, 13, 15, 65, 67, 69, 71, 73, 75, 77, 79 db 17, 19, 21, 23, 25, 27, 29, 31, 81, 83, 85, 87, 89, 91, 93, 95 db 33, 35, 37, 39, 41, 43, 45, 47, 97, 99,101,103,105,107,109,111 db 49, 51, 53, 55, 57, 59, 61, 63,113,115,117,119,121,123,125,127 ipred_h_shuf: db 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4 db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 pal_unpack: db 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 pal_perm: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 pb_63to0: db 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48 db 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32 db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 z_frac_table: db 64, 0, 62, 2, 60, 4, 58, 6, 56, 8, 54, 10, 52, 12, 50, 14 db 48, 16, 46, 18, 44, 20, 42, 22, 40, 24, 38, 26, 36, 28, 34, 30 db 32, 32, 30, 34, 28, 36, 26, 38, 24, 40, 22, 42, 20, 44, 18, 46 db 16, 48, 14, 50, 12, 52, 10, 54, 8, 56, 6, 58, 4, 60, 2, 62 z_filter_s1: db -1, -1, -1, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6 db 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22 db 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38 db 46, 47, 47, 48, 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54 z_filter_s5: db 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15, 17, 16 db 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31, 33, 32 db 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47, 49, 48 db 58, 57, 59, 58, 60, 59, 61, 60, 62, 61, 63, 62, 64, 63, 65, 64 z_filter_s3: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 z_filter_s2: db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 z_filter_s4: db 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8 z_xpos_bc: db 17, 17, 17, 17, 33, 33, 33, 33, 9, 9, 9, 9, 9, 9, 9, 9 z_filter4_s1: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 db 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 z_xpos_off1a: db 64, 65, 65, 66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71, 72 z_xpos_off1b: db 72, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79, 80 z_xpos_off2a: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 db 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24 db 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40 db 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55, 56 z_xpos_off2b: db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16 db 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32 db 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48 db 56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64 z_xpos_mul: dw 4, 4, 4, 4, 8, 8, 4, 4, 12, 12, 8, 8, 16, 16, 8, 8 dw 20, 20, 12, 12, 24, 24, 12, 12, 28, 28, 16, 16, 32, 32, 16, 16 z_ypos_off1: db 64, 65, 64, 65, 64, 65, 64, 65, 65, 66, 65, 66, 66, 67, 66, 67 db 66, 67, 66, 67, 68, 69, 68, 69, 67, 68, 67, 68, 70, 71, 70, 71 db 68, 69, 68, 69, 72, 73, 72, 73, 69, 70, 69, 70, 74, 75, 74, 75 db 70, 71, 70, 71, 76, 77, 76, 77, 71, 72, 71, 72, 78, 79, 78, 79 z_ypos_off2: db 64, 65, 64, 65, 0, 0, 0, 0, 64, 65, 64, 65, 0, 0, 0, 0 db 65, 66, 65, 66, 1, 1, 1, 1, 65, 66, 65, 66, 1, 1, 1, 1 db 66, 67, 66, 67, 2, 2, 2, 2, 66, 67, 66, 67, 2, 2, 2, 2 db 67, 68, 67, 68, 3, 3, 3, 3, 67, 68, 67, 68, 3, 3, 3, 3 z_ypos_off3: db 1, 2, 1, 2, 1, 1, 1, 1, 3, 4, 3, 4, 1, 1, 1, 1 db 5, 6, 5, 6, 3, 3, 3, 3, 7, 8, 7, 8, 3, 3, 3, 3 db 9, 10, 9, 10, 5, 5, 5, 5, 11, 12, 11, 12, 5, 5, 5, 5 db 13, 14, 13, 14, 7, 7, 7, 7, 15, 16, 15, 16, 7, 7, 7, 7 z_ypos_mul1a: dw 1, 2, 3, 4, 5, 6, 7, 8, 17, 18, 19, 20, 21, 22, 23, 24 dw 33, 34, 35, 36, 37, 38, 39, 40, 49, 50, 51, 52, 53, 54, 55, 56 z_ypos_mul1b: dw 9, 10, 11, 12, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32 dw 41, 42, 43, 44, 45, 46, 47, 48, 57, 58, 59, 60, 61, 62, 63, 64 z_ypos_mul2a: dw 1*512, 2*512, 3*512, 4*512, 5*512, 6*512, 7*512, 8*512 dw 17*512, 18*512, 19*512, 20*512, 21*512, 22*512, 23*512, 24*512 dw 33*512, 34*512, 35*512, 36*512, 37*512, 38*512, 39*512, 40*512 dw 49*512, 50*512, 51*512, 52*512, 53*512, 54*512, 55*512, 56*512 z_ypos_mul2b: dw 9*512, 10*512, 11*512, 12*512, 13*512, 14*512, 15*512, 16*512 dw 25*512, 26*512, 27*512, 28*512, 29*512, 30*512, 31*512, 32*512 dw 41*512, 42*512, 43*512, 44*512, 45*512, 46*512, 47*512, 48*512 dw 57*512, 58*512, 59*512, 60*512, 61*512, 62*512, 63*512, 64*512 z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 z3_upsample: db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 db 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8 z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 db 39, 39, 47, 47, 47, 79, 79, 79 z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16 db 32, 0, 32, 0, 24, 0, 24, 0, 16, 0, 16, 0 db 0, 32, 0, 32, 0, 24, 0, 24, 0, 16, 0, 16 pb_8_56_0_0: db 8, 56, 0, 0 pb_m4_36: times 2 db -4, 36 pb_127_m127: times 2 db 127, -127 pb_8: times 4 db 8 pb_15: times 4 db 15 pb_16: times 4 db 16 pb_31: times 4 db 31 pb_63: times 4 db 63 pb_90: times 4 db 90 pb_128: times 4 db 128 pw_128: times 2 dw 128 pw_255: times 2 dw 255 pw_512: times 2 dw 512 %define pb_1 (ipred_h_shuf+24) %define pb_2 (ipred_h_shuf+20) %define pb_3 (ipred_h_shuf+16) %define pb_4 (smooth_shuf +48) %define pb_7 (ipred_h_shuf+ 0) %define pb_9 (z_xpos_bc + 8) %define pb_17 (z_xpos_bc + 0) %define pb_33 (z_xpos_bc + 4) %define pd_8 (filter_taps+128) %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) %xdefine %%base mangle(private_prefix %+ _%1_%2) %%table: %rep %0 - 2 dd %%base %+ .%3 - (%%table - 2*4) %rotate 1 %endrep %endmacro %define ipred_dc_splat_8bpc_avx512icl_table (ipred_dc_8bpc_avx512icl_table + 10*4) JMP_TABLE ipred_h_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_paeth_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_z1_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_z2_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_z3_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 JMP_TABLE ipred_dc_left_8bpc, avx512icl, h4, h8, h16, h32, h64 cextern dr_intra_derivative cextern pb_0to63 SECTION .text INIT_ZMM avx512icl cglobal ipred_dc_top_8bpc, 3, 7, 5, dst, stride, tl, w, h lea r5, [ipred_dc_left_8bpc_avx512icl_table] movd xm0, wm tzcnt wd, wm inc tlq movifnidn hd, hm movu ym1, [tlq] movd xmm3, wd movsxd r6, [r5+wq*4] vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1] psrld xm0, 1 vpdpbusd ym0, ym1, ym2 add r6, r5 add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 cglobal ipred_dc_left_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 lea r5, [ipred_dc_left_8bpc_avx512icl_table] mov hd, hm tzcnt r6d, hd sub tlq, hq tzcnt wd, wm movd xm0, hm movu ym1, [tlq] movd xmm3, r6d movsxd r6, [r5+r6*4] vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1] psrld xm0, 1 vpdpbusd ym0, ym1, ym2 add r6, r5 add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 .h64: movu ym1, [tlq+32] ; unaligned when jumping here from dc_top vpdpbusd ym0, ym1, ym2 .h32: vextracti32x4 xm1, ym0, 1 paddd xm0, xm1 .h16: punpckhqdq xm1, xm0, xm0 paddd xm0, xm1 .h8: psrlq xm1, xm0, 32 paddd xm0, xm1 .h4: vpsrlvd xm0, xmm3 lea stride3q, [strideq*3] vpbroadcastb m0, xm0 jmp wq cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 movifnidn hd, hm movifnidn wd, wm tzcnt r6d, hd lea r5d, [wq+hq] movd xm0, r5d tzcnt r5d, r5d movd xmm4, r5d lea r5, [ipred_dc_8bpc_avx512icl_table] tzcnt wd, wd movsxd r6, [r5+r6*4] movsxd wq, [r5+wq*4+5*4] vpbroadcastd ym3, [r5-ipred_dc_8bpc_avx512icl_table+pb_1] psrld xm0, 1 add r6, r5 add wq, r5 lea stride3q, [strideq*3] jmp r6 .h4: movd xmm1, [tlq-4] vpdpbusd xm0, xmm1, xm3 jmp wq .w4: movd xmm1, [tlq+1] vpdpbusd xm0, xmm1, xm3 cmp hd, 4 jg .w4_mul psrlw xmm0, xm0, 3 jmp .w4_end .w4_mul: punpckhqdq xmm1, xm0, xm0 lea r2d, [hq*2] mov r6d, 0x55563334 paddd xmm1, xm0 shrx r6d, r6d, r2d psrlq xmm0, xmm1, 32 paddd xmm0, xmm1 movd xmm1, r6d psrld xmm0, 2 pmulhuw xmm0, xmm1 .w4_end: vpbroadcastb xm0, xmm0 .s4: movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm0 movd [dstq+strideq*2], xm0 movd [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s4 RET .h8: movq xmm1, [tlq-8] vpdpbusd xm0, xmm1, xm3 jmp wq .w8: movq xmm1, [tlq+1] vextracti32x4 xm2, ym0, 1 vpdpbusd xm0, xmm1, xm3 paddd xmm2, xm2, xm0 punpckhqdq xmm0, xmm2, xmm2 paddd xmm0, xmm2 psrlq xmm1, xmm0, 32 paddd xmm0, xmm1 vpsrlvd xmm0, xmm4 cmp hd, 8 je .w8_end mov r6d, 0x5556 mov r2d, 0x3334 cmp hd, 32 cmove r6d, r2d movd xmm1, r6d pmulhuw xmm0, xmm1 .w8_end: vpbroadcastb xm0, xmm0 .s8: movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm0 movq [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s8 RET .h16: mova xmm1, [tlq-16] vpdpbusd xm0, xmm1, xm3 jmp wq .w16: movu xmm1, [tlq+1] vextracti32x4 xm2, ym0, 1 vpdpbusd xm0, xmm1, xm3 paddd xmm2, xm2, xm0 punpckhqdq xmm0, xmm2, xmm2 paddd xmm0, xmm2 psrlq xmm1, xmm0, 32 paddd xmm0, xmm1 vpsrlvd xmm0, xmm4 cmp hd, 16 je .w16_end mov r6d, 0x5556 mov r2d, 0x3334 test hb, 8|32 cmovz r6d, r2d movd xmm1, r6d pmulhuw xmm0, xmm1 .w16_end: vpbroadcastb xm0, xmm0 .s16: mova [dstq+strideq*0], xm0 mova [dstq+strideq*1], xm0 mova [dstq+strideq*2], xm0 mova [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s16 RET .h32: mova ym1, [tlq-32] vpdpbusd ym0, ym1, ym3 jmp wq .w32: movu ym1, [tlq+1] vpdpbusd ym0, ym1, ym3 vextracti32x4 xm1, ym0, 1 paddd xmm1, xm1, xm0 punpckhqdq xmm0, xmm1, xmm1 paddd xmm0, xmm1 psrlq xmm1, xmm0, 32 paddd xmm0, xmm1 vpsrlvd xmm0, xmm4 cmp hd, 32 je .w32_end lea r2d, [hq*2] mov r6d, 0x33345556 shrx r6d, r6d, r2d movd xmm1, r6d pmulhuw xmm0, xmm1 .w32_end: vpbroadcastb ym0, xmm0 .s32: mova [dstq+strideq*0], ym0 mova [dstq+strideq*1], ym0 mova [dstq+strideq*2], ym0 mova [dstq+stride3q ], ym0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s32 RET .h64: mova ym1, [tlq-64] mova ym2, [tlq-32] vpdpbusd ym0, ym1, ym3 vpdpbusd ym0, ym2, ym3 jmp wq .w64: movu ym1, [tlq+ 1] movu ym2, [tlq+33] vpdpbusd ym0, ym1, ym3 vpdpbusd ym0, ym2, ym3 vextracti32x4 xm1, ym0, 1 paddd xmm1, xm1, xm0 punpckhqdq xmm0, xmm1, xmm1 paddd xmm0, xmm1 psrlq xmm1, xmm0, 32 paddd xmm0, xmm1 vpsrlvd xmm0, xmm4 cmp hd, 64 je .w64_end mov r6d, 0x33345556 shrx r6d, r6d, hd movd xmm1, r6d pmulhuw xmm0, xmm1 .w64_end: vpbroadcastb m0, xmm0 .s64: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m0 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s64 RET cglobal ipred_dc_128_8bpc, 2, 7, 5, dst, stride, tl, w, h, stride3 lea r5, [ipred_dc_splat_8bpc_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4] vpbroadcastd m0, [r5-ipred_dc_splat_8bpc_avx512icl_table+pb_128] add wq, r5 lea stride3q, [strideq*3] jmp wq cglobal ipred_v_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 lea r5, [ipred_dc_splat_8bpc_avx512icl_table] tzcnt wd, wm movu m0, [tlq+1] movifnidn hd, hm movsxd wq, [r5+wq*4] add wq, r5 lea stride3q, [strideq*3] jmp wq cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h, stride3 %define base r6-ipred_h_8bpc_avx512icl_table lea r6, [ipred_h_8bpc_avx512icl_table] tzcnt wd, wm mov hd, hm movsxd wq, [r6+wq*4] lea stride3q, [strideq*3] sub tlq, hq add wq, r6 jmp wq .w4: mova xmm1, [base+ipred_h_shuf+16] .w4_loop: movd xmm0, [tlq+hq-4] pshufb xmm0, xmm1 movd [dstq+strideq*0], xmm0 pextrd [dstq+strideq*1], xmm0, 1 pextrd [dstq+strideq*2], xmm0, 2 pextrd [dstq+stride3q ], xmm0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_loop RET .w8: movsldup xmm2, [base+ipred_h_shuf+16] movshdup xmm3, [base+ipred_h_shuf+16] .w8_loop: movd xmm1, [tlq+hq-4] pshufb xmm0, xmm1, xmm2 pshufb xmm1, xmm3 movq [dstq+strideq*0], xmm0 movq [dstq+strideq*1], xmm1 movhps [dstq+strideq*2], xmm0 movhps [dstq+stride3q ], xmm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop RET .w16: movsldup m1, [base+smooth_shuf] .w16_loop: vpbroadcastd m0, [tlq+hq-4] pshufb m0, m1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16 RET .w32: vpbroadcastd ym3, [base+pb_1] vpord m2, m3, [base+pb_2] {1to16} .w32_loop: vpbroadcastd m1, [tlq+hq-4] pshufb m0, m1, m2 pshufb m1, m3 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w32_loop RET .w64: vpbroadcastd m4, [base+pb_3] vpbroadcastd m5, [base+pb_2] vpbroadcastd m6, [base+pb_1] pxor m7, m7 .w64_loop: vpbroadcastd m3, [tlq+hq-4] pshufb m0, m3, m4 pshufb m1, m3, m5 pshufb m2, m3, m6 pshufb m3, m7 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w64_loop RET %macro PAETH 0 psubusb m1, m5, m4 psubusb m0, m4, m5 por m1, m0 ; tdiff pavgb m2, m6, m4 vpcmpub k1, m1, m7, 1 ; tdiff < ldiff vpblendmb m0{k1}, m4, m6 vpternlogd m4, m6, m8, 0x28 ; (m4 ^ m6) & m8 psubusb m3, m5, m2 psubb m2, m4 psubusb m2, m5 por m2, m3 pminub m1, m7 paddusb m2, m2 por m2, m4 ; min(tldiff, 255) vpcmpub k1, m2, m1, 1 ; tldiff < ldiff && tldiff < tdiff vmovdqu8 m0{k1}, m5 %endmacro cglobal ipred_paeth_8bpc, 3, 7, 10, dst, stride, tl, w, h, top, stride3 lea r6, [ipred_paeth_8bpc_avx512icl_table] tzcnt wd, wm vpbroadcastb m5, [tlq] ; topleft mov hd, hm movsxd wq, [r6+wq*4] vpbroadcastd m8, [r6-ipred_paeth_8bpc_avx512icl_table+pb_1] lea topq, [tlq+1] sub tlq, hq add wq, r6 lea stride3q, [strideq*3] jmp wq INIT_YMM avx512icl .w4: vpbroadcastd m6, [topq] mova m9, [ipred_h_shuf] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 ; ldiff .w4_loop: vpbroadcastq m4, [tlq+hq-8] pshufb m4, m9 ; left PAETH movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+stride3q ], xm0, 3 sub hd, 8 jl .w4_ret vextracti32x4 xm0, m0, 1 lea dstq, [dstq+strideq*4] movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+stride3q ], xm0, 3 lea dstq, [dstq+strideq*4] jg .w4_loop .w4_ret: RET INIT_ZMM avx512icl .w8: vpbroadcastq m6, [topq] movsldup m9, [smooth_shuf] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w8_loop: vpbroadcastq m4, [tlq+hq-8] pshufb m4, m9 PAETH vextracti32x4 xm1, m0, 2 vextracti32x4 xm2, ym0, 1 vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm2 movq [dstq+stride3q ], xm3 sub hd, 8 jl .w8_ret lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] jg .w8_loop .w8_ret: RET .w16: vbroadcasti32x4 m6, [topq] movsldup m9, [smooth_shuf] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w16_loop: vpbroadcastd m4, [tlq+hq-4] pshufb m4, m9 PAETH mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16_loop RET .w32: vbroadcasti32x8 m6, [topq] mova ym9, ym8 psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w32_loop: vpbroadcastd m4, [tlq+hq-2] pshufb m4, m9 PAETH mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: movu m6, [topq] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w64_loop: vpbroadcastb m4, [tlq+hq-1] PAETH mova [dstq], m0 add dstq, strideq dec hd jg .w64_loop RET cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 %define base r6-ipred_smooth_v_8bpc_avx512icl_table lea r6, [ipred_smooth_v_8bpc_avx512icl_table] tzcnt wd, wm mov hd, hm movsxd wq, [r6+wq*4] vpbroadcastd m0, [base+pb_127_m127] vpbroadcastd m1, [base+pw_128] lea weightsq, [base+smooth_weights+hq*4] neg hq vpbroadcastb m4, [tlq+hq] ; bottom add wq, r6 lea stride3q, [strideq*3] jmp wq .w4: vpbroadcastd m2, [tlq+1] movshdup m5, [smooth_shuf] mova ym6, [smooth_endA] punpcklbw m2, m4 ; top, bottom pmaddubsw m3, m2, m0 paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok paddw m3, m1 ; 128 * top + 129 * bottom + 128 .w4_loop: vbroadcasti32x4 m0, [weightsq+hq*2] pshufb m0, m5 pmaddubsw m0, m2, m0 paddw m0, m3 vpermb m0, m6, m0 vextracti32x4 xm1, ym0, 1 movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+stride3q ], xm1, 2 add hq, 8 jg .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 pextrd [dstq+stride3q ], xm1, 3 lea dstq, [dstq+strideq*4] jl .w4_loop .ret: RET .w8: vpbroadcastq m2, [tlq+1] movshdup m5, [smooth_shuf] mova ym6, [smooth_endA] punpcklbw m2, m4 pmaddubsw m3, m2, m0 paddw m1, m2 paddw m3, m1 .w8_loop: vpbroadcastq m0, [weightsq+hq*2] pshufb m0, m5 pmaddubsw m0, m2, m0 paddw m0, m3 vpermb m0, m6, m0 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] add hq, 4 jl .w8_loop RET .w16: vbroadcasti32x4 m3, [tlq+1] movshdup m6, [smooth_shuf] mova m7, [smooth_endB] punpcklbw m2, m3, m4 punpckhbw m3, m4 pmaddubsw m4, m2, m0 pmaddubsw m5, m3, m0 paddw m0, m1, m2 paddw m1, m3 paddw m4, m0 paddw m5, m1 .w16_loop: vpbroadcastq m1, [weightsq+hq*2] pshufb m1, m6 pmaddubsw m0, m2, m1 pmaddubsw m1, m3, m1 paddw m0, m4 paddw m1, m5 vpermt2b m0, m7, m1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] add hq, 4 jl .w16_loop RET .w32: vbroadcasti32x8 m3, [tlq+1] movshdup m6, [smooth_shuf] mova m7, [smooth_endB] punpcklbw m2, m3, m4 punpckhbw m3, m4 pmaddubsw m4, m2, m0 pmaddubsw m5, m3, m0 paddw m0, m1, m2 paddw m1, m3 paddw m4, m0 paddw m5, m1 .w32_loop: vpbroadcastd m1, [weightsq+hq*2] pshufb m1, m6 pmaddubsw m0, m2, m1 pmaddubsw m1, m3, m1 paddw m0, m4 paddw m1, m5 vpermt2b m0, m7, m1 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] add hq, 2 jl .w32_loop RET .w64: movu m3, [tlq+1] mova m6, [smooth_endB] punpcklbw m2, m3, m4 punpckhbw m3, m4 pmaddubsw m4, m2, m0 pmaddubsw m5, m3, m0 paddw m0, m1, m2 paddw m1, m3 paddw m4, m0 paddw m5, m1 .w64_loop: vpbroadcastw m1, [weightsq+hq*2] pmaddubsw m0, m2, m1 pmaddubsw m1, m3, m1 paddw m0, m4 paddw m1, m5 vpermt2b m0, m6, m1 mova [dstq], m0 add dstq, strideq inc hq jl .w64_loop RET cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3 %define base r5-ipred_smooth_h_8bpc_avx512icl_table lea r5, [ipred_smooth_h_8bpc_avx512icl_table] mov r6d, wd tzcnt wd, wd vpbroadcastb m4, [tlq+r6] ; right mov hd, hm movsxd wq, [r5+wq*4] vpbroadcastd m5, [base+pb_127_m127] vpbroadcastd m6, [base+pw_128] sub tlq, hq add wq, r5 vpmovb2m k1, m6 lea stride3q, [strideq*3] jmp wq .w4: movsldup m3, [smooth_shuf] vpbroadcastq m7, [smooth_weights+4*2] mova ym8, [smooth_endA] .w4_loop: vpbroadcastq m0, [tlq+hq-8] mova m2, m4 vpshufb m2{k1}, m0, m3 ; left, right pmaddubsw m0, m2, m5 pmaddubsw m1, m2, m7 paddw m2, m6 paddw m0, m2 paddw m0, m1 vpermb m0, m8, m0 vextracti32x4 xm1, ym0, 1 movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+stride3q ], xm1, 2 sub hd, 8 jl .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 pextrd [dstq+stride3q ], xm1, 3 lea dstq, [dstq+strideq*4] jg .w4_loop .ret: RET .w8: movsldup m3, [smooth_shuf] vbroadcasti32x4 m7, [smooth_weights+8*2] mova ym8, [smooth_endA] .w8_loop: vpbroadcastd m0, [tlq+hq-4] mova m2, m4 vpshufb m2{k1}, m0, m3 pmaddubsw m0, m2, m5 pmaddubsw m1, m2, m7 paddw m2, m6 paddw m0, m2 paddw m0, m1 vpermb m0, m8, m0 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop RET .w16: movsldup m7, [smooth_shuf] vbroadcasti32x4 m8, [smooth_weights+16*2] vbroadcasti32x4 m9, [smooth_weights+16*3] mova m10, [smooth_endB] .w16_loop: vpbroadcastd m0, [tlq+hq-4] mova m3, m4 vpshufb m3{k1}, m0, m7 pmaddubsw m2, m3, m5 pmaddubsw m0, m3, m8 pmaddubsw m1, m3, m9 paddw m3, m6 paddw m2, m3 paddw m0, m2 paddw m1, m2 vpermt2b m0, m10, m1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16_loop RET .w32: mova m10, [smooth_endA] vpbroadcastd ym7, [pb_1] vbroadcasti32x8 m8, [smooth_weights+32*2] vbroadcasti32x8 m9, [smooth_weights+32*3] vshufi32x4 m10, m10, q3120 .w32_loop: vpbroadcastd m0, [tlq+hq-2] mova m3, m4 vpshufb m3{k1}, m0, m7 pmaddubsw m2, m3, m5 pmaddubsw m0, m3, m8 pmaddubsw m1, m3, m9 paddw m3, m6 paddw m2, m3 paddw m0, m2 paddw m1, m2 vpermt2b m0, m10, m1 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: mova m7, [smooth_weights+64*2] mova m8, [smooth_weights+64*3] mova m9, [smooth_endA] .w64_loop: mova m3, m4 vpbroadcastb m3{k1}, [tlq+hq-1] pmaddubsw m2, m3, m5 pmaddubsw m0, m3, m7 pmaddubsw m1, m3, m8 paddw m3, m6 paddw m2, m3 paddw m0, m2 paddw m1, m2 vpermt2b m0, m9, m1 mova [dstq], m0 add dstq, strideq dec hd jg .w64_loop RET cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3 %define base r5-ipred_smooth_8bpc_avx512icl_table lea r5, [ipred_smooth_8bpc_avx512icl_table] mov r6d, wd tzcnt wd, wd mov hd, hm vpbroadcastb m6, [tlq+r6] ; right sub tlq, hq movsxd wq, [r5+wq*4] vpbroadcastd m7, [base+pb_127_m127] vpbroadcastb m0, [tlq] ; bottom vpbroadcastd m1, [base+pw_255] add wq, r5 lea v_weightsq, [base+smooth_weights+hq*2] vpmovb2m k1, m1 lea stride3q, [strideq*3] jmp wq .w4: vpbroadcastd m8, [tlq+hq+1] movsldup m4, [smooth_shuf] movshdup m5, [smooth_shuf] vpbroadcastq m9, [smooth_weights+4*2] mova ym11, [smooth_endA] punpcklbw m8, m0 ; top, bottom pmaddubsw m10, m8, m7 paddw m1, m8 ; 1 * top + 256 * bottom + 255 paddw m10, m1 ; 128 * top + 129 * bottom + 255 .w4_loop: vpbroadcastq m1, [tlq+hq-8] vbroadcasti32x4 m0, [v_weightsq] add v_weightsq, 16 mova m2, m6 vpshufb m2{k1}, m1, m4 ; left, right pmaddubsw m1, m2, m7 ; 127 * left - 127 * right pshufb m0, m5 pmaddubsw m0, m8, m0 paddw m1, m2 ; 128 * left + 129 * right pmaddubsw m2, m9 paddw m0, m10 paddw m1, m2 pavgw m0, m1 vpermb m0, m11, m0 vextracti32x4 xm1, ym0, 1 movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+stride3q ], xm1, 2 sub hd, 8 jl .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 pextrd [dstq+stride3q ], xm1, 3 lea dstq, [dstq+strideq*4] jg .w4_loop .ret: RET .w8: vpbroadcastq m8, [tlq+hq+1] movsldup m4, [smooth_shuf] movshdup m5, [smooth_shuf] vbroadcasti32x4 m9, [smooth_weights+8*2] mova ym11, [smooth_endA] punpcklbw m8, m0 pmaddubsw m10, m8, m7 paddw m1, m8 paddw m10, m1 .w8_loop: vpbroadcastd m1, [tlq+hq-4] vpbroadcastq m0, [v_weightsq] add v_weightsq, 8 mova m2, m6 vpshufb m2{k1}, m1, m4 pmaddubsw m1, m2, m7 pshufb m0, m5 pmaddubsw m0, m8, m0 paddw m1, m2 pmaddubsw m2, m9 paddw m0, m10 paddw m1, m2 pavgw m0, m1 vpermb m0, m11, m0 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop RET .w16: vbroadcasti32x4 m9, [tlq+hq+1] movsldup m5, [smooth_shuf] movshdup m10, [smooth_shuf] vbroadcasti32x4 m11, [smooth_weights+16*2] vbroadcasti32x4 m12, [smooth_weights+16*3] mova m15, [smooth_endB] punpcklbw m8, m9, m0 punpckhbw m9, m0 pmaddubsw m13, m8, m7 pmaddubsw m14, m9, m7 paddw m0, m1, m8 paddw m1, m9 paddw m13, m0 paddw m14, m1 .w16_loop: vpbroadcastd m0, [tlq+hq-4] vpbroadcastq m1, [v_weightsq] add v_weightsq, 8 mova m4, m6 vpshufb m4{k1}, m0, m5 pmaddubsw m2, m4, m7 pshufb m1, m10 pmaddubsw m0, m8, m1 pmaddubsw m1, m9, m1 paddw m2, m4 pmaddubsw m3, m4, m11 pmaddubsw m4, m12 paddw m0, m13 paddw m1, m14 paddw m3, m2 paddw m4, m2 pavgw m0, m3 pavgw m1, m4 vpermt2b m0, m15, m1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16_loop RET .w32: vbroadcasti32x8 m9, [tlq+hq+1] movshdup m10, [smooth_shuf] mova m12, [smooth_weights+32*2] vpbroadcastd ym5, [pb_1] mova m15, [smooth_endB] punpcklbw m8, m9, m0 punpckhbw m9, m0 pmaddubsw m13, m8, m7 pmaddubsw m14, m9, m7 vshufi32x4 m11, m12, m12, q2020 vshufi32x4 m12, m12, q3131 paddw m0, m1, m8 paddw m1, m9 paddw m13, m0 paddw m14, m1 .w32_loop: vpbroadcastd m0, [tlq+hq-2] vpbroadcastd m1, [v_weightsq] add v_weightsq, 4 mova m4, m6 vpshufb m4{k1}, m0, m5 pmaddubsw m2, m4, m7 pshufb m1, m10 pmaddubsw m0, m8, m1 pmaddubsw m1, m9, m1 paddw m2, m4 pmaddubsw m3, m4, m11 pmaddubsw m4, m12 paddw m0, m13 paddw m1, m14 paddw m3, m2 paddw m4, m2 pavgw m0, m3 pavgw m1, m4 vpermt2b m0, m15, m1 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: movu m9, [tlq+hq+1] mova m11, [smooth_weights+64*2] mova m2, [smooth_weights+64*3] mova m14, [smooth_endB] punpcklbw m8, m9, m0 punpckhbw m9, m0 pmaddubsw m12, m8, m7 pmaddubsw m13, m9, m7 vshufi32x4 m10, m11, m2, q2020 vshufi32x4 m11, m2, q3131 paddw m0, m1, m8 paddw m1, m9 paddw m12, m0 paddw m13, m1 .w64_loop: mova m4, m6 vpbroadcastb m4{k1}, [tlq+hq-1] vpbroadcastw m1, [v_weightsq] add v_weightsq, 2 pmaddubsw m2, m4, m7 pmaddubsw m0, m8, m1 pmaddubsw m1, m9, m1 paddw m2, m4 pmaddubsw m3, m4, m10 pmaddubsw m4, m11 paddw m0, m12 paddw m1, m13 paddw m3, m2 paddw m4, m2 pavgw m0, m3 pavgw m1, m4 vpermt2b m0, m14, m1 mova [dstq], m0 add dstq, strideq dec hd jg .w64_loop RET cglobal pal_pred_8bpc, 4, 7, 6, dst, stride, pal, idx, w, h, stride3 movifnidn wd, wm movifnidn hd, hm lea stride3q, [strideq*3] cmp wd, 8 jg .w32 movq xmm3, [palq] je .w8 .w4: movq xmm0, [idxq] add idxq, 8 psrlw xmm1, xmm0, 4 punpcklbw xmm0, xmm1 pshufb xmm0, xmm3, xmm0 movd [dstq+strideq*0], xmm0 pextrd [dstq+strideq*1], xmm0, 1 pextrd [dstq+strideq*2], xmm0, 2 pextrd [dstq+stride3q ], xmm0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 RET .w8: movu xmm2, [idxq] add idxq, 16 pshufb xmm1, xmm3, xmm2 psrlw xmm2, 4 pshufb xmm2, xmm3, xmm2 punpcklbw xmm0, xmm1, xmm2 punpckhbw xmm1, xmm2 movq [dstq+strideq*0], xmm0 movhps [dstq+strideq*1], xmm0 movq [dstq+strideq*2], xmm1 movhps [dstq+stride3q ], xmm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8 RET .w16: pmovzxdq m0, [idxq] add idxq, 32 vpmultishiftqb m0, m3, m0 pshufb m0, m5, m0 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16 RET .w32: vpbroadcastq m3, [pal_unpack+0] vpbroadcastq m5, [palq] cmp wd, 32 jl .w16 pmovzxbd m2, [pal_perm] vpbroadcastq m4, [pal_unpack+8] jg .w64 .w32_loop: vpermd m1, m2, [idxq] add idxq, 64 vpmultishiftqb m0, m3, m1 vpmultishiftqb m1, m4, m1 pshufb m0, m5, m0 pshufb m1, m5, m1 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w32_loop RET .w64: vpermd m1, m2, [idxq] add idxq, 64 vpmultishiftqb m0, m3, m1 vpmultishiftqb m1, m4, m1 pshufb m0, m5, m0 pshufb m1, m5, m1 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w64 RET %if WIN64 DECLARE_REG_TMP 4 %else DECLARE_REG_TMP 8 %endif cglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx %define base r7-z_filter_t0 lea r7, [z_filter_t0] tzcnt wd, wm movifnidn angled, anglem lea t0, [dr_intra_derivative] movsxd wq, [base+ipred_z1_8bpc_avx512icl_table+wq*4] inc tlq mov dxd, angled and dxd, 0x7e add angled, 165 ; ~90 movzx dxd, word [t0+dxq] lea wq, [base+ipred_z1_8bpc_avx512icl_table+wq] movifnidn hd, hm xor angled, 0x4ff ; d = 90 - angle mova m14, [base+z_frac_table] vpbroadcastd m15, [base+pw_512] jmp wq .w4: mova m9, [pb_0to63] pminud m8, m9, [base+pb_7] {1to16} vpbroadcastq m7, [tlq] pshufb m7, m8 cmp angleb, 40 jae .w4_no_upsample lea r3d, [angleq-1024] sar r3d, 7 add r3d, hd jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) pshufb xmm0, xm7, [base+z_filter_s4] mova xmm1, [tlq-1] pshufb xmm1, [base+z_xpos_off2a] vpbroadcastd xmm2, [base+pb_m4_36] vpbroadcastq m4, [pb_0to63] pmaddubsw xmm0, xmm2 pmaddubsw xmm1, xmm2 add dxd, dxd kxnorw k1, k1, k1 paddw xmm0, xmm1 pmulhrsw xm0, xmm0, xm15 packuswb xm0, xm0 punpcklbw ym7{k1}, ym0 jmp .w4_main2 .w4_no_upsample: test angled, 0x400 jnz .w4_main ; !enable_intra_edge_filter lea r3d, [hq+3] vpbroadcastb xm0, r3d vpbroadcastb xm1, angled shr angled, 8 ; is_sm << 1 vpcmpeqb k1, xm0, [base+z_filter_wh] vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8] kmovw r5d, k1 test r5d, r5d jz .w4_main vbroadcasti32x4 ym0, [tlq-1] pshufb ym0, [base+z_filter4_s1] popcnt r5d, r5d ; filter_strength pshufb ym1, ym7, [z_filter_s4] pshufb ym7, [base+z_filter_s3] vpbroadcastd ym11, [base+z_filter_k+(r5-1)*4+12*0] vpbroadcastd ym12, [base+z_filter_k+(r5-1)*4+12*1] pmaddubsw ym0, ym11 pmaddubsw ym1, ym11 pmaddubsw ym7, ym12 paddw ym0, ym1 paddw ym7, ym0 pmulhrsw ym7, ym15 cmp hd, 4 je .w4_filter_end vpbroadcastd m8, [base+pb_9] pminub m8, m9 .w4_filter_end: paddb m8, m8 vpermb m7, m8, m7 .w4_main: vpbroadcastq m4, [base+z_xpos_off1a] .w4_main2: movsldup m2, [base+z_xpos_mul] vpbroadcastw m5, dxd vbroadcasti32x4 m3, [base+z_xpos_bc] lea r2, [strideq*3] pmullw m2, m5 ; xpos psllw m5, 5 ; dx*8 .w4_loop: psrlw m1, m2, 3 pshufb m0, m2, m3 vpermw m1, m1, m14 ; 64-frac, frac paddsb m0, m4 ; base, base+1 vpermb m0, m0, m7 ; top[base], top[base+1] paddsw m2, m5 ; xpos += dx pmaddubsw m0, m1 ; v pmulhrsw m0, m15 packuswb m0, m0 vextracti32x4 xm1, ym0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+r2 ], xm1, 1 sub hd, 8 jl .w4_end vextracti32x4 xm1, m0, 2 ; top[max_base_x] lea dstq, [dstq+strideq*4] vextracti32x4 xm0, m0, 3 movd [dstq+strideq*0], xm1 pextrd [dstq+strideq*1], xm1, 1 movd [dstq+strideq*2], xm0 pextrd [dstq+r2 ], xm0, 1 lea dstq, [dstq+strideq*4] jg .w4_loop .w4_end: RET .w8_filter: mova ym0, [base+z_filter_s1] popcnt r5d, r5d vbroadcasti32x4 ym1, [base+z_filter_s2] vbroadcasti32x4 ym3, [base+z_filter_s3] vbroadcasti32x4 ym4, [base+z_filter_s4] vpermi2b ym0, ym7, ym2 ; al bl mova ym5, [base+z_filter_s5] pshufb ym1, ym7, ym1 ; ah bh vpbroadcastd ym11, [base+z_filter_k+(r5-1)*4+12*0] pshufb ym3, ym7, ym3 ; cl ch vpbroadcastd ym12, [base+z_filter_k+(r5-1)*4+12*1] pshufb ym4, ym7, ym4 ; el dl vpbroadcastd ym13, [base+z_filter_k+(r5-1)*4+12*2] vpermb ym5, ym5, ym7 ; eh dh pmaddubsw ym0, ym11 pmaddubsw ym1, ym11 pmaddubsw ym2, ym3, ym12 pmaddubsw ym3, ym13 pmaddubsw ym4, ym11 pmaddubsw ym5, ym11 paddw ym0, ym2 paddw ym1, ym3 paddw ym0, ym4 paddw ym1, ym5 pmulhrsw ym0, ym15 pmulhrsw ym1, ym15 packuswb ym0, ym1 ret .w8: lea r3d, [angleq+216] mov r3b, hb cmp r3d, 8 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 lea r3d, [hq-1] mova xm1, [base+z_filter_s4] vpbroadcastb xm2, r3d mova xm7, [tlq-1] vinserti32x4 ym7, [tlq+7], 1 vbroadcasti32x4 ym0, [base+z_xpos_off1a] vpbroadcastd ym3, [base+pb_m4_36] pminub xm2, xm1 pshufb ym0, ym7, ym0 vinserti32x4 ym1, xm2, 1 psrldq ym7, 1 pshufb ym1, ym7, ym1 pmaddubsw ym0, ym3 pmaddubsw ym1, ym3 vbroadcasti32x4 m8, [pb_0to63] add dxd, dxd paddw ym0, ym1 pmulhrsw ym0, ym15 packuswb ym0, ym0 punpcklbw ym7, ym0 jmp .w8_main2 .w8_no_upsample: lea r3d, [hq+7] mova m9, [pb_0to63] vpbroadcastb ym0, r3d and r3d, 7 vbroadcasti32x4 m7, [tlq] or r3d, 8 ; imin(h+7, 15) vpbroadcastb m8, r3d pminub m8, m9 pshufb m7, m8 test angled, 0x400 jnz .w8_main vpbroadcastb ym1, angled shr angled, 8 vpcmpeqb k1, ym0, [base+z_filter_wh] mova xm0, [base+z_filter_t0+angleq*8] vpcmpgtb k1{k1}, ym1, ym0 kmovd r5d, k1 test r5d, r5d jz .w8_main vpbroadcastd ym2, [tlq-4] call .w8_filter cmp hd, 8 jle .w8_filter_end vpbroadcastd m8, [base+pb_17] add r3d, 2 pminub m8, m9 .w8_filter_end: vpermb m7, m8, m0 .w8_main: vbroadcasti32x4 m8, [base+z_xpos_off1a] .w8_main2: movsldup m4, [base+z_xpos_mul] vpbroadcastw m9, dxd shl r3d, 6 vpbroadcastd m5, [base+z_xpos_bc+8*0] pmullw m4, m9 ; xpos vpbroadcastd m6, [base+z_xpos_bc+8*1] sub r3d, dxd shl dxd, 3 psllw m9, 5 ; dx*8 lea r2, [strideq*3] .w8_loop: psrlw m3, m4, 3 pshufb m0, m4, m5 pshufb m1, m4, m6 vpermw m3, m3, m14 paddsb m0, m8 paddsb m1, m8 vpermb m0, m0, m7 vpermb m1, m1, m7 paddsw m4, m9 punpcklqdq m2, m3, m3 pmaddubsw m0, m2 punpckhqdq m3, m3 pmaddubsw m1, m3 pmulhrsw m0, m15 pmulhrsw m1, m15 packuswb m0, m1 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r2 ], xm1 sub hd, 8 jl .w8_end vextracti32x8 ym0, m0, 1 lea dstq, [dstq+strideq*4] vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r2 ], xm1 jz .w8_end lea dstq, [dstq+strideq*4] sub r3d, dxd jg .w8_loop vextracti32x4 xm7, m7, 3 .w8_end_loop: movq [dstq+strideq*0], xm7 movq [dstq+strideq*1], xm7 movq [dstq+strideq*2], xm7 movq [dstq+r2 ], xm7 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_end_loop .w8_end: RET .w16_filter: mova m0, [base+z_filter_s1] popcnt r5d, r5d vbroadcasti32x4 m1, [base+z_filter_s2] vbroadcasti32x4 m3, [base+z_filter_s3] vbroadcasti32x4 m4, [base+z_filter_s4] vpermi2b m0, m7, m2 ; al bl mova m5, [base+z_filter_s5] pshufb m1, m7, m1 ; ah bh vpbroadcastd m11, [base+z_filter_k+(r5-1)*4+12*0] pshufb m3, m7, m3 ; cl ch vpbroadcastd m12, [base+z_filter_k+(r5-1)*4+12*1] pshufb m4, m7, m4 ; el dl vpbroadcastd m13, [base+z_filter_k+(r5-1)*4+12*2] vpermb m5, m5, m7 ; eh dh pmaddubsw m0, m11 pmaddubsw m1, m11 pmaddubsw m2, m3, m12 pmaddubsw m3, m13 pmaddubsw m4, m11 pmaddubsw m5, m11 paddw m0, m2 paddw m1, m3 paddw m0, m4 paddw m1, m5 pmulhrsw m0, m15 pmulhrsw m1, m15 packuswb m0, m1 ret .w16: lea r3d, [hq+15] mova m9, [pb_0to63] vpbroadcastb ym0, r3d and r3d, 15 movu ym7, [tlq] or r3d, 16 ; imin(h+15, 31) vpbroadcastb m8, r3d pminub m8, m9 vpermb m7, m8, m7 test angled, 0x400 jnz .w16_main vpbroadcastb ym1, angled shr angled, 8 vpcmpeqb k1, ym0, [base+z_filter_wh] mova xm0, [base+z_filter_t0+angleq*8] vpcmpgtb k1{k1}, ym1, ym0 kmovd r5d, k1 test r5d, r5d jz .w16_main vpbroadcastd m2, [tlq-4] call .w16_filter cmp hd, 16 jle .w16_filter_end vpbroadcastd m8, [base+pb_33] add r3d, 2 pminub m8, m9 .w16_filter_end: vpermb m7, m8, m0 .w16_main: movshdup m3, [base+z_xpos_mul] vpbroadcastw m8, dxd shl r3d, 6 vpbroadcastd m4, [base+z_xpos_bc] pmullw m3, m8 ; xpos vbroadcasti32x4 m5, [base+z_xpos_off1a] sub r3d, dxd shl dxd, 2 vbroadcasti32x4 m6, [base+z_xpos_off1b] psllw m8, 4 ; dx*4 lea r2, [strideq*3] .w16_loop: pshufb m1, m3, m4 psrlw m2, m3, 3 paddsb m0, m1, m5 vpermw m2, m2, m14 paddsb m1, m6 vpermb m0, m0, m7 vpermb m1, m1, m7 paddsw m3, m8 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m15 pmulhrsw m1, m15 packuswb m0, m1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+r2 ], m0, 3 sub hd, 4 jz .w16_end lea dstq, [dstq+strideq*4] sub r3d, dxd jg .w16_loop vextracti32x4 xm7, m7, 3 .w16_end_loop: mova [dstq+strideq*0], xm7 mova [dstq+strideq*1], xm7 mova [dstq+strideq*2], xm7 mova [dstq+r2 ], xm7 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16_end_loop .w16_end: RET .w32_filter: mova m0, [base+z_filter_s1] vbroadcasti32x4 m1, [base+z_filter_s2] vbroadcasti32x4 m3, [base+z_filter_s3] vbroadcasti32x4 m4, [base+z_filter_s4] vpermi2b m0, m7, m2 ; al bl mova m5, [base+z_filter_s5] pshufb m1, m7, m1 ; ah bh vpbroadcastd m11, [base+z_filter_k+4*2+12*0] pshufb m3, m7, m3 ; cl ch vpbroadcastd m12, [base+z_filter_k+4*2+12*1] pshufb m4, m7, m4 ; el dl vpbroadcastd m13, [base+z_filter_k+4*2+12*2] vpermi2b m5, m7, m8 ; eh dh pmaddubsw m0, m11 pmaddubsw m1, m11 pmaddubsw m2, m3, m12 pmaddubsw m3, m13 pmaddubsw m4, m11 pmaddubsw m5, m11 paddw m0, m2 paddw m1, m3 paddw m0, m4 paddw m1, m5 pmulhrsw m0, m15 pmulhrsw m1, m15 packuswb m7, m0, m1 ret .w32: lea r3d, [hq+31] vpbroadcastb m9, r3d and r3d, 31 pminub m10, m9, [pb_0to63] or r3d, 32 ; imin(h+31, 63) vpermb m7, m10, [tlq] vpbroadcastb m8, [tlq+r3] test angled, 0x400 ; !enable_intra_edge_filter jnz .w32_main vpbroadcastd m2, [tlq-4] call .w32_filter cmp hd, 64 je .w32_h64_filter_end vpermb m8, m9, m7 vpermb m7, m10, m7 jmp .w32_main .w32_h64_filter_end: ; edge case for 32x64 movd xmm0, [tlq+r3-1] movd xmm1, [base+pb_8_56_0_0] add r3d, 2 pmaddubsw xmm0, xmm1 vptestmw k1, xmm1, xmm1 ; 0x01 pmulhrsw xm0, xmm0, xm15 vmovdqu8 m8{k1}, m0 .w32_main: rorx r2d, dxd, 30 vpbroadcastd m4, [base+z_xpos_bc] vpbroadcastw m3, r2d vbroadcasti32x8 m5, [base+z_xpos_off2a] shl r3d, 6 vbroadcasti32x8 m6, [base+z_xpos_off2b] sub r3d, dxd paddw m9, m3, m3 add dxd, dxd vinserti32x8 m3, ym9, 1 .w32_loop: pshufb m1, m3, m4 psrlw m2, m3, 3 paddsb m0, m1, m5 vpermw m2, m2, m14 paddsb m1, m6 vpermi2b m0, m7, m8 vpermi2b m1, m7, m8 paddsw m3, m9 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m15 pmulhrsw m1, m15 packuswb m0, m1 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 sub hd, 2 jz .w32_end lea dstq, [dstq+strideq*2] sub r3d, dxd jg .w32_loop punpckhqdq ym8, ym8 .w32_end_loop: mova [dstq+strideq*0], ym8 mova [dstq+strideq*1], ym8 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_end_loop .w32_end: RET .w64_filter: vbroadcasti32x4 m3, [base+z_filter_s2] mova m1, [base+z_filter_s1] pshufb m0, m3 ; al bl vpermi2b m1, m7, m2 vbroadcasti32x4 m4, [base+z_filter_s4] pshufb m6, m8, m4 ; el dl pshufb m9, m7, m4 pminub m10, m13, [base+z_filter_s5] pshufb m2, m8, m3 ; ah bh pshufb m3, m7, m3 vbroadcasti32x4 m5, [base+z_filter_s3] vpermb m10, m10, m8 ; eh dh pshufb m11, m4 vpbroadcastd m4, [base+z_filter_k+4*2+12*0] pshufb m8, m5 ; cl ch pshufb m7, m5 vpbroadcastd m5, [base+z_filter_k+4*2+12*1] REPX {pmaddubsw x, m4}, m0, m1, m6, m9, m2, m3, m10, m11 pmaddubsw m4, m8, m5 pmaddubsw m5, m7, m5 paddw m0, m6 vpbroadcastd m6, [base+z_filter_k+4*2+12*2] paddw m1, m9 pmaddubsw m7, m6 pmaddubsw m8, m6 paddw m2, m10 paddw m3, m11 paddw m0, m4 paddw m1, m5 paddw m2, m8 paddw m3, m7 REPX {pmulhrsw x, m15}, m0, m2, m1, m3 packuswb m0, m2 packuswb m7, m1, m3 vpermb m8, m12, m0 ret .w64: lea r3d, [hq-1] movu m7, [tlq+64*0] vpbroadcastb m13, r3d pminub m12, m13, [pb_0to63] or r3d, 64 vpermb m8, m12, [tlq+64*1] test angled, 0x400 ; !enable_intra_edge_filter jnz .w64_main movu m0, [tlq+56] vpbroadcastd m2, [tlq-4] movu m11, [tlq+8] call .w64_filter .w64_main: rorx r2d, dxd, 30 vpbroadcastd m4, [base+z_xpos_bc] vpbroadcastw m3, r2d mova m5, [base+z_xpos_off2a] shl r3d, 6 mova m6, [base+z_xpos_off2b] sub r3d, dxd mova m9, m3 .w64_loop: pshufb m1, m3, m4 psrlw m2, m3, 3 paddsb m0, m1, m5 vpermw m2, m2, m14 paddsb m1, m6 vpermi2b m0, m7, m8 vpermi2b m1, m7, m8 paddsw m3, m9 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m15 pmulhrsw m1, m15 packuswb m0, m1 mova [dstq], m0 dec hd jz .w64_end add dstq, strideq sub r3d, dxd jg .w64_loop vpermb m8, m13, m8 .w64_end_loop: mova [dstq], m8 add dstq, strideq dec hd jg .w64_end_loop .w64_end: RET cglobal ipred_z2_8bpc, 3, 9, 18, dst, stride, tl, w, h, angle, dx, _, dy tzcnt wd, wm movifnidn angled, anglem lea dxq, [dr_intra_derivative-90] movzx dyd, angleb xor angled, 0x400 mov r7, dxq sub dxq, dyq movifnidn hd, hm and dyd, ~1 and dxq, ~1 movzx dyd, word [r7+dyq] ; angle - 90 lea r7, [z_filter_t0] movzx dxd, word [dxq+270] ; 180 - angle movsxd wq, [base+ipred_z2_8bpc_avx512icl_table+wq*4] mova m8, [base+pb_63to0] neg dyd vpermb m8, m8, [tlq-64] ; left lea wq, [base+ipred_z2_8bpc_avx512icl_table+wq] mova m14, [base+z_frac_table] inc tlq vpbroadcastd m15, [base+pw_512] neg dxd jmp wq .w4: movd xm7, [tlq] vpbroadcastq m10, [base+z_xpos_off2a] test angled, 0x400 jnz .w4_main ; !enable_intra_edge_filter lea r3d, [hq+2] add angled, 1022 shl r3d, 6 test r3d, angled jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) vpbroadcastd xm2, [base+pb_4] sub angled, 1075 ; angle - 53 call .upsample_above lea r3d, [hq+3] vpbroadcastq m10, [pb_0to63+1] punpcklbw xm7, xm0, xm7 call .filter_strength jmp .w4_filter_left .w4_upsample_left: call .upsample_left movsldup m16, [base+z_ypos_off3] vpbroadcastd m9, [base+pb_16] punpcklbw xm8, xm0, xm8 jmp .w4_main2 .w4_no_upsample_above: lea r3d, [hq+3] sub angled, 1112 ; angle - 90 call .filter_strength test r3d, r3d jz .w4_no_filter_above vpbroadcastd xm5, [base+pb_3] call .filter_top_w16 .w4_no_filter_above: lea r3d, [hq+2] add angled, 973 ; angle + 883 shl r3d, 6 test r3d, angled jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) vpbroadcastd ym0, [base+pb_90] psubb ym0, ym17 vpcmpgtb k2{k2}, ym0, ym16 kmovd r3d, k2 .w4_filter_left: test r3d, r3d jz .w4_main popcnt r3d, r3d call .filter_left_h16 .w4_main: movsldup m16, [base+z_ypos_off1] vpbroadcastd m9, [base+pb_8] .w4_main2: vpbroadcastq m3, [base+z_ypos_mul1a] vpbroadcastw m0, dyd movsldup m1, [base+z_xpos_mul] vpbroadcastw m5, dxd vinserti32x4 m7, [tlq-16], 3 vinserti32x4 m8, [tlq-16], 3 pmullw m3, m0 vbroadcasti32x4 m2, [base+z_xpos_bc] pmullw m1, m5 ; xpos0..3 psllw m5, 5 ; dx*8 psraw m4, m3, 6 psrlw m3, 1 packsswb m4, m4 vpermw m3, m3, m14 ; 64-frac, frac punpcklbw m4, m4 lea r2, [strideq*3] paddb m4, m16 ; base, base+1 .w4_loop: pshufb m16, m1, m2 psrlw m0, m1, 3 paddb m16, m10 vpermw m0, m0, m14 vpmovw2m k1, m16 ; base_x < 0 vpermb m16, m16, m7 pmaddubsw m16, m0 vpermb m0, m4, m8 pmaddubsw m16{k1}, m0, m3 pmulhrsw m16, m15 vpmovwb ym16, m16 movd [dstq+strideq*0], xm16 pextrd [dstq+strideq*1], xm16, 1 pextrd [dstq+strideq*2], xm16, 2 pextrd [dstq+r2 ], xm16, 3 sub hd, 8 jl .w4_end paddsw m1, m5 vextracti128 xm16, ym16, 1 lea dstq, [dstq+strideq*4] paddb m4, m9 movd [dstq+strideq*0], xm16 pextrd [dstq+strideq*1], xm16, 1 pextrd [dstq+strideq*2], xm16, 2 pextrd [dstq+r2 ], xm16, 3 lea dstq, [dstq+strideq*4] jg .w4_loop .w4_end: RET .upsample_above: ; w4/w8 mova xm0, [tlq-1] xor angled, 0x7f ; 180 - angle add dxd, dxd jmp .upsample .upsample_left: ; h4/h8 palignr xm0, xm8, [tlq-16], 15 vpbroadcastb xm2, hd add dyd, dyd .upsample: pshufb xm1, xm0, [base+z_filter4_s1] pminub xm2, [base+z_filter_s4] vpbroadcastd xm3, [base+pb_m4_36] pshufb xm0, xm2 pmaddubsw xm1, xm3 pmaddubsw xm0, xm3 paddw xm0, xm1 pmulhrsw xm0, xm15 packuswb xm0, xm0 ret .filter_strength: vpbroadcastb ym16, r3d mov r3d, angled vpbroadcastd m2, [tlq-4] vpbroadcastb ym17, angled shr r3d, 8 vpcmpeqb k2, ym16, [base+z_filter_wh] mova xm16, [base+z_filter_t0+r3*8] vpcmpgtb k1{k2}, ym17, ym16 mova m9, [pb_0to63] kmovd r3d, k1 ret .w8: movq xm7, [tlq] vbroadcasti32x4 m10, [base+z_xpos_off2a] test angled, 0x400 jnz .w8_main lea r3d, [angleq+126] mov r3b, hb cmp r3d, 8 ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm vpbroadcastd xm2, [base+pb_8] sub angled, 53 ; angle - 53 call .upsample_above lea r3d, [hq+7] vbroadcasti32x4 m10, [pb_0to63+1] punpcklbw xm7, xm0, xm7 call .filter_strength jmp .w8_filter_left .w8_upsample_left: call .upsample_left movshdup m16, [base+z_ypos_off3] vpbroadcastd m9, [base+pb_8] punpcklbw xm8, xm0, xm8 jmp .w8_main2 .w8_no_upsample_above: lea r3d, [hq+7] sub angled, 90 ; angle - 90 call .filter_strength test r3d, r3d jz .w8_no_filter_above vpbroadcastd xm5, [base+pb_7] call .filter_top_w16 .w8_no_filter_above: lea r3d, [angleq-51] mov r3b, hb cmp r3d, 8 jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm vpbroadcastd ym0, [base+pb_90] psubb ym0, ym17 vpcmpgtb k2{k2}, ym0, ym16 kmovd r3d, k2 .w8_filter_left: test r3d, r3d jz .w8_main cmp hd, 32 je .w8_filter_left_h32 popcnt r3d, r3d call .filter_left_h16 jmp .w8_main .w8_filter_left_h32: call .filter_left_h64 .w8_main: movshdup m16, [base+z_ypos_off2] vpbroadcastd m9, [base+pb_4] .w8_main2: vbroadcasti32x4 m3, [base+z_ypos_mul1a] vpbroadcastw m0, dyd movshdup m1, [base+z_xpos_mul] vpbroadcastw m5, dxd vinserti32x4 m7, [tlq-16], 3 vinserti32x4 m8, [tlq-16], 3 pmullw m3, m0 vpbroadcastd m2, [base+pb_1] pmullw m1, m5 ; xpos0..3 psllw m5, 4 ; dx*4 psraw m4, m3, 6 psrlw m3, 1 packsswb m4, m4 vpermw m3, m3, m14 ; 64-frac, frac lea r3d, [dxq+(8<<6)] paddsb m4, m16 shl dxd, 2 paddsb m0, m4, m2 lea r2, [strideq*3] punpcklbw m4, m0 ; base, base+1 .w8_loop: pshufb m16, m1, m2 psrlw m0, m1, 3 paddb m16, m10 vpermw m0, m0, m14 vpmovw2m k1, m16 ; base_x < 0 vpermb m16, m16, m7 pmaddubsw m16, m0 vpermb m0, m4, m8 pmaddubsw m16{k1}, m0, m3 pmulhrsw m16, m15 vpmovwb ym16, m16 vextracti128 xm17, ym16, 1 movq [dstq+strideq*0], xm16 movhps [dstq+strideq*1], xm16 movq [dstq+strideq*2], xm17 movhps [dstq+r2 ], xm17 sub hd, 4 jz .w8_end paddw m1, m5 lea dstq, [dstq+strideq*4] paddb m4, m9 add r3d, dxd jge .w8_loop .w8_leftonly_loop: vpermb m16, m4, m8 pmaddubsw m16, m3 paddb m4, m9 pmulhrsw m16, m15 vpmovwb ym16, m16 vextracti128 xm17, ym16, 1 movq [dstq+strideq*0], xm16 movhps [dstq+strideq*1], xm16 movq [dstq+strideq*2], xm17 movhps [dstq+r2 ], xm17 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_leftonly_loop .w8_end: RET .filter_top_w16: mova xm0, [base+z_filter_s1] popcnt r3d, r3d pminub xm4, xm5, [base+z_filter_s4] vpermi2b xm0, xm7, xm2 pminub xm5, [base+z_filter_s5] pshufb xm1, xm7, [base+z_filter_s2] vpbroadcastd xm11, [base+z_filter_k+(r3-1)*4+12*0] pshufb xm3, xm7, [base+z_filter_s3] vpbroadcastd xm12, [base+z_filter_k+(r3-1)*4+12*1] pshufb xm4, xm7, xm4 vpbroadcastd xm13, [base+z_filter_k+(r3-1)*4+12*2] pshufb xm5, xm7, xm5 pmaddubsw xm0, xm11 pmaddubsw xm1, xm11 pmaddubsw xm6, xm3, xm12 vpbroadcastd xm12, r7m ; max_width pmaddubsw xm3, xm13 pmaddubsw xm4, xm11 pmaddubsw xm5, xm11 packssdw xm12, xm12 paddw xm0, xm6 paddw xm1, xm3 paddw xm0, xm4 paddw xm1, xm5 packsswb xm12, xm12 pmulhrsw xm0, xm15 pmulhrsw xm1, xm15 vpcmpgtb k1, xm12, xm9 ; x < max_width packuswb xm7{k1}, xm0, xm1 ret .filter_left_h16: lea r5d, [hq-1] mova xm0, [base+z_filter_s1] vpbroadcastb xm5, r5d vpermi2b xm0, xm8, xm2 pminub xm4, xm5, [base+z_filter_s4] pshufb xm1, xm8, [base+z_filter_s2] pminub xm5, [base+z_filter_s5] pshufb xm3, xm8, [base+z_filter_s3] vpbroadcastd xm11, [base+z_filter_k+(r3-1)*4+12*0] pshufb xm4, xm8, xm4 vpbroadcastd xm12, [base+z_filter_k+(r3-1)*4+12*1] pshufb xm5, xm8, xm5 vpbroadcastd xm13, [base+z_filter_k+(r3-1)*4+12*2] pmaddubsw xm0, xm11 pmaddubsw xm1, xm11 pmaddubsw xm6, xm3, xm12 vpbroadcastd xm12, r8m ; max_height pmaddubsw xm3, xm13 pmaddubsw xm4, xm11 pmaddubsw xm5, xm11 packssdw xm12, xm12 paddw xm0, xm6 paddw xm1, xm3 paddw xm0, xm4 paddw xm1, xm5 packsswb xm12, xm12 pmulhrsw xm0, xm15 pmulhrsw xm1, xm15 vpcmpgtb k1, xm12, xm9 ; y < max_height packuswb xm8{k1}, xm0, xm1 ret .w16: movu xm7, [tlq] ; top test angled, 0x400 jnz .w16_main lea r3d, [hq+15] sub angled, 90 call .filter_strength test r3d, r3d jz .w16_no_filter_above vpbroadcastd xm5, [base+pb_15] call .filter_top_w16 .w16_no_filter_above: cmp hd, 16 jg .w16_filter_left_h64 vpbroadcastd ym0, [base+pb_90] psubb ym0, ym17 vpcmpgtb k2{k2}, ym0, ym16 kmovd r3d, k2 test r3d, r3d jz .w16_main popcnt r3d, r3d call .filter_left_h16 jmp .w16_main .w16_filter_left_h64: call .filter_left_h64 .w16_main: vbroadcasti32x4 m6, [base+z_ypos_mul1a] ; 1.. 8 vbroadcasti32x4 m5, [base+z_ypos_mul1b] ; 9..15 vpbroadcastw m0, dyd vinserti32x4 m7, [tlq-16], 3 vpbroadcastd m2, [base+pb_1] vpbroadcastw m12, dxd movshdup m1, [base+z_xpos_mul] pmullw m6, m0 vbroadcasti32x4 m3, [base+z_xpos_off2a] pmullw m5, m0 vbroadcasti32x4 m4, [base+z_xpos_off2b] pmullw m1, m12 ; xpos0 xpos1 xpos2 xpos3 vpbroadcastd m9, [base+pb_4] psllw m12, 4 ; dx*4 movshdup m16, [base+z_ypos_off2] psrlw m10, m6, 1 psrlw m11, m5, 1 vpermw m10, m10, m14 ; 64-frac, frac psraw m6, 6 vpermw m11, m11, m14 psraw m5, 6 mov r5d, -(16<<6) ; 15 to avoid top, +1 to avoid topleft packsswb m6, m5 mov r3d, 1<<6 paddsb m6, m16 sub r5d, dxd ; left-only threshold paddsb m0, m6, m2 shl dxd, 2 punpcklbw m5, m6, m0 ; base, base+1 lea r2, [strideq*3] punpckhbw m6, m0 .w16_loop: pshufb m17, m1, m2 psrlw m0, m1, 3 paddb m16, m3, m17 vpermw m0, m0, m14 paddb m17, m4 vpmovw2m k1, m16 vpermb m16, m16, m7 vpmovw2m k2, m17 vpermb m17, m17, m7 pmaddubsw m16, m0 pmaddubsw m17, m0 add r3d, dxd jge .w16_toponly mova m0, m8 vpermt2b m0, m5, m7 pmaddubsw m16{k1}, m0, m10 mova m0, m8 vpermt2b m0, m6, m7 pmaddubsw m17{k2}, m0, m11 .w16_toponly: pmulhrsw m16, m15 pmulhrsw m17, m15 packuswb m16, m17 mova [dstq+strideq*0], xm16 vextracti128 [dstq+strideq*1], ym16, 1 vextracti32x4 [dstq+strideq*2], m16, 2 vextracti32x4 [dstq+r2 ], m16, 3 sub hd, 4 jz .w16_end paddw m1, m12 lea dstq, [dstq+strideq*4] paddb m5, m9 paddb m6, m9 cmp r3d, r5d jge .w16_loop .w16_leftonly_loop: vpermb m16, m5, m8 vpermb m17, m6, m8 pmaddubsw m16, m10 pmaddubsw m17, m11 paddb m5, m9 paddb m6, m9 pmulhrsw m16, m15 pmulhrsw m17, m15 packuswb m16, m17 mova [dstq+strideq*0], xm16 vextracti128 [dstq+strideq*1], ym16, 1 vextracti32x4 [dstq+strideq*2], m16, 2 vextracti32x4 [dstq+r2 ], m16, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16_leftonly_loop .w16_end: RET .w32: movu ym7, [tlq] test angled, 0x400 jnz .w32_main vpbroadcastd m2, [tlq-4] mova ym0, [base+z_filter_s1] vbroadcasti32x4 ym1, [base+z_filter_s2] vbroadcasti32x4 ym3, [base+z_filter_s3] vbroadcasti32x4 ym4, [base+z_filter_s4] vpermi2b ym0, ym7, ym2 ; al bl vpbroadcastd ym5, [base+pb_31] pminub ym5, [base+z_filter_s5] pshufb ym1, ym7, ym1 ; ah bh vpbroadcastd ym11, [base+z_filter_k+4*2+12*0] pshufb ym3, ym7, ym3 ; cl ch vpbroadcastd ym12, [base+z_filter_k+4*2+12*1] pshufb ym4, ym7, ym4 ; el dl vpbroadcastd ym13, [base+z_filter_k+4*2+12*2] vpermb ym5, ym5, ym7 ; eh dh pmaddubsw ym0, ym11 pmaddubsw ym1, ym11 pmaddubsw ym6, ym3, ym12 vpbroadcastd ym12, r6m pmaddubsw ym3, ym13 pmaddubsw ym4, ym11 pmaddubsw ym5, ym11 mova m9, [pb_0to63] packssdw ym12, ym12 paddw ym0, ym6 paddw ym1, ym3 paddw ym0, ym4 paddw ym1, ym5 packsswb ym12, ym12 pmulhrsw ym0, ym15 pmulhrsw ym1, ym15 vpcmpgtb k1, ym12, ym9 ; x < max_width packuswb ym7{k1}, ym0, ym1 cmp hd, 16 jg .w32_filter_h64 mov r3d, 3 call .filter_left_h16 jmp .w32_main .w32_filter_h64: call .filter_left_h64 .w32_main: vbroadcasti32x8 m6, [base+z_ypos_mul1a] ; 1.. 8 vbroadcasti32x8 m5, [base+z_ypos_mul1b] ; 9..15 vpbroadcastw m0, dyd vinserti32x4 m7, [tlq-16], 3 rorx r2q, dxq, 62 ; dx << 2 vpbroadcastd m2, [base+pb_1] vpbroadcastw m1, r2d pmullw m6, m0 vbroadcasti32x8 m3, [base+z_xpos_off2a] pmullw m5, m0 vbroadcasti32x8 m4, [base+z_xpos_off2b] mova ym0, ym1 paddw m12, m1, m1 vpbroadcastd m9, [base+pb_2] paddw m1, m0 ; xpos1 xpos0 mova ym0, ym2 psrlw m10, m6, 1 psrlw m11, m5, 1 vpermw m10, m10, m14 ; 64-frac, frac psraw m6, 6 vpermw m11, m11, m14 psraw m5, 6 mov r5d, -(32<<6) ; 31 to avoid top, +1 to avoid topleft packsswb m6, m5 mov r3d, 1<<6 paddsb m6, m0 sub r5d, dxd ; left-only threshold paddsb m0, m6, m2 add dxd, dxd punpcklbw m5, m6, m0 ; base, base+1 punpckhbw m6, m0 .w32_loop: pshufb m17, m1, m2 psrlw m0, m1, 3 paddb m16, m3, m17 vpermw m0, m0, m14 paddb m17, m4 vpmovw2m k1, m16 vpermb m16, m16, m7 vpmovw2m k2, m17 vpermb m17, m17, m7 pmaddubsw m16, m0 pmaddubsw m17, m0 add r3d, dxd jge .w32_toponly mova m0, m8 vpermt2b m0, m5, m7 pmaddubsw m16{k1}, m0, m10 mova m0, m8 vpermt2b m0, m6, m7 pmaddubsw m17{k2}, m0, m11 .w32_toponly: pmulhrsw m16, m15 pmulhrsw m17, m15 packuswb m16, m17 vextracti32x8 [dstq+strideq*0], m16, 1 mova [dstq+strideq*1], ym16 sub hd, 2 jz .w32_end paddw m1, m12 lea dstq, [dstq+strideq*2] paddb m5, m9 paddb m6, m9 cmp r3d, r5d jge .w32_loop .w32_leftonly_loop: vpermb m16, m5, m8 vpermb m17, m6, m8 pmaddubsw m16, m10 pmaddubsw m17, m11 paddb m5, m9 paddb m6, m9 pmulhrsw m16, m15 pmulhrsw m17, m15 packuswb m16, m17 vextracti32x8 [dstq+strideq*0], m16, 1 mova [dstq+strideq*1], ym16 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_leftonly_loop .w32_end: RET .filter_left_h64: mova m0, [base+z_filter_s1] lea r3d, [hq-1] vbroadcasti32x4 m4, [base+z_filter_s4] vpbroadcastb m5, r3d vbroadcasti32x4 m1, [base+z_filter_s2] vbroadcasti32x4 m3, [base+z_filter_s3] vpermi2b m0, m8, m2 ; al bl pminub m5, [base+z_filter_s5] pshufb m1, m8, m1 ; ah bh vpbroadcastd m11, [base+z_filter_k+4*2+12*0] pshufb m3, m8, m3 ; cl ch vpbroadcastd m12, [base+z_filter_k+4*2+12*1] pshufb m4, m8, m4 ; el dl vpbroadcastd m13, [base+z_filter_k+4*2+12*2] vpermb m5, m5, m8 ; eh dh pmaddubsw m0, m11 pmaddubsw m1, m11 pmaddubsw m6, m3, m12 vpbroadcastd m12, r8m ; max_height pmaddubsw m3, m13 pmaddubsw m4, m11 pmaddubsw m5, m11 packssdw m12, m12 paddw m0, m6 paddw m1, m3 paddw m0, m4 paddw m1, m5 packsswb m12, m12 pmulhrsw m0, m15 pmulhrsw m1, m15 vpcmpgtb k1, m12, m9 ; y < max_height packuswb m8{k1}, m0, m1 ret .w64: movu m7, [tlq] test angled, 0x400 jnz .w64_main vpbroadcastd m2, [tlq-4] mova m0, [base+z_filter_s1] vbroadcasti32x4 m1, [base+z_filter_s2] vbroadcasti32x4 m3, [base+z_filter_s3] vbroadcasti32x4 m4, [base+z_filter_s4] vpermi2b m0, m7, m2 ; al bl vpbroadcastd m5, [base+pb_63] pminub m5, [base+z_filter_s5] pshufb m1, m7, m1 ; ah bh vpbroadcastd m11, [base+z_filter_k+4*2+12*0] pshufb m3, m7, m3 ; cl ch vpbroadcastd m12, [base+z_filter_k+4*2+12*1] pshufb m4, m7, m4 ; el dl vpbroadcastd m13, [base+z_filter_k+4*2+12*2] vpermb m5, m5, m7 ; eh dh pmaddubsw m0, m11 pmaddubsw m1, m11 pmaddubsw m6, m3, m12 vpbroadcastd m12, r6m pmaddubsw m3, m13 pmaddubsw m4, m11 pmaddubsw m5, m11 mova m9, [pb_0to63] packssdw m12, m12 paddw m0, m6 paddw m1, m3 paddw m0, m4 paddw m1, m5 packsswb m12, m12 pmulhrsw m0, m15 pmulhrsw m1, m15 vpcmpgtb k1, m12, m9 ; x < max_width packuswb m7{k1}, m0, m1 call .filter_left_h64 ; always filter the full 64 pixels for simplicity .w64_main: vpbroadcastw m5, dyd vpbroadcastd m9, [tlq-4] rorx r2q, dxq, 62 ; dx << 2 pmullw m6, m5, [base+z_ypos_mul1a] ; can overflow, but it doesn't matter as such pmullw m5, [base+z_ypos_mul1b] ; pixels aren't selected from the left edge vpbroadcastw m1, r2d ; xpos mova m3, [base+z_xpos_off2a] mova m4, [base+z_xpos_off2b] mova m12, m1 vpbroadcastd m2, [base+pb_1] psrlw m10, m6, 1 psrlw m11, m5, 1 vpermw m10, m10, m14 ; 64-frac, frac psraw m6, 6 vpermw m11, m11, m14 psraw m5, 6 mov r5d, -(64<<6) ; 63 to avoid top, +1 to avoid topleft packsswb m6, m5 mov r3d, 1<<6 paddsb m0, m6, m2 sub r5d, dxd ; left-only threshold punpcklbw m5, m6, m0 ; base, base+1 punpckhbw m6, m0 .w64_loop: pshufb m17, m1, m2 psrlw m0, m1, 3 paddb m16, m3, m17 vpermw m0, m0, m14 paddb m17, m4 vpmovw2m k1, m16 ; base_x < 0 vpermi2b m16, m7, m9 vpmovw2m k2, m17 vpermi2b m17, m7, m9 pmaddubsw m16, m0 pmaddubsw m17, m0 add r3d, dxd jge .w64_toponly mova m0, m8 vpermt2b m0, m5, m9 pmaddubsw m16{k1}, m0, m10 mova m0, m8 vpermt2b m0, m6, m9 pmaddubsw m17{k2}, m0, m11 .w64_toponly: pmulhrsw m16, m15 pmulhrsw m17, m15 packuswb m16, m17 mova [dstq], m16 dec hd jz .w64_end paddw m1, m12 add dstq, strideq paddb m5, m2 paddb m6, m2 cmp r3d, r5d jge .w64_loop .w64_leftonly_loop: vpermb m16, m5, m8 vpermb m17, m6, m8 pmaddubsw m16, m10 pmaddubsw m17, m11 paddb m5, m2 paddb m6, m2 pmulhrsw m16, m15 pmulhrsw m17, m15 packuswb m16, m17 mova [dstq], m16 add dstq, strideq dec hd jg .w64_leftonly_loop .w64_end: RET cglobal ipred_z3_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy lea r7, [z_filter_t0] tzcnt wd, wm movifnidn angled, anglem lea t0, [dr_intra_derivative+45*2-1] movsxd wq, [base+ipred_z3_8bpc_avx512icl_table+wq*4] sub angled, 180 mov dyd, angled neg dyd xor angled, 0x400 or dyq, ~0x7e mova m0, [base+pb_63to0] movzx dyd, word [t0+dyq] lea wq, [base+ipred_z3_8bpc_avx512icl_table+wq] movifnidn hd, hm mova m14, [base+z_frac_table] shl dyd, 6 vpbroadcastd m15, [base+pw_512] jmp wq .w4: cmp angleb, 40 jae .w4_no_upsample lea r3d, [angleq-1024] sar r3d, 7 add r3d, hd jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) lea r3d, [hq+4] call .upsample movshdup m1, [base+z_ypos_off1] vpbroadcastd m6, [base+pb_16] jmp .w4_main2 .w4_no_upsample: lea r3d, [hq+3] vpbroadcastb m9, r3d vpxord m1, m9, [base+pb_63] {1to16} ; 63 - (h + 4) pmaxub m1, m0 vpermb m7, m1, [tlq-64*1] test angled, 0x400 ; !enable_intra_edge_filter jnz .w4_main vpbroadcastb xm1, angled shr angled, 8 vpcmpeqb k1, xm9, [base+z_filter_wh] vpbroadcastd m2, [tlq-3] vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8] kmovw r5d, k1 test r5d, r5d jz .w4_main pminub m9, [pb_0to63] call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w8_filter vpermb m7, m9, m0 .w4_main: movsldup m1, [base+z_ypos_off1] vpbroadcastd m6, [base+pb_8] .w4_main2: vpbroadcastw m0, dyd vpbroadcastq m2, [base+z_ypos_mul2a] ; 1..4 pmulhuw m2, m0 ; ypos >> 1 lea r2, [strideq*3] vpermw m3, m2, m14 ; 64-frac, frac psrlw m2, 5 packsswb m2, m2 punpcklbw m2, m2 paddsb m2, m1 ; base, base+1 .w4_loop: vpermb m0, m2, m7 pmaddubsw m0, m3 paddsb m2, m6 pmulhrsw m0, m15 vpmovwb ym0, m0 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+r2 ], xm0, 3 sub hd, 8 jl .w4_end vextracti32x4 xm0, ym0, 1 lea dstq, [dstq+strideq*4] movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+r2 ], xm0, 3 lea dstq, [dstq+strideq*4] jg .w4_loop .w4_end: RET .upsample: xor r3d, 31 ; 31 - (h + imin(w, h)) vbroadcasti32x4 ym0, [base+z_xpos_off2a] vpbroadcastb ym7, r3d pmaxub ym7, [base+z3_upsample] vbroadcasti32x4 ym1, [base+z_filter_s4] vpermb ym7, ym7, [tlq-31] vpbroadcastd ym2, [base+pb_m4_36] pshufb ym0, ym7, ym0 psrldq ym7, 1 pshufb ym1, ym7, ym1 pmaddubsw ym0, ym2 pmaddubsw ym1, ym2 add dyd, dyd paddw ym0, ym1 pmulhrsw ym0, ym15 packuswb ym0, ym0 punpcklbw ym7, ym0 ret .w8: lea r3d, [angleq+216] mov r3b, hb cmp r3d, 8 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 lea r3d, [hq*2] call .upsample pshufd m1, [base+z_ypos_off1], q0000 vpbroadcastd m6, [base+pb_8] jmp .w8_main2 .w8_no_upsample: mov r3d, 8 cmp hd, 4 cmove r3d, hd lea r3d, [r3+hq-1] xor r3d, 63 ; 63 - (h + imin(w, h)) vpbroadcastb m1, wd pmaxub m1, m0 vpermb m7, m1, [tlq-64*1] test angled, 0x400 ; !enable_intra_edge_filter jnz .w8_main lea r3d, [hq+7] call .filter_strength test r5d, r5d jz .w8_main call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w16_filter vpermb m7, m10, m0 .w8_main: movsldup m1, [base+z_ypos_off2] vpbroadcastd m6, [base+pb_4] .w8_main2: vpbroadcastw m0, dyd vbroadcasti32x4 m2, [base+z_ypos_mul2a] ; 1..8 pmulhuw m2, m0 ; ypos >> 1 lea r2, [strideq*3] vpermw m3, m2, m14 ; 64-frac, frac psrlw m2, 5 packsswb m2, m2 punpcklbw m2, m2 paddsb m2, m1 ; base, base+1 .w8_loop: vpermb m0, m2, m7 pmaddubsw m0, m3 paddsb m2, m6 pmulhrsw m0, m15 vpmovwb ym0, m0 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r2 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop RET .filter_strength: vpbroadcastd m2, [tlq-3] .filter_strength2: vpbroadcastb m9, r3d vpbroadcastb ym1, angled shr angled, 8 vpcmpeqb k1, ym9, [base+z_filter_wh] mova xm0, [base+z_filter_t0+angleq*8] vpcmpgtb k1{k1}, ym1, ym0 pminub m10, m9, [pb_0to63] kmovd r5d, k1 ret .w16_load: cmp r3d, hd cmovae r3d, hd add r3d, hd mova m7, [tlq-64*1] neg r3d ; -(h + imin(w, h)) and r3d, 63 vpbroadcastb m1, r3d pmaxub m2, m0, m1 cmp hd, 64 je .w16_load_h64 vpermb m8, m1, m7 vpermb m7, m2, m7 ret .w16_load_h64: vpermb m7, m0, m7 vpermb m8, m2, [tlq-64*2] ret .w16: mov r3d, 16 call .w16_load test angled, 0x400 ; !enable_intra_edge_filter jnz .w16_main vpbroadcastd m2, [tlq-3] cmp hd, 64 je .w16_filter64 lea r3d, [hq+15] call .filter_strength2 test r5d, r5d jz .w16_main call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w16_filter pminub m10, m9, [pb_0to63] vpermb m8, m9, m0 vpermb m7, m10, m0 jmp .w16_main .w16_filter64: vpbroadcastd m13, [base+pb_15] valignq m0, m8, m7, 7 pminub m12, m13, [pb_0to63] valignq m11, m8, m7, 1 call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter .w16_main: vbroadcasti32x4 m3, [base+z_ypos_mul2a] ; 1.. 8 vbroadcasti32x4 m2, [base+z_ypos_mul2b] ; 9..15 vpbroadcastw m0, dyd vpbroadcastd m6, [base+pb_4] pmulhuw m3, m0 ; ypos >> 1 pmulhuw m2, m0 movshdup m0, [base+z_ypos_off2] lea r2, [strideq*3] vpbroadcastd m1, [base+pb_1] vpermw m4, m3, m14 ; 64-frac, frac psrlw m3, 5 vpermw m5, m2, m14 psrlw m2, 5 packsswb m3, m2 paddsb m3, m0 paddsb m1, m3 punpcklbw m2, m3, m1 ; base, base+1 punpckhbw m3, m1 .w16_loop: %macro Z3_PERM2 0 mova m0, m7 vpermt2b m0, m2, m8 mova m1, m7 vpermt2b m1, m3, m8 pmaddubsw m0, m4 pmaddubsw m1, m5 paddsb m2, m6 paddsb m3, m6 pmulhrsw m0, m15 pmulhrsw m1, m15 packuswb m0, m1 %endmacro Z3_PERM2 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+r2 ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16_loop RET .w32: mov r3d, 32 call .w16_load test angled, 0x400 ; !enable_intra_edge_filter jnz .w32_main vpbroadcastd m2, [tlq-3] cmp hd, 64 je .w32_filter64 lea r3d, [hq+31] vpbroadcastb m9, r3d call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w32_filter vpermb m8, m9, m7 jmp .w32_main .w32_filter64: vpbroadcastd m13, [base+pb_31] valignq m0, m8, m7, 7 pminub m12, m13, [pb_0to63] valignq m11, m8, m7, 1 call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter .w32_main: vbroadcasti32x8 m3, [base+z_ypos_mul2a] ; 1.. 8 vbroadcasti32x8 m2, [base+z_ypos_mul2b] ; 9..15 vpbroadcastw m0, dyd vpbroadcastd m1, [base+pb_1] pmulhuw m3, m0 ; ypos >> 1 pmulhuw m2, m0 vpbroadcastd m6, [base+pb_2] mova ym0, ym1 vpermw m4, m3, m14 ; 64-frac, frac psrlw m3, 5 vpermw m5, m2, m14 psrlw m2, 5 packsswb m3, m2 paddsb m3, m0 paddsb m1, m3 punpcklbw m2, m3, m1 ; base, base+1 punpckhbw m3, m1 .w32_loop: Z3_PERM2 vextracti32x8 [dstq+strideq*0], m0, 1 mova [dstq+strideq*1], ym0 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: mova m7, [tlq-64*1] cmp hd, 64 je .w64_h64 lea r3d, [hq*2-1] xor r3d, 63 ; -(h + imin(w, h)) & 63 vpbroadcastb m1, r3d pmaxub m0, m1 vpermb m8, m1, m7 jmp .w64_filter .w64_h64: vpermb m8, m0, [tlq-64*2] .w64_filter: vpermb m7, m0, m7 test angled, 0x400 ; !enable_intra_edge_filter jnz .w64_main lea r3d, [hq-1] vpbroadcastd m2, [tlq-3] vpbroadcastb m13, r3d valignq m0, m8, m7, 7 pminub m12, m13, [pb_0to63] valignq m11, m8, m7, 1 call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter .w64_main: vpbroadcastw m2, dyd pmulhuw m3, m2, [base+z_ypos_mul2a] pmulhuw m2, [base+z_ypos_mul2b] vpbroadcastd m6, [base+pb_1] vpermw m4, m3, m14 ; 64-frac, frac psrlw m3, 5 vpermw m5, m2, m14 psrlw m2, 5 packsswb m3, m2 paddsb m1, m3, m6 punpcklbw m2, m3, m1 ; base, base+1 punpckhbw m3, m1 .w64_loop: Z3_PERM2 mova [dstq], m0 add dstq, strideq dec hd jg .w64_loop RET ; The ipred_filter code processes 4x2 blocks in the following order ; which increases parallelism compared to doing things row by row. ; Some redundant blocks are calculated for w > 4. ; w4 w8 w16 w32 ; 1 1 2 1 2 3 4 1 2 3 4 9 a b c ; 2 2 3 2 3 4 5 2 3 4 5 a b c d ; 3 3 4 3 4 5 6 3 4 5 6 b c d e ; 4 4 5 4 5 6 7 4 5 6 7 c d e f ; 5 5 6 5 6 7 8 5 6 7 8 d e f g ; 6 6 7 6 7 8 9 6 7 8 9 e f g h ; 7 7 8 7 8 9 a 7 8 9 a f g h i ; ___ 8 ___ 8 9 ___ 8 9 a b ___ 8 9 a b g h i j ___ ; 9 9 a b h i j ; a b i j ; b j cglobal ipred_filter_8bpc, 4, 7, 14, dst, stride, tl, w, h, flt %define base r6-filter_taps lea r6, [filter_taps] %ifidn fltd, fltm movzx fltd, fltb %else movzx fltd, byte fltm %endif vpbroadcastd xmm2, [tlq+1] ; t0 t0 t0 t0 movifnidn hd, hm shl fltd, 6 vpbroadcastd m6, [base+pd_8] vpbroadcastd xmm3, [tlq-2] ; l1 l0 tl __ vbroadcasti32x4 m7, [r6+fltq+16*0] ; p1 p2 p3 p4 vbroadcasti32x4 m8, [r6+fltq+16*1] vbroadcasti32x4 m9, [r6+fltq+16*2] ; p6 p5 p0 __ vbroadcasti32x4 m10, [r6+fltq+16*3] mova xmm0, xm6 vpdpbusd xmm0, xmm2, xm7 mova xmm1, xm6 vpdpbusd xmm1, xmm2, xm8 vpdpbusd xmm0, xmm3, xm9 vpdpbusd xmm1, xmm3, xm10 packssdw xmm0, xmm1 cmp wd, 8 jb .w4 vpbroadcastd ym2, [tlq+5] mova m11, [base+filter_perm] mov r5, 0xffffffffffff000f psrldq xmm2, 1 ; __ t0 kmovq k1, r5 ; 0x000f psraw xm5, xmm0, 4 packuswb xmm2, xm5 ; __ t0 a0 b0 pshufd ym2{k1}, ymm2, q3333 ; b0 b0 b0 b0 t1 t1 t1 t1 je .w8 kxnorb k3, k3, k3 ; 0x00ff vpbroadcastd xm3, [tlq-4] kandnq k2, k3, k1 ; 0xffffffffffff0000 vpermb ym3{k2}, ym11, ymm2 ; l3 l2 l1 __ b3 a3 t3 __ mova ym0, ym6 vpdpbusd ym0, ym2, ym7 mova ym1, ym6 vpdpbusd ym1, ym2, ym8 pshufb ym5{k2}, ym2, ym11 ; a0 b0 __ t0 vpbroadcastd m2, [tlq+9] vpdpbusd ym0, ym3, ym9 vpdpbusd ym1, ym3, ym10 vpbroadcastd xm3, [tlq-6] ; l5 l4 l3 __ kunpckbw k4, k1, k3 ; 0x0fff packssdw ym0, ym1 psraw ym0, 4 ; a0 d0 a1 b1 packuswb ym5, ym0 ; a0 b0 c0 d0 __ t1 a1 b1 pshufd m2{k3}, m5, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1 t2 t2 t2 t2 vpermb m3{k2}, m11, m5 ; l5 l4 l3 __ d3 c3 b3 __ b7 a7 t7 __ mova m4, m6 vpdpbusd m4, m2, m7 mova m1, m6 vpdpbusd m1, m2, m8 psrldq m0, m2, 1 ; __ d0 __ b0 __ t0 vpbroadcastd m2, [tlq+13] vpdpbusd m4, m3, m9 vpdpbusd m1, m3, m10 mova m12, [base+filter_end] lea r5d, [hq-6] mov r6, dstq cmovp hd, r5d ; w == 16 ? h : h - 6 packssdw m4, m1 psraw m4, 4 ; e0 f0 c1 d1 a2 b2 packuswb m0, m4 ; __ d0 e0 f0 __ b1 c1 d1 __ t2 a2 b2 pshufd m2{k4}, m0, q3333 ; f0 f0 f0 f0 d1 d1 d1 d1 b2 b2 b2 b2 t3 t3 t3 t3 .w16_loop: vpbroadcastd xm3, [tlq-8] vpermb m3{k2}, m11, m0 ; l7 l6 l5 __ f3 e3 d3 __ d7 c7 b7 __ bb ab tb __ mova m1, m6 vpdpbusd m1, m2, m7 mova m0, m6 vpdpbusd m0, m2, m8 sub tlq, 2 vpdpbusd m1, m3, m9 vpdpbusd m0, m3, m10 packssdw m1, m0 mova m0, m4 psraw m4, m1, 4 ; g0 h0 e1 f1 c2 d2 a3 b3 packuswb m0, m4 ; e0 f0 g0 h0 c1 d1 e1 f1 a2 b2 c2 d2 __ __ a3 b3 pshufd m2, m0, q3333 ; h0 h0 h0 h0 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3 vpermt2d m5, m12, m0 ; c0 d0 e0 f0 __ __ c1 d1 a0 a1 a2 a3 b0 b1 b2 b3 vextracti32x4 [dstq+strideq*0], m5, 2 vextracti32x4 [dstq+strideq*1], m5, 3 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_loop cmp wd, 16 je .ret mova xm13, [filter_perm+16] mova xmm3, [r6+strideq*0] punpckhdq xmm3, [r6+strideq*1] vpbroadcastd m2{k1}, [tlq+r5+17] ; t4 t4 t4 t4 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3 pinsrb xm3, xmm3, [tlq+r5+16], 7 pshufb xm3, xm13 vpermb m3{k2}, m11, m0 ; bf af tf __ h3 g3 f3 __ f7 e7 d7 __ db cb bb __ mova m0, m6 vpdpbusd m0, m2, m7 mova m1, m6 vpdpbusd m1, m2, m8 kunpckbw k5, k3, k1 ; 0xff0f lea r3, [strideq*3] vpdpbusd m0, m3, m9 vpdpbusd m1, m3, m10 packssdw m0, m1 psraw m0, 4 ; a4 b4 g1 h1 e2 f2 c3 d3 packuswb m4, m0 ; g0 h0 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3 vpblendmb m1{k3}, m4, m2 ; __ t4 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3 vpbroadcastd ym2, [tlq+r5+21] pshufd m2{k5}, m4, q3333 ; b4 b4 b4 b4 t5 t5 t5 t5 f2 f2 f2 f2 d3 d3 d3 d3 vpermt2d m5, m12, m4 ; e0 f0 g0 h0 __ __ e1 f1 c0 c1 c2 c3 d0 d1 d2 d3 vextracti32x4 [dstq+strideq*0], m5, 2 vextracti32x4 [dstq+strideq*1], m5, 3 punpckhqdq xmm3, [r6+r3] pinsrb xmm3, [r6+strideq*2+15], 11 pshufb xm3, xmm3, xm13 vpermb m3{k2}, m11, m1 ; df cf bf __ bj aj tj __ h7 g7 f7 __ fb eb db __ mova m4, m6 vpdpbusd m4, m2, m7 mova m1, m6 vpdpbusd m1, m2, m8 kxnord k3, k3, k4 ; 0xfffff0ff lea r4, [strideq*5] vpdpbusd m4, m3, m9 vpdpbusd m1, m3, m10 packssdw m4, m1 psraw m4, 4 ; c4 d4 a5 b5 g2 h2 e3 f3 packuswb m0, m4 ; a4 b4 c4 d4 g1 h1 a5 b5 e2 f2 g2 h2 __ __ e3 f3 vpblendmw m1{k3}, m2, m0 ; a4 b4 c4 d4 __ t5 a5 b5 e2 f2 g2 h2 __ __ e3 f3 vpbroadcastd m2, [tlq+r5+25] pshufd m2{k3}, m0, q3333 ; d4 d4 d4 d4 b5 b5 b5 b5 t6 t6 t6 t6 f3 f3 f3 f3 vpermt2d m5, m12, m0 ; g0 h0 a4 b4 __ __ g1 h1 e0 e1 e2 e3 f0 f1 f2 f3 vextracti32x4 [dstq+strideq*2], m5, 2 vextracti32x4 [dstq+r3 ], m5, 3 punpckhqdq xmm3, [r6+r4] pinsrb xmm3, [r6+strideq*4+15], 11 pshufb xm3, xmm3, xm13 vpermb m3{k2}, m11, m1 ; ff ef df __ dj cj bj __ bn an tn __ hb hb fb __ mova m0, m6 vpdpbusd m0, m2, m7 mova m1, m6 vpdpbusd m1, m2, m8 kunpckwd k1, k1, k2 ; 0x000f0000 vpdpbusd m0, m3, m9 vpdpbusd m1, m3, m10 packssdw m0, m1 psraw m0, 4 ; e4 f4 c5 d5 a6 b6 g3 h3 packuswb m4, m0 ; c4 d4 e4 f4 a5 b5 c5 d5 g2 h2 a6 b6 __ __ g3 h3 vpblendmw m1{k1}, m4, m2 ; c4 d4 e4 f4 a5 b5 c5 d5 __ t6 a6 b6 __ __ g3 h3 vpbroadcastd m2, [tlq+r5+29] pshufd m2{k4}, m4, q3333 ; f4 f4 f4 f4 d5 d5 d5 d5 b6 b6 b6 b6 t7 t7 t7 t7 vpermt2d m5, m12, m4 ; a4 b4 c4 d4 __ __ a5 b5 g0 g1 g2 g3 h0 h1 h2 h3 vextracti32x4 [dstq+strideq*4], m5, 2 vextracti32x4 [dstq+r4 ], m5, 3 lea r0, [strideq+r3*2] .w32_loop: punpckhqdq xmm3, [r6+r0] pinsrb xmm3, [r6+r3*2+15], 11 pshufb xm3, xmm3, xm13 vpermb m3{k2}, m11, m1 ; hf gf ff __ fj ej dj __ dn cn bn __ br ar tr __ .w32_loop_tail: mova m4, m6 vpdpbusd m4, m2, m7 mova m1, m6 vpdpbusd m1, m2, m8 vpdpbusd m4, m3, m9 vpdpbusd m1, m3, m10 packssdw m4, m1 mova m1, m0 psraw m0, m4, 4 ; g4 h4 e5 f5 c6 d6 a7 b7 packuswb m1, m0 ; e4 f4 g4 h4 c5 d5 e5 f5 a6 b6 c6 d6 __ __ a7 b7 pshufd m2, m1, q3333 ; h4 h4 h4 h4 f5 f5 f5 f5 d6 d6 d6 d6 b7 b7 b7 b7 vpermt2d m5, m12, m1 ; c4 d4 e4 f4 __ __ c5 d5 a4 a5 a6 a7 b4 b5 b6 b7 vextracti32x4 [r6+strideq*0+16], m5, 2 vextracti32x4 [r6+strideq*1+16], m5, 3 lea r6, [r6+strideq*2] sub r5d, 2 jg .w32_loop vpermb m3, m11, m1 cmp r5d, -6 jg .w32_loop_tail .ret: RET .w8: vpermb ym3, ym11, ymm2 .w8_loop: vpbroadcastd ym3{k1}, [tlq-4] ; l3 l2 l1 __ b3 a3 t3 __ mova ym0, ym6 vpdpbusd ym0, ym2, ym7 mova ym1, ym6 vpdpbusd ym1, ym2, ym8 sub tlq, 2 vpdpbusd ym0, ym3, ym9 vpdpbusd ym1, ym3, ym10 mova ym3, ym5 packssdw ym0, ym1 psraw ym5, ym0, 4 ; c0 d0 a1 b1 packuswb ym3, ym5 ; a0 b0 c0 d0 __ __ a1 b1 pshufd ym2, ym3, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1 vpermb ym3, ym11, ym3 ; a0 a1 b0 b1 movq [dstq+strideq*0], xm3 movhps [dstq+strideq*1], xm3 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET .w4_loop: vpbroadcastd xmm3, [tlq-4] ; l3 l2 l1 __ mova xmm0, xm6 vpdpbusd xmm0, xmm2, xm7 mova xmm1, xm6 vpdpbusd xmm1, xmm2, xm8 sub tlq, 2 vpdpbusd xmm0, xmm3, xm9 vpdpbusd xmm1, xmm3, xm10 packssdw xmm0, xmm1 .w4: psraw xmm0, 4 ; a0 b0 packuswb xmm0, xmm0 movd [dstq+strideq*0], xmm0 pshufd xmm2, xmm0, q1111 ; b0 b0 b0 b0 movd [dstq+strideq*1], xmm2 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w4_loop RET %endif ; ARCH_X86_64