summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/x86/ipred16_avx2.asm
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 09:22:09 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 09:22:09 +0000
commit43a97878ce14b72f0981164f87f2e35e14151312 (patch)
tree620249daf56c0258faa40cbdcf9cfba06de2a846 /third_party/dav1d/src/x86/ipred16_avx2.asm
parentInitial commit. (diff)
downloadfirefox-43a97878ce14b72f0981164f87f2e35e14151312.tar.xz
firefox-43a97878ce14b72f0981164f87f2e35e14151312.zip
Adding upstream version 110.0.1.upstream/110.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/dav1d/src/x86/ipred16_avx2.asm')
-rw-r--r--third_party/dav1d/src/x86/ipred16_avx2.asm4996
1 files changed, 4996 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/ipred16_avx2.asm b/third_party/dav1d/src/x86/ipred16_avx2.asm
new file mode 100644
index 0000000000..72300c2a4c
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred16_avx2.asm
@@ -0,0 +1,4996 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 64
+
+%macro SMOOTH_WEIGHTS 1-*
+const smooth_weights_1d_16bpc ; sm_weights[] << 7
+ %rep %0
+ dw %1*128
+ %rotate 1
+ %endrep
+const smooth_weights_2d_16bpc ; sm_weights[], 256 - sm_weights[]
+ %rep %0
+ dw %1, 256-%1
+ %rotate 1
+ %endrep
+%endmacro
+
+SMOOTH_WEIGHTS 0, 0, 255, 128, 255, 149, 85, 64, \
+ 255, 197, 146, 105, 73, 50, 37, 32, \
+ 255, 225, 196, 170, 145, 123, 102, 84, \
+ 68, 54, 43, 33, 26, 20, 17, 16, \
+ 255, 240, 225, 210, 196, 182, 169, 157, \
+ 145, 133, 122, 111, 101, 92, 83, 74, \
+ 66, 59, 52, 45, 39, 34, 29, 25, \
+ 21, 17, 14, 12, 10, 9, 8, 8, \
+ 255, 248, 240, 233, 225, 218, 210, 203, \
+ 196, 189, 182, 176, 169, 163, 156, 150, \
+ 144, 138, 133, 127, 121, 116, 111, 106, \
+ 101, 96, 91, 86, 82, 77, 73, 69, \
+ 65, 61, 57, 54, 50, 47, 44, 41, \
+ 38, 35, 32, 29, 27, 25, 22, 20, \
+ 18, 16, 15, 13, 12, 10, 9, 8, \
+ 7, 6, 6, 5, 5, 4, 4, 4
+
+%if ARCH_X86_64
+
+ipred_hv_shuf: db 6, 7, 6, 7, 0, 1, 2, 3, 2, 3, 2, 3, 8, 9, 10, 11
+ db 4, 5, 4, 5, 4, 5, 6, 7, 0, 1, 0, 1, 12, 13, 14, 15
+filter_shuf1: db 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 14, 15, 12, 13, -1, -1
+filter_shuf2: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1
+filter_shuf3: db 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 8, 9, -1, -1
+pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64
+ dw 8*64, 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64
+z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0
+z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0
+z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
+ db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1
+pw_m1024: times 2 dw -1024
+pw_1to16: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+pw_16to1: dw 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
+z2_ymul: dw 1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4
+z2_ymul8: dw 1, 2, 5, 6, 3, 4, 7, 8, 5, 6, 16, 16, 7, 8
+pb_90: times 4 db 90
+z2_y_shuf_h4: dd 3, 7, 2, 6, 1, 5, 0, 4
+z_upsample: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+z2_x_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
+z2_y_shuf: db 6, 7, 14, 15, 4, 5, 12, 13, 4, 5, 12, 13, 2, 3, 10, 11
+z2_y_shuf_us: db 6, 7, 14, 15, 2, 3, 10, 11, 4, 5, 12, 13, 0, 1, 8, 9
+z_filter_k: dw 4, 4, 5, 5, 4, 4
+ dw 8, 8, 6, 6, 4, 4
+ dw 0, 0, 0, 0, 2, 2
+
+%define pw_2 (z_filter_k+32)
+%define pw_4 (z_filter_k+ 0)
+%define pw_16 (z2_ymul8 +20)
+
+pw_1: times 2 dw 1
+pw_3: times 2 dw 3
+pw_62: times 2 dw 62
+pw_512: times 2 dw 512
+pw_2048: times 2 dw 2048
+pd_8: dd 8
+
+%macro JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%base %+ .%3 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+%define ipred_dc_splat_16bpc_avx2_table (ipred_dc_16bpc_avx2_table + 10*4)
+%define ipred_cfl_splat_16bpc_avx2_table (ipred_cfl_16bpc_avx2_table + 8*4)
+
+JMP_TABLE ipred_dc_16bpc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
+JMP_TABLE ipred_dc_left_16bpc, avx2, h4, h8, h16, h32, h64
+JMP_TABLE ipred_h_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_paeth_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_h_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_v_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z1_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z2_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z3_16bpc, avx2, h4, h8, h16, h32, h64
+JMP_TABLE ipred_filter_16bpc, avx2, w4, w8, w16, w32
+JMP_TABLE ipred_cfl_16bpc, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
+ s4-8*4, s8-8*4, s16-8*4, s32-8*4
+JMP_TABLE ipred_cfl_left_16bpc, avx2, h4, h8, h16, h32
+JMP_TABLE ipred_cfl_ac_444_16bpc, avx2, w4, w8, w16, w32
+JMP_TABLE pal_pred_16bpc, avx2, w4, w8, w16, w32, w64
+
+cextern dr_intra_derivative
+cextern filter_intra_taps
+
+SECTION .text
+
+INIT_YMM avx2
+cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h
+ movifnidn hd, hm
+ add tlq, 2
+ movd xm4, wd
+ pxor xm3, xm3
+ pavgw xm4, xm3
+ tzcnt wd, wd
+ movd xm5, wd
+ movu m0, [tlq]
+ lea r5, [ipred_dc_left_16bpc_avx2_table]
+ movsxd r6, [r5+wq*4]
+ add r6, r5
+ add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+
+cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ mov hd, hm
+ sub tlq, hq
+ movd xm4, hd
+ sub tlq, hq
+ pxor xm3, xm3
+ pavgw xm4, xm3
+ tzcnt r6d, hd
+ movd xm5, r6d
+ movu m0, [tlq]
+ lea r5, [ipred_dc_left_16bpc_avx2_table]
+ movsxd r6, [r5+r6*4]
+ add r6, r5
+ add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table
+ tzcnt wd, wd
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+.h64:
+ paddw m0, [tlq+96]
+ paddw m0, [tlq+64]
+.h32:
+ paddw m0, [tlq+32]
+.h16:
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+.h8:
+ psrldq xm1, xm0, 8
+ paddw xm0, xm1
+.h4:
+ punpcklwd xm0, xm3
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ paddd xm0, xm4
+ psrld xm0, xm5
+ lea stride3q, [strideq*3]
+ vpbroadcastw m0, xm0
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ jmp wq
+
+cglobal ipred_dc_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ tzcnt r6d, hd
+ lea r5d, [wq+hq]
+ movd xm4, r5d
+ tzcnt r5d, r5d
+ movd xm5, r5d
+ lea r5, [ipred_dc_16bpc_avx2_table]
+ tzcnt wd, wd
+ movsxd r6, [r5+r6*4]
+ movsxd wq, [r5+wq*4+5*4]
+ pxor m3, m3
+ psrlw xm4, 1
+ add r6, r5
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp r6
+.h4:
+ movq xm0, [tlq-8]
+ jmp wq
+.w4:
+ movq xm1, [tlq+2]
+ paddw m0, m4
+ paddw m0, m1
+ psrlq m1, m0, 32
+ paddw m0, m1
+ psrld m1, m0, 16
+ paddw m0, m1
+ cmp hd, 4
+ jg .w4_mul
+ psrlw xm0, 3
+ jmp .w4_end
+.w4_mul:
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+ lea r2d, [hq*2]
+ mov r6d, 0xAAAB6667
+ shrx r6d, r6d, r2d
+ punpckhwd xm1, xm0, xm3
+ punpcklwd xm0, xm3
+ paddd xm0, xm1
+ movd xm1, r6d
+ psrld xm0, 2
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w4_end:
+ vpbroadcastw xm0, xm0
+.s4:
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm0
+ movq [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s4
+ RET
+ALIGN function_align
+.h8:
+ mova xm0, [tlq-16]
+ jmp wq
+.w8:
+ vextracti128 xm1, m0, 1
+ paddw xm0, [tlq+2]
+ paddw xm0, xm4
+ paddw xm0, xm1
+ psrld xm1, xm0, 16
+ paddw xm0, xm1
+ pblendw xm0, xm3, 0xAA
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ cmp hd, 32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w8_end:
+ vpbroadcastw xm0, xm0
+.s8:
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm0
+ mova [dstq+strideq*2], xm0
+ mova [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s8
+ RET
+ALIGN function_align
+.h16:
+ mova m0, [tlq-32]
+ jmp wq
+.w16:
+ paddw m0, [tlq+2]
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm4
+ paddw xm0, xm1
+ punpckhwd xm1, xm0, xm3
+ punpcklwd xm0, xm3
+ paddd xm0, xm1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ test hb, 8|32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w16_end:
+ vpbroadcastw m0, xm0
+.s16:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s16
+ RET
+ALIGN function_align
+.h32:
+ mova m0, [tlq-64]
+ paddw m0, [tlq-32]
+ jmp wq
+.w32:
+ paddw m0, [tlq+ 2]
+ paddw m0, [tlq+34]
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm4
+ paddw xm0, xm1
+ punpcklwd xm1, xm0, xm3
+ punpckhwd xm0, xm3
+ paddd xm0, xm1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x6667AAAB
+ shrx r6d, r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w32_end:
+ vpbroadcastw m0, xm0
+ mova m1, m0
+.s32:
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m0
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*2+32*0], m0
+ mova [dstq+strideq*2+32*1], m1
+ mova [dstq+stride3q +32*0], m0
+ mova [dstq+stride3q +32*1], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s32
+ RET
+ALIGN function_align
+.h64:
+ mova m0, [tlq-128]
+ mova m1, [tlq- 96]
+ paddw m0, [tlq- 64]
+ paddw m1, [tlq- 32]
+ paddw m0, m1
+ jmp wq
+.w64:
+ movu m1, [tlq+ 2]
+ paddw m0, [tlq+34]
+ paddw m1, [tlq+66]
+ paddw m0, [tlq+98]
+ paddw m0, m1
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+ punpcklwd xm1, xm0, xm3
+ punpckhwd xm0, xm3
+ paddd xm1, xm4
+ paddd xm0, xm1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 64
+ je .w64_end
+ mov r6d, 0x6667AAAB
+ shrx r6d, r6d, hd
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w64_end:
+ vpbroadcastw m0, xm0
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+.s64:
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*0+32*2], m2
+ mova [dstq+strideq*0+32*3], m3
+ mova [dstq+strideq*1+32*0], m0
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*1+32*2], m2
+ mova [dstq+strideq*1+32*3], m3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .s64
+ RET
+
+cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
+ mov r6d, r8m
+ shr r6d, 11
+ lea r5, [ipred_dc_splat_16bpc_avx2_table]
+ tzcnt wd, wd
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m0, [r5-ipred_dc_splat_16bpc_avx2_table+pw_512+r6*4]
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+cglobal ipred_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ movu m0, [tlq+ 2]
+ movu m1, [tlq+34]
+ movu m2, [tlq+66]
+ movu m3, [tlq+98]
+ lea r5, [ipred_dc_splat_16bpc_avx2_table]
+ tzcnt wd, wd
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+%macro IPRED_H 2 ; w, store_type
+ vpbroadcastw m0, [tlq-2]
+ vpbroadcastw m1, [tlq-4]
+ vpbroadcastw m2, [tlq-6]
+ vpbroadcastw m3, [tlq-8]
+ sub tlq, 8
+ mov%2 [dstq+strideq*0], m0
+ mov%2 [dstq+strideq*1], m1
+ mov%2 [dstq+strideq*2], m2
+ mov%2 [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w%1
+ RET
+ALIGN function_align
+%endmacro
+
+cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ lea r5, [ipred_h_16bpc_avx2_table]
+ tzcnt wd, wd
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+INIT_XMM avx2
+.w4:
+ IPRED_H 4, q
+.w8:
+ IPRED_H 8, a
+INIT_YMM avx2
+.w16:
+ IPRED_H 16, a
+.w32:
+ vpbroadcastw m0, [tlq-2]
+ vpbroadcastw m1, [tlq-4]
+ vpbroadcastw m2, [tlq-6]
+ vpbroadcastw m3, [tlq-8]
+ sub tlq, 8
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m0
+ mova [dstq+strideq*1+32*0], m1
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*2+32*0], m2
+ mova [dstq+strideq*2+32*1], m2
+ mova [dstq+stride3q +32*0], m3
+ mova [dstq+stride3q +32*1], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w32
+ RET
+.w64:
+ vpbroadcastw m0, [tlq-2]
+ vpbroadcastw m1, [tlq-4]
+ sub tlq, 4
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m0
+ mova [dstq+strideq*0+32*2], m0
+ mova [dstq+strideq*0+32*3], m0
+ mova [dstq+strideq*1+32*0], m1
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*1+32*2], m1
+ mova [dstq+strideq*1+32*3], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w64
+ RET
+
+%macro PAETH 3 ; top, signed_ldiff, ldiff
+ paddw m0, m%2, m1
+ psubw m7, m3, m0 ; tldiff
+ psubw m0, m%1 ; tdiff
+ pabsw m7, m7
+ pabsw m0, m0
+ pminsw m7, m0
+ pcmpeqw m0, m7
+ pcmpgtw m7, m%3, m7
+ vpblendvb m0, m3, m%1, m0
+ vpblendvb m0, m1, m0, m7
+%endmacro
+
+cglobal ipred_paeth_16bpc, 3, 6, 8, dst, stride, tl, w, h
+%define base r5-ipred_paeth_16bpc_avx2_table
+ movifnidn hd, hm
+ lea r5, [ipred_paeth_16bpc_avx2_table]
+ tzcnt wd, wd
+ movsxd wq, [r5+wq*4]
+ vpbroadcastw m3, [tlq] ; topleft
+ add wq, r5
+ jmp wq
+.w4:
+ vpbroadcastq m2, [tlq+2] ; top
+ movsldup m6, [base+ipred_hv_shuf]
+ lea r3, [strideq*3]
+ psubw m4, m2, m3
+ pabsw m5, m4
+.w4_loop:
+ sub tlq, 8
+ vpbroadcastq m1, [tlq]
+ pshufb m1, m6 ; left
+ PAETH 2, 4, 5
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r3 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ vbroadcasti128 m2, [tlq+2]
+ movsldup m6, [base+ipred_hv_shuf]
+ psubw m4, m2, m3
+ pabsw m5, m4
+.w8_loop:
+ sub tlq, 4
+ vpbroadcastd m1, [tlq]
+ pshufb m1, m6
+ PAETH 2, 4, 5
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ movu m2, [tlq+2]
+ psubw m4, m2, m3
+ pabsw m5, m4
+.w16_loop:
+ sub tlq, 2
+ vpbroadcastw m1, [tlq]
+ PAETH 2, 4, 5
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ movu m2, [tlq+2]
+ movu m6, [tlq+34]
+%if WIN64
+ movaps r4m, xmm8
+ movaps r6m, xmm9
+%endif
+ psubw m4, m2, m3
+ psubw m8, m6, m3
+ pabsw m5, m4
+ pabsw m9, m8
+.w32_loop:
+ sub tlq, 2
+ vpbroadcastw m1, [tlq]
+ PAETH 2, 4, 5
+ mova [dstq+32*0], m0
+ PAETH 6, 8, 9
+ mova [dstq+32*1], m0
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+%if WIN64
+ movaps xmm8, r4m
+ movaps xmm9, r6m
+%endif
+ RET
+ALIGN function_align
+.w64:
+ WIN64_SPILL_XMM 16
+ movu m2, [tlq+ 2]
+ movu m6, [tlq+34]
+ movu m10, [tlq+66]
+ movu m13, [tlq+98]
+ psubw m4, m2, m3
+ psubw m8, m6, m3
+ psubw m11, m10, m3
+ psubw m14, m13, m3
+ pabsw m5, m4
+ pabsw m9, m8
+ pabsw m12, m11
+ pabsw m15, m14
+.w64_loop:
+ sub tlq, 2
+ vpbroadcastw m1, [tlq]
+ PAETH 2, 4, 5
+ mova [dstq+32*0], m0
+ PAETH 6, 8, 9
+ mova [dstq+32*1], m0
+ PAETH 10, 11, 12
+ mova [dstq+32*2], m0
+ PAETH 13, 14, 15
+ mova [dstq+32*3], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+cglobal ipred_smooth_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, weights
+%define base r6-ipred_smooth_v_16bpc_avx2_table
+ lea r6, [ipred_smooth_v_16bpc_avx2_table]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ lea weightsq, [base+smooth_weights_1d_16bpc+hq*4]
+ neg hq
+ vpbroadcastw m5, [tlq+hq*2] ; bottom
+ add wq, r6
+ jmp wq
+.w4:
+ vpbroadcastq m4, [tlq+2] ; top
+ movsldup m3, [base+ipred_hv_shuf]
+ lea r6, [strideq*3]
+ psubw m4, m5 ; top - bottom
+.w4_loop:
+ vpbroadcastq m0, [weightsq+hq*2]
+ pshufb m0, m3
+ pmulhrsw m0, m4
+ paddw m0, m5
+ vextracti128 xm1, m0, 1
+ movhps [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movq [dstq+r6 ], xm0
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w4_loop
+.ret:
+ RET
+.w8:
+ vbroadcasti128 m4, [tlq+2]
+ movsldup m3, [base+ipred_hv_shuf]
+ lea r6, [strideq*3]
+ psubw m4, m5
+.w8_loop:
+ vpbroadcastd m0, [weightsq+hq*2+0]
+ vpbroadcastd m1, [weightsq+hq*2+4]
+ pshufb m0, m3
+ pshufb m1, m3
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ paddw m0, m5
+ paddw m1, m5
+ vextracti128 [dstq+strideq*0], m0, 1
+ mova [dstq+strideq*1], xm0
+ vextracti128 [dstq+strideq*2], m1, 1
+ mova [dstq+r6 ], xm1
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w8_loop
+ RET
+.w16:
+ movu m4, [tlq+2]
+ lea r6, [strideq*3]
+ psubw m4, m5
+.w16_loop:
+ vpbroadcastw m0, [weightsq+hq*2+0]
+ vpbroadcastw m1, [weightsq+hq*2+2]
+ vpbroadcastw m2, [weightsq+hq*2+4]
+ vpbroadcastw m3, [weightsq+hq*2+6]
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ REPX {paddw x, m5}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r6 ], m3
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w16_loop
+ RET
+.w32:
+ WIN64_SPILL_XMM 7
+ movu m4, [tlq+ 2]
+ movu m6, [tlq+34]
+ psubw m4, m5
+ psubw m6, m5
+.w32_loop:
+ vpbroadcastw m1, [weightsq+hq*2+0]
+ vpbroadcastw m3, [weightsq+hq*2+2]
+ pmulhrsw m0, m4, m1
+ pmulhrsw m1, m6
+ pmulhrsw m2, m4, m3
+ pmulhrsw m3, m6
+ REPX {paddw x, m5}, m0, m1, m2, m3
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m2
+ mova [dstq+strideq*1+32*1], m3
+ lea dstq, [dstq+strideq*2]
+ add hq, 2
+ jl .w32_loop
+ RET
+.w64:
+ WIN64_SPILL_XMM 8
+ movu m3, [tlq+ 2]
+ movu m4, [tlq+34]
+ movu m6, [tlq+66]
+ movu m7, [tlq+98]
+ REPX {psubw x, m5}, m3, m4, m6, m7
+.w64_loop:
+ vpbroadcastw m2, [weightsq+hq*2]
+ pmulhrsw m0, m3, m2
+ pmulhrsw m1, m4, m2
+ paddw m0, m5
+ paddw m1, m5
+ mova [dstq+32*0], m0
+ pmulhrsw m0, m6, m2
+ mova [dstq+32*1], m1
+ pmulhrsw m1, m7, m2
+ paddw m0, m5
+ paddw m1, m5
+ mova [dstq+32*2], m0
+ mova [dstq+32*3], m1
+ add dstq, strideq
+ inc hq
+ jl .w64_loop
+ RET
+
+cglobal ipred_smooth_h_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+%define base r6-ipred_smooth_h_16bpc_avx2_table
+ lea r6, [ipred_smooth_h_16bpc_avx2_table]
+ mov wd, wm
+ movifnidn hd, hm
+ vpbroadcastw m5, [tlq+wq*2] ; right
+ tzcnt wd, wd
+ add hd, hd
+ movsxd wq, [r6+wq*4]
+ sub tlq, hq
+ lea stride3q, [strideq*3]
+ add wq, r6
+ jmp wq
+.w4:
+ vpbroadcastq m4, [base+smooth_weights_1d_16bpc+4*2]
+ movsldup m3, [base+ipred_hv_shuf]
+.w4_loop:
+ vpbroadcastq m0, [tlq+hq-8] ; left
+ pshufb m0, m3
+ psubw m0, m5 ; left - right
+ pmulhrsw m0, m4
+ paddw m0, m5
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4*2
+ jg .w4_loop
+ RET
+.w8:
+ vbroadcasti128 m4, [base+smooth_weights_1d_16bpc+8*2]
+ movsldup m3, [base+ipred_hv_shuf]
+.w8_loop:
+ vpbroadcastd m0, [tlq+hq-4]
+ vpbroadcastd m1, [tlq+hq-8]
+ pshufb m0, m3
+ pshufb m1, m3
+ psubw m0, m5
+ psubw m1, m5
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ paddw m0, m5
+ paddw m1, m5
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+stride3q ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hq, 4*2
+ jg .w8_loop
+ RET
+.w16:
+ movu m4, [base+smooth_weights_1d_16bpc+16*2]
+.w16_loop:
+ vpbroadcastq m3, [tlq+hq-8]
+ punpcklwd m3, m3
+ psubw m3, m5
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ REPX {paddw x, m5}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hq, 4*2
+ jg .w16_loop
+ RET
+.w32:
+ WIN64_SPILL_XMM 7
+ movu m4, [base+smooth_weights_1d_16bpc+32*2]
+ movu m6, [base+smooth_weights_1d_16bpc+32*3]
+.w32_loop:
+ vpbroadcastw m1, [tlq+hq-2]
+ vpbroadcastw m3, [tlq+hq-4]
+ psubw m1, m5
+ psubw m3, m5
+ pmulhrsw m0, m4, m1
+ pmulhrsw m1, m6
+ pmulhrsw m2, m4, m3
+ pmulhrsw m3, m6
+ REPX {paddw x, m5}, m0, m1, m2, m3
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m2
+ mova [dstq+strideq*1+32*1], m3
+ lea dstq, [dstq+strideq*2]
+ sub hq, 2*2
+ jg .w32_loop
+ RET
+.w64:
+ WIN64_SPILL_XMM 8
+ movu m3, [base+smooth_weights_1d_16bpc+32*4]
+ movu m4, [base+smooth_weights_1d_16bpc+32*5]
+ movu m6, [base+smooth_weights_1d_16bpc+32*6]
+ movu m7, [base+smooth_weights_1d_16bpc+32*7]
+.w64_loop:
+ vpbroadcastw m2, [tlq+hq-2]
+ psubw m2, m5
+ pmulhrsw m0, m3, m2
+ pmulhrsw m1, m4, m2
+ paddw m0, m5
+ paddw m1, m5
+ mova [dstq+32*0], m0
+ pmulhrsw m0, m6, m2
+ mova [dstq+32*1], m1
+ pmulhrsw m1, m7, m2
+ paddw m0, m5
+ paddw m1, m5
+ mova [dstq+32*2], m0
+ mova [dstq+32*3], m1
+ add dstq, strideq
+ sub hq, 1*2
+ jg .w64_loop
+ RET
+
+%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2]
+ pmaddwd m0, m%1, m%3
+ pmaddwd m1, m%2, m%4
+ paddd m0, m%5
+ paddd m1, m%6
+ psrld m0, 8
+ psrld m1, 8
+ packssdw m0, m1
+ pavgw m0, m5
+%endmacro
+
+cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights
+%define base r6-ipred_smooth_16bpc_avx2_table
+ lea r6, [ipred_smooth_16bpc_avx2_table]
+ mov wd, wm
+ vpbroadcastw m4, [tlq+wq*2] ; right
+ tzcnt wd, wd
+ mov hd, hm
+ sub tlq, hq
+ sub tlq, hq
+ movsxd wq, [r6+wq*4]
+ pxor m5, m5
+ add wq, r6
+ lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*4]
+ jmp wq
+.w4:
+ WIN64_SPILL_XMM 11
+ vpbroadcastw m0, [tlq] ; bottom
+ vpbroadcastq m6, [tlq+hq*2+2]
+ movsldup m7, [base+ipred_hv_shuf]
+ movshdup m9, [base+ipred_hv_shuf]
+ vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+4*4]
+ punpcklwd m6, m0 ; top, bottom
+ punpcklqdq m8, m9, m9
+ punpckhqdq m9, m9
+ lea r3, [strideq*3]
+.w4_loop:
+ vpbroadcastq m3, [tlq+hq*2-8]
+ vbroadcasti128 m1, [v_weightsq]
+ pshufb m3, m7
+ punpcklwd m2, m3, m4 ; left, right
+ punpckhwd m3, m4
+ pmaddwd m2, m10
+ pmaddwd m3, m10
+ pshufb m0, m1, m8
+ pshufb m1, m9
+ SMOOTH_2D_END 0, 1, 6, 6, 2, 3
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r3 ], xm1
+ lea dstq, [dstq+strideq*4]
+ add v_weightsq, 16
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8:
+%assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 12
+ vpbroadcastw m0, [tlq] ; bottom
+ vbroadcasti128 m7, [tlq+hq*2+2]
+ movsldup m8, [base+ipred_hv_shuf]
+ movshdup m9, [base+ipred_hv_shuf]
+ vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+8*4+16*0]
+ vbroadcasti128 m11, [base+smooth_weights_2d_16bpc+8*4+16*1]
+ punpcklwd m6, m7, m0 ; top, bottom
+ punpckhwd m7, m0
+.w8_loop:
+ vpbroadcastd m3, [tlq+hq*2-4]
+ vpbroadcastq m1, [v_weightsq]
+ pshufb m3, m8
+ punpcklwd m2, m3, m4 ; left, right
+ punpckhwd m3, m4
+ pmaddwd m2, m10
+ pmaddwd m3, m11
+ pshufb m1, m9
+ SMOOTH_2D_END 1, 1, 6, 7, 2, 3
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ add v_weightsq, 8
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16:
+%assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 11
+ vpbroadcastw m0, [tlq] ; bottom
+ movu m7, [tlq+hq*2+2]
+ mova xm8, [base+smooth_weights_2d_16bpc+16*4+16*0]
+ mova xm9, [base+smooth_weights_2d_16bpc+16*4+16*1]
+ vinserti128 m8, [base+smooth_weights_2d_16bpc+16*4+16*2], 1
+ vinserti128 m9, [base+smooth_weights_2d_16bpc+16*4+16*3], 1
+ punpcklwd m6, m7, m0 ; top, bottom
+ punpckhwd m7, m0
+.w16_loop:
+ vpbroadcastd m3, [tlq+hq*2-4]
+ vpbroadcastd m1, [v_weightsq+0]
+ punpcklwd m3, m4 ; left, right
+ pshufd m2, m3, q1111
+ pmaddwd m10, m8, m2
+ pmaddwd m2, m9
+ pshufd m3, m3, q0000
+ SMOOTH_2D_END 1, 1, 6, 7, 10, 2
+ vpbroadcastd m1, [v_weightsq+4]
+ pmaddwd m2, m8, m3
+ pmaddwd m3, m9
+ mova [dstq+strideq*0], m0
+ SMOOTH_2D_END 1, 1, 6, 7, 2, 3
+ mova [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ add v_weightsq, 8
+ sub hq, 2
+ jg .w16_loop
+ RET
+.w32:
+%assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 15
+ vpbroadcastw m0, [tlq] ; bottom
+ movu m7, [tlq+hq*2+ 2]
+ movu m9, [tlq+hq*2+34]
+ mova xm10, [base+smooth_weights_2d_16bpc+32*4+16*0]
+ mova xm11, [base+smooth_weights_2d_16bpc+32*4+16*1]
+ vinserti128 m10, [base+smooth_weights_2d_16bpc+32*4+16*2], 1
+ vinserti128 m11, [base+smooth_weights_2d_16bpc+32*4+16*3], 1
+ mova xm12, [base+smooth_weights_2d_16bpc+32*4+16*4]
+ mova xm13, [base+smooth_weights_2d_16bpc+32*4+16*5]
+ vinserti128 m12, [base+smooth_weights_2d_16bpc+32*4+16*6], 1
+ vinserti128 m13, [base+smooth_weights_2d_16bpc+32*4+16*7], 1
+ punpcklwd m6, m7, m0
+ punpckhwd m7, m0
+ punpcklwd m8, m9, m0
+ punpckhwd m9, m0
+.w32_loop:
+ vpbroadcastw m3, [tlq+hq*2-2]
+ vpbroadcastd m14, [v_weightsq]
+ punpcklwd m3, m4
+ pmaddwd m1, m10, m3
+ pmaddwd m2, m11, m3
+ pmaddwd m0, m6, m14
+ paddd m0, m1
+ pmaddwd m1, m7, m14
+ paddd m1, m2
+ pmaddwd m2, m12, m3
+ pmaddwd m3, m13
+ psrld m0, 8
+ psrld m1, 8
+ packssdw m0, m1
+ pavgw m0, m5
+ mova [dstq+32*0], m0
+ SMOOTH_2D_END 14, 14, 8, 9, 2, 3
+ mova [dstq+32*1], m0
+ add dstq, strideq
+ add v_weightsq, 4
+ dec hd
+ jg .w32_loop
+ RET
+.w64:
+%assign stack_offset stack_offset - stack_size_padded
+ PROLOGUE 0, 11, 16, dst, stride, tl, tl_base, h, v_weights, dummy, v_weights_base, x, y, dst_base
+ mov dst_baseq, dstq
+ mov tl_baseq, tlq
+ mov v_weights_baseq, v_weightsq
+ xor xq, xq
+.w64_loop_x:
+ mov yq, hq
+ lea tlq, [tl_baseq+hq*2]
+ vpbroadcastw m0, [tl_baseq] ; bottom
+ movu m7, [tlq+xq*2+ 2]
+ movu m9, [tlq+xq*2+34]
+ mova xm10, [base+smooth_weights_2d_16bpc+64*4+16*0]
+ mova xm11, [base+smooth_weights_2d_16bpc+64*4+16*1]
+ vinserti128 m10, [base+smooth_weights_2d_16bpc+64*4+16*2], 1
+ vinserti128 m11, [base+smooth_weights_2d_16bpc+64*4+16*3], 1
+ mova xm12, [base+smooth_weights_2d_16bpc+64*4+16*4]
+ mova xm13, [base+smooth_weights_2d_16bpc+64*4+16*5]
+ vinserti128 m12, [base+smooth_weights_2d_16bpc+64*4+16*6], 1
+ vinserti128 m13, [base+smooth_weights_2d_16bpc+64*4+16*7], 1
+ punpcklwd m6, m7, m0
+ punpckhwd m7, m0
+ punpcklwd m8, m9, m0
+ punpckhwd m9, m0
+ lea tlq, [tl_baseq-2]
+.w64_loop_y:
+ vpbroadcastw m3, [tlq+yq*2]
+ vpbroadcastd m1, [v_weightsq]
+ punpcklwd m3, m4
+ pmaddwd m14, m10, m3
+ pmaddwd m15, m11, m3
+ pmaddwd m2, m12, m3
+ pmaddwd m3, m13
+ pmaddwd m0, m6, m1
+ paddd m0, m14
+ pmaddwd m14, m7, m1
+ paddd m14, m15
+ psrld m0, 8
+ psrld m14, 8
+ packssdw m0, m14
+ pavgw m0, m5
+ mova [dstq+32*0], m0
+ SMOOTH_2D_END 8, 9, 1, 1, 2, 3
+ mova [dstq+32*1], m0
+ add dstq, strideq
+ add v_weightsq, 4
+ dec yq
+ jg .w64_loop_y
+ lea dstq, [dst_baseq+32*2]
+ add r6, 16*8
+ mov v_weightsq, v_weights_baseq
+ add xq, 32
+ test xb, 64
+ jz .w64_loop_x
+ RET
+
+cglobal ipred_z1_16bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
+ %assign org_stack_offset stack_offset
+ lea r6, [ipred_z1_16bpc_avx2_table]
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ movifnidn hd, hm
+ lea r7, [dr_intra_derivative]
+ movsxd wq, [r6+wq*4]
+ add tlq, 2
+ add wq, r6
+ mov dxd, angled
+ and dxd, 0x7e
+ add angled, 165 ; ~90
+ movzx dxd, word [r7+dxq]
+ xor angled, 0x4ff ; d = 90 - angle
+ vpbroadcastd m5, [pw_62]
+ jmp wq
+.w4:
+ ALLOC_STACK -64, 7
+ cmp angleb, 40
+ jae .w4_no_upsample
+ lea r3d, [angleq-1024]
+ sar r3d, 7
+ add r3d, hd
+ jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
+ vpbroadcastw xm3, [tlq+14]
+ movu xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8
+ palignr xm0, xm3, xm1, 4 ; 3 4 5 6 7 8 8 8
+ paddw xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7
+ add dxd, dxd
+ palignr xm2, xm3, xm1, 2 ; 2 3 4 5 6 7 8 8
+ paddw xm2, xm1 ; -1 * a + 9 * b + 9 * c + -1 * d
+ psubw xm0, xm2, xm0 ; = (b + c - a - d + (b + c) << 3 + 8) >> 4
+ psraw xm0, 3 ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1
+ pxor xm4, xm4
+ paddw xm2, xm0
+ vpbroadcastw xm0, r8m ; pixel_max
+ mova [rsp+32], xm3
+ movd xm3, dxd
+ pmaxsw xm2, xm4
+ mov r3d, dxd
+ pavgw xm2, xm4
+ vpbroadcastw m3, xm3
+ pminsw xm2, xm0
+ punpcklwd xm0, xm1, xm2
+ punpckhwd xm1, xm2
+ lea r5, [strideq*3]
+ pslldq m2, m3, 8
+ mova [rsp+ 0], xm0
+ mova [rsp+16], xm1
+ paddw m6, m3, m3
+ paddw m3, m2
+ vpblendd m4, m6, 0xf0
+ paddw m6, m6
+ paddw m3, m4 ; xpos0 xpos1 xpos2 xpos3
+ vbroadcasti128 m4, [z_upsample]
+.w4_upsample_loop:
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base0
+ movu xm1, [rsp+r3*2]
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base1
+ movu xm2, [rsp+r2*2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base2
+ vinserti128 m1, [rsp+r3*2], 1 ; 0 2
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base3
+ vinserti128 m2, [rsp+r2*2], 1 ; 1 3
+ pshufb m1, m4
+ pshufb m2, m4
+ punpcklqdq m0, m1, m2
+ punpckhqdq m1, m2
+ pand m2, m5, m3 ; frac
+ psllw m2, 9 ; (a * (64 - frac) + b * frac + 32) >> 6
+ psubw m1, m0 ; = a + (((b - a) * frac + 32) >> 6)
+ pmulhrsw m1, m2 ; = a + (((b - a) * (frac << 9) + 16384) >> 15)
+ paddw m3, m6 ; xpos += dx
+ paddw m0, m1
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r5 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_upsample_loop
+ RET
+ALIGN function_align
+.filter_strength: ; w4/w8/w16
+%define base r3-z_filter_t0
+ movd xm0, maxbased
+ lea r3, [z_filter_t0]
+ movd xm1, angled
+ shr angled, 8 ; is_sm << 1
+ vpbroadcastb m0, xm0
+ vpbroadcastb m1, xm1
+ pcmpeqb m0, [base+z_filter_wh]
+ mova xm2, [r3+angleq*8]
+ pand m0, m1
+ pcmpgtb m0, m2
+ pmovmskb r5d, m0
+ ret
+.w4_no_upsample:
+ mov maxbased, 7
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w4_main
+ lea maxbased, [hq+3]
+ call .filter_strength
+ mov maxbased, 7
+ test r5d, r5d
+ jz .w4_main ; filter_strength == 0
+ popcnt r5d, r5d
+ vpbroadcastw xm3, [tlq+14]
+ mova xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7
+ vpbroadcastd xm1, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0]
+ palignr xm2, xm3, xm0, 4 ; 2 3 4 5 6 7 8 8
+ pmullw xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8
+ paddw xm2, xm0
+ pmullw xm2, xm4
+ movd [rsp+16], xm3
+ cmp r5d, 3
+ jne .w4_3tap
+ paddw xm1, xm2
+ palignr xm2, xm3, xm0, 6 ; 3 4 5 6 7 8 8 8
+ pblendw xm0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6
+ movzx r3d, word [tlq+14]
+ movzx r2d, word [tlq+12]
+ inc maxbased
+ paddw xm2, xm0
+ sub r2d, r3d
+ paddw xm2, xm2
+ lea r2d, [r2+r3*8+4]
+ shr r2d, 3 ; (1 * top[6] + 7 * top[7] + 4) >> 3
+ mov [rsp+16], r2w
+.w4_3tap:
+ pxor xm0, xm0
+ paddw xm1, xm2
+ mov tlq, rsp
+ psrlw xm1, 3
+ cmp hd, 8
+ sbb maxbased, -1
+ pavgw xm0, xm1
+ mova [tlq], xm0
+.w4_main:
+ movd xm3, dxd
+ vpbroadcastq m1, [z_base_inc]
+ vpbroadcastw m6, [tlq+maxbaseq*2] ; top[max_base_x]
+ shl maxbased, 6
+ vpbroadcastw m3, xm3
+ movd xm0, maxbased
+ mov r3d, dxd ; xpos
+ vpbroadcastw m0, xm0
+ paddw m4, m3, m3
+ psubw m1, m0 ; -max_base_x
+ vpblendd m3, m4, 0xcc
+ paddw m0, m4, m3
+ vpblendd m3, m0, 0xf0 ; xpos0 xpos1 xpos2 xpos3
+ paddw m4, m4
+ paddw m3, m1
+.w4_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base0
+ movu xm1, [tlq+r3*2]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base1
+ movu xm2, [tlq+r5*2]
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base2
+ vinserti128 m1, [tlq+r3*2], 1 ; 0 2
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base3
+ vinserti128 m2, [tlq+r5*2], 1 ; 1 3
+ punpcklqdq m0, m1, m2
+ psrldq m1, 2
+ pslldq m2, 6
+ vpblendd m1, m2, 0xcc
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ psraw m2, m3, 15 ; xpos < max_base_x
+ paddw m3, m4
+ paddw m0, m1
+ vpblendvb m0, m6, m0, m2
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ lea dstq, [dstq+strideq*2]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ sub hd, 4
+ jz .w4_end
+ lea dstq, [dstq+strideq*2]
+ cmp r3d, maxbased
+ jb .w4_loop
+ lea r6, [strideq*3]
+.w4_end_loop:
+ movq [dstq+strideq*0], xm6
+ movq [dstq+strideq*1], xm6
+ movq [dstq+strideq*2], xm6
+ movq [dstq+r6 ], xm6
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_end_loop
+.w4_end:
+ RET
+.w8:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -64, 7
+ lea r3d, [angleq+216]
+ mov r3b, hb
+ cmp r3d, 8
+ ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
+ movu m2, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g _
+ movu m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g _ _
+ movu m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ cmp hd, 4
+ jne .w8_upsample_h8 ; awkward single-pixel edge case
+ vpblendd m0, m2, 0x20 ; 3 4 5 6 7 8 9 a b c c _ _ _ _ _
+.w8_upsample_h8:
+ paddw m2, m1
+ paddw m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ add dxd, dxd
+ psubw m0, m2, m0
+ psraw m0, 3
+ pxor m4, m4
+ paddw m2, m0
+ vpbroadcastw m0, r8m
+ movd xm3, dxd
+ pmaxsw m2, m4
+ mov r3d, dxd
+ pavgw m2, m4
+ vpbroadcastw m3, xm3
+ pminsw m2, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ vbroadcasti128 m4, [z_upsample]
+ mova [rsp+ 0], xm0
+ mova [rsp+16], xm1
+ paddw m6, m3, m3
+ vextracti128 [rsp+32], m0, 1
+ vextracti128 [rsp+48], m1, 1
+ vpblendd m3, m6, 0xf0 ; xpos0 xpos1
+.w8_upsample_loop:
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base0
+ movu xm1, [rsp+r3*2]
+ movu xm2, [rsp+r3*2+16]
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base1
+ vinserti128 m1, [rsp+r2*2], 1
+ vinserti128 m2, [rsp+r2*2+16], 1
+ pshufb m1, m4
+ pshufb m2, m4
+ punpcklqdq m0, m1, m2
+ punpckhqdq m1, m2
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m3, m6
+ paddw m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_upsample_loop
+ RET
+.w8_no_intra_edge_filter:
+ and maxbased, 7
+ or maxbased, 8 ; imin(h+7, 15)
+ jmp .w8_main
+.w8_no_upsample:
+ lea maxbased, [hq+7]
+ test angled, 0x400
+ jnz .w8_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .w8_main
+ popcnt r5d, r5d
+ vpbroadcastd m1, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0]
+ mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ movu m2, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pmullw m1, m2
+ cmp hd, 8
+ jl .w8_filter_h4
+ punpckhwd m2, m2
+ vpblendd m3, m2, [tlq+2], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g
+ je .w8_filter_end ; 8x4 and 8x8 are always 3-tap
+ movzx r3d, word [tlq+30]
+ mov maxbased, 16
+ mov [rsp+32], r3d
+ cmp r5d, 3
+ jne .w8_filter_end
+ punpcklwd xm6, xm0, xm0
+ vpblendd m2, [tlq+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g
+ vpblendd m6, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ movzx r5d, word [tlq+28]
+ mov [rsp+34], r3w
+ paddw m2, m6
+ sub r5d, r3d
+ inc maxbased
+ paddw m2, m2
+ lea r3d, [r5+r3*8+4]
+ paddw m1, m2
+ shr r3d, 3
+ mov [rsp+32], r3w
+ jmp .w8_filter_end
+.w8_filter_h4:
+ pshuflw m3, m2, q3321
+ vinserti128 m3, [tlq+2], 0 ; 2 3 4 5 6 7 8 9 a b c c _ _ _ _
+.w8_filter_end:
+ paddw m0, m3
+ pmullw m0, m4
+ mov tlq, rsp
+ pxor m2, m2
+ paddw m0, m1
+ psrlw m0, 3
+ pavgw m0, m2
+ mova [tlq], m0
+.w8_main:
+ movd xm3, dxd
+ vbroadcasti128 m1, [z_base_inc]
+ vpbroadcastw m6, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m3, xm3
+ movd xm0, maxbased
+ mov r3d, dxd
+ vpbroadcastw m0, xm0
+ paddw m4, m3, m3
+ psubw m1, m0
+ vpblendd m3, m4, 0xf0 ; xpos0 xpos1
+ paddw m3, m1
+.w8_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6
+ movu xm0, [tlq+r3*2]
+ movu xm1, [tlq+r3*2+2]
+ lea r3d, [r5+dxq]
+ shr r5d, 6
+ vinserti128 m0, [tlq+r5*2], 1
+ vinserti128 m1, [tlq+r5*2+2], 1
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ psraw m2, m3, 15
+ paddw m3, m4
+ paddw m0, m1
+ vpblendvb m0, m6, m0, m2
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ sub hd, 2
+ jz .w8_end
+ lea dstq, [dstq+strideq*2]
+ cmp r3d, maxbased
+ jb .w8_loop
+.w8_end_loop:
+ mova [dstq+strideq*0], xm6
+ mova [dstq+strideq*1], xm6
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_end_loop
+.w8_end:
+ RET
+.w16_no_intra_edge_filter:
+ and maxbased, 15
+ or maxbased, 16 ; imin(h+15, 31)
+ jmp .w16_main
+.w16:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -96, 7
+ lea maxbased, [hq+15]
+ test angled, 0x400
+ jnz .w16_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .w16_main
+ popcnt r5d, r5d
+ mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m1, m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ cmp r5d, 3
+ jne .w16_filter_3tap
+ vpbroadcastd m2, [base+pw_3]
+ punpcklwd xm0, xm0
+ vpblendd m0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ paddw m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ paddw m0, m2
+ pavgw m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ paddw m0, m1
+ psrlw m0, 2
+ movu m3, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m1, m3, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ cmp hd, 8
+ jl .w16_filter_5tap_h4
+ punpckhwd m3, m3
+ je .w16_filter_5tap_h8
+ vpblendd m4, m3, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h
+ vpblendd m3, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h
+ movzx r3d, word [tlq+62]
+ movzx r2d, word [tlq+60]
+ pavgw m2, m4
+ sub r2d, r3d
+ paddw m1, m3
+ lea r2d, [r2+r3*8+4]
+ paddw m1, m2
+ shr r2d, 3
+ psrlw m1, 2
+ mov [rsp+66], r3w
+ mov [rsp+64], r2w
+ mov tlq, rsp
+ mov r3d, 33
+ cmp hd, 16
+ cmovg maxbased, r3d
+ jmp .w16_filter_end2
+.w16_filter_5tap_h8:
+ vpblendd xm4, xm3, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9
+ vpblendd xm3, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9
+ pavgw xm2, xm4
+ paddw xm1, xm3
+ paddw xm1, xm2
+ psrlw xm1, 2
+ jmp .w16_filter_end2
+.w16_filter_5tap_h4:
+ pshuflw xm4, xm3, q3332 ; 4 5 5 5
+ pshuflw xm3, xm3, q3321 ; 3 4 5 5
+ pavgw xm2, xm4
+ paddw xm1, xm3
+ paddw xm1, xm2
+ psrlw xm1, 2
+ jmp .w16_filter_end2
+.w16_filter_3tap:
+ vpbroadcastd m3, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0]
+ pmullw m0, m3, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ movu m2, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pmullw m1, m4
+ pmullw m3, m2
+ paddw m0, m1
+ cmp hd, 8
+ je .w16_filter_3tap_h8
+ jl .w16_filter_3tap_h4
+ punpckhwd m2, m2
+ vpblendd m2, [tlq+34], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g
+ jmp .w16_filter_end
+.w16_filter_3tap_h4:
+ pshuflw xm2, xm2, q3321 ; 2 3 4 4 _ _ _ _
+ jmp .w16_filter_end
+.w16_filter_3tap_h8:
+ psrldq xm2, 2
+ pshufhw xm2, xm2, q2210 ; 2 3 4 5 6 7 8 8
+.w16_filter_end:
+ paddw m2, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ pmullw m2, m4
+ psrlw m0, 3
+ pxor m1, m1
+ paddw m2, m3
+ psrlw m2, 3
+ pavgw m0, m1
+ pavgw m1, m2
+.w16_filter_end2:
+ mov tlq, rsp
+ mova [tlq+ 0], m0
+ mova [tlq+32], m1
+.w16_main:
+ movd xm4, dxd
+ vpbroadcastw m6, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m4, xm4
+ movd xm0, maxbased
+ mov r3d, dxd
+ vpbroadcastw m0, xm0
+ paddw m3, m4, [z_base_inc]
+ psubw m3, m0
+.w16_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6
+ movu m0, [tlq+r3*2]
+ movu m1, [tlq+r3*2+2]
+ lea r3d, [r5+dxq]
+ shr r5d, 6
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ psraw m2, m3, 15
+ paddw m3, m4
+ paddw m1, m0
+ movu m0, [tlq+r5*2]
+ vpblendvb m2, m6, m1, m2
+ movu m1, [tlq+r5*2+2]
+ mova [dstq+strideq*0], m2
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ psraw m2, m3, 15
+ paddw m3, m4
+ paddw m0, m1
+ vpblendvb m0, m6, m0, m2
+ mova [dstq+strideq*1], m0
+ sub hd, 2
+ jz .w16_end
+ lea dstq, [dstq+strideq*2]
+ cmp r3d, maxbased
+ jb .w16_loop
+.w16_end_loop:
+ mova [dstq+strideq*0], m6
+ mova [dstq+strideq*1], m6
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_end_loop
+.w16_end:
+ RET
+.w32:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -160, 8
+ lea maxbased, [hq+31]
+ mov r3d, 63
+ cmp hd, 32
+ cmova maxbased, r3d
+ test angled, 0x400
+ jnz .w32_main
+ vpbroadcastd m2, [pw_3]
+ mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ punpcklwd xm1, xm0, xm0
+ vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ paddw m1, m2
+ paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ mov r3, rsp
+ paddw m0, m1
+ lea r5d, [maxbaseq-31]
+ psrlw m0, 2
+ mova [r3], m0
+.w32_filter_loop:
+ mova m0, [tlq+30]
+ paddw m1, m2, [tlq+28]
+ add tlq, 32
+ paddw m0, [tlq+0]
+ pavgw m1, [tlq+4]
+ paddw m0, [tlq+2]
+ add r3, 32
+ paddw m0, m1
+ psrlw m0, 2
+ mova [r3], m0
+ sub r5d, 16
+ jg .w32_filter_loop
+ movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ punpckhwd m1, m0, m0
+ paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ jl .w32_filter_h8
+ vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h
+ vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h
+ movzx r5d, word [tlq+62]
+ movzx r2d, word [tlq+60]
+ pavgw m2, m3
+ sub r2d, r5d
+ paddw m0, m1
+ lea r2d, [r2+r5*8+4]
+ paddw m0, m2
+ shr r2d, 3
+ psrlw m0, 2
+ mova [r3+32], m0
+ mov [r3+66], r5w
+ mov [r3+64], r2w
+ mov tlq, rsp
+ mov r3d, 65
+ cmp hd, 64
+ cmove maxbased, r3d
+ jmp .w32_main
+.w32_filter_h8:
+ vpblendd xm3, xm1, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9
+ vpblendd xm1, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9
+ pavgw xm2, xm3
+ paddw xm0, xm1
+ mov tlq, rsp
+ paddw xm0, xm2
+ psrlw xm0, 2
+ mova [r3+32], xm0
+.w32_main:
+ movd xm4, dxd
+ vpbroadcastw m6, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m4, xm4
+ movd xm0, maxbased
+ mov r5d, dxd
+ vpbroadcastd m7, [pw_m1024] ; -16 * 64
+ vpbroadcastw m0, xm0
+ paddw m3, m4, [z_base_inc]
+ psubw m3, m0
+.w32_loop:
+ mov r3d, r5d
+ shr r3d, 6
+ movu m0, [tlq+r3*2]
+ movu m1, [tlq+r3*2+2]
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ psraw m1, m3, 15
+ vpblendvb m0, m6, m0, m1
+ mova [dstq+32*0], m0
+ movu m0, [tlq+r3*2+32]
+ movu m1, [tlq+r3*2+34]
+ add r5d, dxd
+ psubw m1, m0
+ pmulhrsw m1, m2
+ pcmpgtw m2, m7, m3
+ paddw m3, m4
+ paddw m0, m1
+ vpblendvb m0, m6, m0, m2
+ mova [dstq+32*1], m0
+ dec hd
+ jz .w32_end
+ add dstq, strideq
+ cmp r5d, maxbased
+ jb .w32_loop
+.w32_end_loop:
+ mova [dstq+32*0], m6
+ mova [dstq+32*1], m6
+ add dstq, strideq
+ dec hd
+ jg .w32_end_loop
+.w32_end:
+ RET
+.w64:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -256, 10
+ lea maxbased, [hq+63]
+ test angled, 0x400
+ jnz .w64_main
+ vpbroadcastd m2, [pw_3]
+ mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ punpcklwd xm1, xm0, xm0
+ vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ paddw m1, m2
+ paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ mov r3, rsp
+ paddw m0, m1
+ lea r5d, [hq+32]
+ psrlw m0, 2
+ mova [r3], m0
+.w64_filter_loop:
+ mova m0, [tlq+30]
+ paddw m1, m2, [tlq+28]
+ add tlq, 32
+ paddw m0, [tlq+0]
+ pavgw m1, [tlq+4]
+ paddw m0, [tlq+2]
+ add r3, 32
+ paddw m0, m1
+ psrlw m0, 2
+ mova [r3], m0
+ sub r5d, 16
+ jg .w64_filter_loop
+ movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ punpckhwd m1, m0, m0
+ paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h
+ vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h
+ pavgw m2, m3
+ paddw m0, m1
+ paddw m0, m2
+ mov tlq, rsp
+ psrlw m0, 2
+ mova [r3+32], m0
+.w64_main:
+ movd xm4, dxd
+ vpbroadcastw m6, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m4, xm4
+ movd xm0, maxbased
+ mov r5d, dxd
+ vpbroadcastd m7, [pw_m1024] ; -16 * 64
+ vpbroadcastw m0, xm0
+ paddw m3, m4, [z_base_inc]
+ paddw m8, m7, m7 ; -32 * 64
+ psubw m3, m0
+ paddw m9, m8, m7 ; -48 * 64
+.w64_loop:
+ mov r3d, r5d
+ shr r3d, 6
+ movu m0, [tlq+r3*2]
+ movu m1, [tlq+r3*2+2]
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ psraw m1, m3, 15
+ vpblendvb m0, m6, m0, m1
+ mova [dstq+32*0], m0
+ movu m0, [tlq+r3*2+32]
+ movu m1, [tlq+r3*2+34]
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ pcmpgtw m1, m7, m3
+ vpblendvb m0, m6, m0, m1
+ mova [dstq+32*1], m0
+ movu m0, [tlq+r3*2+64]
+ movu m1, [tlq+r3*2+66]
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ pcmpgtw m1, m8, m3
+ vpblendvb m0, m6, m0, m1
+ mova [dstq+32*2], m0
+ movu m0, [tlq+r3*2+96]
+ movu m1, [tlq+r3*2+98]
+ add r5d, dxd
+ psubw m1, m0
+ pmulhrsw m1, m2
+ pcmpgtw m2, m9, m3
+ paddw m3, m4
+ paddw m0, m1
+ vpblendvb m0, m6, m0, m2
+ mova [dstq+32*3], m0
+ dec hd
+ jz .w64_end
+ add dstq, strideq
+ cmp r5d, maxbased
+ jb .w64_loop
+.w64_end_loop:
+ mova [dstq+32*0], m6
+ mova [dstq+32*1], m6
+ mova [dstq+32*2], m6
+ mova [dstq+32*3], m6
+ add dstq, strideq
+ dec hd
+ jg .w64_end_loop
+.w64_end:
+ RET
+
+cglobal ipred_z2_16bpc, 3, 12, 12, 352, dst, stride, tl, w, h, angle, dx, dy
+%define base r9-z_filter_t0
+ lea r9, [ipred_z2_16bpc_avx2_table]
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ movifnidn hd, hm
+ lea dxq, [dr_intra_derivative-90]
+ movsxd wq, [r9+wq*4]
+ mova m1, [tlq- 0]
+ movzx dyd, angleb
+ xor angled, 0x400
+ mova m2, [tlq- 32]
+ mov r8, dxq
+ sub dxq, dyq
+ mova m3, [tlq- 64]
+ add wq, r9
+ add r9, z_filter_t0-ipred_z2_16bpc_avx2_table
+ mova m4, [tlq- 96]
+ and dyd, ~1
+ mova m5, [tlq-128]
+ and dxq, ~1
+ movzx dyd, word [r8+dyq] ; angle - 90
+ movzx dxd, word [dxq+270] ; 180 - angle
+ vpbroadcastd m11, [base+pw_62]
+ mova [rsp+128], m1
+ mova [rsp+ 96], m2
+ mova [rsp+ 64], m3
+ neg dxd
+ mova [rsp+ 32], m4
+ neg dyq
+ mova [rsp+ 0], m5
+ jmp wq
+.w4:
+ vbroadcasti128 m10, [base+z2_x_shuf]
+ vpbroadcastq m6, [base+z_base_inc+2]
+ lea r8d, [dxq+(65<<6)] ; xpos
+ mov r10d, (63-4)<<6
+ test angled, 0x400
+ jnz .w4_main ; !enable_intra_edge_filter
+ lea r3d, [hq+2]
+ add angled, 1022
+ shl r3d, 6
+ test r3d, angled
+ jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
+ movq xm0, [tlq+2] ; 1 2 3 4
+ movq xm1, [tlq+0] ; 0 1 2 3
+ pshuflw xm2, xm0, q3321 ; 2 3 4 4
+ pshuflw xm3, xm1, q2100 ; 0 0 1 2
+ vpbroadcastw xm4, r8m ; pixel_max
+ vbroadcasti128 m10, [base+z_upsample]
+ paddw xm1, xm0
+ paddw xm2, xm3
+ lea r8d, [r8+dxq+(1<<6)]
+ psubw xm2, xm1, xm2
+ add dxd, dxd
+ psraw xm2, 3
+ pxor xm3, xm3
+ sub r10d, 3<<6
+ paddw xm1, xm2
+ paddw m6, m6
+ pmaxsw xm1, xm3
+ sub angled, 1075 ; angle - 53
+ pavgw xm1, xm3
+ lea r3d, [hq+3]
+ pminsw xm1, xm4
+ xor angled, 0x7f ; 180 - angle
+ punpcklwd xm1, xm0
+ movu [rsp+130], xm1
+ call .filter_strength
+ jmp .w4_filter_left
+ALIGN function_align
+.filter_strength:
+ movd xm8, r3d
+ mov r3d, angled
+ movd xm7, angled
+ vpbroadcastb m8, xm8
+ shr r3d, 8 ; is_sm << 1
+ vpbroadcastb m7, xm7
+ pcmpeqb m8, [base+z_filter_wh]
+ mova xm9, [r9+r3*8]
+ pand m0, m8, m7
+ pcmpgtb m0, m9
+ pmovmskb r3d, m0
+ ret
+ALIGN function_align
+.upsample_left: ; h4/h8
+ mova xm0, [tlq-16] ; 8 7 6 5 4 3 2 1
+ movu xm1, [tlq-14] ; 7 6 5 4 3 2 1 0
+%if STACK_ALIGNMENT < 32
+ vpbroadcastw xm4, r8m ; pixel_max
+%else
+ vpbroadcastw xm4, r9m ; r8m -> r9m due to call
+%endif
+ cmp hd, 8
+ je .upsample_left_h8
+ pshufhw xm2, xm0, q2100 ; _ _ _ _ 4 4 3 2
+ pshufhw xm3, xm1, q3321 ; _ _ _ _ 2 1 0 0
+ jmp .upsample_left_end
+.upsample_left_h8:
+ pblendw xm2, xm0, [tlq-18], 0xfe ; 8 8 7 6 5 4 3 2
+ pblendw xm3, xm1, [tlq-12], 0x7f ; 6 5 4 3 2 1 0 0
+.upsample_left_end:
+ paddw xm1, xm0
+ paddw xm2, xm3
+ psubw xm2, xm1, xm2
+ add dyq, dyq
+ psraw xm2, 3
+ pxor xm3, xm3
+ paddw xm1, xm2
+ pmaxsw xm1, xm3
+ pavgw xm1, xm3
+ pminsw xm1, xm4
+ punpcklwd xm2, xm0, xm1
+ punpckhwd xm0, xm1
+ mova [rsp+ 96+gprsize], xm2
+ mova [rsp+112+gprsize], xm0
+ ret
+.w4_no_upsample_above:
+ lea r3d, [hq+3]
+ sub angled, 1112 ; angle - 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w4_no_filter_above
+ popcnt r3d, r3d
+ vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0]
+ psrldq xm0, xm1, 2 ; 1 2 3 4
+ pshuflw xm2, xm1, q2100 ; 0 0 1 2
+ pmullw xm4, xm0
+ pshuflw xm3, xm0, q3321 ; 2 3 4 4
+ paddw xm1, xm3
+ pshuflw xm3, xm0, q3332 ; 3 4 4 4
+ pmullw xm1, xm5
+ vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*2]
+ paddw xm2, xm3
+ vpbroadcastd xm3, r6m ; max_width
+ pmullw xm2, xm5
+ packssdw xm3, xm3
+ paddw xm1, xm4
+ paddw xm1, xm2
+ psubw xm3, [base+pw_1to16]
+ pxor xm4, xm4
+ psrlw xm1, 3
+ pminsw xm3, xm11 ; clip to byte range since there's no variable word blend
+ pavgw xm1, xm4
+ vpblendvb xm1, xm0, xm3
+ movq [rsp+130], xm1
+.w4_no_filter_above:
+ lea r3d, [hq+2]
+ add angled, 973 ; angle + 883
+ shl r3d, 6
+ test r3d, angled
+ jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
+ vpbroadcastd xm0, [base+pb_90]
+ psubb xm0, xm7 ; 180 - angle
+ pand xm0, xm8 ; reuse from previous filter_strength call
+ pcmpgtb xm0, xm9
+ pmovmskb r3d, xm0
+.w4_filter_left:
+ test r3d, r3d
+ jz .w4_main
+ popcnt r3d, r3d
+ mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ vpbroadcastd m5, r7m ; max_height
+ cmp r3d, 3
+ je .w4_filter_left_s3
+ vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0]
+ pmullw m2, m0
+ cmp hd, 8
+ jl .w4_filter_left_h4
+ movu m4, [tlq-34]
+ punpcklwd m1, m0, m0
+ vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e
+ je .w4_filter_left_end
+ vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ jmp .w4_filter_left_end
+.w4_upsample_left:
+ call .upsample_left
+ mov r11, -16
+ vbroadcasti128 m9, [base+z_upsample]
+ jmp .w4_main_upsample_left
+.w4_filter_left_s3: ; can only be h16
+ movu m2, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpbroadcastd m4, [base+pw_3]
+ paddw m1, m0, m2
+ punpckhwd m2, m2
+ vpblendd m2, [tlq-28], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g
+ punpcklwd xm3, xm0, xm0
+ paddw m2, m4
+ vpblendd m4, m3, [tlq-34], 0xfe ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e
+ vpblendd m3, [tlq-36], 0xfe ; 0 0 0 1 2 3 4 5 6 8 8 9 a b c d
+ paddw m1, m4
+ pavgw m2, m3
+ paddw m1, m2
+ psrlw m1, 2
+ jmp .w4_filter_left_end2
+.w4_filter_left_h4:
+ pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e
+.w4_filter_left_end:
+ paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pmullw m1, m3
+ paddw m1, m2
+ pxor m2, m2
+ psrlw m1, 3
+ pavgw m1, m2
+.w4_filter_left_end2:
+ packssdw m5, m5
+ psubw m5, [base+pw_16to1]
+ pminsw m5, m11
+ vpblendvb m1, m0, m5
+ mova [rsp+96], m1
+.w4_main:
+ vbroadcasti128 m9, [base+z2_x_shuf]
+ mov r11, -8
+.w4_main_upsample_left:
+ movd xm5, dyd
+ mova m4, [base+z2_y_shuf_h4]
+ mov r2d, r8d
+ movd xm0, dxd
+ vpbroadcastw m5, xm5
+ rorx r5, dyq, 5
+ lea r8d, [dyq*3]
+ pmullw m5, [base+z2_ymul]
+ rorx r9, dyq, 4
+ sar dyd, 6
+ vpbroadcastw m0, xm0
+ sar r8d, 6
+ pand m5, m11 ; frac_y
+ neg dyd
+ psllw m5, 9
+ add r5d, dyd
+ add r8d, dyd
+ add r9d, dyd
+ paddw m7, m0, m0
+ lea dyq, [rsp+dyq*2+126]
+ vpblendd m0, m7, 0xcc
+ add dyq, r11
+ neg r5d
+ paddw m1, m0, m7
+ neg r8d
+ vpblendd m0, m1, 0xf0 ; xpos0 xpos1 xpos2 xpos3
+ neg r9d
+ paddw m7, m7
+ paddw m6, m0
+.w4_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x0
+ movu xm1, [rsp+r2*2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x1
+ movu xm3, [rsp+r3*2]
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x2
+ vinserti128 m1, [rsp+r2*2], 1
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x3
+ vinserti128 m3, [rsp+r3*2], 1
+ pshufb m1, m10 ; a0 a1 a2 a3 A0 A1 A2 A3
+ pshufb m3, m10 ; b0 b1 b2 b3 B0 B1 B2 B3
+ pand m2, m11, m6
+ punpcklqdq m0, m1, m3
+ punpckhqdq m1, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ cmp r3d, 64
+ jge .w4_toponly
+ movu xm2, [dyq]
+ vinserti128 m2, [dyq+r8*2], 1
+ movu xm3, [dyq+r5*2]
+ vinserti128 m3, [dyq+r9*2], 1
+ pshufb m2, m9
+ pshufb m3, m9
+ punpckhwd m1, m2, m3 ; a3 b3 a2 b2 a1 b1 a0 b0
+ punpcklwd m2, m3
+ psubw m2, m1
+ pmulhrsw m2, m5
+ psraw m3, m6, 15 ; base_x < topleft
+ paddw m1, m2
+ vpermd m1, m4, m1 ; a0 b0 c0 d0 a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3
+ vpblendvb m0, m1, m3
+.w4_toponly:
+ paddw m6, m7 ; xpos += dx
+ lea r3, [strideq*3]
+ add dyq, r11
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r3 ], xm1
+ sub hd, 4
+ jz .w4_end
+ lea dstq, [dstq+strideq*4]
+ cmp r2d, r10d
+ jge .w4_loop
+.w4_leftonly_loop:
+ movu xm1, [dyq]
+ vinserti128 m1, [dyq+r8*2], 1
+ movu xm2, [dyq+r5*2]
+ vinserti128 m2, [dyq+r9*2], 1
+ add dyq, r11
+ pshufb m1, m9
+ pshufb m2, m9
+ punpckhwd m0, m1, m2
+ punpcklwd m1, m2
+ psubw m1, m0
+ pmulhrsw m1, m5
+ paddw m0, m1
+ vpermd m0, m4, m0
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r3 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_leftonly_loop
+.w4_end:
+ RET
+.w8:
+ mov r10d, hd
+ test angled, 0x400
+ jnz .w8_main
+ lea r3d, [angleq+126]
+ xor r8d, r8d
+ mov r3b, hb
+ cmp r3d, 8
+ ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
+ movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8
+ mova xm1, [tlq+0] ; 0 1 2 3 4 5 6 7
+ pblendw xm2, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8
+ pblendw xm3, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6
+ vpbroadcastw xm4, r8m ; pixel_max
+ paddw xm1, xm0
+ paddw xm2, xm3
+ not r8d
+ psubw xm2, xm1, xm2
+ add dxd, dxd
+ psraw xm2, 3
+ sub angled, 53 ; angle - 53
+ pxor xm3, xm3
+ paddw xm2, xm1
+ lea r3d, [hq+7]
+ pmaxsw xm2, xm3
+ xor angled, 0x7f ; 180 - angle
+ pavgw xm2, xm3
+ pminsw xm2, xm4
+ punpcklwd xm1, xm2, xm0
+ punpckhwd xm2, xm0
+ movu [rsp+130], xm1
+ movu [rsp+146], xm2
+ call .filter_strength
+ jmp .w8_filter_left
+.w8_no_upsample_above:
+ lea r3d, [hq+7]
+ sub angled, 90 ; angle - 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w8_no_filter_above
+ popcnt r3d, r3d
+ vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0]
+ vpbroadcastd xm6, [base+z_filter_k-4+r3*4+12*2]
+ movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8 x
+ pblendw xm2, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 x
+ pmullw xm4, xm0
+ pblendw xm3, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8 x
+ paddw xm1, xm3
+ vpblendd xm3, [tlq+6], 0x07 ; 3 4 5 6 7 8 8 8 x
+ paddw xm2, xm3
+ vpbroadcastd xm3, r6m ; max_width
+ pmullw xm1, xm5
+ pmullw xm2, xm6
+ packssdw xm3, xm3
+ paddw xm1, xm4
+ paddw xm1, xm2
+ psubw xm3, [base+pw_1to16]
+ pxor xm4, xm4
+ psrlw xm1, 3
+ pminsw xm3, xm11
+ pavgw xm1, xm4
+ vpblendvb xm1, xm0, xm3
+ movu [rsp+130], xm1
+.w8_no_filter_above:
+ lea r3d, [angleq-51]
+ mov r3b, hb
+ cmp r3d, 8
+ jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm
+ vpbroadcastd m0, [base+pb_90]
+ psubb m0, m7
+ pand m0, m8
+ pcmpgtb m0, m9
+ pmovmskb r3d, m0
+.w8_filter_left:
+ test r3d, r3d
+ jz .w8_main
+ popcnt r3d, r3d
+ cmp r3d, 3
+ jne .w8_filter_left_s12
+ vpbroadcastd m6, [base+pw_3]
+ vpbroadcastd m7, [base+pw_16]
+ cmp hd, 16 ; flags needed for later
+ jmp .filter_left_s3b
+.w8_upsample_left:
+ call .upsample_left
+ vbroadcasti128 m7, [base+z2_y_shuf_us]
+ lea r11, [rsp+118]
+ mov r8, -8
+ jmp .w8_main_upsample_left
+.w16_filter_left_s12:
+ xor r8d, r8d
+.w8_filter_left_s12:
+ mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ vpbroadcastd m5, r7m ; max_height
+ vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0]
+ pmullw m2, m0
+ cmp hd, 8
+ jl .w8_filter_left_h4
+ movu m4, [tlq-34]
+ punpcklwd m1, m0, m0
+ vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e
+ je .w8_filter_left_end
+ vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ jmp .w8_filter_left_end
+.w8_filter_left_h4:
+ pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e
+.w8_filter_left_end:
+ paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pmullw m1, m3
+ paddw m1, m2
+ pxor m2, m2
+ psrlw m1, 3
+ pavgw m1, m2
+ packssdw m5, m5
+ psubw m5, [base+pw_16to1]
+ pminsw m5, m11
+ vpblendvb m1, m0, m5
+ mova [rsp+96], m1
+ test r8d, r8d
+ jz .w8_main
+; upsample_main
+ vbroadcasti128 m10, [base+z_upsample]
+ vbroadcasti128 m7, [base+z2_y_shuf]
+ lea r5, [rsp+120]
+ movd xm1, dyd
+ vbroadcasti128 m4, [base+z_base_inc+2]
+ movd xm2, dxd
+ vpbroadcastw m1, xm1
+ vpbroadcastw m2, xm2
+ mov r7, dstq
+ paddw m4, m4
+ pmullw m0, m1, [base+z2_ymul8]
+ paddw m5, m2, m2
+ psllw xm1, 3
+ vpblendd m2, m5, 0xf0
+ lea r2d, [dxq+(66<<6)] ; xpos
+ paddw m4, m2
+ pshufd m6, m0, q2020
+ psraw xm0, 6
+ pxor xm1, xm1
+ psubw xm8, xm1, xm0
+ pand m6, m11
+ punpckhwd xm9, xm8, xm1
+ psllw m6, 9
+ punpcklwd xm8, xm1
+.w8_upsample_above_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6
+ movu xm1, [rsp+r2*2]
+ movu xm2, [rsp+r2*2+16]
+ lea r2d, [r3+dxq]
+ shr r3d, 6
+ vinserti128 m1, [rsp+r3*2], 1
+ vinserti128 m2, [rsp+r3*2+16], 1
+ pshufb m1, m10
+ pshufb m2, m10
+ punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0
+ punpckhqdq m1, m2
+ pand m2, m11, m4
+ psubw m1, m0
+ psllw m2, 9
+ pmulhrsw m1, m2
+ paddw m0, m1
+ cmp r3d, 64
+ jge .w8_upsample_above_toponly
+ mova m1, m5
+ vpgatherdq m3, [r5+xm9*2], m5
+ mova m5, m1
+ vpgatherdq m2, [r5+xm8*2], m1
+ pshufb m3, m7
+ pshufb m2, m7
+ punpckldq m1, m2, m3
+ punpckhdq m2, m3
+ psubw m2, m1
+ pmulhrsw m2, m6
+ paddw m1, m2
+ vpermq m1, m1, q3120
+ psraw m2, m4, 15
+ vpblendvb m0, m1, m2
+.w8_upsample_above_toponly:
+ paddw m4, m5
+ sub r5, 4
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ sub hd, 2
+ jz .w8_ret
+ lea dstq, [dstq+strideq*2]
+ jmp .w8_upsample_above_loop
+.w8_main:
+ vbroadcasti128 m7, [base+z2_y_shuf]
+ lea r11, [rsp+120]
+ mov r8, -4
+.w8_main_upsample_left:
+ movd xm1, dyd
+ vbroadcasti128 m4, [base+z_base_inc+2]
+ movd xm2, dxd
+ vpbroadcastw m1, xm1
+ vpbroadcastw m2, xm2
+ mov r7, dstq
+ pmullw m0, m1, [base+z2_ymul8]
+ paddw m5, m2, m2
+ psllw xm1, 3
+ vpblendd m2, m5, 0xf0 ; xpos0 xpos1
+ lea r9d, [dxq+(65<<6)] ; xpos
+ paddw m4, m2
+ movd [rsp+284], xm1
+.w8_loop0:
+ mov r2d, r9d
+ mova [rsp+288], m0
+ mov r5, r11
+ mova [rsp+320], m4
+ pshufd m6, m0, q2020
+ psraw xm0, 6
+ pxor xm1, xm1
+ psubw xm8, xm1, xm0 ; base_y
+ pand m6, m11 ; frac_y
+ punpckhwd xm9, xm8, xm1 ; base_y 2 3 6 7
+ psllw m6, 9
+ punpcklwd xm8, xm1 ; base_y 0 1 4 5
+.w8_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x0
+ movu xm0, [rsp+r2*2]
+ movu xm1, [rsp+r2*2+2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x1
+ vinserti128 m0, [rsp+r3*2], 1
+ vinserti128 m1, [rsp+r3*2+2], 1
+ pand m2, m11, m4
+ psubw m1, m0
+ psllw m2, 9
+ pmulhrsw m1, m2
+ paddw m0, m1
+ cmp r3d, 64
+ jge .w8_toponly
+ mova m1, m5
+ vpgatherdq m3, [r5+xm9*2], m5
+ mova m5, m1
+ vpgatherdq m2, [r5+xm8*2], m1
+ pshufb m3, m7 ; c0 d0 c1 d1 g0 h0 g1 h1
+ pshufb m2, m7 ; a0 b0 a1 b1 e0 f0 e1 f1
+ punpckldq m1, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhdq m2, m3
+ psubw m2, m1
+ pmulhrsw m2, m6
+ paddw m1, m2
+ vpermq m1, m1, q3120
+ psraw m2, m4, 15 ; base_x < topleft
+ vpblendvb m0, m1, m2
+.w8_toponly:
+ paddw m4, m5 ; xpos += dx
+ add r5, r8
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ sub hd, 2
+ jz .w8_end
+ lea dstq, [dstq+strideq*2]
+ cmp r2d, (63-8)<<6
+ jge .w8_loop
+.w8_leftonly_loop:
+ mova m0, m5
+ vpgatherdq m4, [r5+xm9*2], m5
+ mova m5, m0
+ vpgatherdq m3, [r5+xm8*2], m0
+ add r5, r8
+ pshufb m2, m4, m7
+ pshufb m1, m3, m7
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ psubw m1, m0
+ pmulhrsw m1, m6
+ paddw m0, m1
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_leftonly_loop
+.w8_end:
+ sub r10d, 1<<8
+ jl .w8_ret
+ vpbroadcastd m0, [rsp+284]
+ add r7, 16
+ paddw m0, [rsp+288] ; base_y += 8*dy
+ add r9d, 8<<6
+ vpbroadcastd m4, [pw_512]
+ movzx hd, r10b
+ paddw m4, [rsp+320] ; base_x += 8*64
+ mov dstq, r7
+ jmp .w8_loop0
+.w8_ret:
+ RET
+.w16:
+ movd xm0, [tlq+32]
+ lea r10d, [hq+(1<<8)]
+ movd [rsp+160], xm0
+ test angled, 0x400
+ jnz .w8_main
+ lea r3d, [hq+15]
+ sub angled, 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w16_no_filter_above
+ popcnt r3d, r3d
+ vpbroadcastd m4, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd m5, [base+z_filter_k-4+r3*4+12*0]
+ vpbroadcastd m6, [base+z_filter_k-4+r3*4+12*2]
+ movu m0, [tlq+2] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ punpcklwd xm2, xm1, xm1
+ vpblendd m2, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ punpckhwd m3, m0, m0
+ pmullw m4, m0
+ vpblendd m3, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g
+ paddw m1, m3
+ vpblendd m3, [tlq+6], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g
+ paddw m2, m3
+ vpbroadcastd m3, r6m ; max_width
+ pmullw m1, m5
+ pmullw m2, m6
+ packssdw m3, m3
+ paddw m1, m4
+ paddw m1, m2
+ psubw m3, [base+pw_1to16]
+ pxor m4, m4
+ psrlw m1, 3
+ pminsw m3, m11
+ pavgw m1, m4
+ vpblendvb m1, m0, m3
+ movu [rsp+130], m1
+.w16_no_filter_above:
+ vpbroadcastd m0, [base+pb_90]
+ psubb m0, m7
+ pand m0, m8
+ pcmpgtb m0, m9
+ pmovmskb r3d, m0
+ test r3d, r3d
+ jz .w8_main
+ popcnt r3d, r3d
+ cmp r3d, 3
+ jne .w16_filter_left_s12
+ vpbroadcastd m6, [base+pw_3]
+ vpbroadcastd m7, [base+pw_16]
+ cmp hd, 4
+ jne .filter_left_s3
+ movq xm0, [tlq-8] ; 0 1 2 3
+ movq xm1, [tlq-6] ; 1 2 3 4
+ vpbroadcastd xm5, r7m ; max_height
+ movq xm4, [base+pw_16to1+24] ; 4to1
+ pshuflw xm2, xm0, q2100 ; 0 0 1 2
+ pshuflw xm3, xm1, q3321 ; 2 3 4 4
+ paddw xm1, xm0
+ paddw xm1, xm2
+ pshuflw xm2, xm0, q1000 ; 0 0 0 1
+ paddw xm3, xm6
+ packssdw xm5, xm5
+ pavgw xm2, xm3
+ psubw xm5, xm4
+ paddw xm1, xm2
+ pminsw xm5, xm11
+ psrlw xm1, 2
+ vpblendvb xm1, xm0, xm5
+ movq [rsp+120], xm1
+ jmp .w8_main
+.w32:
+ mova m2, [tlq+32]
+ movd xm0, [tlq+64]
+ lea r10d, [hq+(3<<8)]
+ mova [rsp+160], m2
+ movd [rsp+192], xm0
+ test angled, 0x400
+ jnz .w8_main
+ vpbroadcastd m6, [base+pw_3]
+ vpbroadcastd m0, r6m ; max_width
+ vpbroadcastd m7, [base+pw_16]
+ mov r3d, 32
+ packssdw m0, m0
+ psubw m0, [base+pw_1to16]
+ pminsw m8, m0, m11
+ psubw m9, m8, m7
+.w32_filter_above:
+ movu m0, [tlq+2]
+ punpcklwd xm4, xm1, xm1
+ paddw m2, m6, [tlq+6]
+ paddw m1, m0
+ vpblendd m4, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ paddw m1, [tlq+4]
+ movu m3, [tlq+r3+2]
+ paddw m5, m6, [tlq+r3-2]
+ pavgw m2, m4
+ punpckhwd m4, m3, m3
+ paddw m1, m2
+ vpblendd m2, m4, [tlq+r3+6], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h
+ vpblendd m4, [tlq+r3+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h
+ pavgw m2, m5
+ paddw m5, m3, [tlq+r3]
+ paddw m4, m5
+ psrlw m1, 2
+ paddw m2, m4
+ vpblendvb m1, m0, m8
+ psrlw m2, 2
+ vpblendvb m2, m3, m9
+ movu [rsp+130], m1
+ movu [rsp+r3+130], m2
+.filter_left_s3:
+ cmp hd, 16
+ jl .filter_left_s3_h8 ; h8
+.filter_left_s3b:
+ mova m0, [tlq-32] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ movu m2, [tlq-30] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ vpbroadcastd m5, r7m ; max_height
+ paddw m1, m0, m2
+ punpckhwd m2, m2
+ mov r3d, hd
+ vpblendd m2, [tlq-28], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i
+ packssdw m5, m5
+ not r3
+ psubw m5, [base+pw_16to1]
+ paddw m2, m6
+ pminsw m8, m11, m5
+ je .filter_left_s3_end ; h16
+ paddw m1, [tlq-34] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pavgw m2, [tlq-36] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m1, m2
+ psrlw m1, 2
+ vpblendvb m3, m1, m0, m8
+ mova m0, [tlq-64] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m1, m0, [tlq-62] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ paddw m2, m6, [tlq-60] ; 4 5 6 7 8 9 a b c d e f g h i j
+ psubw m8, m7
+ mova [rsp+96], m3
+ jnp .filter_left_s3_end ; h32
+ mova m5, [tlq-96]
+ paddw m1, [tlq-66]
+ pavgw m2, [tlq-68]
+ paddw m1, m2
+ paddw m4, m5, [tlq-94]
+ paddw m2, m6, [tlq-92]
+ psrlw m1, 2
+ paddw m4, [tlq- 98]
+ pavgw m2, [tlq-100]
+ vpblendvb m3, m1, m0, m8
+ mova m0, [tlq-128]
+ psubw m8, m7
+ paddw m4, m2
+ paddw m1, m0, [tlq-126]
+ paddw m2, m6, [tlq-124]
+ psrlw m4, 2
+ mova [rsp+64], m3
+ vpblendvb m4, m5, m8
+ psubw m8, m7
+ mova [rsp+32], m4
+.filter_left_s3_end:
+ punpcklwd xm3, xm0, xm0
+ vpblendd m4, m3, [tlq+r3*2], 0xfe ; 2 2 3 4 5 6 7 8 9 a b c d e f g
+ vpblendd m3, [tlq+r3*2-2], 0xfe ; 2 2 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m1, m4
+ pavgw m2, m3
+ paddw m1, m2
+ psrlw m1, 2
+ vpblendvb m1, m0, m8
+ mova [rsp+r3*2+130], m1
+ jmp .w8_main
+.filter_left_s3_h8:
+ mova xm0, [tlq-16] ; 0 1 2 3 4 5 6 7
+ movu xm3, [tlq-14] ; 1 2 3 4 5 6 7 8
+ pblendw xm2, xm0, [tlq-18], 0xfe ; 0 0 1 2 3 4 5 6
+ vpbroadcastd xm5, r7m ; max_height
+ paddw xm1, xm0, xm3
+ pblendw xm3, [tlq-12], 0x7f ; 2 3 4 5 6 7 8 8
+ paddw xm1, xm2
+ vpblendd xm2, [tlq-20], 0x0e ; 0 0 0 1 2 3 4 5
+ paddw xm3, xm6
+ packssdw xm5, xm5
+ pavgw xm2, xm3
+ psubw xm5, [base+pw_16to1+16] ; 8to1
+ paddw xm1, xm2
+ pminsw xm5, xm11
+ psrlw xm1, 2
+ vpblendvb xm1, xm0, xm5
+ mova [rsp+112], xm1
+ jmp .w8_main
+.w64:
+ mova m2, [tlq+ 32]
+ mova m3, [tlq+ 64]
+ mova m4, [tlq+ 96]
+ movd xm0, [tlq+128]
+ lea r10d, [hq+(7<<8)]
+ mova [rsp+160], m2
+ mova [rsp+192], m3
+ mova [rsp+224], m4
+ movd [rsp+256], xm0
+ test angled, 0x400
+ jnz .w8_main
+ vpbroadcastd m6, [base+pw_3]
+ movu m0, [tlq+34] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m2, m6, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m5, m0, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pavgw m2, [tlq+38] ; 4 5 6 7 8 9 a b c d e f g h h h
+ paddw m5, [tlq+36] ; 3 4 5 6 7 8 9 a b c d e f g h h
+ movu m4, [tlq+66]
+ paddw m3, m6, [tlq+62]
+ paddw m7, m4, [tlq+64]
+ pavgw m3, [tlq+70]
+ paddw m7, [tlq+68]
+ paddw m2, m5
+ vpbroadcastd m5, r6m ; max_width
+ mov r3d, 96
+ packssdw m5, m5
+ paddw m3, m7
+ psubw m5, [base+pw_1to16]
+ psrlw m2, 2
+ vpbroadcastd m7, [base+pw_16]
+ psrlw m3, 2
+ pminsw m8, m11, m5
+ psubw m9, m8, m7
+ vpblendvb m2, m0, m9
+ psubw m9, m7
+ vpblendvb m3, m4, m9
+ psubw m9, m7
+ movu [rsp+162], m2
+ movu [rsp+194], m3
+ jmp .w32_filter_above
+
+cglobal ipred_z3_16bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
+ %assign org_stack_offset stack_offset
+ lea r6, [ipred_z3_16bpc_avx2_table]
+ tzcnt hd, hm
+ movifnidn angled, anglem
+ lea r7, [dr_intra_derivative+45*2-1]
+ sub tlq, 2
+ movsxd hq, [r6+hq*4]
+ sub angled, 180
+ add hq, r6
+ mov dyd, angled
+ neg dyd
+ xor angled, 0x400
+ or dyq, ~0x7e
+ movzx dyd, word [r7+dyq]
+ vpbroadcastd m5, [pw_62]
+ mov org_wd, wd
+ jmp hq
+.h4:
+ ALLOC_STACK -64, 7
+ lea r7, [strideq*3]
+ cmp angleb, 40
+ jae .h4_no_upsample
+ lea r4d, [angleq-1024]
+ sar r4d, 7
+ add r4d, wd
+ jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm)
+ mova xm2, [tlq-14] ; 0 1 2 3 4 5 6 7
+ pblendw xm1, xm2, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6
+ vpblendd xm0, xm1, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5
+ pshufd xm3, xm1, q0000
+ paddw xm1, xm2
+ paddw xm0, [tlq-12] ; 1 2 3 4 5 6 7 8
+ vpbroadcastw xm4, r8m ; pixel_max
+ add dyd, dyd
+ psubw xm0, xm1, xm0
+ mova [rsp+ 0], xm3
+ movd xm3, dyd
+ psraw xm0, 3
+ neg dyd
+ paddw xm1, xm0
+ pxor xm0, xm0
+ lea r2d, [dyq+(16<<6)+63] ; ypos
+ pmaxsw xm1, xm0
+ pavgw xm1, xm0
+ vpbroadcastw m3, xm3
+ pminsw xm1, xm4
+ punpckhwd xm0, xm1, xm2
+ punpcklwd xm1, xm2
+ paddw m2, m3, m3
+ mova [rsp+32], xm0
+ punpcklwd m3, m2
+ mova [rsp+16], xm1
+ paddw m4, m2, m2
+ paddw m2, m3
+ vpblendd m3, m2, 0xf0 ; ypos0 ypos1 ypos2 ypos3
+.h4_upsample_loop:
+ lea r4d, [r2+dyq]
+ shr r2d, 6
+ movu xm1, [rsp+r2*2]
+ lea r2d, [r4+dyq]
+ shr r4d, 6
+ movu xm2, [rsp+r4*2]
+ lea r4d, [r2+dyq]
+ shr r2d, 6
+ vinserti128 m1, [rsp+r2*2], 1
+ lea r2d, [r4+dyq]
+ shr r4d, 6
+ vinserti128 m2, [rsp+r4*2], 1
+ psrld m0, m1, 16
+ pblendw m0, m2, 0xaa ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0
+ pslld m2, 16
+ pblendw m1, m2, 0xaa
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m3, m4
+ paddw m1, m0
+ vextracti128 xm2, m1, 1
+ punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0
+ punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2
+ movhps [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm0
+ movhps [dstq+strideq*2], xm1
+ movq [dstq+r7 ], xm1
+ add dstq, 8
+ sub wd, 4
+ jg .h4_upsample_loop
+ RET
+ALIGN function_align
+.filter_strength: ; h4/h8/h16
+%define base r4-z_filter_t0
+ lea r4, [z_filter_t0]
+ movd xm0, maxbased
+ movd xm1, angled
+ shr angled, 8 ; is_sm << 1
+ vpbroadcastb m0, xm0
+ vpbroadcastb m1, xm1
+ pcmpeqb m0, [base+z_filter_wh]
+ pand m0, m1
+ mova xm1, [r4+angleq*8]
+ pcmpgtb m0, m1
+ pmovmskb r5d, m0
+ ret
+.h4_no_upsample:
+ mov maxbased, 7
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h4_main
+ lea maxbased, [wq+3]
+ call .filter_strength
+ mov maxbased, 7
+ test r5d, r5d
+ jz .h4_main ; filter_strength == 0
+ popcnt r5d, r5d
+ mova xm0, [tlq-14] ; 0 1 2 3 4 5 6 7
+ movu xm3, [tlq-12] ; 1 2 3 4 5 6 7 8
+ vpbroadcastd xm2, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0]
+ pmullw xm2, xm0
+ pblendw xm0, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6
+ paddw xm1, xm0, xm3
+ movd [rsp+12], xm0
+ pmullw xm1, xm4
+ cmp r5d, 3
+ jne .h4_filter_3tap
+ pblendw xm3, [tlq-10], 0x7f ; 2 3 4 5 6 7 8 8
+ vpblendd xm0, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5
+ movzx r4d, word [tlq-14]
+ movzx r2d, word [tlq-12]
+ inc maxbased
+ paddw xm1, xm2
+ paddw xm0, xm3
+ sub r2d, r4d
+ paddw xm2, xm0, xm0
+ lea r2d, [r2+r4*8+4]
+ shr r2d, 3
+ mov [rsp+14], r2w
+.h4_filter_3tap:
+ pxor xm0, xm0
+ paddw xm1, xm2
+ lea tlq, [rsp+30]
+ psrlw xm1, 3
+ cmp wd, 8
+ sbb maxbased, -1
+ pavgw xm0, xm1
+ mova [rsp+16], xm0
+.h4_main:
+ movd xm3, dyd
+ neg maxbaseq
+ vbroadcasti128 m1, [z_base_inc]
+ vpbroadcastw m6, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m3, xm3
+ lea r4d, [maxbaseq+3*64]
+ neg dyq
+ movd xm2, r4d
+ sub tlq, 8
+ lea r4, [dyq+63] ; ypos
+ punpcklwd m1, m1
+ paddw m0, m3, m3
+ vpbroadcastw m2, xm2
+ punpcklwd m3, m0
+ paddw m4, m0, m0
+ paddw m0, m3
+ psubw m2, m1
+ vpblendd m3, m0, 0xf0 ; ypos0 ypos1 ypos2 ypos3
+ or maxbased, 63
+ paddw m3, m2
+.h4_loop:
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base0
+ movu xm1, [tlq+r4*2]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base1
+ movu xm2, [tlq+r5*2]
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base2
+ vinserti128 m1, [tlq+r4*2], 1
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base3
+ vinserti128 m2, [tlq+r5*2], 1
+ punpckhwd m0, m1, m2
+ punpcklwd m1, m2
+ pand m2, m5, m3
+ palignr m0, m1, 4 ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ psraw m2, m3, 15 ; ypos < max_base_y
+ paddw m3, m4
+ paddw m1, m0
+ vpblendvb m1, m6, m1, m2
+ vextracti128 xm2, m1, 1
+ punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0
+ punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2
+ movhps [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm0
+ movhps [dstq+strideq*2], xm1
+ movq [dstq+r7 ], xm1
+ sub wd, 4
+ jz .h4_end
+ add dstq, 8
+ cmp r4d, maxbased
+ jg .h4_loop
+.h4_end_loop:
+ movq [dstq+strideq*0], xm6
+ movq [dstq+strideq*1], xm6
+ movq [dstq+strideq*2], xm6
+ movq [dstq+r7 ], xm6
+ add dstq, 8
+ sub wd, 4
+ jg .h4_end_loop
+.h4_end:
+ RET
+.h8:
+ lea r4d, [angleq+216]
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -64, 8
+ mov r4b, wb
+ lea r7, [strideq*3]
+ cmp r4d, 8
+ ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
+ mova m2, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m1, m2, [tlq-32] ; _ 0 1 2 3 4 5 6 7 8 9 a b c d e
+ movu m0, [tlq-34] ; _ _ 0 1 2 3 4 5 6 7 8 9 a b c d
+ cmp wd, 8
+ je .h8_upsample_w8
+ pshufhw xm3, xm2, q1000
+ vpblendd m0, m3, 0x0f ; _ _ _ _ 4 4 4 5 6 7 8 9 a b c d
+.h8_upsample_w8:
+ paddw m0, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpbroadcastw m4, r8m ; pixel_max
+ add dyd, dyd
+ psubw m0, m1, m0
+ movd xm6, dyd
+ psraw m0, 3
+ neg dyd
+ paddw m1, m0
+ pxor m0, m0
+ pmaxsw m1, m0
+ lea r4d, [dyq+(16<<6)+63] ; ypos
+ pavgw m1, m0
+ vpbroadcastw m6, xm6
+ pminsw m1, m4
+ punpckhwd m0, m1, m2
+ punpcklwd m1, m2
+ vextracti128 [rsp+48], m0, 1
+ vextracti128 [rsp+32], m1, 1
+ paddw m7, m6, m6
+ mova [rsp+16], xm0
+ mova [rsp+ 0], xm1
+ punpcklwd m6, m7 ; ypos0 ypos1
+.h8_upsample_loop:
+ lea r2d, [r4+dyq]
+ shr r4d, 6 ; base0
+ movu m1, [rsp+r4*2]
+ lea r4d, [r2+dyq]
+ shr r2d, 6 ; base1
+ movu m2, [rsp+r2*2]
+ lea r2d, [r4+dyq]
+ shr r4d, 6 ; base2
+ movu m3, [rsp+r4*2]
+ lea r4d, [r2+dyq]
+ shr r2d, 6 ; base3
+ movu m4, [rsp+r2*2]
+ psrld m0, m1, 16
+ pblendw m0, m2, 0xaa ; a7 b7 a6 b6 a5 b5 a4 b4 a3 b3 a2 b2 a1 b1 a0 b0
+ pslld m2, 16
+ pblendw m1, m2, 0xaa
+ psrld m2, m3, 16
+ pblendw m2, m4, 0xaa ; c7 d7 c6 d6 c5 d5 c4 d4 c3 d3 c2 d2 c1 d1 c0 d0
+ pslld m4, 16
+ pblendw m3, m4, 0xaa
+ pand m4, m5, m6
+ paddw m6, m7
+ psllw m4, 9
+ psubw m1, m0
+ pmulhrsw m1, m4
+ pand m4, m5, m6
+ psllw m4, 9
+ psubw m3, m2
+ pmulhrsw m3, m4
+ paddw m6, m7
+ lea r2, [dstq+strideq*4]
+ paddw m1, m0
+ paddw m3, m2
+ punpckhdq m0, m1, m3 ; a5 b5 c5 d5 a4 b4 c4 d4 a1 b1 c1 d1 a0 b0 c0 d0
+ punpckldq m1, m3 ; a7 b7 c7 d7 a6 b6 c6 d6 a3 b3 c3 d3 a2 b2 c2 d2
+ vextracti128 xm2, m0, 1
+ vextracti128 xm3, m1, 1
+ movhps [r2 +strideq*0], xm0
+ movq [r2 +strideq*1], xm0
+ movhps [r2 +strideq*2], xm1
+ movq [r2 +r7 ], xm1
+ movhps [dstq+strideq*0], xm2
+ movq [dstq+strideq*1], xm2
+ movhps [dstq+strideq*2], xm3
+ movq [dstq+r7 ], xm3
+ add dstq, 8
+ sub wd, 4
+ jg .h8_upsample_loop
+ RET
+.h8_no_intra_edge_filter:
+ and maxbased, 7
+ or maxbased, 8 ; imin(w+7, 15)
+ jmp .h8_main
+.h8_no_upsample:
+ lea maxbased, [wq+7]
+ test angled, 0x400
+ jnz .h8_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .h8_main
+ popcnt r5d, r5d
+ mova m0, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ movu m3, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpbroadcastd m2, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0]
+ pmullw m2, m0
+ cmp wd, 8
+ jl .h8_filter_w4
+ punpcklwd xm0, xm0
+ vpblendd m1, m0, [tlq-32], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ movd [rsp+28], xm0
+ paddw m1, m3
+ mov r4d, 16
+ pmullw m1, m4
+ cmovg maxbased, r4d
+ cmp r5d, 3
+ jne .h8_filter_3tap
+ punpckhwd m3, m3
+ vpblendd m0, [tlq-34], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d
+ vpblendd m3, [tlq-26], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g
+ movzx r4d, word [tlq-30]
+ movzx r2d, word [tlq-28]
+ inc maxbased
+ paddw m1, m2
+ paddw m0, m3
+ sub r2d, r4d
+ paddw m2, m0, m0
+ lea r2d, [r2+r4*8+4]
+ shr r2d, 3
+ mov [rsp+30], r2w
+ jmp .h8_filter_3tap
+.h8_filter_w4:
+ pshufhw xm1, xm0, q2100
+ vinserti128 m1, [tlq-16], 1 ; _ _ _ _ 4 4 5 6 7 8 9 a b c d e
+ paddw m1, m3
+ pmullw m1, m4
+.h8_filter_3tap:
+ pxor m0, m0
+ paddw m1, m2
+ lea tlq, [rsp+62]
+ psrlw m1, 3
+ pavgw m0, m1
+ mova [rsp+32], m0
+.h8_main:
+ movd xm4, dyd
+ neg maxbaseq
+ vbroadcasti128 m1, [z_base_inc]
+ vpbroadcastw m7, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m4, xm4
+ lea r4d, [maxbaseq+7*64]
+ neg dyq
+ movd xm2, r4d
+ sub tlq, 16
+ lea r4, [dyq+63]
+ paddw m6, m4, m4
+ vpbroadcastw m2, xm2
+ vpblendd m4, m6, 0xf0 ; ypos0 ypos1
+ psubw m2, m1
+ or maxbased, 63
+ paddw m4, m2
+.h8_loop:
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base0
+ movu xm0, [tlq+r4*2+2]
+ movu xm1, [tlq+r4*2]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base1
+ vinserti128 m0, [tlq+r5*2+2], 1
+ vinserti128 m1, [tlq+r5*2], 1
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base2
+ pand m3, m5, m4
+ psllw m3, 9
+ psubw m1, m0
+ pmulhrsw m1, m3
+ psraw m3, m4, 15
+ paddw m4, m6
+ paddw m0, m1
+ movu xm1, [tlq+r4*2+2]
+ movu xm2, [tlq+r4*2]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base3
+ vpblendvb m0, m7, m0, m3
+ vinserti128 m1, [tlq+r5*2+2], 1
+ vinserti128 m2, [tlq+r5*2], 1
+ pand m3, m5, m4
+ psllw m3, 9
+ psubw m2, m1
+ pmulhrsw m2, m3
+ psraw m3, m4, 15
+ paddw m4, m6
+ lea r5, [dstq+strideq*4]
+ paddw m1, m2
+ vpblendvb m1, m7, m1, m3
+ punpckhwd m2, m0, m1 ; a3 c3 a2 c2 a1 c1 a0 c0 b3 d3 b2 d2 b1 d1 b0 d0
+ vextracti128 xm3, m2, 1
+ punpcklwd m0, m1 ; a7 c7 a6 c6 a5 c5 a4 c5 b7 d7 b6 d6 b5 d5 b4 d4
+ punpckhwd xm1, xm2, xm3 ; a1 b1 c1 d1 a0 b0 c0 d0
+ punpcklwd xm2, xm3 ; a3 b3 c3 d3 a2 b2 c2 d2
+ vextracti128 xm3, m0, 1
+ movhps [dstq+strideq*0], xm1
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movq [dstq+r7 ], xm2
+ punpckhwd xm1, xm0, xm3 ; a5 b5 c5 d5 a4 b4 c4 d4
+ punpcklwd xm0, xm3 ; a7 b7 c7 d7 a6 b6 c6 d6
+ movhps [r5 +strideq*0], xm1
+ movq [r5 +strideq*1], xm1
+ movhps [r5 +strideq*2], xm0
+ movq [r5 +r7 ], xm0
+ sub wd, 4
+ jz .h8_end
+ add dstq, 8
+ cmp r4d, maxbased
+ jg .h8_loop
+ lea r6, [strideq*5]
+ lea r2, [strideq+r7*2] ; stride*7
+ test wd, 4
+ jz .h8_end_loop
+ movq [dstq+strideq*0], xm7
+ movq [dstq+strideq*1], xm7
+ movq [dstq+strideq*2], xm7
+ movq [dstq+r7 ], xm7
+ movq [dstq+strideq*4], xm7
+ movq [dstq+r6 ], xm7
+ movq [dstq+r7*2 ], xm7
+ movq [dstq+r2 ], xm7
+ add dstq, 8
+ sub wd, 4
+ jz .h8_end
+.h8_end_loop:
+ mova [dstq+strideq*0], xm7
+ mova [dstq+strideq*1], xm7
+ mova [dstq+strideq*2], xm7
+ mova [dstq+r7 ], xm7
+ mova [dstq+strideq*4], xm7
+ mova [dstq+r6 ], xm7
+ mova [dstq+r7*2 ], xm7
+ mova [dstq+r2 ], xm7
+ add dstq, 16
+ sub wd, 8
+ jg .h8_end_loop
+.h8_end:
+ RET
+.h16_no_intra_edge_filter:
+ and maxbased, 15
+ or maxbased, 16 ; imin(w+15, 31)
+ jmp .h16_main
+ALIGN function_align
+.h16:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -96, 10
+ lea maxbased, [wq+15]
+ lea r7, [strideq*3]
+ test angled, 0x400
+ jnz .h16_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .h16_main ; filter_strength == 0
+ popcnt r5d, r5d
+ movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ paddw m1, m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpbroadcastd m6, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0]
+ pmullw m2, m6, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ pmullw m1, m7
+ paddw m1, m2
+ cmp wd, 8
+ jg .h16_filter_w16
+ mova xm3, [tlq-46] ; 0 1 2 3 4 5 6 7
+ pmullw xm6, xm3
+ jl .h16_filter_w4
+ pblendw xm3, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6
+ cmp r5d, 3
+ jne .h16_filter_w8_3tap
+ vpblendd xm4, xm3, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5
+.h16_filter_w8_5tap:
+ punpckhwd m0, m0
+ vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i
+ paddw xm4, [tlq-42] ; 2 3 4 5 6 7 8 9
+ paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw xm4, xm4
+ paddw m0, m0
+ paddw xm6, xm4
+ paddw m1, m0
+.h16_filter_w8_3tap:
+ paddw xm3, [tlq-44] ; 1 2 3 4 5 6 7 8
+ pmullw xm3, xm7
+ pxor m0, m0
+ paddw xm3, xm6
+ psrlw xm3, 3
+ pavgw xm3, xm0
+ mova [rsp+48], xm3
+ jmp .h16_filter_end
+.h16_filter_w4:
+ pshufhw xm3, xm3, q2100 ; _ _ _ _ 4 4 5 6
+ cmp r5d, 3
+ jne .h16_filter_w8_3tap
+ pshufhw xm4, xm3, q2100 ; _ _ _ _ 4 4 4 5
+ jmp .h16_filter_w8_5tap
+.h16_filter_w16:
+ mova m3, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ pmullw m6, m3
+ punpcklwd xm3, xm3
+ vpblendd m4, m3, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ paddw m4, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ mov r4d, 32
+ cmp wd, 16
+ cmovg maxbased, r4d
+ movd [rsp+28], xm3
+ pmullw m4, m7
+ cmp r5d, 3
+ jne .h16_filter_w16_3tap
+ punpckhwd m0, m0
+ vpblendd m3, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d
+ vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i
+ paddw m3, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ movzx r4d, word [tlq-62]
+ movzx r2d, word [tlq-60]
+ or maxbased, 1
+ paddw m3, m3
+ sub r2d, r4d
+ paddw m0, m0
+ lea r2d, [r2+r4*8+4]
+ paddw m4, m3
+ shr r2d, 3
+ paddw m1, m0
+ mov [rsp+30], r2w
+.h16_filter_w16_3tap:
+ pxor m0, m0
+ paddw m4, m6
+ psrlw m4, 3
+ pavgw m4, m0
+ mova [rsp+32], m4
+.h16_filter_end:
+ psrlw m1, 3
+ lea tlq, [rsp+94]
+ pavgw m1, m0
+ mova [rsp+64], m1
+.h16_main:
+ movd xm8, dyd
+ neg maxbaseq
+ vpbroadcastw m9, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m8, xm8
+ lea r4d, [maxbaseq+dyq+15*64]
+ neg dyq
+ movd xm7, r4d
+ sub tlq, 32
+ lea r4, [dyq+63]
+ vpbroadcastw m7, xm7
+ or maxbased, 63
+ psubw m7, [z_base_inc]
+.h16_loop:
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base0
+ movu m0, [tlq+r4*2+2]
+ movu m2, [tlq+r4*2]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base1
+ movu m1, [tlq+r5*2+2]
+ movu m3, [tlq+r5*2]
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base3
+ pand m6, m5, m7
+ psllw m6, 9
+ psubw m2, m0
+ pmulhrsw m2, m6
+ psraw m6, m7, 15
+ paddw m7, m8
+ paddw m0, m2
+ movu m2, [tlq+r4*2+2]
+ movu m4, [tlq+r4*2]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base3
+ vpblendvb m0, m9, m0, m6
+ pand m6, m5, m7
+ psllw m6, 9
+ psubw m3, m1
+ pmulhrsw m3, m6
+ psraw m6, m7, 15
+ paddw m7, m8
+ paddw m1, m3
+ vpblendvb m1, m9, m1, m6
+ pand m6, m5, m7
+ psllw m6, 9
+ psubw m4, m2
+ pmulhrsw m4, m6
+ psraw m6, m7, 15
+ paddw m7, m8
+ paddw m2, m4
+ movu m3, [tlq+r5*2+2]
+ movu m4, [tlq+r5*2]
+ vpblendvb m2, m9, m2, m6
+ pand m6, m5, m7
+ psllw m6, 9
+ psubw m4, m3
+ pmulhrsw m4, m6
+ psraw m6, m7, 15
+ paddw m7, m8
+ lea r5, [dstq+strideq*4]
+ paddw m3, m4
+ vpblendvb m3, m9, m3, m6
+ punpckhwd m4, m0, m1 ; ab bb aa ba a9 b9 a8 b8 a3 b3 a2 b2 a1 b1 a0 b0
+ punpcklwd m0, m1 ; af bf ae be ad bd ac bc a7 b7 a6 b6 a5 b5 a4 b4
+ punpckhwd m1, m2, m3 ; cb db ca da c9 d9 c8 d8 c3 d3 c2 d2 c1 d1 c0 d0
+ punpcklwd m2, m3 ; cf df ce de cd dd cc dc c7 d7 c6 d6 c5 d5 c4 d4
+ punpckhdq m3, m4, m1 ; a9 b9 c9 d9 a8 b8 c8 d8 a1 b1 c1 d1 a0 b0 c0 d0
+ vextracti128 xm6, m3, 1
+ punpckldq m4, m1 ; ab bb cb db aa ba ca da a3 b3 c3 d3 a2 b2 c2 d2
+ punpckhdq m1, m0, m2 ; ad bd cd dd ac bc cc dc a5 b5 c5 d5 a4 b4 c4 d4
+ punpckldq m0, m2 ; af bf cf df ae be ce de a7 b7 c7 d7 a6 b6 c6 d6
+ vextracti128 xm2, m4, 1
+ movhps [dstq+strideq*0], xm6
+ movq [dstq+strideq*1], xm6
+ vextracti128 xm6, m1, 1
+ movhps [dstq+strideq*2], xm2
+ movq [dstq+r7 ], xm2
+ vextracti128 xm2, m0, 1
+ movhps [r5 +strideq*0], xm6
+ movq [r5 +strideq*1], xm6
+ movhps [r5 +strideq*2], xm2
+ movq [r5 +r7 ], xm2
+ lea r5, [dstq+strideq*8]
+ movhps [r5 +strideq*0], xm3
+ movq [r5 +strideq*1], xm3
+ movhps [r5 +strideq*2], xm4
+ movq [r5 +r7 ], xm4
+ lea r5, [r5+strideq*4]
+ movhps [r5 +strideq*0], xm1
+ movq [r5 +strideq*1], xm1
+ movhps [r5 +strideq*2], xm0
+ movq [r5 +r7 ], xm0
+ sub wd, 4
+ jz .h16_end
+ add dstq, 8
+ cmp r4d, maxbased
+ jg .h16_loop
+ mov hd, 4
+.h16_end_loop0:
+ mov r6d, wd
+ mov r2, dstq
+ test wb, 4
+ jz .h16_end_loop
+ movq [dstq+strideq*0], xm9
+ movq [dstq+strideq*1], xm9
+ movq [dstq+strideq*2], xm9
+ movq [dstq+r7 ], xm9
+ and r6d, 120
+ jz .h16_end_w4
+ add dstq, 8
+.h16_end_loop:
+ mova [dstq+strideq*0], xm9
+ mova [dstq+strideq*1], xm9
+ mova [dstq+strideq*2], xm9
+ mova [dstq+r7 ], xm9
+ add dstq, 16
+ sub r6d, 8
+ jg .h16_end_loop
+.h16_end_w4:
+ lea dstq, [r2+strideq*4]
+ dec hd
+ jg .h16_end_loop0
+.h16_end:
+ RET
+.h32:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -160, 9
+ lea maxbased, [wq+31]
+ and maxbased, 31
+ or maxbased, 32 ; imin(w+31, 63)
+ test angled, 0x400
+ jnz .h32_main
+ vpbroadcastd m2, [pw_3]
+ movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ punpckhwd m1, m0, m0
+ vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i
+ paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m1, m2
+ paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ lea r4, [rsp+128]
+ paddw m0, m1
+ lea r5d, [maxbaseq-31]
+ psrlw m0, 2
+ mova [r4], m0
+.h32_filter_loop:
+ mova m0, [tlq-62]
+ paddw m1, m2, [tlq-66]
+ paddw m0, [tlq-64]
+ pavgw m1, [tlq-58]
+ paddw m0, [tlq-60]
+ sub tlq, 32
+ sub r4, 32
+ paddw m0, m1
+ psrlw m0, 2
+ mova [r4], m0
+ sub r5d, 16
+ jg .h32_filter_loop
+ jl .h32_filter_h8
+ mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ punpcklwd xm1, xm0, xm0
+ paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d
+ vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ movzx r5d, word [tlq-62]
+ movzx r2d, word [tlq-60]
+ pavgw m2, m3
+ sub r2d, r5d
+ paddw m0, m1
+ lea r2d, [r2+r5*8+4]
+ paddw m0, m2
+ shr r2d, 3
+ psrlw m0, 2
+ mova [r4-32], m0
+ mov [r4-36], r5w
+ mov [r4-34], r2w
+ lea tlq, [rsp+158]
+ mov r4d, 65
+ cmp wd, 64
+ cmove maxbased, r4d
+ jmp .h32_main
+.h32_filter_h8:
+ mova xm0, [tlq-46] ; 0 1 2 3 4 5 6 7
+ pblendw xm1, xm0, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6
+ paddw xm2, [tlq-42] ; 2 3 4 5 6 7 8 9
+ paddw xm0, [tlq-44] ; 1 2 3 4 5 6 7 8
+ vpblendd xm3, xm1, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5
+ lea tlq, [rsp+158]
+ pavgw xm2, xm3
+ paddw xm0, xm1
+ paddw xm0, xm2
+ psrlw xm0, 2
+ mova [r4-16], xm0
+.h32_main:
+ movd xm6, dyd
+ neg maxbaseq
+ vpbroadcastw m7, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m6, xm6
+ lea r4d, [maxbaseq+dyq+15*64]
+ neg dyq
+ movd xm4, r4d
+ vpbroadcastd m8, [pw_m1024]
+ lea r4, [dyq+63]
+ vpbroadcastw m4, xm4
+ or maxbased, 63
+ psubw m4, [z_base_inc]
+.h32_loop:
+ mov r5, r4
+ sar r5, 6
+ movu m1, [tlq+r5*2-64]
+ movu m0, [tlq+r5*2-62]
+ pand m3, m5, m4
+ psllw m3, 9
+ psubw m1, m0
+ pmulhrsw m1, m3
+ pcmpgtw m2, m8, m4
+ paddw m0, m1
+ vpblendvb m0, m7, m0, m2
+ movu m2, [tlq+r5*2-32]
+ movu m1, [tlq+r5*2-30]
+ add r4, dyq
+ sub rsp, 64
+ psubw m2, m1
+ pmulhrsw m2, m3
+ psraw m3, m4, 15
+ paddw m4, m6
+ mova [rsp+32*0], m0
+ paddw m1, m2
+ vpblendvb m1, m7, m1, m3
+ mova [rsp+32*1], m1
+ dec wd
+ jz .h32_transpose
+ cmp r4d, maxbased
+ jg .h32_loop
+.h32_end_loop:
+ sub rsp, 64
+ mova [rsp+32*0], m7
+ mova [rsp+32*1], m7
+ dec wd
+ jg .h32_end_loop
+.h32_transpose:
+ lea r3, [strideq*3]
+ lea r4, [strideq*5]
+ mov r8, dstq
+ lea r5, [strideq+r3*2]
+.h32_transpose_loop0:
+ lea r6, [rsp+32]
+ lea r2, [r8+org_wq*2-16]
+.h32_transpose_loop:
+ mova m0, [r6+64*7]
+ mova m1, [r6+64*6]
+ mova m2, [r6+64*5]
+ mova m3, [r6+64*4]
+ mova m4, [r6+64*3]
+ mova m5, [r6+64*2]
+ mova m6, [r6+64*1]
+ mova m7, [r6+64*0]
+ punpckhwd m8, m0, m1 ; a3 b3 a2 b2 a1 b1 a0 b0
+ punpcklwd m0, m1 ; a7 b7 a6 b6 a5 b5 a4 b4
+ punpckhwd m1, m2, m3 ; c3 d3 c2 d2 c1 d1 c0 d0
+ punpcklwd m2, m3 ; c7 d7 c6 d6 c5 d5 c4 d4
+ punpckhwd m3, m4, m5 ; e3 f3 e2 f2 e1 f1 e0 f0
+ punpcklwd m4, m5 ; e7 f7 e6 f6 e5 f5 e4 f4
+ punpckhwd m5, m6, m7 ; g3 h3 g2 h2 g1 h1 g0 h0
+ punpcklwd m6, m7 ; g7 h7 g6 h6 g5 h5 g4 h4
+ lea dstq, [r2+strideq*8]
+ sub r6, 32
+ punpckhdq m7, m8, m1 ; a1 b1 c1 d1 a0 b0 c0 d0
+ punpckldq m8, m1 ; a3 b3 c3 d3 a2 b2 c2 d2
+ punpckhdq m1, m3, m5 ; e1 f1 g1 h1 e0 f0 g0 h0
+ punpckldq m3, m5 ; e3 f3 g3 h3 e2 f2 g2 h2
+ punpckhqdq m5, m7, m1 ; 8 0
+ vextracti128 [r2 +strideq*0], m5, 1
+ punpcklqdq m7, m1 ; 9 1
+ mova [dstq+strideq*0], xm5
+ punpckhqdq m1, m8, m3 ; 10 2
+ vextracti128 [r2 +strideq*1], m7, 1
+ punpcklqdq m8, m3 ; 11 3
+ mova [dstq+strideq*1], xm7
+ punpckhdq m3, m0, m2 ; a5 b5 c5 d5 a4 b4 c4 d4
+ vextracti128 [r2 +strideq*2], m1, 1
+ punpckldq m0, m2 ; a7 b7 c7 d7 a6 b6 c6 d6
+ mova [dstq+strideq*2], xm1
+ punpckhdq m2, m4, m6 ; e5 f5 g5 h5 e4 f4 g4 h4
+ vextracti128 [r2 +r3 ], m8, 1
+ punpckldq m4, m6 ; e7 f7 g7 h7 e6 f6 g6 h6
+ mova [dstq+r3 ], xm8
+ punpckhqdq m6, m3, m2 ; 12 4
+ vextracti128 [r2 +strideq*4], m6, 1
+ punpcklqdq m3, m2 ; 13 5
+ mova [dstq+strideq*4], xm6
+ punpckhqdq m2, m0, m4 ; 14 6
+ vextracti128 [r2 +r4 ], m3, 1
+ punpcklqdq m0, m4 ; 15 7
+ mova [dstq+r4 ], xm3
+ vextracti128 [r2 +r3*2 ], m2, 1
+ mova [dstq+r3*2 ], xm2
+ vextracti128 [r2 +r5 ], m0, 1
+ mova [dstq+r5 ], xm0
+ lea r2, [dstq+strideq*8]
+ cmp r6, rsp
+ jae .h32_transpose_loop
+ add rsp, 64*8
+ sub org_wd, 8
+ jg .h32_transpose_loop0
+.h32_end:
+ RET
+.h64:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -256, 10
+ lea maxbased, [wq+63]
+ test angled, 0x400
+ jnz .h64_main
+ vpbroadcastd m2, [pw_3]
+ movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ punpckhwd m1, m0, m0
+ vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i
+ paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m1, m2
+ paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ lea r4, [rsp+224]
+ paddw m0, m1
+ lea r5d, [wq+32]
+ psrlw m0, 2
+ mova [r4], m0
+.h64_filter_loop:
+ mova m0, [tlq-62]
+ paddw m1, m2, [tlq-66]
+ paddw m0, [tlq-64]
+ pavgw m1, [tlq-58]
+ paddw m0, [tlq-60]
+ sub tlq, 32
+ sub r4, 32
+ paddw m0, m1
+ psrlw m0, 2
+ mova [r4], m0
+ sub r5d, 16
+ jg .h64_filter_loop
+ mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ punpcklwd xm1, xm0, xm0
+ paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d
+ vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ lea tlq, [rsp+254]
+ pavgw m2, m3
+ paddw m0, m1
+ paddw m0, m2
+ psrlw m0, 2
+ mova [r4-32], m0
+.h64_main:
+ neg maxbaseq
+ movd xm4, dyd
+ vpbroadcastw m6, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m4, xm4
+ lea r4d, [maxbaseq+dyq+15*64]
+ neg dyq
+ vpbroadcastd m7, [pw_m1024]
+ movd xm3, r4d
+ lea r4, [dyq+63]
+ paddw m8, m7, m7
+ vpbroadcastw m3, xm3
+ or maxbased, 63
+ paddw m9, m8, m7
+ psubw m3, [z_base_inc]
+.h64_loop:
+ mov r5, r4
+ sar r5, 6
+ movu m1, [tlq+r5*2-128]
+ movu m0, [tlq+r5*2-126]
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ sub rsp, 128
+ paddw m0, m1
+ pcmpgtw m1, m9, m3
+ vpblendvb m0, m6, m0, m1
+ mova [rsp+32*0], m0
+ movu m1, [tlq+r5*2-96]
+ movu m0, [tlq+r5*2-94]
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ pcmpgtw m1, m8, m3
+ vpblendvb m0, m6, m0, m1
+ mova [rsp+32*1], m0
+ movu m1, [tlq+r5*2-64]
+ movu m0, [tlq+r5*2-62]
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ pcmpgtw m1, m7, m3
+ vpblendvb m0, m6, m0, m1
+ mova [rsp+32*2], m0
+ movu m1, [tlq+r5*2-32]
+ movu m0, [tlq+r5*2-30]
+ psubw m1, m0
+ pmulhrsw m1, m2
+ add r4, dyq
+ psraw m2, m3, 15
+ paddw m3, m4
+ paddw m0, m1
+ vpblendvb m0, m6, m0, m2
+ mova [rsp+32*3], m0
+ dec wd
+ jz .h64_transpose
+ cmp r4d, maxbased
+ jg .h64_loop
+.h64_end_loop:
+ sub rsp, 128
+ mova [rsp+32*0], m6
+ mova [rsp+32*1], m6
+ mova [rsp+32*2], m6
+ mova [rsp+32*3], m6
+ dec wd
+ jg .h64_end_loop
+.h64_transpose:
+ lea r2, [strideq*3]
+ lea r3, [strideq*5]
+ mov r5, dstq
+ lea r4, [strideq+r2*2]
+.h64_transpose_loop0:
+ lea r6, [rsp+112]
+ lea dstq, [r5+org_wq*2-32]
+.h64_transpose_loop:
+ mova xm0, [r6+128*15]
+ vinserti128 m0, [r6+128* 7], 1
+ mova xm1, [r6+128*14]
+ vinserti128 m1, [r6+128* 6], 1
+ mova xm2, [r6+128*13]
+ vinserti128 m2, [r6+128* 5], 1
+ mova xm3, [r6+128*12]
+ vinserti128 m3, [r6+128* 4], 1
+ mova xm4, [r6+128*11]
+ vinserti128 m4, [r6+128* 3], 1
+ mova xm5, [r6+128*10]
+ vinserti128 m5, [r6+128* 2], 1
+ mova xm6, [r6+128* 9]
+ vinserti128 m6, [r6+128* 1], 1
+ mova xm7, [r6+128* 8]
+ vinserti128 m7, [r6+128* 0], 1
+ punpckhwd m8, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m6, m7
+ punpcklwd m6, m7
+ sub r6, 16
+ punpckhdq m7, m8, m1
+ punpckldq m8, m1
+ punpckhdq m1, m3, m5
+ punpckldq m3, m5
+ punpckhqdq m5, m7, m1
+ punpcklqdq m7, m1
+ punpckhqdq m1, m8, m3
+ punpcklqdq m8, m3
+ punpckhdq m3, m0, m2
+ mova [dstq+strideq*0], m5
+ punpckldq m0, m2
+ mova [dstq+strideq*1], m7
+ punpckhdq m2, m4, m6
+ mova [dstq+strideq*2], m1
+ punpckldq m4, m6
+ mova [dstq+r2 ], m8
+ punpckhqdq m6, m3, m2
+ mova [dstq+strideq*4], m6
+ punpcklqdq m3, m2
+ mova [dstq+r3 ], m3
+ punpckhqdq m2, m0, m4
+ mova [dstq+r2*2 ], m2
+ punpcklqdq m0, m4
+ mova [dstq+r4 ], m0
+ lea dstq, [dstq+strideq*8]
+ cmp r6, rsp
+ jae .h64_transpose_loop
+ add rsp, 128*16
+ sub org_wd, 16
+ jg .h64_transpose_loop0
+.h64_end:
+ RET
+
+%macro FILTER_1BLK 5 ; dst, src, tmp, shuf, bdmax
+%ifnum %4
+ pshufb xm%2, xm%4
+%else
+ pshufb xm%2, %4
+%endif
+ vinserti128 m%2, xm%2, 1
+ pshufd m%1, m%2, q0000
+ pmaddwd m%1, m2
+ pshufd m%3, m%2, q1111
+ pmaddwd m%3, m3
+ paddd m%1, m1
+ paddd m%1, m%3
+ pshufd m%3, m%2, q2222
+ pmaddwd m%3, m4
+ paddd m%1, m%3
+ pshufd m%3, m%2, q3333
+ pmaddwd m%3, m5
+ paddd m%1, m%3
+ psrad m%1, 4
+ packusdw m%1, m%1
+ pminsw m%1, m%5
+%endmacro
+
+%macro FILTER_2BLK 7 ; dst, src, tmp_dst, tmp_src, tmp, shuf, bdmax
+ pshufb m%2, m%6
+ vpermq m%4, m%2, q3232
+ vinserti128 m%2, xm%2, 1
+ pshufd m%1, m%2, q0000
+ pshufd m%3, m%4, q0000
+ pmaddwd m%1, m2
+ pmaddwd m%3, m2
+ paddd m%1, m1
+ paddd m%3, m1
+ pshufd m%5, m%2, q1111
+ pmaddwd m%5, m3
+ paddd m%1, m%5
+ pshufd m%5, m%4, q1111
+ pmaddwd m%5, m3
+ paddd m%3, m%5
+ pshufd m%5, m%2, q2222
+ pmaddwd m%5, m4
+ paddd m%1, m%5
+ pshufd m%5, m%4, q2222
+ pmaddwd m%5, m4
+ paddd m%3, m%5
+ pshufd m%5, m%2, q3333
+ pmaddwd m%5, m5
+ paddd m%1, m%5
+ pshufd m%5, m%4, q3333
+ pmaddwd m%5, m5
+ paddd m%3, m%5
+ psrad m%1, 4
+ psrad m%3, 4
+ packusdw m%1, m%3
+ pminsw m%1, m%7
+%endmacro
+
+; The ipred_filter SIMD processes 4x2 blocks in the following order which
+; increases parallelism compared to doing things row by row. One redundant
+; block is calculated for w8 and w16, two for w32.
+; w4 w8 w16 w32
+; 1 1 2 1 2 3 5 1 2 3 5 b c d f
+; 2 2 3 2 4 5 7 2 4 5 7 c e f h
+; 3 3 4 4 6 7 9 4 6 7 9 e g h j
+; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___
+; 5 8 8 i
+
+cglobal ipred_filter_16bpc, 3, 9, 0, dst, stride, tl, w, h, filter
+%assign org_stack_offset stack_offset
+%define base r6-ipred_filter_16bpc_avx2_table
+ lea r6, [filter_intra_taps]
+ tzcnt wd, wm
+%ifidn filterd, filterm
+ movzx filterd, filterb
+%else
+ movzx filterd, byte filterm
+%endif
+ shl filterd, 6
+ add filterq, r6
+ lea r6, [ipred_filter_16bpc_avx2_table]
+ vbroadcasti128 m0, [tlq-6]
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m1, [base+pd_8]
+ pmovsxbw m2, [filterq+16*0]
+ pmovsxbw m3, [filterq+16*1]
+ pmovsxbw m4, [filterq+16*2]
+ pmovsxbw m5, [filterq+16*3]
+ add wq, r6
+ mov hd, hm
+ jmp wq
+.w4:
+ WIN64_SPILL_XMM 10
+ mova xm8, [base+filter_shuf2]
+ vpbroadcastw m9, r8m ; bitdepth_max
+ lea r7, [6+hq*2]
+ sub tlq, r7
+ jmp .w4_loop_start
+.w4_loop:
+ pinsrq xm0, [tlq+hq*2], 0
+ lea dstq, [dstq+strideq*2]
+.w4_loop_start:
+ FILTER_1BLK 6, 0, 7, 8, 9
+ vextracti128 xm0, m6, 1
+ movq [dstq+strideq*0], xm6
+ movq [dstq+strideq*1], xm0
+ sub hd, 2
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 16
+ vbroadcasti128 m14, [base+filter_shuf3]
+ vpbroadcastw m15, r8m ; bitdepth_max
+ FILTER_1BLK 10, 0, 7, [base+filter_shuf2], 15
+ vpermq m6, m10, q1302 ; ____ ____ | ____ 4321
+ pslldq m8, m0, 4
+ psrldq m7, m6, 2
+ psrldq m0, m6, 10
+ punpcklwd m7, m0
+ vpblendd m8, m6, 0x33 ; _0__ 4321 | ____ 4321
+ vpblendd m8, m7, 0x40 ; _056 4321 | ____ 4321
+ vpblendd m8, [tlq-6], 0x30 ; _056 4321 | ____ 4321
+ lea r7, [16+hq*2]
+ sub tlq, r7
+ jmp .w8_loop_start
+.w8_loop:
+ vpermq m8, m9, q1302 ; ____ 4321 | ____ 4321
+ vpermq m6, m9, q2031
+ psrldq m0, m6, 2
+ psrldq m6, 10
+ punpcklwd m6, m0
+ vpblendd m8, m7, 0x80 ; _0__ 4321 | ____ 4321
+ vpblendd m8, m6, 0x40 ; _056 4321 | ____ 4321
+ mova m10, m9
+.w8_loop_start:
+ vpblendd m8, [tlq+hq*2], 0x0C ; _056 4321 | _056 4321
+ call .main
+ vpblendd m10, m9, 0xCC
+ mova [dstq+strideq*0], xm10
+ vextracti128 [dstq+strideq*1], m10, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ %assign stack_offset stack_offset - stack_size_padded
+ ALLOC_STACK 32, 16
+ vpbroadcastw m15, r8m ; bitdepth_max
+ sub hd, 2
+ TAIL_CALL .w16_main, 0
+.w16_main:
+ mova xm10, [base+filter_shuf2]
+ FILTER_1BLK 13, 0, 6, 10, 15
+ vpermq m12, m13, q3120
+ mova xm14, [base+filter_shuf3]
+ vinserti128 m14, [base+filter_shuf1], 1
+ vpbroadcastq m0, [tlq+10]
+ vpblendd m0, [tlq-16], 0x4C ; ___0 4321 | _056 ____
+ psrldq m6, m12, 8
+ vpblendd m0, m6, 0x03 ; ___0 4321 | _056 4321
+ punpcklwd m6, m12
+ vpblendd m0, m6, 0x80 ; 56_0 4321 | _056 4321
+ FILTER_2BLK 12, 0, 6, 7, 8, 14, 15
+ vpblendd m13, m12, 0xCC
+ vpermq m12, m12, q2031 ; 6___ 5___
+ psrldq xm6, xm12, 2
+ psrldq xm8, xm12, 12
+ vpblendd xm6, xm8, 0x01
+ pblendw xm6, [tlq+10], 0xF8 ; 4321 056_
+ FILTER_1BLK 11, 6, 8, 10, 15
+ vpermq m11, m11, q3120
+ pshufd m9, m11, q1032
+ movu m8, [tlq+6] ; __43 210_ | ____ ____
+ pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____
+ pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____
+ vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321
+ mova [dstq+strideq*0], xm13
+ vextracti128 [dstq+strideq*1], m13, 1
+ lea r7, [20+hq*2]
+ sub tlq, r7
+ vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321
+ jmp .w16_loop_start
+.w16_loop:
+ vpermq m13, m13, q3322
+ vpermq m11, m9, q2020
+ vpermq m9, m9, q1302
+ vpermq m6, m12, q0123
+ psrldq m7, 4
+ vpblendd m13, m10, 0xCC
+ vpblendd m9, m7, 0x40
+ mova m0, [rsp+8]
+ mova [dstq+strideq*0], xm13
+ vextracti128 [dstq+strideq*1], m13, 1
+.w16_loop_start:
+ mova m13, m12
+ vpblendd m0, [tlq+hq*2], 0x0C
+ psrldq m7, m12, 8
+ punpcklwd m7, m12
+ vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321
+ vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321
+ FILTER_2BLK 10, 0, 6, 7, 8, 14, 15
+ vpermq m12, m10, q2031
+ mova [rsp+8], m0
+ psrldq m8, m11, 8
+ psrldq xm6, xm12, 2
+ psrldq xm7, xm12, 10
+ psrldq xm0, xm13, 2
+ punpcklwd m8, m11
+ punpcklwd xm7, xm6
+ vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321
+ vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321
+ vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321
+ call .main
+ vpermq m8, m11, q3120
+ vpblendd m6, m8, m9, 0xCC
+ mova [dstq+strideq*0+16], xm6
+ vextracti128 [dstq+strideq*1+16], m6, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ vpermq m8, m9, q3120
+ vextracti128 xm0, m8, 1 ; 4321 ____
+ pshufd xm11, xm11, q1032
+ vpblendd xm0, xm11, 0x02 ; 4321 0___
+ psrldq xm6, xm8, 2
+ psrldq xm7, xm8, 12
+ pblendw xm0, xm6, 0x4 ; 4321 05__
+ pblendw xm0, xm7, 0x2 ; 4321 056_
+ FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15
+ vpermq m12, m13, q1302
+ vpblendd m12, m10, 0xCC
+ vpblendd m9, m6, 0xCC
+ mova [dstq+strideq*0+ 0], xm12
+ mova [dstq+strideq*0+16], xm9
+ vextracti128 [dstq+strideq*1+ 0], m12, 1
+ vextracti128 [dstq+strideq*1+16], m9, 1
+ ret
+ALIGN function_align
+.w32:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK 64, 16
+ vpbroadcastw m15, r8m ; bitdepth_max
+ sub hd, 2
+ lea r3, [dstq+32]
+ lea r5d, [hd*2+20]
+ call .w16_main
+ mov dstq, r3
+ lea tlq, [tlq+r5+32]
+ sub r5d, 20
+ shr r5d, 1
+ sub r5d, 2
+ lea r4, [dstq+strideq*2-2]
+DEFINE_ARGS dst, stride, tl, stride3, left, h
+ lea stride3q, [strideq*3]
+ movu m8, [tlq-6] ; 4321 0___
+ mova xm10, [base+filter_shuf2]
+ pinsrw xm0, xm8, [dstq+strideq*0-2], 2
+ pinsrw xm0, xm0, [dstq+strideq*1-2], 1 ; 4321 056_
+ pinsrw xm9, [leftq+strideq*0], 5
+ pinsrw xm9, [leftq+strideq*1], 4
+ FILTER_1BLK 13, 0, 6, 10, 15
+ vpermq m12, m13, q3120
+ mova xm14, [base+filter_shuf3]
+ vinserti128 m14, [base+filter_shuf1], 1
+ psrldq m6, m12, 8
+ punpcklwd m7, m6, m12
+ vpblendd m0, m6, 0x03 ; ___0 ____ | _0__ 4321
+ vpblendd m0, m7, 0x80 ; 56_0 ____ | _0__ 4321
+ vpblendd m0, m8, 0x30 ; 56_0 4321 | _0__ 4321
+ vpblendd m0, m9, 0x04 ; 56_0 4321 | _056 4321
+ FILTER_2BLK 12, 0, 6, 7, 8, 14, 15
+ vpblendd m13, m12, 0xCC
+ pinsrw xm9, [leftq+strideq*2], 3
+ pinsrw xm9, [leftq+stride3q ], 2
+ lea leftq, [leftq+strideq*4]
+ pinsrw xm9, [leftq+strideq*0], 1
+ pinsrw xm9, [leftq+strideq*1], 0
+ movq [rsp+32], xm9
+ mov r7d, 1
+ pslldq m8, m9, 4
+ vpblendd m0, m8, 0x0C ; ___0 ____ | _056 ____
+ vpermq m12, m12, q2031 ; 6___ 5___
+ psrldq xm6, xm12, 2
+ psrldq xm7, xm12, 12
+ vpblendd xm6, xm7, 0x01 ; ____ _56_
+ pblendw xm6, [tlq+10], 0xF8 ; 4321 056_
+ FILTER_1BLK 11, 6, 7, 10, 15
+ vpermq m11, m11, q3120
+ pshufd m9, m11, q1032
+ vbroadcasti128 m8, [tlq+22] ; __43 210_ | ____ ____
+ pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____
+ pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____
+ vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321
+ mova [dstq+strideq*0], xm13
+ vextracti128 [dstq+strideq*1], m13, 1
+ vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321
+ jmp .w32_loop_start
+.w32_loop_last:
+ mova m0, [rsp+0]
+ jmp .w32_loop
+.w32_loop_left:
+ mova m0, [rsp+0]
+ vpblendd m0, [rsp+32+r7*4-12], 0x0C
+ dec r7d
+ jg .w32_loop
+ cmp hd, 2
+ je .w32_loop
+ pinsrw xm6, [rsp+32], 6
+ pinsrw xm6, [leftq+strideq*2], 5
+ pinsrw xm6, [leftq+stride3q ], 4
+ lea leftq, [leftq+strideq*4]
+ pinsrw xm6, [leftq+strideq*0], 3
+ pinsrw xm6, [leftq+strideq*1], 2
+ pinsrw xm6, [leftq+strideq*2], 1
+ pinsrw xm6, [leftq+stride3q ], 0
+ lea leftq, [leftq+strideq*4]
+ movu [rsp+36], xm6
+ pinsrw xm6, [leftq+strideq*0], 1
+ pinsrw xm6, [leftq+strideq*1], 0
+ movd [rsp+32], xm6
+ mov r7d, 4
+.w32_loop:
+ vpermq m13, m13, q3322
+ vpermq m11, m9, q2020
+ vpermq m9, m9, q1302
+ vpermq m6, m12, q0123
+ psrldq m7, 4
+ vpblendd m13, m10, 0xCC
+ vpblendd m9, m7, 0x40 ; ___0 4321 | ____ 4321
+ mova [dstq+strideq*0], xm13
+ vextracti128 [dstq+strideq*1], m13, 1
+.w32_loop_start:
+ mova m13, m12
+ psrldq m7, m12, 8
+ punpcklwd m7, m12
+ vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321
+ vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321
+ FILTER_2BLK 10, 0, 6, 7, 8, 14, 15
+ vpermq m12, m10, q2031
+ mova [rsp+0], m0
+ psrldq m8, m11, 8
+ psrldq xm6, xm12, 2
+ psrldq xm7, xm12, 10
+ psrldq xm0, xm13, 2
+ punpcklwd m8, m11
+ punpcklwd xm7, xm6
+ vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321
+ vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321
+ vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321
+ call .main
+ vpermq m8, m11, q3120
+ vpblendd m6, m8, m9, 0xCC
+ mova [dstq+strideq*0+16], xm6
+ vextracti128 [dstq+strideq*1+16], m6, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop_left
+ jz .w32_loop_last
+ vpermq m8, m9, q3120
+ vextracti128 xm0, m8, 1 ; 4321 ____
+ pshufd xm11, xm11, q1032
+ vpblendd xm0, xm11, 0x02 ; 4321 0___
+ psrldq xm6, xm8, 2
+ psrldq xm7, xm8, 12
+ pblendw xm0, xm6, 0x4 ; 4321 05__
+ pblendw xm0, xm7, 0x2 ; 4321 056_
+ FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15
+ vpermq m12, m13, q1302
+ vpblendd m12, m10, 0xCC
+ vpblendd m9, m6, 0xCC
+ mova [dstq+strideq*0+ 0], xm12
+ mova [dstq+strideq*0+16], xm9
+ vextracti128 [dstq+strideq*1+ 0], m12, 1
+ vextracti128 [dstq+strideq*1+16], m9, 1
+ RET
+.main:
+ FILTER_2BLK 9, 8, 6, 7, 0, 14, 15
+ ret
+
+%if WIN64
+DECLARE_REG_TMP 5
+%else
+DECLARE_REG_TMP 7
+%endif
+
+%macro IPRED_CFL 1 ; ac in, unpacked pixels out
+ psignw m3, m%1, m1
+ pabsw m%1, m%1
+ pmulhrsw m%1, m2
+ psignw m%1, m3
+ paddw m%1, m0
+%endmacro
+
+cglobal ipred_cfl_top_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
+ movifnidn hd, hm
+ add tlq, 2
+ movd xm4, wd
+ pxor m6, m6
+ vpbroadcastw m7, r7m
+ pavgw xm4, xm6
+ tzcnt wd, wd
+ movd xm5, wd
+ movu m0, [tlq]
+ lea t0, [ipred_cfl_left_16bpc_avx2_table]
+ movsxd r6, [t0+wq*4]
+ add r6, t0
+ add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table
+ movsxd wq, [t0+wq*4]
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+
+cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
+ mov hd, hm ; zero upper half
+ sub tlq, hq
+ movd xm4, hd
+ sub tlq, hq
+ pxor m6, m6
+ vpbroadcastw m7, r7m
+ pavgw xm4, xm6
+ tzcnt r6d, hd
+ movd xm5, r6d
+ movu m0, [tlq]
+ lea t0, [ipred_cfl_left_16bpc_avx2_table]
+ movsxd r6, [t0+r6*4]
+ add r6, t0
+ add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table
+ tzcnt wd, wd
+ movsxd wq, [t0+wq*4]
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+.h32:
+ paddw m0, [tlq+32]
+.h16:
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+.h8:
+ psrldq xm1, xm0, 8
+ paddw xm0, xm1
+.h4:
+ punpcklwd xm0, xm6
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ paddd xm0, xm4
+ psrld xm0, xm5
+ vpbroadcastw m0, xm0
+ jmp wq
+
+cglobal ipred_cfl_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
+ movifnidn hd, hm
+ movifnidn wd, wm
+ tzcnt r6d, hd
+ lea t0d, [wq+hq]
+ movd xm4, t0d
+ tzcnt t0d, t0d
+ movd xm5, t0d
+ lea t0, [ipred_cfl_16bpc_avx2_table]
+ tzcnt wd, wd
+ movsxd r6, [t0+r6*4]
+ movsxd wq, [t0+wq*4+4*4]
+ psrlw xm4, 1
+ pxor m6, m6
+ vpbroadcastw m7, r7m
+ add r6, t0
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+.h4:
+ movq xm0, [tlq-8]
+ jmp wq
+.w4:
+ movq xm1, [tlq+2]
+ paddw m0, m4
+ paddw m0, m1
+ psrlq m1, m0, 32
+ paddw m0, m1
+ psrld m1, m0, 16
+ paddw m0, m1
+ cmp hd, 4
+ jg .w4_mul
+ psrlw xm0, 3
+ jmp .w4_end
+.w4_mul:
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+ lea r2d, [hq*2]
+ mov r6d, 0xAAAB6667
+ shrx r6d, r6d, r2d
+ punpckhwd xm1, xm0, xm6
+ punpcklwd xm0, xm6
+ paddd xm0, xm1
+ movd xm1, r6d
+ psrld xm0, 2
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w4_end:
+ vpbroadcastw m0, xm0
+.s4:
+ vpbroadcastw m1, alpham
+ lea r6, [strideq*3]
+ pabsw m2, m1
+ psllw m2, 9
+.s4_loop:
+ mova m4, [acq]
+ IPRED_CFL 4
+ pmaxsw m4, m6
+ pminsw m4, m7
+ vextracti128 xm5, m4, 1
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*2], xm5
+ movhps [dstq+strideq*1], xm4
+ movhps [dstq+r6 ], xm5
+ lea dstq, [dstq+strideq*4]
+ add acq, 32
+ sub hd, 4
+ jg .s4_loop
+ RET
+ALIGN function_align
+.h8:
+ mova xm0, [tlq-16]
+ jmp wq
+.w8:
+ vextracti128 xm1, m0, 1
+ paddw xm0, [tlq+2]
+ paddw xm0, xm4
+ paddw xm0, xm1
+ psrld xm1, xm0, 16
+ paddw xm0, xm1
+ pblendw xm0, xm6, 0xAA
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ cmp hd, 32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w8_end:
+ vpbroadcastw m0, xm0
+.s8:
+ vpbroadcastw m1, alpham
+ lea r6, [strideq*3]
+ pabsw m2, m1
+ psllw m2, 9
+.s8_loop:
+ mova m4, [acq]
+ mova m5, [acq+32]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ pmaxsw m4, m6
+ pmaxsw m5, m6
+ pminsw m4, m7
+ pminsw m5, m7
+ mova [dstq+strideq*0], xm4
+ mova [dstq+strideq*2], xm5
+ vextracti128 [dstq+strideq*1], m4, 1
+ vextracti128 [dstq+r6 ], m5, 1
+ lea dstq, [dstq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .s8_loop
+ RET
+ALIGN function_align
+.h16:
+ mova m0, [tlq-32]
+ jmp wq
+.w16:
+ paddw m0, [tlq+2]
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm4
+ paddw xm0, xm1
+ punpckhwd xm1, xm0, xm6
+ punpcklwd xm0, xm6
+ paddd xm0, xm1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ test hb, 8|32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w16_end:
+ vpbroadcastw m0, xm0
+.s16:
+ vpbroadcastw m1, alpham
+ pabsw m2, m1
+ psllw m2, 9
+.s16_loop:
+ mova m4, [acq]
+ mova m5, [acq+32]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ pmaxsw m4, m6
+ pmaxsw m5, m6
+ pminsw m4, m7
+ pminsw m5, m7
+ mova [dstq+strideq*0], m4
+ mova [dstq+strideq*1], m5
+ lea dstq, [dstq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .s16_loop
+ RET
+ALIGN function_align
+.h32:
+ mova m0, [tlq-64]
+ paddw m0, [tlq-32]
+ jmp wq
+.w32:
+ paddw m0, [tlq+ 2]
+ paddw m0, [tlq+34]
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm4
+ paddw xm0, xm1
+ punpcklwd xm1, xm0, xm6
+ punpckhwd xm0, xm6
+ paddd xm0, xm1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x6667AAAB
+ shrx r6d, r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w32_end:
+ vpbroadcastw m0, xm0
+.s32:
+ vpbroadcastw m1, alpham
+ pabsw m2, m1
+ psllw m2, 9
+.s32_loop:
+ mova m4, [acq]
+ mova m5, [acq+32]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ pmaxsw m4, m6
+ pmaxsw m5, m6
+ pminsw m4, m7
+ pminsw m5, m7
+ mova [dstq+32*0], m4
+ mova [dstq+32*1], m5
+ add dstq, strideq
+ add acq, 64
+ dec hd
+ jg .s32_loop
+ RET
+
+cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
+ mov r6d, r7m
+ shr r6d, 11
+ lea t0, [ipred_cfl_splat_16bpc_avx2_table]
+ tzcnt wd, wd
+ movifnidn hd, hm
+ movsxd wq, [t0+wq*4]
+ vpbroadcastd m0, [t0-ipred_cfl_splat_16bpc_avx2_table+pw_512+r6*4]
+ pxor m6, m6
+ vpbroadcastw m7, r7m
+ add wq, t0
+ movifnidn acq, acmp
+ jmp wq
+
+cglobal ipred_cfl_ac_420_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
+ movifnidn hpadd, hpadm
+ vpbroadcastd m5, [pw_2]
+ mov hd, hm
+ shl hpadd, 2
+ pxor m4, m4
+ sub hd, hpadd
+ cmp dword wm, 8
+ jg .w16
+ je .w8
+.w4:
+ lea r3, [strideq*3]
+ mov r5, acq
+.w4_loop:
+ mova xm0, [ypxq+strideq*2]
+ mova xm1, [ypxq+r3 ]
+ vinserti128 m0, [ypxq+strideq*0], 1
+ vinserti128 m1, [ypxq+strideq*1], 1
+ lea ypxq, [ypxq+strideq*4]
+ pmaddwd m0, m5
+ pmaddwd m1, m5
+ paddd m0, m1
+ vextracti128 xm1, m0, 1
+ paddd m4, m0
+ packssdw xm1, xm0
+ mova [acq], xm1
+ add acq, 16
+ sub hd, 2
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .dc
+ vpermq m1, m1, q1111
+ pslld xm0, 2
+.w4_hpad_loop:
+ mova [acq], m1
+ paddd m4, m0
+ add acq, 32
+ sub hpadd, 4
+ jg .w4_hpad_loop
+ jmp .dc
+.w8:
+ mov r5, acq
+ test wpadd, wpadd
+ jnz .w8_wpad1
+.w8_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0]
+ pmaddwd m1, m5, [ypxq+strideq*1]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m0, m1
+ vextracti128 xm1, m0, 1
+ paddd m4, m0
+ packssdw xm1, xm0, xm1
+ mova [acq], xm1
+ add acq, 16
+ dec hd
+ jg .w8_loop
+.w8_hpad:
+ test hpadd, hpadd
+ jz .dc
+ vinserti128 m1, xm1, 1
+ pslld m0, 2
+ jmp .hpad
+.w8_wpad1:
+ pmaddwd xm0, xm5, [ypxq+strideq*0]
+ pmaddwd xm3, xm5, [ypxq+strideq*1]
+ lea ypxq, [ypxq+strideq*2]
+ paddd xm0, xm3
+ pshufd xm3, xm0, q3333
+ packssdw xm1, xm0, xm3
+ paddd xm0, xm3
+ paddd xm4, xm0
+ mova [acq], xm1
+ add acq, 16
+ dec hd
+ jg .w8_wpad1
+ jmp .w8_hpad
+.w16_wpad:
+ mova m0, [ypxq+strideq*0+ 0]
+ mova m1, [ypxq+strideq*1+ 0]
+ cmp wpadd, 2
+ jl .w16_wpad1
+ je .w16_wpad2
+ vpbroadcastd m2, [ypxq+strideq*0+12]
+ vpbroadcastd m3, [ypxq+strideq*1+12]
+ vpblendd m0, m2, 0xf0
+ vpblendd m1, m3, 0xf0
+ jmp .w16_wpad_end
+.w16_wpad2:
+ vpbroadcastd m2, [ypxq+strideq*0+28]
+ vpbroadcastd m3, [ypxq+strideq*1+28]
+ jmp .w16_wpad_end
+.w16_wpad1:
+ vpbroadcastd m2, [ypxq+strideq*0+44]
+ vpbroadcastd m3, [ypxq+strideq*1+44]
+ vinserti128 m2, [ypxq+strideq*0+32], 0
+ vinserti128 m3, [ypxq+strideq*1+32], 0
+.w16_wpad_end:
+ lea ypxq, [ypxq+strideq*2]
+ REPX {pmaddwd x, m5}, m0, m1, m2, m3
+ paddd m0, m1
+ paddd m2, m3
+ packssdw m1, m0, m2
+ paddd m0, m2
+ vpermq m1, m1, q3120
+ paddd m4, m0
+ mova [acq], m1
+ add acq, 32
+ dec hd
+ jg .w16_wpad
+ jmp .w16_hpad
+.w16:
+ mov r5, acq
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0+ 0]
+ pmaddwd m2, m5, [ypxq+strideq*0+32]
+ pmaddwd m1, m5, [ypxq+strideq*1+ 0]
+ pmaddwd m3, m5, [ypxq+strideq*1+32]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m0, m1
+ paddd m2, m3
+ packssdw m1, m0, m2
+ paddd m0, m2
+ vpermq m1, m1, q3120
+ paddd m4, m0
+ mova [acq], m1
+ add acq, 32
+ dec hd
+ jg .w16_loop
+.w16_hpad:
+ add hpadd, hpadd
+ jz .dc
+ paddd m0, m0
+.hpad:
+ mova [acq+32*0], m1
+ paddd m4, m0
+ mova [acq+32*1], m1
+ add acq, 32*2
+ sub hpadd, 4
+ jg .hpad
+.dc:
+ vextracti128 xm1, m4, 1
+ sub r5, acq ; -w*h*2
+ tzcnt r1d, r5d
+ paddd xm4, xm1
+ sub r1d, 2
+ punpckhqdq xm1, xm4, xm4
+ movd xm0, r1d
+ paddd xm1, xm4
+ pshuflw xm4, xm1, q1032
+ paddd xm1, xm4
+ psrld xm1, xm0
+ pxor xm0, xm0
+ pavgw xm1, xm0
+ vpbroadcastw m1, xm1
+.dc_loop:
+ mova m0, [acq+r5]
+ psubw m0, m1
+ mova [acq+r5], m0
+ add r5, 32
+ jl .dc_loop
+ RET
+
+cglobal ipred_cfl_ac_422_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
+ movifnidn hpadd, hpadm
+ vpbroadcastd m5, [pw_4]
+ mov hd, hm
+ shl hpadd, 2
+ pxor m4, m4
+ sub hd, hpadd
+ cmp dword wm, 8
+ jg .w16
+ je .w8
+.w4:
+ lea r3, [strideq*3]
+ mov r5, acq
+.w4_loop:
+ mova xm0, [ypxq+strideq*0]
+ mova xm1, [ypxq+strideq*1]
+ vinserti128 m0, [ypxq+strideq*2], 1
+ vinserti128 m1, [ypxq+r3 ], 1
+ lea ypxq, [ypxq+strideq*4]
+ pmaddwd m0, m5
+ pmaddwd m1, m5
+ paddd m4, m0
+ packssdw m0, m1
+ paddd m4, m1
+ mova [acq], m0
+ add acq, 32
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+ vextracti128 xm1, m1, 1
+ vpermq m0, m0, q3333
+ pslld xm1, 2
+.w4_hpad_loop:
+ mova [acq], m0
+ paddd m4, m1
+ add acq, 32
+ sub hpadd, 4
+ jg .w4_hpad_loop
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+.w8:
+ mov r5, acq
+ test wpadd, wpadd
+ jnz .w8_wpad1
+.w8_loop:
+ pmaddwd m1, m5, [ypxq+strideq*0]
+ pmaddwd m0, m5, [ypxq+strideq*1]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m4, m1
+ packssdw m1, m0
+ paddd m4, m0
+ vpermq m2, m1, q3120
+ mova [acq], m2
+ add acq, 32
+ sub hd, 2
+ jg .w8_loop
+.w8_hpad:
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+ vpermq m1, m1, q3131
+ pslld m0, 2
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad
+.w8_wpad1:
+ vpbroadcastd m1, [ypxq+strideq*0+12]
+ vpbroadcastd m0, [ypxq+strideq*1+12]
+ vinserti128 m1, [ypxq+strideq*0+ 0], 0
+ vinserti128 m0, [ypxq+strideq*1+ 0], 0
+ lea ypxq, [ypxq+strideq*2]
+ pmaddwd m1, m5
+ pmaddwd m0, m5
+ paddd m4, m1
+ packssdw m1, m0
+ paddd m4, m0
+ vpermq m2, m1, q3120
+ mova [acq], m2
+ add acq, 32
+ sub hd, 2
+ jg .w8_wpad1
+ jmp .w8_hpad
+.w16:
+ mov r5, acq
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ pmaddwd m2, m5, [ypxq+strideq*0+ 0]
+ pmaddwd m1, m5, [ypxq+strideq*0+32]
+ pmaddwd m0, m5, [ypxq+strideq*1+ 0]
+ pmaddwd m3, m5, [ypxq+strideq*1+32]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m4, m2
+ packssdw m2, m1
+ paddd m4, m1
+ packssdw m1, m0, m3
+ paddd m0, m3
+ vpermq m2, m2, q3120
+ paddd m4, m0
+ vpermq m1, m1, q3120
+ mova [acq+32*0], m2
+ mova [acq+32*1], m1
+ add acq, 32*2
+ sub hd, 2
+ jg .w16_loop
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad
+.w16_wpad:
+ mova m2, [ypxq+strideq*0+ 0]
+ mova m0, [ypxq+strideq*1+ 0]
+ cmp wpadd, 2
+ jl .w16_wpad1
+ je .w16_wpad2
+ vpbroadcastd m1, [ypxq+strideq*0+12]
+ vpbroadcastd m3, [ypxq+strideq*1+12]
+ vpblendd m2, m1, 0xf0
+ vpblendd m0, m3, 0xf0
+ jmp .w16_wpad_end
+.w16_wpad2:
+ vpbroadcastd m1, [ypxq+strideq*0+28]
+ vpbroadcastd m3, [ypxq+strideq*1+28]
+ jmp .w16_wpad_end
+.w16_wpad1:
+ vpbroadcastd m1, [ypxq+strideq*0+44]
+ vpbroadcastd m3, [ypxq+strideq*1+44]
+ vinserti128 m1, [ypxq+strideq*0+32], 0
+ vinserti128 m3, [ypxq+strideq*1+32], 0
+.w16_wpad_end:
+ lea ypxq, [ypxq+strideq*2]
+ REPX {pmaddwd x, m5}, m2, m0, m1, m3
+ paddd m4, m2
+ packssdw m2, m1
+ paddd m4, m1
+ packssdw m1, m0, m3
+ paddd m0, m3
+ vpermq m2, m2, q3120
+ paddd m4, m0
+ vpermq m1, m1, q3120
+ mova [acq+32*0], m2
+ mova [acq+32*1], m1
+ add acq, 32*2
+ sub hd, 2
+ jg .w16_wpad
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad
+
+cglobal ipred_cfl_ac_444_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
+ lea r6, [ipred_cfl_ac_444_16bpc_avx2_table]
+ tzcnt wd, wm
+ movifnidn hpadd, hpadm
+ vpbroadcastd m5, [pw_1]
+ movsxd wq, [r6+wq*4]
+ shl hpadd, 2
+ add wq, r6
+ mov hd, hm
+ pxor m4, m4
+ sub hd, hpadd
+ jmp wq
+.w4:
+ lea r3, [strideq*3]
+ mov r5, acq
+.w4_loop:
+ movq xm0, [ypxq+strideq*0]
+ movhps xm0, [ypxq+strideq*1]
+ vpbroadcastq m1, [ypxq+strideq*2]
+ vpbroadcastq m2, [ypxq+r3 ]
+ lea ypxq, [ypxq+strideq*4]
+ vpblendd m0, m1, 0x30
+ vpblendd m0, m2, 0xc0
+ psllw m0, 3
+ pmaddwd m1, m0, m5
+ mova [acq], m0
+ add acq, 32
+ paddd m4, m1
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+ vpermq m0, m0, q3333
+ paddd m1, m1
+ mova [acq+32*0], m0
+ vpermq m1, m1, q3333
+ mova [acq+32*1], m0
+ add acq, 32*2
+ paddd m4, m1
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+.w8:
+ lea r3, [strideq*3]
+ mov r5, acq
+.w8_loop:
+ mova xm2, [ypxq+strideq*0]
+ vinserti128 m2, [ypxq+strideq*1], 1
+ mova xm1, [ypxq+strideq*2]
+ vinserti128 m1, [ypxq+r3 ], 1
+ lea ypxq, [ypxq+strideq*4]
+ psllw m2, 3
+ psllw m1, 3
+ mova [acq+32*0], m2
+ pmaddwd m2, m5
+ mova [acq+32*1], m1
+ pmaddwd m0, m1, m5
+ add acq, 32*2
+ paddd m4, m2
+ paddd m4, m0
+ sub hd, 4
+ jg .w8_loop
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+ vperm2i128 m1, m1, 0x11
+ pslld m0, 2
+ pxor m2, m2
+ vpblendd m0, m2, 0x0f
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad
+.w16_wpad2:
+ vpbroadcastw m3, [ypxq+strideq*0+14]
+ vpbroadcastw m0, [ypxq+strideq*1+14]
+ vpblendd m2, m3, 0xf0
+ vpblendd m1, m0, 0xf0
+ jmp .w16_wpad_end
+.w16:
+ mov r5, acq
+.w16_loop:
+ mova m2, [ypxq+strideq*0]
+ mova m1, [ypxq+strideq*1]
+ test wpadd, wpadd
+ jnz .w16_wpad2
+.w16_wpad_end:
+ lea ypxq, [ypxq+strideq*2]
+ psllw m2, 3
+ psllw m1, 3
+ mova [acq+32*0], m2
+ pmaddwd m2, m5
+ mova [acq+32*1], m1
+ pmaddwd m0, m1, m5
+ add acq, 32*2
+ paddd m4, m2
+ paddd m4, m0
+ sub hd, 2
+ jg .w16_loop
+ add hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+ paddd m0, m0
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad
+.w32:
+ mov r5, acq
+ test wpadd, wpadd
+ jnz .w32_wpad
+.w32_loop:
+ mova m0, [ypxq+ 0]
+ mova m1, [ypxq+32]
+ add ypxq, strideq
+ psllw m0, 3
+ psllw m1, 3
+ pmaddwd m2, m0, m5
+ mova [acq+32*0], m0
+ pmaddwd m3, m1, m5
+ mova [acq+32*1], m1
+ add acq, 32*2
+ paddd m2, m3
+ paddd m4, m2
+ dec hd
+ jg .w32_loop
+.w32_hpad:
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+ paddd m2, m2
+.w32_hpad_loop:
+ mova [acq+32*0], m0
+ mova [acq+32*1], m1
+ paddd m4, m2
+ mova [acq+32*2], m0
+ mova [acq+32*3], m1
+ add acq, 32*4
+ sub hpadd, 2
+ jg .w32_hpad_loop
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+.w32_wpad:
+ mova m0, [ypxq+ 0]
+ cmp wpadd, 4
+ jl .w32_wpad2
+ je .w32_wpad4
+ vpbroadcastw m1, [ypxq+14]
+ vpblendd m0, m1, 0xf0
+ jmp .w32_wpad_end
+.w32_wpad4:
+ vpbroadcastw m1, [ypxq+30]
+ jmp .w32_wpad_end
+.w32_wpad2:
+ vpbroadcastw m1, [ypxq+46]
+ vinserti128 m1, [ypxq+32], 0
+.w32_wpad_end:
+ add ypxq, strideq
+ psllw m0, 3
+ psllw m1, 3
+ pmaddwd m2, m0, m5
+ mova [acq+32*0], m0
+ pmaddwd m3, m1, m5
+ mova [acq+32*1], m1
+ add acq, 32*2
+ paddd m2, m3
+ paddd m4, m2
+ dec hd
+ jg .w32_wpad
+ jmp .w32_hpad
+
+cglobal pal_pred_16bpc, 4, 6, 5, dst, stride, pal, idx, w, h
+ vbroadcasti128 m3, [palq]
+ lea r2, [pal_pred_16bpc_avx2_table]
+ tzcnt wd, wm
+ vbroadcasti128 m4, [pal_pred_shuf]
+ movifnidn hd, hm
+ movsxd wq, [r2+wq*4]
+ pshufb m3, m4
+ punpckhqdq m4, m3, m3
+ add wq, r2
+DEFINE_ARGS dst, stride, stride3, idx, w, h
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ mova xm2, [idxq]
+ add idxq, 16
+ pshufb xm1, xm3, xm2
+ pshufb xm2, xm4, xm2
+ punpcklbw xm0, xm1, xm2
+ punpckhbw xm1, xm2
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+strideq*1], xm0
+ movhps [dstq+stride3q ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ movu m2, [idxq] ; only 16-byte alignment
+ add idxq, 32
+ pshufb m1, m3, m2
+ pshufb m2, m4, m2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm1
+ vextracti128 [dstq+strideq*2], m0, 1
+ vextracti128 [dstq+stride3q ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ vpermq m2, [idxq+ 0], q3120
+ vpermq m5, [idxq+32], q3120
+ add idxq, 64
+ pshufb m1, m3, m2
+ pshufb m2, m4, m2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ pshufb m1, m3, m5
+ pshufb m2, m4, m5
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16
+ RET
+.w32:
+ vpermq m2, [idxq+ 0], q3120
+ vpermq m5, [idxq+32], q3120
+ add idxq, 64
+ pshufb m1, m3, m2
+ pshufb m2, m4, m2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+32], m1
+ pshufb m1, m3, m5
+ pshufb m2, m4, m5
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+32], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32
+ RET
+.w64:
+ vpermq m2, [idxq+ 0], q3120
+ vpermq m5, [idxq+32], q3120
+ add idxq, 64
+ pshufb m1, m3, m2
+ pshufb m2, m4, m2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+ 0], m0
+ mova [dstq+32], m1
+ pshufb m1, m3, m5
+ pshufb m2, m4, m5
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+64], m0
+ mova [dstq+96], m1
+ add dstq, strideq
+ dec hd
+ jg .w64
+ RET
+
+%endif