summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/x86/ipred16_sse.asm
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:44:51 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:44:51 +0000
commit9e3c08db40b8916968b9f30096c7be3f00ce9647 (patch)
treea68f146d7fa01f0134297619fbe7e33db084e0aa /third_party/dav1d/src/x86/ipred16_sse.asm
parentInitial commit. (diff)
downloadthunderbird-upstream.tar.xz
thunderbird-upstream.zip
Adding upstream version 1:115.7.0.upstream/1%115.7.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/dav1d/src/x86/ipred16_sse.asm')
-rw-r--r--third_party/dav1d/src/x86/ipred16_sse.asm1923
1 files changed, 1923 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/ipred16_sse.asm b/third_party/dav1d/src/x86/ipred16_sse.asm
new file mode 100644
index 0000000000..07ea9567e1
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred16_sse.asm
@@ -0,0 +1,1923 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA
+
+filter_shuf: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1
+pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+
+pb_0_1: times 4 db 0, 1
+pb_2_3: times 4 db 2, 3
+pw_1: times 4 dw 1
+pw_2: times 4 dw 2
+pw_4: times 4 dw 4
+pw_512: times 4 dw 512
+pw_2048: times 4 dw 2048
+
+%macro JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%base %+ .%3 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+%define ipred_dc_splat_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 10*4)
+%define ipred_dc_128_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 15*4)
+%define ipred_cfl_splat_16bpc_ssse3_table (ipred_cfl_16bpc_ssse3_table + 8*4)
+
+JMP_TABLE ipred_dc_left_16bpc, ssse3, h4, h8, h16, h32, h64
+JMP_TABLE ipred_dc_16bpc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4, \
+ s4-15*4, s8-15*4, s16c-15*4, s32c-15*4, s64-15*4
+JMP_TABLE ipred_h_16bpc, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_cfl_16bpc, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \
+ s4-8*4, s8-8*4, s16-8*4, s32-8*4
+JMP_TABLE ipred_cfl_left_16bpc, ssse3, h4, h8, h16, h32
+JMP_TABLE ipred_cfl_ac_444_16bpc, ssse3, w4, w8, w16, w32
+JMP_TABLE pal_pred_16bpc, ssse3, w4, w8, w16, w32, w64
+
+cextern smooth_weights_1d_16bpc
+cextern smooth_weights_2d_16bpc
+cextern filter_intra_taps
+
+SECTION .text
+
+INIT_XMM ssse3
+cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h
+ LEA r5, ipred_dc_left_16bpc_ssse3_table
+ movd m4, wm
+ tzcnt wd, wm
+ add tlq, 2
+ movifnidn hd, hm
+ pxor m3, m3
+ pavgw m4, m3
+ movd m5, wd
+ movu m0, [tlq]
+ movsxd r6, [r5+wq*4]
+ add r6, r5
+ add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+
+cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_dc_left_16bpc_ssse3_table
+ mov hd, hm
+ movd m4, hm
+ tzcnt r6d, hd
+ sub tlq, hq
+ tzcnt wd, wm
+ pxor m3, m3
+ sub tlq, hq
+ pavgw m4, m3
+ movd m5, r6d
+ movu m0, [tlq]
+ movsxd r6, [r5+r6*4]
+ add r6, r5
+ add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+.h64:
+ movu m2, [tlq+112]
+ movu m1, [tlq+ 96]
+ paddw m0, m2
+ movu m2, [tlq+ 80]
+ paddw m1, m2
+ movu m2, [tlq+ 64]
+ paddw m0, m2
+ paddw m0, m1
+.h32:
+ movu m1, [tlq+ 48]
+ movu m2, [tlq+ 32]
+ paddw m1, m2
+ paddw m0, m1
+.h16:
+ movu m1, [tlq+ 16]
+ paddw m0, m1
+.h8:
+ movhlps m1, m0
+ paddw m0, m1
+.h4:
+ punpcklwd m0, m3
+ paddd m4, m0
+ punpckhqdq m0, m0
+ paddd m0, m4
+ pshuflw m4, m0, q1032
+ paddd m0, m4
+ psrld m0, m5
+ lea stride3q, [strideq*3]
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ jmp wq
+
+cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ tzcnt r6d, hd
+ lea r5d, [wq+hq]
+ movd m4, r5d
+ tzcnt r5d, r5d
+ movd m5, r5d
+ LEA r5, ipred_dc_16bpc_ssse3_table
+ tzcnt wd, wd
+ movsxd r6, [r5+r6*4]
+ movsxd wq, [r5+wq*4+5*4]
+ pxor m3, m3
+ psrlw m4, 1
+ add r6, r5
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp r6
+.h4:
+ movq m0, [tlq-8]
+ jmp wq
+.w4:
+ movq m1, [tlq+2]
+ paddw m1, m0
+ punpckhwd m0, m3
+ punpcklwd m1, m3
+ paddd m0, m1
+ paddd m4, m0
+ punpckhqdq m0, m0
+ paddd m0, m4
+ pshuflw m1, m0, q1032
+ paddd m0, m1
+ cmp hd, 4
+ jg .w4_mul
+ psrlw m0, 3
+ jmp .w4_end
+.w4_mul:
+ mov r2d, 0xAAAB
+ mov r3d, 0x6667
+ cmp hd, 16
+ cmove r2d, r3d
+ psrld m0, 2
+ movd m1, r2d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w4_end:
+ pshuflw m0, m0, q0000
+.s4:
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s4
+ RET
+.h8:
+ mova m0, [tlq-16]
+ jmp wq
+.w8:
+ movu m1, [tlq+2]
+ paddw m0, m1
+ punpcklwd m1, m0, m3
+ punpckhwd m0, m3
+ paddd m0, m1
+ paddd m4, m0
+ punpckhqdq m0, m0
+ paddd m0, m4
+ pshuflw m1, m0, q1032
+ paddd m0, m1
+ psrld m0, m5
+ cmp hd, 8
+ je .w8_end
+ mov r2d, 0xAAAB
+ mov r3d, 0x6667
+ cmp hd, 32
+ cmove r2d, r3d
+ movd m1, r2d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w8_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s8:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s8
+ RET
+.h16:
+ mova m0, [tlq-32]
+ paddw m0, [tlq-16]
+ jmp wq
+.w16:
+ movu m1, [tlq+ 2]
+ movu m2, [tlq+18]
+ paddw m1, m2
+ paddw m0, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ paddd m0, m1
+ paddd m4, m0
+ punpckhqdq m0, m0
+ paddd m0, m4
+ pshuflw m1, m0, q1032
+ paddd m0, m1
+ psrld m0, m5
+ cmp hd, 16
+ je .w16_end
+ mov r2d, 0xAAAB
+ mov r3d, 0x6667
+ test hd, 8|32
+ cmovz r2d, r3d
+ movd m1, r2d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w16_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s16c:
+ mova m1, m0
+.s16:
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*0+16*1], m1
+ mova [dstq+strideq*1+16*0], m0
+ mova [dstq+strideq*1+16*1], m1
+ mova [dstq+strideq*2+16*0], m0
+ mova [dstq+strideq*2+16*1], m1
+ mova [dstq+stride3q +16*0], m0
+ mova [dstq+stride3q +16*1], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s16
+ RET
+.h32:
+ mova m0, [tlq-64]
+ paddw m0, [tlq-48]
+ paddw m0, [tlq-32]
+ paddw m0, [tlq-16]
+ jmp wq
+.w32:
+ movu m1, [tlq+ 2]
+ movu m2, [tlq+18]
+ paddw m1, m2
+ movu m2, [tlq+34]
+ paddw m0, m2
+ movu m2, [tlq+50]
+ paddw m1, m2
+ paddw m0, m1
+ punpcklwd m1, m0, m3
+ punpckhwd m0, m3
+ paddd m0, m1
+ paddd m4, m0
+ punpckhqdq m0, m0
+ paddd m0, m4
+ pshuflw m1, m0, q1032
+ paddd m0, m1
+ psrld m0, m5
+ cmp hd, 32
+ je .w32_end
+ mov r2d, 0xAAAB
+ mov r3d, 0x6667
+ cmp hd, 8
+ cmove r2d, r3d
+ movd m1, r2d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w32_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s32c:
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+.s32:
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*0+16*1], m1
+ mova [dstq+strideq*0+16*2], m2
+ mova [dstq+strideq*0+16*3], m3
+ mova [dstq+strideq*1+16*0], m0
+ mova [dstq+strideq*1+16*1], m1
+ mova [dstq+strideq*1+16*2], m2
+ mova [dstq+strideq*1+16*3], m3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .s32
+ RET
+.h64:
+ mova m0, [tlq-128]
+ mova m1, [tlq-112]
+ paddw m0, [tlq- 96]
+ paddw m1, [tlq- 80]
+ paddw m0, [tlq- 64]
+ paddw m1, [tlq- 48]
+ paddw m0, [tlq- 32]
+ paddw m1, [tlq- 16]
+ paddw m0, m1
+ jmp wq
+.w64:
+ movu m1, [tlq+ 2]
+ movu m2, [tlq+ 18]
+ paddw m1, m2
+ movu m2, [tlq+ 34]
+ paddw m0, m2
+ movu m2, [tlq+ 50]
+ paddw m1, m2
+ movu m2, [tlq+ 66]
+ paddw m0, m2
+ movu m2, [tlq+ 82]
+ paddw m1, m2
+ movu m2, [tlq+ 98]
+ paddw m0, m2
+ movu m2, [tlq+114]
+ paddw m1, m2
+ paddw m0, m1
+ punpcklwd m1, m0, m3
+ punpckhwd m0, m3
+ paddd m0, m1
+ paddd m4, m0
+ punpckhqdq m0, m0
+ paddd m0, m4
+ pshuflw m1, m0, q1032
+ paddd m0, m1
+ psrld m0, m5
+ cmp hd, 64
+ je .w64_end
+ mov r2d, 0xAAAB
+ mov r3d, 0x6667
+ cmp hd, 16
+ cmove r2d, r3d
+ movd m1, r2d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w64_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s64:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m0
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m0
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m0
+ mova [dstq+16*6], m0
+ mova [dstq+16*7], m0
+ add dstq, strideq
+ dec hd
+ jg .s64
+ RET
+
+cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
+ mov r6d, r8m
+ LEA r5, ipred_dc_128_16bpc_ssse3_table
+ tzcnt wd, wm
+ shr r6d, 11
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ movddup m0, [r5-ipred_dc_128_16bpc_ssse3_table+pw_512+r6*8]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+cglobal ipred_v_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_dc_splat_16bpc_ssse3_table
+ movifnidn hd, hm
+ movu m0, [tlq+ 2]
+ movu m1, [tlq+ 18]
+ movu m2, [tlq+ 34]
+ movu m3, [tlq+ 50]
+ cmp wd, 64
+ je .w64
+ tzcnt wd, wd
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+.w64:
+ WIN64_SPILL_XMM 8
+ movu m4, [tlq+ 66]
+ movu m5, [tlq+ 82]
+ movu m6, [tlq+ 98]
+ movu m7, [tlq+114]
+.w64_loop:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ mova [dstq+16*4], m4
+ mova [dstq+16*5], m5
+ mova [dstq+16*6], m6
+ mova [dstq+16*7], m7
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
+%define base r5-ipred_h_16bpc_ssse3_table
+ tzcnt wd, wm
+ LEA r5, ipred_h_16bpc_ssse3_table
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ movddup m2, [base+pb_0_1]
+ movddup m3, [base+pb_2_3]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ sub tlq, 8
+ movq m3, [tlq]
+ pshuflw m0, m3, q3333
+ pshuflw m1, m3, q2222
+ pshuflw m2, m3, q1111
+ pshuflw m3, m3, q0000
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m1
+ movq [dstq+strideq*2], m2
+ movq [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ sub tlq, 8
+ movq m3, [tlq]
+ punpcklwd m3, m3
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ sub tlq, 4
+ movd m1, [tlq]
+ pshufb m0, m1, m3
+ pshufb m1, m2
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*0+16*1], m0
+ mova [dstq+strideq*1+16*0], m1
+ mova [dstq+strideq*1+16*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16
+ RET
+.w32:
+ sub tlq, 4
+ movd m1, [tlq]
+ pshufb m0, m1, m3
+ pshufb m1, m2
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*0+16*1], m0
+ mova [dstq+strideq*0+16*2], m0
+ mova [dstq+strideq*0+16*3], m0
+ mova [dstq+strideq*1+16*0], m1
+ mova [dstq+strideq*1+16*1], m1
+ mova [dstq+strideq*1+16*2], m1
+ mova [dstq+strideq*1+16*3], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32
+ RET
+.w64:
+ sub tlq, 2
+ movd m0, [tlq]
+ pshufb m0, m2
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m0
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m0
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m0
+ mova [dstq+16*6], m0
+ mova [dstq+16*7], m0
+ add dstq, strideq
+ dec hd
+ jg .w64
+ RET
+
+cglobal ipred_paeth_16bpc, 4, 6, 8, dst, stride, tl, w, h, left
+%define base r5-ipred_paeth_16bpc_ssse3_table
+ movifnidn hd, hm
+ pshuflw m4, [tlq], q0000
+ mov leftq, tlq
+ add hd, hd
+ punpcklqdq m4, m4 ; topleft
+ sub leftq, hq
+ and wd, ~7
+ jnz .w8
+ movddup m5, [tlq+2] ; top
+ psubw m6, m5, m4
+ pabsw m7, m6
+.w4_loop:
+ movd m1, [leftq+hq-4]
+ punpcklwd m1, m1
+ punpckldq m1, m1 ; left
+%macro PAETH 0
+ paddw m0, m6, m1
+ psubw m2, m4, m0 ; tldiff
+ psubw m0, m5 ; tdiff
+ pabsw m2, m2
+ pabsw m0, m0
+ pminsw m2, m0
+ pcmpeqw m0, m2
+ pand m3, m5, m0
+ pandn m0, m4
+ por m0, m3
+ pcmpgtw m3, m7, m2
+ pand m0, m3
+ pandn m3, m1
+ por m0, m3
+%endmacro
+ PAETH
+ movhps [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2*2
+ jg .w4_loop
+ RET
+.w8:
+%if ARCH_X86_32
+ PUSH r6
+ %define r7d hm
+ %assign regs_used 7
+%elif WIN64
+ movaps r4m, m8
+ PUSH r7
+ %assign regs_used 8
+%endif
+%if ARCH_X86_64
+ movddup m8, [pb_0_1]
+%endif
+ lea tlq, [tlq+wq*2+2]
+ neg wq
+ mov r7d, hd
+.w8_loop0:
+ movu m5, [tlq+wq*2]
+ mov r6, dstq
+ add dstq, 16
+ psubw m6, m5, m4
+ pabsw m7, m6
+.w8_loop:
+ movd m1, [leftq+hq-2]
+%if ARCH_X86_64
+ pshufb m1, m8
+%else
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+%endif
+ PAETH
+ mova [r6], m0
+ add r6, strideq
+ sub hd, 1*2
+ jg .w8_loop
+ mov hd, r7d
+ add wq, 8
+ jl .w8_loop0
+%if WIN64
+ movaps m8, r4m
+%endif
+ RET
+
+%if ARCH_X86_64
+DECLARE_REG_TMP 7
+%else
+DECLARE_REG_TMP 4
+%endif
+
+cglobal ipred_smooth_v_16bpc, 4, 6, 6, dst, stride, tl, w, h, weights
+ LEA weightsq, smooth_weights_1d_16bpc
+ mov hd, hm
+ lea weightsq, [weightsq+hq*4]
+ neg hq
+ movd m5, [tlq+hq*2] ; bottom
+ pshuflw m5, m5, q0000
+ punpcklqdq m5, m5
+ cmp wd, 4
+ jne .w8
+ movddup m4, [tlq+2] ; top
+ lea r3, [strideq*3]
+ psubw m4, m5 ; top - bottom
+.w4_loop:
+ movq m1, [weightsq+hq*2]
+ punpcklwd m1, m1
+ pshufd m0, m1, q1100
+ punpckhdq m1, m1
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ paddw m0, m5
+ paddw m1, m5
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m1
+ movhps [dstq+r3 ], m1
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w4_loop
+ RET
+.w8:
+%if ARCH_X86_32
+ PUSH r6
+ %assign regs_used 7
+ mov hm, hq
+ %define hq hm
+%elif WIN64
+ PUSH r7
+ %assign regs_used 8
+%endif
+.w8_loop0:
+ mov t0, hq
+ movu m4, [tlq+2]
+ add tlq, 16
+ mov r6, dstq
+ add dstq, 16
+ psubw m4, m5
+.w8_loop:
+ movq m3, [weightsq+t0*2]
+ punpcklwd m3, m3
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ REPX {paddw x, m5}, m0, m1, m2, m3
+ mova [r6+strideq*0], m0
+ mova [r6+strideq*1], m1
+ lea r6, [r6+strideq*2]
+ mova [r6+strideq*0], m2
+ mova [r6+strideq*1], m3
+ lea r6, [r6+strideq*2]
+ add t0, 4
+ jl .w8_loop
+ sub wd, 8
+ jg .w8_loop0
+ RET
+
+cglobal ipred_smooth_h_16bpc, 3, 6, 6, dst, stride, tl, w, h, weights
+ LEA weightsq, smooth_weights_1d_16bpc
+ mov wd, wm
+ movifnidn hd, hm
+ movd m5, [tlq+wq*2] ; right
+ sub tlq, 8
+ add hd, hd
+ pshuflw m5, m5, q0000
+ sub tlq, hq
+ punpcklqdq m5, m5
+ cmp wd, 4
+ jne .w8
+ movddup m4, [weightsq+4*2]
+ lea r3, [strideq*3]
+.w4_loop:
+ movq m1, [tlq+hq] ; left
+ punpcklwd m1, m1
+ psubw m1, m5 ; left - right
+ pshufd m0, m1, q3322
+ punpckldq m1, m1
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ paddw m0, m5
+ paddw m1, m5
+ movhps [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movhps [dstq+strideq*2], m1
+ movq [dstq+r3 ], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4*2
+ jg .w4_loop
+ RET
+.w8:
+ lea weightsq, [weightsq+wq*4]
+ neg wq
+%if ARCH_X86_32
+ PUSH r6
+ %assign regs_used 7
+ %define hd hm
+%elif WIN64
+ PUSH r7
+ %assign regs_used 8
+%endif
+.w8_loop0:
+ mov t0d, hd
+ mova m4, [weightsq+wq*2]
+ mov r6, dstq
+ add dstq, 16
+.w8_loop:
+ movq m3, [tlq+t0*(1+ARCH_X86_32)]
+ punpcklwd m3, m3
+ psubw m3, m5
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ REPX {paddw x, m5}, m0, m1, m2, m3
+ mova [r6+strideq*0], m0
+ mova [r6+strideq*1], m1
+ lea r6, [r6+strideq*2]
+ mova [r6+strideq*0], m2
+ mova [r6+strideq*1], m3
+ lea r6, [r6+strideq*2]
+ sub t0d, 4*(1+ARCH_X86_64)
+ jg .w8_loop
+ add wq, 8
+ jl .w8_loop0
+ RET
+
+%if ARCH_X86_64
+DECLARE_REG_TMP 10
+%else
+DECLARE_REG_TMP 3
+%endif
+
+cglobal ipred_smooth_16bpc, 3, 7, 8, dst, stride, tl, w, h, \
+ h_weights, v_weights, top
+ LEA h_weightsq, smooth_weights_2d_16bpc
+ mov wd, wm
+ mov hd, hm
+ movd m7, [tlq+wq*2] ; right
+ lea v_weightsq, [h_weightsq+hq*8]
+ neg hq
+ movd m6, [tlq+hq*2] ; bottom
+ pshuflw m7, m7, q0000
+ pshuflw m6, m6, q0000
+ cmp wd, 4
+ jne .w8
+ movq m4, [tlq+2] ; top
+ mova m5, [h_weightsq+4*4]
+ punpcklwd m4, m6 ; top, bottom
+ pxor m6, m6
+.w4_loop:
+ movq m1, [v_weightsq+hq*4]
+ sub tlq, 4
+ movd m3, [tlq] ; left
+ pshufd m0, m1, q0000
+ pshufd m1, m1, q1111
+ pmaddwd m0, m4
+ punpcklwd m3, m7 ; left, right
+ pmaddwd m1, m4
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ pmaddwd m2, m5
+ pmaddwd m3, m5
+ paddd m0, m2
+ paddd m1, m3
+ psrld m0, 8
+ psrld m1, 8
+ packssdw m0, m1
+ pavgw m0, m6
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ add hq, 2
+ jl .w4_loop
+ RET
+.w8:
+%if ARCH_X86_32
+ lea h_weightsq, [h_weightsq+wq*4]
+ mov t0, tlq
+ mov r1m, tlq
+ mov r2m, hq
+ %define m8 [h_weightsq+16*0]
+ %define m9 [h_weightsq+16*1]
+%else
+%if WIN64
+ movaps r4m, m8
+ movaps r6m, m9
+ PUSH r7
+ PUSH r8
+%endif
+ PUSH r9
+ PUSH r10
+ %assign regs_used 11
+ lea h_weightsq, [h_weightsq+wq*8]
+ lea topq, [tlq+wq*2]
+ neg wq
+ mov r8, tlq
+ mov r9, hq
+%endif
+ punpcklqdq m6, m6
+.w8_loop0:
+%if ARCH_X86_32
+ movu m5, [t0+2]
+ add t0, 16
+ mov r0m, t0
+%else
+ movu m5, [topq+wq*2+2]
+ mova m8, [h_weightsq+wq*4+16*0]
+ mova m9, [h_weightsq+wq*4+16*1]
+%endif
+ mov t0, dstq
+ add dstq, 16
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+.w8_loop:
+ movd m1, [v_weightsq+hq*4]
+ sub tlq, 2
+ movd m3, [tlq] ; left
+ pshufd m1, m1, q0000
+ pmaddwd m0, m4, m1
+ pshuflw m3, m3, q0000
+ pmaddwd m1, m5
+ punpcklwd m3, m7 ; left, right
+ pmaddwd m2, m8, m3
+ pmaddwd m3, m9
+ paddd m0, m2
+ paddd m1, m3
+ psrld m0, 8
+ psrld m1, 8
+ packssdw m0, m1
+ pxor m1, m1
+ pavgw m0, m1
+ mova [t0], m0
+ add t0, strideq
+ inc hq
+ jl .w8_loop
+%if ARCH_X86_32
+ mov t0, r0m
+ mov tlq, r1m
+ add h_weightsq, 16*2
+ mov hq, r2m
+ sub dword wm, 8
+ jg .w8_loop0
+%else
+ mov tlq, r8
+ mov hq, r9
+ add wq, 8
+ jl .w8_loop0
+%endif
+%if WIN64
+ movaps m8, r4m
+ movaps m9, r6m
+%endif
+ RET
+
+%if ARCH_X86_64
+cglobal ipred_filter_16bpc, 4, 7, 16, dst, stride, tl, w, h, filter
+%else
+cglobal ipred_filter_16bpc, 4, 7, 8, -16*8, dst, stride, tl, w, h, filter
+%define m8 [esp+16*0]
+%define m9 [esp+16*1]
+%define m10 [esp+16*2]
+%define m11 [esp+16*3]
+%define m12 [esp+16*4]
+%define m13 [esp+16*5]
+%define m14 [esp+16*6]
+%define m15 [esp+16*7]
+%endif
+%define base r6-$$
+ movifnidn hd, hm
+ movd m6, r8m ; bitdepth_max
+%ifidn filterd, filterm
+ movzx filterd, filterb
+%else
+ movzx filterd, byte filterm
+%endif
+ LEA r6, $$
+ shl filterd, 6
+ movu m0, [tlq-6] ; __ l1 l0 tl t0 t1 t2 t3
+ mova m1, [base+filter_intra_taps+filterq+16*0]
+ mova m2, [base+filter_intra_taps+filterq+16*1]
+ mova m3, [base+filter_intra_taps+filterq+16*2]
+ mova m4, [base+filter_intra_taps+filterq+16*3]
+ pxor m5, m5
+%if ARCH_X86_64
+ punpcklbw m8, m5, m1 ; place 8-bit coefficients in the upper
+ punpckhbw m9, m5, m1 ; half of each 16-bit word to avoid
+ punpcklbw m10, m5, m2 ; having to perform sign-extension.
+ punpckhbw m11, m5, m2
+ punpcklbw m12, m5, m3
+ punpckhbw m13, m5, m3
+ punpcklbw m14, m5, m4
+ punpckhbw m15, m5, m4
+%else
+ punpcklbw m7, m5, m1
+ mova m8, m7
+ punpckhbw m7, m5, m1
+ mova m9, m7
+ punpcklbw m7, m5, m2
+ mova m10, m7
+ punpckhbw m7, m5, m2
+ mova m11, m7
+ punpcklbw m7, m5, m3
+ mova m12, m7
+ punpckhbw m7, m5, m3
+ mova m13, m7
+ punpcklbw m7, m5, m4
+ mova m14, m7
+ punpckhbw m7, m5, m4
+ mova m15, m7
+%endif
+ mova m7, [base+filter_shuf]
+ add hd, hd
+ mov r5, dstq
+ pshuflw m6, m6, q0000
+ mov r6, tlq
+ punpcklqdq m6, m6
+ sub tlq, hq
+.left_loop:
+ pshufb m0, m7 ; tl t0 t1 t2 t3 l0 l1 __
+ pshufd m1, m0, q0000
+ pmaddwd m2, m8, m1
+ pmaddwd m1, m9
+ pshufd m4, m0, q1111
+ pmaddwd m3, m10, m4
+ pmaddwd m4, m11
+ paddd m2, m3
+ paddd m1, m4
+ pshufd m4, m0, q2222
+ pmaddwd m3, m12, m4
+ pmaddwd m4, m13
+ paddd m2, m3
+ paddd m1, m4
+ pshufd m3, m0, q3333
+ pmaddwd m0, m14, m3
+ pmaddwd m3, m15
+ paddd m0, m2
+ paddd m1, m3
+ psrad m0, 11 ; x >> 3
+ psrad m1, 11
+ packssdw m0, m1
+ pmaxsw m0, m5
+ pavgw m0, m5 ; (x + 8) >> 4
+ pminsw m0, m6
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movlps m0, [tlq+hq-10]
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2*2
+ jg .left_loop
+ sub wd, 4
+ jz .end
+ sub tld, r6d ; -h*2
+ sub r6, r5 ; tl-dst
+.right_loop0:
+ add r5, 8
+ mov hd, tld
+ movu m0, [r5+r6] ; tl t0 t1 t2 t3 __ __ __
+ mov dstq, r5
+.right_loop:
+ pshufd m2, m0, q0000
+ pmaddwd m1, m8, m2
+ pmaddwd m2, m9
+ pshufd m4, m0, q1111
+ pmaddwd m3, m10, m4
+ pmaddwd m4, m11
+ pinsrw m0, [dstq+strideq*0-2], 5
+ paddd m1, m3
+ paddd m2, m4
+ pshufd m0, m0, q2222
+ movddup m4, [dstq+strideq*1-8]
+ pmaddwd m3, m12, m0
+ pmaddwd m0, m13
+ paddd m1, m3
+ paddd m0, m2
+ pshuflw m2, m4, q3333
+ punpcklwd m2, m5
+ pmaddwd m3, m14, m2
+ pmaddwd m2, m15
+ paddd m1, m3
+ paddd m0, m2
+ psrad m1, 11
+ psrad m0, 11
+ packssdw m0, m1
+ pmaxsw m0, m5
+ pavgw m0, m5
+ pminsw m0, m6
+ movhps [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ palignr m0, m4, 14
+ lea dstq, [dstq+strideq*2]
+ add hd, 2*2
+ jl .right_loop
+ sub wd, 4
+ jg .right_loop0
+.end:
+ RET
+
+%if UNIX64
+DECLARE_REG_TMP 7
+%else
+DECLARE_REG_TMP 5
+%endif
+
+cglobal ipred_cfl_top_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac
+ LEA t0, ipred_cfl_left_16bpc_ssse3_table
+ movd m4, wd
+ tzcnt wd, wd
+ movifnidn hd, hm
+ add tlq, 2
+ movsxd r6, [t0+wq*4]
+ movd m5, wd
+ jmp mangle(private_prefix %+ _ipred_cfl_left_16bpc_ssse3.start)
+
+cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
+ movifnidn hd, hm
+ LEA t0, ipred_cfl_left_16bpc_ssse3_table
+ tzcnt wd, wm
+ lea r6d, [hq*2]
+ movd m4, hd
+ sub tlq, r6
+ tzcnt r6d, hd
+ movd m5, r6d
+ movsxd r6, [t0+r6*4]
+.start:
+ movd m7, r7m
+ movu m0, [tlq]
+ add r6, t0
+ add t0, ipred_cfl_splat_16bpc_ssse3_table-ipred_cfl_left_16bpc_ssse3_table
+ movsxd wq, [t0+wq*4]
+ pxor m6, m6
+ pshuflw m7, m7, q0000
+ pcmpeqw m3, m3
+ add wq, t0
+ movifnidn acq, acmp
+ pavgw m4, m6
+ punpcklqdq m7, m7
+ jmp r6
+.h32:
+ movu m1, [tlq+48]
+ movu m2, [tlq+32]
+ paddw m0, m1
+ paddw m0, m2
+.h16:
+ movu m1, [tlq+16]
+ paddw m0, m1
+.h8:
+ pshufd m1, m0, q1032
+ paddw m0, m1
+.h4:
+ pmaddwd m0, m3
+ psubd m4, m0
+ pshuflw m0, m4, q1032
+ paddd m0, m4
+ psrld m0, m5
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ jmp wq
+
+%macro IPRED_CFL 2 ; dst, src
+ pabsw m%1, m%2
+ pmulhrsw m%1, m2
+ psignw m%2, m1
+ psignw m%1, m%2
+ paddw m%1, m0
+ pmaxsw m%1, m6
+ pminsw m%1, m7
+%endmacro
+
+cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac, alpha
+ movifnidn hd, hm
+ tzcnt r6d, hd
+ lea t0d, [wq+hq]
+ movd m4, t0d
+ tzcnt t0d, t0d
+ movd m5, t0d
+ LEA t0, ipred_cfl_16bpc_ssse3_table
+ tzcnt wd, wd
+ movd m7, r7m
+ movsxd r6, [t0+r6*4]
+ movsxd wq, [t0+wq*4+4*4]
+ psrlw m4, 1
+ pxor m6, m6
+ pshuflw m7, m7, q0000
+ add r6, t0
+ add wq, t0
+ movifnidn acq, acmp
+ pcmpeqw m3, m3
+ punpcklqdq m7, m7
+ jmp r6
+.h4:
+ movq m0, [tlq-8]
+ jmp wq
+.w4:
+ movq m1, [tlq+2]
+ paddw m0, m1
+ pmaddwd m0, m3
+ psubd m4, m0
+ pshufd m0, m4, q1032
+ paddd m0, m4
+ pshuflw m4, m0, q1032
+ paddd m0, m4
+ cmp hd, 4
+ jg .w4_mul
+ psrld m0, 3
+ jmp .w4_end
+.w4_mul:
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ cmp hd, 16
+ cmove r6d, r2d
+ movd m1, r6d
+ psrld m0, 2
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w4_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s4:
+ movd m1, alpham
+ lea r6, [strideq*3]
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ pabsw m2, m1
+ psllw m2, 9
+.s4_loop:
+ mova m4, [acq+16*0]
+ mova m5, [acq+16*1]
+ add acq, 16*2
+ IPRED_CFL 3, 4
+ IPRED_CFL 4, 5
+ movq [dstq+strideq*0], m3
+ movhps [dstq+strideq*1], m3
+ movq [dstq+strideq*2], m4
+ movhps [dstq+r6 ], m4
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s4_loop
+ RET
+.h8:
+ mova m0, [tlq-16]
+ jmp wq
+.w8:
+ movu m1, [tlq+2]
+ paddw m0, m1
+ pmaddwd m0, m3
+ psubd m4, m0
+ pshufd m0, m4, q1032
+ paddd m0, m4
+ pshuflw m4, m0, q1032
+ paddd m0, m4
+ psrld m0, m5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ cmp hd, 32
+ cmove r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w8_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s8:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ pabsw m2, m1
+ psllw m2, 9
+.s8_loop:
+ mova m4, [acq+16*0]
+ mova m5, [acq+16*1]
+ add acq, 16*2
+ IPRED_CFL 3, 4
+ IPRED_CFL 4, 5
+ mova [dstq+strideq*0], m3
+ mova [dstq+strideq*1], m4
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .s8_loop
+ RET
+.h16:
+ mova m0, [tlq-32]
+ paddw m0, [tlq-16]
+ jmp wq
+.w16:
+ movu m1, [tlq+ 2]
+ movu m2, [tlq+18]
+ paddw m1, m2
+ paddw m0, m1
+ pmaddwd m0, m3
+ psubd m4, m0
+ pshufd m0, m4, q1032
+ paddd m0, m4
+ pshuflw m4, m0, q1032
+ paddd m0, m4
+ psrld m0, m5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ test hd, 8|32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w16_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s16:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ pabsw m2, m1
+ psllw m2, 9
+.s16_loop:
+ mova m4, [acq+16*0]
+ mova m5, [acq+16*1]
+ add acq, 16*2
+ IPRED_CFL 3, 4
+ IPRED_CFL 4, 5
+ mova [dstq+16*0], m3
+ mova [dstq+16*1], m4
+ add dstq, strideq
+ dec hd
+ jg .s16_loop
+ RET
+.h32:
+ mova m0, [tlq-64]
+ paddw m0, [tlq-48]
+ paddw m0, [tlq-32]
+ paddw m0, [tlq-16]
+ jmp wq
+.w32:
+ movu m1, [tlq+ 2]
+ movu m2, [tlq+18]
+ paddw m1, m2
+ movu m2, [tlq+34]
+ paddw m1, m2
+ movu m2, [tlq+50]
+ paddw m1, m2
+ paddw m0, m1
+ pmaddwd m0, m3
+ psubd m4, m0
+ pshufd m0, m4, q1032
+ paddd m0, m4
+ pshuflw m4, m0, q1032
+ paddd m0, m4
+ psrld m0, m5
+ cmp hd, 32
+ je .w32_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ cmp hd, 8
+ cmove r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w32_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s32:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ pabsw m2, m1
+ psllw m2, 9
+.s32_loop:
+ mova m4, [acq+16*0]
+ mova m5, [acq+16*1]
+ IPRED_CFL 3, 4
+ IPRED_CFL 4, 5
+ mova [dstq+16*0], m3
+ mova [dstq+16*1], m4
+ mova m4, [acq+16*2]
+ mova m5, [acq+16*3]
+ add acq, 16*4
+ IPRED_CFL 3, 4
+ IPRED_CFL 4, 5
+ mova [dstq+16*2], m3
+ mova [dstq+16*3], m4
+ add dstq, strideq
+ dec hd
+ jg .s32_loop
+ RET
+
+cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac
+ tzcnt wd, wm
+ LEA t0, ipred_cfl_splat_16bpc_ssse3_table
+ mov r6d, r7m
+ movifnidn hd, hm
+ shr r6d, 11
+ movd m7, r7m
+ movsxd wq, [t0+wq*4]
+ movddup m0, [t0-ipred_cfl_splat_16bpc_ssse3_table+pw_512+r6*8]
+ pshuflw m7, m7, q0000
+ pxor m6, m6
+ add wq, t0
+ movifnidn acq, acmp
+ punpcklqdq m7, m7
+ jmp wq
+
+cglobal ipred_cfl_ac_420_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
+ movifnidn hpadd, hpadm
+%if ARCH_X86_32 && PIC
+ pcmpeqw m5, m5
+ pabsw m5, m5
+ paddw m5, m5
+%else
+ movddup m5, [pw_2]
+%endif
+ mov hd, hm
+ shl hpadd, 2
+ pxor m4, m4
+ sub hd, hpadd
+ cmp dword wm, 8
+ mov r5, acq
+ jg .w16
+ je .w8
+ lea r3, [strideq*3]
+.w4_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0]
+ pmaddwd m1, m5, [ypxq+strideq*1]
+ pmaddwd m2, m5, [ypxq+strideq*2]
+ pmaddwd m3, m5, [ypxq+r3 ]
+ lea ypxq, [ypxq+strideq*4]
+ paddd m0, m1
+ paddd m2, m3
+ paddd m4, m0
+ packssdw m0, m2
+ paddd m4, m2
+ mova [acq], m0
+ add acq, 16
+ sub hd, 2
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .dc
+ punpckhqdq m0, m0
+ pslld m2, 2
+.w4_hpad:
+ mova [acq+16*0], m0
+ paddd m4, m2
+ mova [acq+16*1], m0
+ add acq, 16*2
+ sub hpadd, 4
+ jg .w4_hpad
+ jmp .dc
+.w8:
+%if ARCH_X86_32
+ cmp dword wpadm, 0
+%else
+ test wpadd, wpadd
+%endif
+ jnz .w8_wpad1
+.w8_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0+16*0]
+ pmaddwd m2, m5, [ypxq+strideq*1+16*0]
+ pmaddwd m1, m5, [ypxq+strideq*0+16*1]
+ pmaddwd m3, m5, [ypxq+strideq*1+16*1]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m0, m2
+ paddd m1, m3
+ paddd m2, m0, m1
+ packssdw m0, m1
+ paddd m4, m2
+ mova [acq], m0
+ add acq, 16
+ dec hd
+ jg .w8_loop
+.w8_hpad:
+ test hpadd, hpadd
+ jz .dc
+ pslld m2, 2
+ mova m1, m0
+ jmp .hpad
+.w8_wpad1:
+ pmaddwd m0, m5, [ypxq+strideq*0]
+ pmaddwd m1, m5, [ypxq+strideq*1]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m0, m1
+ pshufd m1, m0, q3333
+ paddd m2, m0, m1
+ packssdw m0, m1
+ paddd m4, m2
+ mova [acq], m0
+ add acq, 16
+ dec hd
+ jg .w8_wpad1
+ jmp .w8_hpad
+.w16_wpad3:
+ pshufd m3, m0, q3333
+ mova m1, m3
+ mova m2, m3
+ jmp .w16_wpad_end
+.w16_wpad2:
+ pshufd m1, m3, q3333
+ mova m2, m1
+ jmp .w16_wpad_end
+.w16_wpad1:
+ pshufd m2, m1, q3333
+ jmp .w16_wpad_end
+.w16:
+ movifnidn wpadd, wpadm
+ WIN64_SPILL_XMM 7
+.w16_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0+16*0]
+ pmaddwd m6, m5, [ypxq+strideq*1+16*0]
+ paddd m0, m6
+ cmp wpadd, 2
+ jg .w16_wpad3
+ pmaddwd m3, m5, [ypxq+strideq*0+16*1]
+ pmaddwd m6, m5, [ypxq+strideq*1+16*1]
+ paddd m3, m6
+ je .w16_wpad2
+ pmaddwd m1, m5, [ypxq+strideq*0+16*2]
+ pmaddwd m6, m5, [ypxq+strideq*1+16*2]
+ paddd m1, m6
+ jp .w16_wpad1
+ pmaddwd m2, m5, [ypxq+strideq*0+16*3]
+ pmaddwd m6, m5, [ypxq+strideq*1+16*3]
+ paddd m2, m6
+.w16_wpad_end:
+ lea ypxq, [ypxq+strideq*2]
+ paddd m6, m0, m3
+ packssdw m0, m3
+ paddd m6, m1
+ mova [acq+16*0], m0
+ packssdw m1, m2
+ paddd m2, m6
+ mova [acq+16*1], m1
+ add acq, 16*2
+ paddd m4, m2
+ dec hd
+ jg .w16_loop
+ WIN64_RESTORE_XMM
+ add hpadd, hpadd
+ jz .dc
+ paddd m2, m2
+.hpad:
+ mova [acq+16*0], m0
+ mova [acq+16*1], m1
+ paddd m4, m2
+ mova [acq+16*2], m0
+ mova [acq+16*3], m1
+ add acq, 16*4
+ sub hpadd, 4
+ jg .hpad
+.dc:
+ sub r5, acq ; -w*h*2
+ pshufd m2, m4, q1032
+ tzcnt r1d, r5d
+ paddd m2, m4
+ sub r1d, 2
+ pshufd m4, m2, q2301
+ movd m0, r1d
+ paddd m2, m4
+ psrld m2, m0
+ pxor m0, m0
+ pavgw m2, m0
+ packssdw m2, m2
+.dc_loop:
+ mova m0, [acq+r5+16*0]
+ mova m1, [acq+r5+16*1]
+ psubw m0, m2
+ psubw m1, m2
+ mova [acq+r5+16*0], m0
+ mova [acq+r5+16*1], m1
+ add r5, 16*2
+ jl .dc_loop
+ RET
+
+cglobal ipred_cfl_ac_422_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
+ movifnidn hpadd, hpadm
+%if ARCH_X86_32 && PIC
+ pcmpeqw m5, m5
+ pabsw m5, m5
+ psllw m5, 2
+%else
+ movddup m5, [pw_4]
+%endif
+ mov hd, hm
+ shl hpadd, 2
+ pxor m4, m4
+ sub hd, hpadd
+ cmp dword wm, 8
+ mov r5, acq
+ jg .w16
+ je .w8
+ lea r3, [strideq*3]
+.w4_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0]
+ pmaddwd m3, m5, [ypxq+strideq*1]
+ pmaddwd m1, m5, [ypxq+strideq*2]
+ pmaddwd m2, m5, [ypxq+r3 ]
+ lea ypxq, [ypxq+strideq*4]
+ paddd m4, m0
+ packssdw m0, m3
+ paddd m3, m1
+ packssdw m1, m2
+ paddd m4, m2
+ paddd m4, m3
+ mova [acq+16*0], m0
+ mova [acq+16*1], m1
+ add acq, 16*2
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+ punpckhqdq m1, m1
+ pslld m2, 3
+ mova [acq+16*0], m1
+ mova [acq+16*1], m1
+ paddd m4, m2
+ mova [acq+16*2], m1
+ mova [acq+16*3], m1
+ add acq, 16*4
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+.w8:
+%if ARCH_X86_32
+ cmp dword wpadm, 0
+%else
+ test wpadd, wpadd
+%endif
+ jnz .w8_wpad1
+.w8_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0+16*0]
+ pmaddwd m2, m5, [ypxq+strideq*0+16*1]
+ pmaddwd m1, m5, [ypxq+strideq*1+16*0]
+ pmaddwd m3, m5, [ypxq+strideq*1+16*1]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m4, m0
+ packssdw m0, m2
+ paddd m4, m2
+ mova [acq+16*0], m0
+ paddd m2, m1, m3
+ packssdw m1, m3
+ paddd m4, m2
+ mova [acq+16*1], m1
+ add acq, 16*2
+ sub hd, 2
+ jg .w8_loop
+.w8_hpad:
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+ pslld m2, 2
+ mova m0, m1
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
+.w8_wpad1:
+ pmaddwd m0, m5, [ypxq+strideq*0]
+ pmaddwd m1, m5, [ypxq+strideq*1]
+ lea ypxq, [ypxq+strideq*2]
+ pshufd m2, m0, q3333
+ pshufd m3, m1, q3333
+ paddd m4, m0
+ packssdw m0, m2
+ paddd m4, m2
+ paddd m2, m1, m3
+ packssdw m1, m3
+ paddd m4, m2
+ mova [acq+16*0], m0
+ mova [acq+16*1], m1
+ add acq, 16*2
+ sub hd, 2
+ jg .w8_wpad1
+ jmp .w8_hpad
+.w16_wpad3:
+ pshufd m3, m0, q3333
+ mova m1, m3
+ mova m2, m3
+ jmp .w16_wpad_end
+.w16_wpad2:
+ pshufd m1, m3, q3333
+ mova m2, m1
+ jmp .w16_wpad_end
+.w16_wpad1:
+ pshufd m2, m1, q3333
+ jmp .w16_wpad_end
+.w16:
+ movifnidn wpadd, wpadm
+ WIN64_SPILL_XMM 7
+.w16_loop:
+ pmaddwd m0, m5, [ypxq+16*0]
+ cmp wpadd, 2
+ jg .w16_wpad3
+ pmaddwd m3, m5, [ypxq+16*1]
+ je .w16_wpad2
+ pmaddwd m1, m5, [ypxq+16*2]
+ jp .w16_wpad1
+ pmaddwd m2, m5, [ypxq+16*3]
+.w16_wpad_end:
+ add ypxq, strideq
+ paddd m6, m0, m3
+ packssdw m0, m3
+ mova [acq+16*0], m0
+ paddd m6, m1
+ packssdw m1, m2
+ paddd m2, m6
+ mova [acq+16*1], m1
+ add acq, 16*2
+ paddd m4, m2
+ dec hd
+ jg .w16_loop
+ WIN64_RESTORE_XMM
+ add hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+ paddd m2, m2
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
+
+cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
+%define base r6-ipred_cfl_ac_444_16bpc_ssse3_table
+ LEA r6, ipred_cfl_ac_444_16bpc_ssse3_table
+ tzcnt wd, wm
+ movifnidn hpadd, hpadm
+ pxor m4, m4
+ movsxd wq, [r6+wq*4]
+ movddup m5, [base+pw_1]
+ add wq, r6
+ mov hd, hm
+ shl hpadd, 2
+ sub hd, hpadd
+ jmp wq
+.w4:
+ lea r3, [strideq*3]
+ mov r5, acq
+.w4_loop:
+ movq m0, [ypxq+strideq*0]
+ movhps m0, [ypxq+strideq*1]
+ movq m1, [ypxq+strideq*2]
+ movhps m1, [ypxq+r3 ]
+ lea ypxq, [ypxq+strideq*4]
+ psllw m0, 3
+ psllw m1, 3
+ mova [acq+16*0], m0
+ pmaddwd m0, m5
+ mova [acq+16*1], m1
+ pmaddwd m2, m5, m1
+ add acq, 16*2
+ paddd m4, m0
+ paddd m4, m2
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+ punpckhqdq m1, m1
+ mova [acq+16*0], m1
+ pslld m2, 2
+ mova [acq+16*1], m1
+ punpckhqdq m2, m2
+ mova [acq+16*2], m1
+ paddd m4, m2
+ mova [acq+16*3], m1
+ add acq, 16*4
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+.w8:
+ mov r5, acq
+.w8_loop:
+ mova m0, [ypxq+strideq*0]
+ mova m1, [ypxq+strideq*1]
+ lea ypxq, [ypxq+strideq*2]
+ psllw m0, 3
+ psllw m1, 3
+ mova [acq+16*0], m0
+ pmaddwd m0, m5
+ mova [acq+16*1], m1
+ pmaddwd m2, m5, m1
+ add acq, 16*2
+ paddd m4, m0
+ paddd m4, m2
+ sub hd, 2
+ jg .w8_loop
+.w8_hpad:
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+ pslld m2, 2
+ mova m0, m1
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
+.w16_wpad2:
+ pshufhw m3, m2, q3333
+ pshufhw m1, m0, q3333
+ punpckhqdq m3, m3
+ punpckhqdq m1, m1
+ jmp .w16_wpad_end
+.w16:
+ movifnidn wpadd, wpadm
+ mov r5, acq
+.w16_loop:
+ mova m2, [ypxq+strideq*0+16*0]
+ mova m0, [ypxq+strideq*1+16*0]
+ psllw m2, 3
+ psllw m0, 3
+ test wpadd, wpadd
+ jnz .w16_wpad2
+ mova m3, [ypxq+strideq*0+16*1]
+ mova m1, [ypxq+strideq*1+16*1]
+ psllw m3, 3
+ psllw m1, 3
+.w16_wpad_end:
+ lea ypxq, [ypxq+strideq*2]
+ mova [acq+16*0], m2
+ pmaddwd m2, m5
+ mova [acq+16*1], m3
+ pmaddwd m3, m5
+ paddd m4, m2
+ pmaddwd m2, m5, m0
+ mova [acq+16*2], m0
+ paddd m4, m3
+ pmaddwd m3, m5, m1
+ mova [acq+16*3], m1
+ add acq, 16*4
+ paddd m2, m3
+ paddd m4, m2
+ sub hd, 2
+ jg .w16_loop
+ add hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+ paddd m2, m2
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
+.w32_wpad6:
+ pshufhw m1, m0, q3333
+ punpckhqdq m1, m1
+ mova m2, m1
+ mova m3, m1
+ jmp .w32_wpad_end
+.w32_wpad4:
+ pshufhw m2, m1, q3333
+ punpckhqdq m2, m2
+ mova m3, m2
+ jmp .w32_wpad_end
+.w32_wpad2:
+ pshufhw m3, m2, q3333
+ punpckhqdq m3, m3
+ jmp .w32_wpad_end
+.w32:
+ movifnidn wpadd, wpadm
+ mov r5, acq
+ WIN64_SPILL_XMM 8
+.w32_loop:
+ mova m0, [ypxq+16*0]
+ psllw m0, 3
+ cmp wpadd, 4
+ jg .w32_wpad6
+ mova m1, [ypxq+16*1]
+ psllw m1, 3
+ je .w32_wpad4
+ mova m2, [ypxq+16*2]
+ psllw m2, 3
+ jnp .w32_wpad2
+ mova m3, [ypxq+16*3]
+ psllw m3, 3
+.w32_wpad_end:
+ add ypxq, strideq
+ pmaddwd m6, m5, m0
+ mova [acq+16*0], m0
+ pmaddwd m7, m5, m1
+ mova [acq+16*1], m1
+ paddd m6, m7
+ pmaddwd m7, m5, m2
+ mova [acq+16*2], m2
+ paddd m6, m7
+ pmaddwd m7, m5, m3
+ mova [acq+16*3], m3
+ add acq, 16*4
+ paddd m6, m7
+ paddd m4, m6
+ dec hd
+ jg .w32_loop
+%if WIN64
+ mova m5, m6
+ WIN64_RESTORE_XMM
+ SWAP 5, 6
+%endif
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+.w32_hpad_loop:
+ mova [acq+16*0], m0
+ mova [acq+16*1], m1
+ paddd m4, m6
+ mova [acq+16*2], m2
+ mova [acq+16*3], m3
+ add acq, 16*4
+ dec hpadd
+ jg .w32_hpad_loop
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+
+cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h
+%define base r2-pal_pred_16bpc_ssse3_table
+%if ARCH_X86_32
+ %define hd r2d
+%endif
+ mova m3, [palq]
+ LEA r2, pal_pred_16bpc_ssse3_table
+ tzcnt wd, wm
+ pshufb m3, [base+pal_pred_shuf]
+ movsxd wq, [r2+wq*4]
+ pshufd m4, m3, q1032
+ add wq, r2
+ movifnidn hd, hm
+ jmp wq
+.w4:
+ mova m0, [idxq]
+ add idxq, 16
+ pshufb m1, m3, m0
+ pshufb m2, m4, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ mova m0, [idxq]
+ add idxq, 16
+ pshufb m1, m3, m0
+ pshufb m2, m4, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8
+ RET
+.w16:
+ mova m0, [idxq]
+ add idxq, 16
+ pshufb m1, m3, m0
+ pshufb m2, m4, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ add dstq, strideq
+ dec hd
+ jg .w16
+ RET
+.w32:
+ mova m0, [idxq+16*0]
+ pshufb m1, m3, m0
+ pshufb m2, m4, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova m2, [idxq+16*1]
+ add idxq, 16*2
+ mova [dstq+16*0], m0
+ pshufb m0, m3, m2
+ mova [dstq+16*1], m1
+ pshufb m1, m4, m2
+ punpcklbw m2, m0, m1
+ punpckhbw m0, m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m0
+ add dstq, strideq
+ dec hd
+ jg .w32
+ RET
+.w64:
+ mova m0, [idxq+16*0]
+ pshufb m1, m3, m0
+ pshufb m2, m4, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova m2, [idxq+16*1]
+ mova [dstq+16*0], m0
+ pshufb m0, m3, m2
+ mova [dstq+16*1], m1
+ pshufb m1, m4, m2
+ punpcklbw m2, m0, m1
+ punpckhbw m0, m1
+ mova m1, [idxq+16*2]
+ mova [dstq+16*2], m2
+ pshufb m2, m3, m1
+ mova [dstq+16*3], m0
+ pshufb m0, m4, m1
+ punpcklbw m1, m2, m0
+ punpckhbw m2, m0
+ mova m0, [idxq+16*3]
+ add idxq, 16*4
+ mova [dstq+16*4], m1
+ pshufb m1, m3, m0
+ mova [dstq+16*5], m2
+ pshufb m2, m4, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+16*6], m0
+ mova [dstq+16*7], m1
+ add dstq, strideq
+ dec hd
+ jg .w64
+ RET