summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/x86/ipred_sse.asm
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/dav1d/src/x86/ipred_sse.asm')
-rw-r--r--third_party/dav1d/src/x86/ipred_sse.asm5409
1 files changed, 5409 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/ipred_sse.asm b/third_party/dav1d/src/x86/ipred_sse.asm
new file mode 100644
index 0000000000..67e90b79ae
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred_sse.asm
@@ -0,0 +1,5409 @@
+; Copyright © 2018-2021, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+%macro SMOOTH_WEIGHT_TABLE 1-*
+ %rep %0
+ db %1-128, 127-%1
+ %rotate 1
+ %endrep
+%endmacro
+
+; sm_weights[], but modified to precalculate x and 256-x with offsets to
+; enable efficient use of pmaddubsw (which requires signed values)
+smooth_weights: SMOOTH_WEIGHT_TABLE \
+ 0, 0, 255, 128, 255, 149, 85, 64, \
+ 255, 197, 146, 105, 73, 50, 37, 32, \
+ 255, 225, 196, 170, 145, 123, 102, 84, \
+ 68, 54, 43, 33, 26, 20, 17, 16, \
+ 255, 240, 225, 210, 196, 182, 169, 157, \
+ 145, 133, 122, 111, 101, 92, 83, 74, \
+ 66, 59, 52, 45, 39, 34, 29, 25, \
+ 21, 17, 14, 12, 10, 9, 8, 8, \
+ 255, 248, 240, 233, 225, 218, 210, 203, \
+ 196, 189, 182, 176, 169, 163, 156, 150, \
+ 144, 138, 133, 127, 121, 116, 111, 106, \
+ 101, 96, 91, 86, 82, 77, 73, 69, \
+ 65, 61, 57, 54, 50, 47, 44, 41, \
+ 38, 35, 32, 29, 27, 25, 22, 20, \
+ 18, 16, 15, 13, 12, 10, 9, 8, \
+ 7, 6, 6, 5, 5, 4, 4, 4
+
+ipred_v_shuf: db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7
+ipred_h_shuf: db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0
+ipred_paeth_shuf: db 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
+z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8
+z_transpose4: db 8, 12, 0, 4, 9, 13, 1, 5, 10, 14, 2, 6, 11, 15, 3, 7
+z3_shuf: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+z3_shuf_h4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8
+filter_shuf1: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1
+filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11, 15, -1, 15, -1
+z_filter_wh4: db 7, 7, 19, 7,
+z_filter_wh8: db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39
+pd_32768: dd 32768
+z3_filter_k_tail: db 64, 0, 64, 0, 64, 0, 56, 8
+z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
+pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64
+z3_base_inc: dw 7*64, 6*64, 5*64, 4*64, 3*64, 2*64, 1*64, 0*64
+z_filter_wh16: db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1
+z_filter_t_w48: db 55,127, 7,127, 15, 31, 39, 31,127, 39,127, 39, 7, 15, 31, 15
+ db 39, 63, 3, 63, 3, 3, 19, 3, 47, 19, 47, 19, 3, 3, 3, 3
+z_filter_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0, 0
+z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7
+ db 7, 8, 8, 9, 9, 10, 10, 11
+z_filter_k_tail: db 0, 64, 0, 64, 8, 56, 0, 64
+z2_h_shuf: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11
+z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8
+z2_dy_offset: dw 88*64, 88*64, 87*64, 87*64
+pw_m1to4: dw -1, -2, -3, -4
+z_filter_k: times 4 db 0, 16
+ times 4 db 0, 20
+ times 4 db 8, 16
+ times 4 db 32, 16
+ times 4 db 24, 20
+ times 4 db 16, 16
+ times 4 db 0, 0
+ times 4 db 0, 0
+pw_8: times 8 db 8, 0
+pb_3: times 16 db 3
+pb_16: times 16 db 16
+pw_62: times 8 dw 62
+pw_64: times 8 dw 64
+pw_256: times 8 dw 256
+pw_512: times 8 dw 512
+pw_m256: times 8 dw -256
+pb_2: times 8 db 2
+pb_4: times 8 db 4
+pb_8: times 8 db 8
+pb_128: times 8 db 128
+pb_m16: times 8 db -16
+pw_128: times 4 dw 128
+pw_255: times 4 dw 255
+pb_36_m4: times 4 db 36, -4
+pb_127_m127: times 4 db 127, -127
+
+%macro JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%base %+ .%3 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+%define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4)
+%define ipred_cfl_splat_ssse3_table (ipred_cfl_ssse3_table + 8*4)
+
+JMP_TABLE ipred_h, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_dc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
+JMP_TABLE ipred_dc_left, ssse3, h4, h8, h16, h32, h64
+JMP_TABLE ipred_smooth, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_v, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_h, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_paeth, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z1, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z2, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z3, ssse3, h4, h8, h16, h32, h64
+JMP_TABLE pal_pred, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_cfl, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \
+ s4-8*4, s8-8*4, s16-8*4, s32-8*4
+JMP_TABLE ipred_cfl_left, ssse3, h4, h8, h16, h32
+JMP_TABLE ipred_filter, ssse3, w4, w8, w16, w32
+
+cextern dr_intra_derivative
+cextern filter_intra_taps
+
+SECTION .text
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+%macro IPRED_SET 3 ; width, stride, stride size pshuflw_imm8
+ pshuflw m1, m0, %3 ; extend 8 byte for 2 pos
+ punpcklqdq m1, m1
+ mova [dstq + %2], m1
+%if %1 > 16
+ mova [dstq + 16 + %2], m1
+%endif
+%if %1 > 32
+ mova [dstq + 32 + %2], m1
+ mova [dstq + 48 + %2], m1
+%endif
+%endmacro
+
+%macro IPRED_H 1 ; width
+ sub tlq, 4
+ movd m0, [tlq] ; get 4 bytes of topleft data
+ punpcklbw m0, m0 ; extend 2 byte
+%if %1 == 4
+ pshuflw m1, m0, q2233
+ movd [dstq+strideq*0], m1
+ psrlq m1, 32
+ movd [dstq+strideq*1], m1
+ pshuflw m0, m0, q0011
+ movd [dstq+strideq*2], m0
+ psrlq m0, 32
+ movd [dstq+stride3q ], m0
+
+%elif %1 == 8
+ punpcklwd m0, m0
+ punpckhdq m1, m0, m0
+ punpckldq m0, m0
+ movq [dstq+strideq*1], m1
+ movhps [dstq+strideq*0], m1
+ movq [dstq+stride3q ], m0
+ movhps [dstq+strideq*2], m0
+%else
+ IPRED_SET %1, 0, q3333
+ IPRED_SET %1, strideq, q2222
+ IPRED_SET %1, strideq*2, q1111
+ IPRED_SET %1, stride3q, q0000
+%endif
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w%1
+ RET
+%endmacro
+
+INIT_XMM ssse3
+cglobal ipred_h_8bpc, 3, 6, 2, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_h_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ IPRED_H 4
+.w8:
+ IPRED_H 8
+.w16:
+ IPRED_H 16
+.w32:
+ IPRED_H 32
+.w64:
+ IPRED_H 64
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_dc_splat_ssse3_table
+ tzcnt wd, wm
+ movu m0, [tlq+ 1]
+ movu m1, [tlq+17]
+ movu m2, [tlq+33]
+ movu m3, [tlq+49]
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ movifnidn wd, wm
+ tzcnt r6d, hd
+ lea r5d, [wq+hq]
+ movd m4, r5d
+ tzcnt r5d, r5d
+ movd m5, r5d
+ LEA r5, ipred_dc_ssse3_table
+ tzcnt wd, wd
+ movsxd r6, [r5+r6*4]
+ movsxd wq, [r5+wq*4+20]
+ pcmpeqd m3, m3
+ psrlw m4, 1 ; dc = (width + height) >> 1;
+ add r6, r5
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp r6
+.h4:
+ movd m0, [tlq-4]
+ pmaddubsw m0, m3
+ jmp wq
+.w4:
+ movd m1, [tlq+1]
+ pmaddubsw m1, m3
+ psubw m0, m4
+ paddw m0, m1
+ pmaddwd m0, m3
+ cmp hd, 4
+ jg .w4_mul
+ psrlw m0, 3 ; dc >>= ctz(width + height);
+ jmp .w4_end
+.w4_mul:
+ punpckhqdq m1, m0, m0
+ paddw m0, m1
+ psrlq m1, m0, 32
+ paddw m0, m1
+ psrlw m0, 2
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 8
+ cmovz r6d, r2d
+ movd m5, r6d
+ pmulhuw m0, m5
+.w4_end:
+ pxor m1, m1
+ pshufb m0, m1
+.s4:
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*1], m0
+ movd [dstq+strideq*2], m0
+ movd [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s4
+ RET
+ALIGN function_align
+.h8:
+ movq m0, [tlq-8]
+ pmaddubsw m0, m3
+ jmp wq
+.w8:
+ movq m1, [tlq+1]
+ pmaddubsw m1, m3
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ paddw m0, m1
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ cmp hd, 32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w8_end:
+ pxor m1, m1
+ pshufb m0, m1
+.s8:
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s8
+ RET
+ALIGN function_align
+.h16:
+ mova m0, [tlq-16]
+ pmaddubsw m0, m3
+ jmp wq
+.w16:
+ movu m1, [tlq+1]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 8|32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w16_end:
+ pxor m1, m1
+ pshufb m0, m1
+.s16:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s16
+ RET
+ALIGN function_align
+.h32:
+ mova m0, [tlq-32]
+ pmaddubsw m0, m3
+ mova m2, [tlq-16]
+ pmaddubsw m2, m3
+ paddw m0, m2
+ jmp wq
+.w32:
+ movu m1, [tlq+1]
+ pmaddubsw m1, m3
+ movu m2, [tlq+17]
+ pmaddubsw m2, m3
+ paddw m1, m2
+ paddw m0, m1
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 64|16
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w32_end:
+ pxor m1, m1
+ pshufb m0, m1
+ mova m1, m0
+.s32:
+ mova [dstq], m0
+ mova [dstq+16], m1
+ mova [dstq+strideq], m0
+ mova [dstq+strideq+16], m1
+ mova [dstq+strideq*2], m0
+ mova [dstq+strideq*2+16], m1
+ mova [dstq+stride3q], m0
+ mova [dstq+stride3q+16], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s32
+ RET
+ALIGN function_align
+.h64:
+ mova m0, [tlq-64]
+ mova m1, [tlq-48]
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ paddw m0, m1
+ mova m1, [tlq-32]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ mova m1, [tlq-16]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ jmp wq
+.w64:
+ movu m1, [tlq+ 1]
+ movu m2, [tlq+17]
+ pmaddubsw m1, m3
+ pmaddubsw m2, m3
+ paddw m1, m2
+ movu m2, [tlq+33]
+ pmaddubsw m2, m3
+ paddw m1, m2
+ movu m2, [tlq+49]
+ pmaddubsw m2, m3
+ paddw m1, m2
+ paddw m0, m1
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 64
+ je .w64_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w64_end:
+ pxor m1, m1
+ pshufb m0, m1
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+.s64:
+ mova [dstq], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ mova [dstq+strideq], m0
+ mova [dstq+strideq+16], m1
+ mova [dstq+strideq+32], m2
+ mova [dstq+strideq+48], m3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .s64
+ RET
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_dc_left_ssse3_table
+ mov hd, hm ; zero upper half
+ tzcnt r6d, hd
+ sub tlq, hq
+ tzcnt wd, wm
+ movu m0, [tlq]
+ movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
+ movd m2, r6d
+ psrld m3, m2
+ movsxd r6, [r5+r6*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, r5
+ add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+.h64:
+ movu m1, [tlq+48] ; unaligned when jumping here from dc_top
+ pmaddubsw m1, m2
+ paddw m0, m1
+ movu m1, [tlq+32] ; unaligned when jumping here from dc_top
+ pmaddubsw m1, m2
+ paddw m0, m1
+.h32:
+ movu m1, [tlq+16] ; unaligned when jumping here from dc_top
+ pmaddubsw m1, m2
+ paddw m0, m1
+.h16:
+ pshufd m1, m0, q3232 ; psrlq m1, m0, 16
+ paddw m0, m1
+.h8:
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+.h4:
+ pmaddwd m0, m2
+ pmulhrsw m0, m3
+ lea stride3q, [strideq*3]
+ pxor m1, m1
+ pshufb m0, m1
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ jmp wq
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_dc_splat_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ movddup m0, [r5-ipred_dc_splat_ssse3_table+pb_128]
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h
+ LEA r5, ipred_dc_left_ssse3_table
+ tzcnt wd, wm
+ inc tlq
+ movu m0, [tlq]
+ movifnidn hd, hm
+ movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
+ movd m2, wd
+ psrld m3, m2
+ movsxd r6, [r5+wq*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, r5
+ add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_smooth_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2]
+ ; w * a = (w - 128) * a + 128 * a
+ ; (256 - w) * b = (127 - w) * b + 129 * b
+ ; => w * a + (256 - w) * b = [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b]
+ pmaddubsw m6, m%3, m%1
+ pmaddubsw m0, m%4, m%2 ; (w - 128) * a + (127 - w) * b
+ paddw m6, m%5
+ paddw m0, m%6 ; [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b + 128]
+ psrlw m6, 8
+ psrlw m0, 8
+ packuswb m6, m0
+%endmacro
+
+cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights
+%define base r6-ipred_smooth_v_ssse3_table
+ LEA r6, ipred_smooth_v_ssse3_table
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ movddup m0, [base+pb_127_m127]
+ movddup m1, [base+pw_128]
+ lea weightsq, [base+smooth_weights+hq*4]
+ neg hq
+ movd m5, [tlq+hq]
+ pxor m2, m2
+ pshufb m5, m2
+ add wq, r6
+ jmp wq
+.w4:
+ movd m2, [tlq+1]
+ punpckldq m2, m2
+ punpcklbw m2, m5 ; top, bottom
+ lea r3, [strideq*3]
+ mova m4, [base+ipred_v_shuf]
+ mova m5, m4
+ punpckldq m4, m4
+ punpckhdq m5, m5
+ pmaddubsw m3, m2, m0 ; m3: 127 * top - 127 * bottom
+ paddw m1, m2 ; m1: 1 * top + 256 * bottom + 128, overflow is ok
+ paddw m3, m1 ; m3: 128 * top + 129 * bottom + 128
+.w4_loop:
+ movu m1, [weightsq+hq*2]
+ pshufb m0, m1, m4 ;m2, m3, m4 and m5 should be stable in loop
+ pshufb m1, m5
+ SMOOTH 0, 1, 2, 2, 3, 3
+ movd [dstq+strideq*0], m6
+ pshuflw m1, m6, q1032
+ movd [dstq+strideq*1], m1
+ punpckhqdq m6, m6
+ movd [dstq+strideq*2], m6
+ psrlq m6, 32
+ movd [dstq+r3 ], m6
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ movq m2, [tlq+1]
+ punpcklbw m2, m5
+ mova m5, [base+ipred_v_shuf]
+ lea r3, [strideq*3]
+ pshufd m4, m5, q0000
+ pshufd m5, m5, q1111
+ pmaddubsw m3, m2, m0
+ paddw m1, m2
+ paddw m3, m1 ; m3 is output for loop
+.w8_loop:
+ movq m1, [weightsq+hq*2]
+ pshufb m0, m1, m4
+ pshufb m1, m5
+ SMOOTH 0, 1, 2, 2, 3, 3
+ movq [dstq+strideq*0], m6
+ movhps [dstq+strideq*1], m6
+ lea dstq, [dstq+strideq*2]
+ add hq, 2
+ jl .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ movu m3, [tlq+1]
+ punpcklbw m2, m3, m5
+ punpckhbw m3, m5
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1 ; m4 and m5 is output for loop
+.w16_loop:
+ movd m1, [weightsq+hq*2]
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ SMOOTH 1, 1, 2, 3, 4, 5
+ mova [dstq], m6
+ add dstq, strideq
+ add hq, 1
+ jl .w16_loop
+ RET
+ALIGN function_align
+.w32:
+%if WIN64
+ movaps [rsp+24], xmm7
+ %define xmm_regs_used 8
+%endif
+ mova m7, m5
+.w32_loop_init:
+ mov r3d, 2
+.w32_loop:
+ movddup m0, [base+pb_127_m127]
+ movddup m1, [base+pw_128]
+ movu m3, [tlq+1]
+ punpcklbw m2, m3, m7
+ punpckhbw m3, m7
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1
+ movd m1, [weightsq+hq*2]
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ SMOOTH 1, 1, 2, 3, 4, 5
+ mova [dstq], m6
+ add tlq, 16
+ add dstq, 16
+ dec r3d
+ jg .w32_loop
+ lea dstq, [dstq-32+strideq]
+ sub tlq, 32
+ add hq, 1
+ jl .w32_loop_init
+ RET
+ALIGN function_align
+.w64:
+%if WIN64
+ movaps [rsp+24], xmm7
+ %define xmm_regs_used 8
+%endif
+ mova m7, m5
+.w64_loop_init:
+ mov r3d, 4
+.w64_loop:
+ movddup m0, [base+pb_127_m127]
+ movddup m1, [base+pw_128]
+ movu m3, [tlq+1]
+ punpcklbw m2, m3, m7
+ punpckhbw m3, m7
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1
+ movd m1, [weightsq+hq*2]
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ SMOOTH 1, 1, 2, 3, 4, 5
+ mova [dstq], m6
+ add tlq, 16
+ add dstq, 16
+ dec r3d
+ jg .w64_loop
+ lea dstq, [dstq-64+strideq]
+ sub tlq, 64
+ add hq, 1
+ jl .w64_loop_init
+ RET
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_smooth_h_8bpc, 3, 7, 8, dst, stride, tl, w, h
+%define base r6-ipred_smooth_h_ssse3_table
+ LEA r6, ipred_smooth_h_ssse3_table
+ mov wd, wm
+ movd m3, [tlq+wq]
+ pxor m1, m1
+ pshufb m3, m1 ; right
+ tzcnt wd, wd
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ movddup m4, [base+pb_127_m127]
+ movddup m5, [base+pw_128]
+ add wq, r6
+ jmp wq
+.w4:
+ movddup m6, [base+smooth_weights+4*2]
+ mova m7, [base+ipred_h_shuf]
+ sub tlq, 4
+ sub tlq, hq
+ lea r3, [strideq*3]
+.w4_loop:
+ movd m2, [tlq+hq] ; left
+ pshufb m2, m7
+ punpcklbw m1, m2, m3 ; left, right
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m6
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ pmaddubsw m2, m6
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ punpckhqdq m0, m0
+ movd [dstq+strideq*2], m0
+ psrlq m0, 32
+ movd [dstq+r3 ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ mova m6, [base+smooth_weights+8*2]
+ mova m7, [base+ipred_h_shuf]
+ sub tlq, 4
+ sub tlq, hq
+ punpckldq m7, m7
+.w8_loop:
+ movd m2, [tlq+hq] ; left
+ pshufb m2, m7
+ punpcklbw m1, m2, m3 ; left, right
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m6
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ pmaddubsw m2, m6
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ mova m6, [base+smooth_weights+16*2]
+ mova m7, [base+smooth_weights+16*3]
+ sub tlq, 1
+ sub tlq, hq
+.w16_loop:
+ pxor m1, m1
+ movd m2, [tlq+hq] ; left
+ pshufb m2, m1
+ punpcklbw m1, m2, m3 ; left, right
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m6
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ pmaddubsw m2, m7
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq], m0
+ lea dstq, [dstq+strideq]
+ sub hd, 1
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ sub tlq, 1
+ sub tlq, hq
+ pxor m6, m6
+.w32_loop_init:
+ mov r5, 2
+ lea r3, [base+smooth_weights+16*4]
+.w32_loop:
+ mova m7, [r3]
+ add r3, 16
+ movd m2, [tlq+hq] ; left
+ pshufb m2, m6
+ punpcklbw m1, m2, m3 ; left, right
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m7
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ mova m7, [r3]
+ add r3, 16
+ pmaddubsw m2, m7
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, 16
+ dec r5
+ jg .w32_loop
+ lea dstq, [dstq-32+strideq]
+ sub hd, 1
+ jg .w32_loop_init
+ RET
+ALIGN function_align
+.w64:
+ sub tlq, 1
+ sub tlq, hq
+ pxor m6, m6
+.w64_loop_init:
+ mov r5, 4
+ lea r3, [base+smooth_weights+16*8]
+.w64_loop:
+ mova m7, [r3]
+ add r3, 16
+ movd m2, [tlq+hq] ; left
+ pshufb m2, m6
+ punpcklbw m1, m2, m3 ; left, right
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m7
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ mova m7, [r3]
+ add r3, 16
+ pmaddubsw m2, m7
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, 16
+ dec r5
+ jg .w64_loop
+ lea dstq, [dstq-64+strideq]
+ sub hd, 1
+ jg .w64_loop_init
+ RET
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_smooth_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+%macro SMOOTH_2D_END 7 ; src[1-2], mul[1-2], add[1-2], m3
+ pmaddubsw m6, m%3, m%1
+ mova m0, m6
+ pmaddubsw m6, m%4, m%2
+ mova m1, m6
+%ifnum %5
+ paddw m0, m%5
+%else
+ paddw m0, %5
+%endif
+%ifnum %6
+ paddw m1, m%6
+%else
+ paddw m1, %6
+%endif
+%ifnum %7
+%else
+ mova m3, %7
+%endif
+ pavgw m0, m2
+ pavgw m1, m3
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+%endmacro
+
+%macro SMOOTH_OUTPUT_16B 12 ; m1, [buffer1, buffer2, buffer3, buffer4,] [w1, w2,] m3, m7, [m0, m4, m5]
+ mova m1, [rsp+16*%1] ; top
+ punpckhbw m6, m1, m0 ; top, bottom
+ punpcklbw m1, m0 ; top, bottom
+ pmaddubsw m2, m1, m5
+ mova [rsp+16*%2], m1
+ paddw m1, m3 ; 1 * top + 255 * bottom + 255
+ paddw m2, m1 ; 128 * top + 129 * bottom + 255
+ mova [rsp+16*%3], m2
+ pmaddubsw m2, m6, m5
+ mova [rsp+16*%4], m6
+ paddw m6, m3 ; 1 * top + 255 * bottom + 255
+ paddw m2, m6 ; 128 * top + 129 * bottom + 255
+ mova [rsp+16*%5], m2
+ movd m1, [tlq+hq] ; left
+ pshufb m1, [base+pb_3] ; topleft[-(1 + y)]
+ punpcklbw m1, m4 ; left, right
+ pmaddubsw m2, m1, m5 ; 127 * left - 127 * right
+ paddw m2, m1 ; 128 * left + 129 * right
+ mova m3, m2
+ pmaddubsw m0, m1, %6 ; weights_hor = &dav1d_sm_weights[width];
+ pmaddubsw m1, %7
+ paddw m2, m3, m0
+ paddw m3, m1
+ movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height];
+ mova m7, [rsp+16*%9]
+ pshufb m1, m7
+ mova [rsp+16*%8], m3
+ mova m4, [rsp+16*%2]
+ mova m5, [rsp+16*%3]
+ mova m3, [rsp+16*%4]
+ mova m7, [rsp+16*%5]
+ SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*%8]
+ mova [dstq], m0
+ movddup m3, [base+pw_255] ; recovery
+ mova m0, [rsp+16*%10] ; recovery
+ mova m4, [rsp+16*%11] ; recovery
+ mova m5, [rsp+16*%12] ; recovery
+%endmacro
+
+cglobal ipred_smooth_8bpc, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights
+%define base r6-ipred_smooth_ssse3_table
+ mov wd, wm
+ mov hd, hm
+ LEA r6, ipred_smooth_ssse3_table
+ movd m4, [tlq+wq] ; right
+ pxor m2, m2
+ pshufb m4, m2
+ tzcnt wd, wd
+ mov r5, tlq
+ sub r5, hq
+ movsxd wq, [r6+wq*4]
+ movddup m5, [base+pb_127_m127]
+ movd m0, [r5]
+ pshufb m0, m2 ; bottom
+ movddup m3, [base+pw_255]
+ add wq, r6
+ lea v_weightsq, [base+smooth_weights+hq*2] ; weights_ver = &dav1d_sm_weights[height]
+ jmp wq
+.w4:
+ mova m7, [base+ipred_v_shuf]
+ movd m1, [tlq+1] ; left
+ pshufd m1, m1, q0000
+ sub tlq, 4
+ lea r3, [strideq*3]
+ sub tlq, hq
+ punpcklbw m1, m0 ; top, bottom
+ pshufd m6, m7, q1100
+ pshufd m7, m7, q3322
+ pmaddubsw m2, m1, m5
+ paddw m3, m1 ; 1 * top + 255 * bottom + 255
+ paddw m2, m3 ; 128 * top + 129 * bottom + 255
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ movq m1, [base+smooth_weights+4*2] ; weights_hor = &dav1d_sm_weights[width];
+ punpcklqdq m1, m1
+ mova [rsp+16*2], m1
+ mova [rsp+16*3], m4
+ mova [rsp+16*4], m6
+ mova [rsp+16*5], m5
+.w4_loop:
+ movd m1, [tlq+hq] ; left
+ pshufb m1, [base+ipred_h_shuf]
+ punpcklbw m0, m1, m4 ; left, right
+ punpckhbw m1, m4
+ pmaddubsw m2, m0, m5 ; 127 * left - 127 * right
+ pmaddubsw m3, m1, m5
+ paddw m2, m0 ; 128 * left + 129 * right
+ paddw m3, m1
+ mova m4, [rsp+16*2]
+ pmaddubsw m0, m4
+ pmaddubsw m1, m4
+ paddw m2, m0
+ paddw m3, m1
+ movq m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height];
+ add v_weightsq, 8
+ pshufb m0, m1, m6
+ pshufb m1, m7
+ mova m4, [rsp+16*0]
+ mova m5, [rsp+16*1]
+ SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3
+ mova m4, [rsp+16*3]
+ mova m6, [rsp+16*4]
+ mova m5, [rsp+16*5]
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ punpckhqdq m0, m0
+ movd [dstq+strideq*2], m0
+ psrlq m0, 32
+ movd [dstq+r3 ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ mova m7, [base+ipred_v_shuf]
+ movq m1, [tlq+1] ; left
+ punpcklqdq m1, m1
+ sub tlq, 4
+ sub tlq, hq
+ punpcklbw m1, m0
+ pshufd m6, m7, q0000
+ pshufd m7, m7, q1111
+ pmaddubsw m2, m1, m5
+ paddw m3, m1
+ paddw m2, m3
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ mova m1, [base+smooth_weights+8*2] ; weights_hor = &dav1d_sm_weights[width];
+ mova [rsp+16*2], m1
+ mova [rsp+16*3], m4
+ mova [rsp+16*4], m6
+ mova [rsp+16*5], m5
+.w8_loop:
+ movd m1, [tlq+hq] ; left
+ pshufb m1, [base+ipred_h_shuf]
+ pshufd m1, m1, q1100
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ pmaddubsw m2, m0, m5
+ pmaddubsw m3, m1, m5
+ paddw m2, m0
+ paddw m3, m1
+ mova m4, [rsp+16*2]
+ pmaddubsw m0, m4
+ pmaddubsw m1, m4
+ paddw m2, m0
+ paddw m3, m1
+ movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height];
+ add v_weightsq, 4
+ pshufb m0, m1, m6
+ pshufb m1, m7
+ mova m4, [rsp+16*0]
+ mova m5, [rsp+16*1]
+ SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3
+ mova m4, [rsp+16*3]
+ mova m6, [rsp+16*4]
+ mova m5, [rsp+16*5]
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ mova m7, [base+ipred_v_shuf]
+ movu m1, [tlq+1] ; left
+ sub tlq, 4
+ sub tlq, hq
+ punpckhbw m6, m1, m0 ; top, bottom
+ punpcklbw m1, m0 ; top, bottom
+ pshufd m7, m7, q0000
+ mova [rsp+16*2], m7
+ pmaddubsw m2, m6, m5
+ mova [rsp+16*5], m6
+ paddw m6, m3 ; 1 * top + 255 * bottom + 255
+ paddw m2, m6 ; 128 * top + 129 * bottom + 255
+ mova [rsp+16*6], m2
+ pmaddubsw m2, m1, m5
+ paddw m3, m1 ; 1 * top + 255 * bottom + 255
+ mova [rsp+16*0], m1
+ paddw m2, m3 ; 128 * top + 129 * bottom + 255
+ mova [rsp+16*1], m2
+ mova [rsp+16*3], m4
+ mova [rsp+16*4], m5
+.w16_loop:
+ movd m1, [tlq+hq] ; left
+ pshufb m1, [base+pb_3] ; topleft[-(1 + y)]
+ punpcklbw m1, m4 ; left, right
+ pmaddubsw m2, m1, m5 ; 127 * left - 127 * right
+ paddw m2, m1 ; 128 * left + 129 * right
+ mova m0, m1
+ mova m3, m2
+ pmaddubsw m0, [base+smooth_weights+16*2] ; weights_hor = &dav1d_sm_weights[width];
+ pmaddubsw m1, [base+smooth_weights+16*3]
+ paddw m2, m0
+ paddw m3, m1
+ movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height];
+ add v_weightsq, 2
+ mova m7, [rsp+16*2]
+ pshufb m1, m7
+ mova [rsp+16*7], m3
+ mova m4, [rsp+16*0]
+ mova m5, [rsp+16*1]
+ mova m3, [rsp+16*5]
+ mova m7, [rsp+16*6]
+ SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*7]
+ mova m4, [rsp+16*3]
+ mova m5, [rsp+16*4]
+ mova [dstq], m0
+ lea dstq, [dstq+strideq]
+ sub hd, 1
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ movu m1, [tlq+1] ; top topleft[1 + x]
+ movu m2, [tlq+17] ; top
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ sub tlq, 4
+ sub tlq, hq
+ mova m7, [base+ipred_v_shuf]
+ pshufd m7, m7, q0000
+ mova [rsp+16*2], m7
+ mova [rsp+16*3], m0
+ mova [rsp+16*4], m4
+ mova [rsp+16*5], m5
+.w32_loop:
+ SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*4], [base+smooth_weights+16*5], 10, 2, 3, 4, 5
+ add dstq, 16
+ SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*6], [base+smooth_weights+16*7], 10, 2, 3, 4, 5
+ lea dstq, [dstq-16+strideq]
+ add v_weightsq, 2
+ sub hd, 1
+ jg .w32_loop
+ RET
+ALIGN function_align
+.w64:
+ movu m1, [tlq+1] ; top topleft[1 + x]
+ movu m2, [tlq+17] ; top
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ movu m1, [tlq+33] ; top
+ movu m2, [tlq+49] ; top
+ mova [rsp+16*11], m1
+ mova [rsp+16*12], m2
+ sub tlq, 4
+ sub tlq, hq
+ mova m7, [base+ipred_v_shuf]
+ pshufd m7, m7, q0000
+ mova [rsp+16*2], m7
+ mova [rsp+16*3], m0
+ mova [rsp+16*4], m4
+ mova [rsp+16*5], m5
+.w64_loop:
+ SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*8], [base+smooth_weights+16*9], 10, 2, 3, 4, 5
+ add dstq, 16
+ SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*10], [base+smooth_weights+16*11], 10, 2, 3, 4, 5
+ add dstq, 16
+ SMOOTH_OUTPUT_16B 11, 6, 7, 8, 9, [base+smooth_weights+16*12], [base+smooth_weights+16*13], 10, 2, 3, 4, 5
+ add dstq, 16
+ SMOOTH_OUTPUT_16B 12, 6, 7, 8, 9, [base+smooth_weights+16*14], [base+smooth_weights+16*15], 10, 2, 3, 4, 5
+ lea dstq, [dstq-48+strideq]
+ add v_weightsq, 2
+ sub hd, 1
+ jg .w64_loop
+ RET
+
+%if ARCH_X86_64
+cglobal ipred_z1_8bpc, 3, 8, 11, 16*12, dst, stride, tl, w, h, angle, dx
+ %define base r7-$$
+ lea r7, [$$]
+ mova m8, [base+pw_62]
+ mova m9, [base+pw_64]
+ mova m10, [base+pw_512]
+%else
+cglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, _, tl, w, h, angle, dx
+ %define base r1-$$
+ %define m8 [base+pw_62]
+ %define m9 [base+pw_64]
+ %define m10 [base+pw_512]
+ %define strideq r3
+ %define stridemp dword [rsp+16*12]
+ mov stridemp, r1
+ LEA r1, $$
+%endif
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ movifnidn hd, hm
+ inc tlq
+ movsxd wq, [base+ipred_z1_ssse3_table+wq*4]
+ mov dxd, angled
+ and dxd, 0x7e
+ add angled, 165 ; ~90
+ lea wq, [base+wq+ipred_z1_ssse3_table]
+ movzx dxd, word [base+dr_intra_derivative+dxq]
+ xor angled, 0x4ff ; d = 90 - angle
+ jmp wq
+.w4:
+ lea r3d, [angleq+88]
+ test r3d, 0x480
+ jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40
+ sar r3d, 9
+ add r3d, hd
+ cmp r3d, 8
+ jg .w4_no_upsample ; h > 8 || (w == h && is_sm)
+ mova m1, [tlq-1]
+ pshufb m0, m1, [base+z_upsample1]
+ pshufb m1, [base+z_upsample2]
+ movddup m2, [base+pb_36_m4]
+ add dxd, dxd
+ pmaddubsw m0, m2
+ pshufd m7, m1, q3333
+ movd [rsp+16], m7 ; top[max_base_x]
+ pmaddubsw m1, m2
+ movd m6, dxd
+ mov r5d, dxd ; xpos
+ pshufb m6, [base+pw_256]
+ paddw m1, m0
+ movq m0, [tlq]
+ pmulhrsw m1, m10
+ paddw m7, m6, m6
+ punpcklqdq m6, m7 ; xpos0 xpos1
+ packuswb m1, m1
+ punpcklbw m0, m1
+ movifnidn strideq, stridemp
+ mova [rsp], m0
+.w4_upsample_loop:
+ lea r2d, [r5+dxq]
+ shr r5d, 6 ; base0
+ movq m0, [rsp+r5]
+ lea r5d, [r2+dxq]
+ shr r2d, 6 ; base1
+ movhps m0, [rsp+r2]
+ pand m2, m8, m6 ; frac
+ psubw m1, m9, m2 ; 64-frac
+ psllw m2, 8
+ por m1, m2 ; 64-frac, frac
+ pmaddubsw m0, m1
+ paddw m6, m7 ; xpos += dx
+ pmulhrsw m0, m10
+ packuswb m0, m0
+ movd [dstq+strideq*0], m0
+ pshuflw m0, m0, q1032
+ movd [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w4_upsample_loop
+ RET
+.w4_no_upsample:
+ mov r3d, 7 ; max_base
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w4_main
+ lea r3d, [hq+3]
+ movd m0, r3d
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ pcmpeqb m1, m0, [base+z_filter_wh4]
+ pand m1, m2
+ pcmpgtb m1, [base+z_filter_t_w48+angleq*8]
+ pmovmskb r5d, m1
+ mov r3d, 7
+ test r5d, r5d
+ jz .w4_main ; filter_strength == 0
+ mova m3, [tlq-1]
+ imul r5d, 0x55555555
+ movu m7, [base+z_filter_s+8]
+ shr r5d, 30 ; filter_strength
+ movddup m0, [base+pb_8]
+ pminub m7, m0
+ pshufb m0, m3, [base+z_filter_s]
+ movddup m4, [base+z_filter_k-8+r5*8+24*0]
+ pshufb m3, m7
+ movddup m5, [base+z_filter_k-8+r5*8+24*1]
+ shufps m2, m0, m3, q2121
+ movddup m6, [base+z_filter_k-8+r5*8+24*2]
+ pmaddubsw m0, m4
+ pmaddubsw m1, m2, m4
+ pmaddubsw m2, m5
+ paddd m5, m6
+ pmaddubsw m4, m3, m5
+ pmaddubsw m3, m6
+ paddw m0, m2
+ paddw m1, m4
+ paddw m0, m3
+ pshufd m1, m1, q3333
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ mov r5d, 9
+ mov tlq, rsp
+ cmp hd, 4
+ cmovne r3d, r5d
+ packuswb m0, m1
+ mova [tlq], m0
+.w4_main:
+ add tlq, r3
+ movd m5, dxd
+ movddup m0, [base+z_base_inc] ; base_inc << 6
+ movd m7, [tlq] ; top[max_base_x]
+ shl r3d, 6
+ movd m4, r3d
+ pshufb m5, [base+pw_256]
+ mov r5d, dxd ; xpos
+ pshufb m7, [base+pw_m256]
+ sub r5, r3
+ pshufb m4, [base+pw_256]
+ mova m3, [base+z1_shuf_w4]
+ paddw m6, m5, m5
+ psubw m4, m0 ; max_base_x
+ punpcklqdq m5, m6 ; xpos0 xpos1
+.w4_loop:
+ lea r3, [r5+dxq]
+ sar r5, 6 ; base0
+ movq m0, [tlq+r5]
+ lea r5, [r3+dxq]
+ sar r3, 6 ; base1
+ movhps m0, [tlq+r3]
+ pand m2, m8, m5 ; frac
+ psubw m1, m9, m2 ; 64-frac
+ psllw m2, 8
+ pshufb m0, m3
+ por m1, m2 ; 64-frac, frac
+ pmaddubsw m0, m1
+ movifnidn strideq, stridemp
+ pcmpgtw m1, m4, m5 ; base < max_base_x
+ pmulhrsw m0, m10
+ paddw m5, m6 ; xpos += dx
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ packuswb m0, m0
+ movd [dstq+strideq*0], m0
+ pshuflw m0, m0, q1032
+ movd [dstq+strideq*1], m0
+ sub hd, 2
+ jz .w4_end
+ lea dstq, [dstq+strideq*2]
+ test r5d, r5d
+ jl .w4_loop
+ packuswb m7, m7
+.w4_end_loop:
+ movd [dstq+strideq*0], m7
+ movd [dstq+strideq*1], m7
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w4_end_loop
+.w4_end:
+ RET
+.w8:
+ lea r3d, [angleq+88]
+ and r3d, ~0x7f
+ or r3d, hd
+ cmp r3d, 8
+ ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
+ mova m5, [base+z_upsample1]
+ movu m3, [base+z_filter_s+6]
+ movd m4, hd
+ mova m0, [tlq-1]
+ movu m1, [tlq+7]
+ pxor m7, m7
+ pshufb m4, m7
+ movddup m7, [base+pb_36_m4]
+ pminub m4, m3
+ add dxd, dxd
+ pshufb m2, m0, m5
+ pmaddubsw m2, m7
+ pshufb m0, m3
+ pmaddubsw m0, m7
+ movd m6, dxd
+ pshufb m3, m1, m5
+ pmaddubsw m3, m7
+ pshufb m1, m4
+ pmaddubsw m1, m7
+ pshufb m6, [base+pw_256]
+ mov r5d, dxd
+ paddw m2, m0
+ paddw m7, m6, m6
+ paddw m3, m1
+ punpcklqdq m6, m7 ; xpos0 xpos1
+ movu m1, [tlq]
+ pmulhrsw m2, m10
+ pmulhrsw m3, m10
+ packuswb m2, m3
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ movifnidn strideq, stridemp
+ mova [rsp+16*0], m0
+ mova [rsp+16*1], m1
+.w8_upsample_loop:
+ lea r2d, [r5+dxq]
+ shr r5d, 6 ; base0
+ movu m0, [rsp+r5]
+ lea r5d, [r2+dxq]
+ shr r2d, 6 ; base1
+ movu m1, [rsp+r2]
+ pand m2, m8, m6
+ psubw m3, m9, m2
+ psllw m2, 8
+ por m3, m2
+ punpcklqdq m2, m3, m3 ; frac0
+ pmaddubsw m0, m2
+ punpckhqdq m3, m3 ; frac1
+ pmaddubsw m1, m3
+ paddw m6, m7
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m0, m1
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_upsample_loop
+ RET
+.w8_no_upsample:
+ lea r3d, [hq+7]
+ movd m0, r3d
+ and r3d, 7
+ or r3d, 8 ; imin(h+7, 15)
+ test angled, 0x400
+ jnz .w8_main
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ movu m1, [base+z_filter_wh8]
+ psrldq m3, [base+z_filter_t_w48+angleq*8], 4
+ pcmpeqb m1, m0
+ pand m1, m2
+ pcmpgtb m1, m3
+ pmovmskb r5d, m1
+ test r5d, r5d
+ jz .w8_main ; filter_strength == 0
+ movd m3, [tlq-1]
+ movu m0, [tlq+16*0]
+ imul r5d, 0x55555555
+ movu m1, [tlq+16*1]
+ shr r5d, 30 ; filter_strength
+ movd m2, [tlq+r3]
+ lea tlq, [rsp+16*4]
+ sub r5, 3
+ mova [tlq-16*1], m0
+ pxor m7, m7
+ mova [tlq+16*0], m1
+ pshufb m3, m7
+ pshufb m2, m7
+ mova [tlq-16*2], m3
+ movq [tlq+r3-15], m2
+ call .filter_edge
+ sar r5d, 1
+ add r5d, 17
+ cmp hd, 8
+ cmova r3d, r5d
+.w8_main:
+ add tlq, r3
+ movd m5, dxd
+ movd m7, [tlq]
+ shl r3d, 6
+ movu m3, [base+z_filter_s+2]
+ movd m4, r3d
+ pshufb m5, [base+pw_256]
+ mov r5d, dxd
+ pshufb m7, [base+pw_m256]
+ sub r5, r3
+ pshufb m4, [base+pw_256]
+ psubw m4, [base+z_base_inc]
+ mova m6, m5
+.w8_loop:
+ mov r3, r5
+ sar r3, 6
+ movu m0, [tlq+r3]
+ pand m1, m8, m5
+ psubw m2, m9, m1
+ psllw m1, 8
+ pshufb m0, m3
+ por m1, m2
+ pmaddubsw m0, m1
+ pcmpgtw m1, m4, m5
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ packuswb m0, m0
+ movq [dstq], m0
+ dec hd
+ jz .w8_end
+ movifnidn strideq, stridemp
+ add dstq, strideq
+ add r5, dxq
+ jl .w8_loop
+ packuswb m7, m7
+.w8_end_loop:
+ movq [dstq], m7
+ add dstq, strideq
+ dec hd
+ jg .w8_end_loop
+.w8_end:
+ RET
+.w16:
+ lea r3d, [hq+15]
+ movd m0, r3d
+ and r3d, 15
+ or r3d, 16 ; imin(h+15, 31)
+ test angled, 0x400
+ jnz .w16_main
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ movq m3, [base+z_filter_t_w16+angleq*4]
+ pcmpeqb m1, m0, [base+z_filter_wh16]
+ pand m1, m2
+ pcmpgtb m1, m3
+ pmovmskb r5d, m1
+ test r5d, r5d
+ jz .w16_main ; filter_strength == 0
+ movd m4, [tlq-1]
+ movu m0, [tlq+16*0]
+ imul r5d, 0x24924924
+ movu m1, [tlq+16*1]
+ shr r5d, 30
+ movd m2, [tlq+30]
+ adc r5, -4 ; filter_strength-3
+ movd m3, [tlq+r3]
+ lea tlq, [rsp+16*4]
+ mova [tlq-16*1], m0
+ pxor m7, m7
+ mova [tlq+16*0], m1
+ pshufb m4, m7
+ movd [rsp], m2
+ pshufb m3, m7
+ mova [tlq-16*2], m4
+ movd [tlq+r3-16], m3
+ call .filter_edge
+ cmp hd, 16
+ jle .w16_main
+ pshuflw m0, [rsp], q0000
+ sar r5, 1
+ movd m1, [base+z_filter_k_tail+4+r5*4]
+ lea r3d, [r5+33]
+ pmaddubsw m0, m1
+%if ARCH_X86_64
+ pmulhrsw m0, m10
+%else
+ pmulhrsw m0, m4
+%endif
+ packuswb m0, m0
+ movd [tlq+32], m0
+.w16_main:
+ add tlq, r3
+ movd m5, dxd
+ movd m7, [tlq]
+ movd m4, r3d
+ shl r3d, 6
+ pshufb m5, [base+pw_256]
+ pxor m6, m6
+ pshufb m7, m6
+ mov r5d, dxd
+ pshufb m4, m6
+ sub r5, r3
+ psubb m4, [base+pb_0to15]
+ mova m6, m5
+.w16_loop:
+ mov r3, r5
+ sar r3, 6
+ movu m1, [tlq+r3+0]
+ pand m0, m8, m5
+ movu m2, [tlq+r3+1]
+ psubw m3, m9, m0
+ psllw m0, 8
+ por m3, m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ psrlw m3, m5, 6
+ packsswb m3, m3
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ paddw m5, m6
+ pcmpgtb m2, m4, m3
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ mova [dstq], m0
+ dec hd
+ jz .w16_end
+ movifnidn strideq, stridemp
+ add dstq, strideq
+ add r5, dxq
+ jl .w16_loop
+.w16_end_loop:
+ mova [dstq], m7
+ add dstq, strideq
+ dec hd
+ jg .w16_end_loop
+.w16_end:
+ RET
+.w32:
+ lea r3d, [hq+31]
+ and r3d, 31
+ or r3d, 32 ; imin(h+31, 63)
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w32_main
+ movd m6, [tlq-1]
+ movu m0, [tlq+16*0]
+ movu m1, [tlq+16*1]
+ movu m2, [tlq+16*2]
+ movu m3, [tlq+16*3]
+ movd m4, [tlq+62]
+ movd m5, [tlq+r3]
+ lea tlq, [rsp+16*6]
+ mova [tlq-16*3], m0
+ pxor m7, m7
+ mova [tlq-16*2], m1
+ pshufb m6, m7
+ mova [tlq-16*1], m2
+ xor r5d, r5d ; filter_strength = 3
+ mova [tlq+16*0], m3
+ movd [rsp], m4
+ pshufb m5, m7
+ mova [tlq-16*4], m6
+ movd [tlq+r3-48], m5
+ call .filter_edge
+ sub tlq, 16*2
+ call .filter_edge
+ cmp hd, 32
+ jle .w32_main
+ pshuflw m0, [rsp], q0000
+ movd m1, [base+z_filter_k_tail+4]
+ add r3d, 2
+ pmaddubsw m0, m1
+%if ARCH_X86_64
+ pmulhrsw m0, m10
+%else
+ pmulhrsw m0, m4
+%endif
+ packuswb m0, m0
+ movd [tlq+64], m0
+.w32_main:
+ add tlq, r3
+ movd m0, r3d
+ movd m7, [tlq]
+ shl r3d, 6
+ movd m5, dxd
+ pxor m6, m6
+ mov r5d, dxd
+ pshufb m0, m6
+ pshufb m5, [base+pw_256]
+ sub r5, r3
+ pshufb m7, m6
+ psubb m0, [base+pb_0to15]
+ movddup m1, [base+pb_m16]
+ mova [rsp+16*0], m0
+ paddb m0, m1
+ mova [rsp+16*1], m0
+ mova m6, m5
+.w32_loop:
+ mov r3, r5
+ sar r3, 6
+ movu m1, [tlq+r3+16*0+0]
+ pand m0, m8, m5
+ movu m2, [tlq+r3+16*0+1]
+ psubw m3, m9, m0
+ psllw m0, 8
+ por m3, m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ psrlw m4, m5, 6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packsswb m4, m4
+ pcmpgtb m2, [rsp+16*0], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ movu m1, [tlq+r3+16*1+0]
+ movu m2, [tlq+r3+16*1+1]
+ mova [dstq+16*0], m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pcmpgtb m2, [rsp+16*1], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ mova [dstq+16*1], m0
+ dec hd
+ jz .w32_end
+ movifnidn strideq, stridemp
+ add dstq, strideq
+ add r5, dxq
+ jl .w32_loop
+.w32_end_loop:
+ mova [dstq+16*0], m7
+ mova [dstq+16*1], m7
+ add dstq, strideq
+ dec hd
+ jg .w32_end_loop
+.w32_end:
+ RET
+.w64:
+ lea r3d, [hq+63]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w64_main
+ movd m4, [tlq-1]
+ movu m0, [tlq+16*0]
+ movu m1, [tlq+16*1]
+ movu m2, [tlq+16*2]
+ movu m3, [tlq+16*3]
+ mova [rsp+16*3], m0
+ pxor m7, m7
+ mova [rsp+16*4], m1
+ pshufb m4, m7
+ mova [rsp+16*5], m2
+ mova [rsp+16*6], m3
+ mova [rsp+16*2], m4
+ movu m0, [tlq+16*4]
+ movu m1, [tlq+16*5]
+ movu m2, [tlq+16*6]
+ movu m3, [tlq+16*7]
+ movd m4, [tlq+r3]
+ lea tlq, [rsp+16*10]
+ mova [tlq-16*3], m0
+ xor r5d, r5d ; filter_strength = 3
+ mova [tlq-16*2], m1
+ pshufb m4, m7
+ mova [tlq-16*1], m2
+ mova [tlq+16*0], m3
+ movd [tlq+r3-16*7], m4
+ cmp hd, 64
+ jl .w64_filter96 ; skip one call if the last 32 bytes aren't used
+ call .filter_edge
+.w64_filter96:
+ sub tlq, 16*2
+ call .filter_edge
+ sub tlq, 16*2
+ call .filter_edge
+ sub tlq, 16*2
+ call .filter_edge
+.w64_main:
+ add tlq, r3
+ movd m0, r3d
+ movd m7, [tlq]
+ shl r3d, 6
+ movd m5, dxd
+ pxor m6, m6
+ mov r5d, dxd
+ pshufb m0, m6
+ sub r5, r3
+ pshufb m5, [base+pw_256]
+ pshufb m7, m6
+ psubb m0, [base+pb_0to15]
+ movddup m1, [base+pb_m16]
+ mova [rsp+16*0], m0
+ paddb m0, m1
+ mova [rsp+16*1], m0
+ paddb m0, m1
+ mova [rsp+16*2], m0
+ paddb m0, m1
+ mova [rsp+16*3], m0
+ mova m6, m5
+.w64_loop:
+ mov r3, r5
+ sar r3, 6
+ movu m1, [tlq+r3+16*0+0]
+ pand m0, m8, m5
+ movu m2, [tlq+r3+16*0+1]
+ psubw m3, m9, m0
+ psllw m0, 8
+ por m3, m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ psrlw m4, m5, 6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packsswb m4, m4
+ pcmpgtb m2, [rsp+16*0], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ movu m1, [tlq+r3+16*1+0]
+ movu m2, [tlq+r3+16*1+1]
+ mova [dstq+16*0], m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pcmpgtb m2, [rsp+16*1], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ movu m1, [tlq+r3+16*2+0]
+ movu m2, [tlq+r3+16*2+1]
+ mova [dstq+16*1], m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pcmpgtb m2, [rsp+16*2], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ movu m1, [tlq+r3+16*3+0]
+ movu m2, [tlq+r3+16*3+1]
+ mova [dstq+16*2], m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pcmpgtb m2, [rsp+16*3], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ mova [dstq+16*3], m0
+ dec hd
+ jz .w64_end
+ movifnidn strideq, stridemp
+ add dstq, strideq
+ add r5, dxq
+ jl .w64_loop
+.w64_end_loop:
+ mova [dstq+16*0], m7
+ mova [dstq+16*1], m7
+ mova [dstq+16*2], m7
+ mova [dstq+16*3], m7
+ add dstq, strideq
+ dec hd
+ jg .w64_end_loop
+.w64_end:
+ RET
+ALIGN function_align
+.filter_edge: ; 32 pixels/iteration
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*0]
+ movu m2, [tlq-18]
+ movu m1, [tlq-17]
+ movu m3, [tlq- 2]
+ movu m4, [tlq- 1]
+ punpcklbw m0, m2, m1
+ pmaddubsw m0, m7
+ punpckhbw m2, m1
+ pmaddubsw m2, m7
+ punpcklbw m1, m3, m4
+ pmaddubsw m1, m7
+ punpckhbw m3, m4
+ pmaddubsw m3, m7
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*1]
+ mova m5, [tlq-16]
+ movu m6, [tlq-15]
+ punpcklbw m4, m5, m6
+ pmaddubsw m4, m7
+ punpckhbw m5, m6
+ pmaddubsw m5, m7
+ paddw m0, m4
+ paddw m2, m5
+ mova m5, [tlq+ 0]
+ movu m6, [tlq+ 1]
+ punpcklbw m4, m5, m6
+ pmaddubsw m4, m7
+ punpckhbw m5, m6
+ pmaddubsw m5, m7
+ paddw m1, m4
+ paddw m3, m5
+ test r5d, r5d
+ jnz .filter_end ; 3-tap
+ movddup m7, [base+z_filter_k+8*8]
+ movu m5, [tlq-14]
+ movu m6, [tlq+ 2]
+ punpcklbw m4, m5, m5
+ pmaddubsw m4, m7
+ punpckhbw m5, m5
+ pmaddubsw m5, m7
+ paddw m0, m4
+ paddw m2, m5
+ punpcklbw m5, m6, m6
+ pmaddubsw m5, m7
+ punpckhbw m6, m6
+ pmaddubsw m6, m7
+ paddw m1, m5
+ paddw m3, m6
+.filter_end:
+%if ARCH_X86_64
+ REPX {pmulhrsw x, m10}, m0, m2, m1, m3
+%else
+ mova m4, m10
+ REPX {pmulhrsw x, m4 }, m0, m2, m1, m3
+%endif
+ packuswb m0, m2
+ packuswb m1, m3
+ mova [tlq+16*0], m0
+ mova [tlq+16*1], m1
+ ret
+
+%if ARCH_X86_64
+cglobal ipred_z2_8bpc, 4, 12, 13, 16*16, dst, stride, tl, w, h, angle, dx, _, dy
+ %define base r7-$$
+ %define maxwm r6m
+ %define maxhm r7m
+ lea r7, [$$]
+ mov hd, hm
+ mova m8, [base+pw_62]
+ mova m9, [base+pw_64]
+ lea r9d, [wq-4]
+ mova m10, [base+pw_512]
+ shl r9d, 6
+ mova m11, [base+z1_shuf_w4]
+ or r9d, hd
+ mova m12, [base+z2_h_shuf]
+%else
+cglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w, h, angle, dx
+ %define base r1-$$
+ %define m8 [base+pw_62]
+ %define m9 [base+pw_64]
+ %define m10 [base+pw_512]
+ %define m11 [rsp+16*16]
+ %define m12 [rsp+16*17]
+ %define r8 [rsp+16*6+4*1]
+ %define r9b byte [rsp+16*18+4*0]
+ %define r9d dword [rsp+16*18+4*0]
+ %define r10d dword [rsp+16*18+4*1]
+ %define r11d dword [rsp+16*18+4*2]
+ %define maxwm [rsp+16*18+4*3]
+ %define maxhm [rsp+16*19+4*0]
+ %define stridemp [rsp+16*19+4*1]
+ %define strideq r3
+ %define dyd r4
+ %define dyq r4
+ mov stridemp, r1
+ mov r1d, r6m
+ mov r4d, r7m
+ mov maxwm, r1d
+ mov maxhm, r4d
+ LEA r1, $$
+ lea hd, [wq-4]
+ mova m0, [base+z1_shuf_w4]
+ shl hd, 6
+ mova m1, [base+z2_h_shuf]
+ or hd, hm
+ mova m11, m0
+ mov r9d, hd
+ mova m12, m1
+%endif
+ tzcnt wd, wd
+ movifnidn angled, anglem
+ movsxd wq, [base+ipred_z2_ssse3_table+wq*4]
+%if ARCH_X86_64
+ movzx dxd, angleb
+%else
+ movzx dxd, byte anglem
+%endif
+ xor angled, 0x400
+ mova m0, [tlq-16*4]
+ mov dyd, dxd
+ mova m1, [tlq-16*3]
+ neg dxq
+ mova m2, [tlq-16*2]
+ and dyd, ~1
+ mova m3, [tlq-16*1]
+ and dxq, ~1
+ movd m4, [tlq]
+ movu m5, [tlq+16*0+1]
+ movu m6, [tlq+16*1+1]
+ movzx dyd, word [base+dr_intra_derivative+dyq-90] ; angle - 90
+ movzx dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle
+ mova [rsp+16*2], m0
+ pxor m7, m7
+ mova [rsp+16*3], m1
+ pshufb m4, m7
+ mova [rsp+16*4], m2
+ lea wq, [base+ipred_z2_ssse3_table+wq]
+ mova [rsp+16*5], m3
+ neg dxd
+ mova [rsp+16*6], m4
+ or dyd, 4<<16
+ mova [rsp+16*7], m4
+ mova [rsp+16*8], m5
+ mova [rsp+16*9], m6
+ movq m0, [base+z_base_inc+2]
+ movsldup m1, [base+z2_dy_offset]
+ movq m2, [base+pw_256] ; 4<<6
+ movq [rsp+16*14+8*0], m0
+ movq [rsp+16*15+8*0], m1
+ movq [rsp+16*15+8*1], m2
+%if ARCH_X86_64
+ lea r10d, [dxq+(128<<6)] ; xpos
+%else
+ mov [rsp+16*7+4*1], dyd
+ lea r4d, [dxq+(128<<6)]
+ mov r10d, r4d
+ movzx hd, r9b
+%endif
+ mov r11d, (128-4)<<6
+ jmp wq
+.w4:
+ test angled, 0x400
+ jnz .w4_main
+ movd m5, [tlq+4]
+ lea r3d, [hq+2]
+ add angled, 1022
+ pshufb m5, m7
+ shl r3d, 6
+ movd [rsp+16*8+4], m5
+ test r3d, angled
+ jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
+ call .upsample_above
+ sub angled, 1075 ; angle - 53
+ lea r3d, [hq+3]
+ xor angled, 0x7f ; 180 - angle
+ movd m0, r3d
+ movd m6, angled
+ shr angled, 8 ; is_sm << 1
+ pshufb m0, m7
+ pshufb m6, m7
+ pcmpeqb m0, [base+z_filter_wh4]
+ pand m6, m0
+ pcmpgtb m6, [base+z_filter_t_w48+angleq*8]
+ jmp .w8_filter_left
+.upsample_above: ; w4/w8
+ movq m3, [rsp+gprsize+16*8-2]
+ movq m1, [rsp+gprsize+16*8-1]
+ movq m0, [rsp+gprsize+16*8+0]
+ movq m4, [rsp+gprsize+16*8+1]
+ movddup m5, [base+pb_36_m4]
+ punpcklbw m1, m3
+ punpcklbw m2, m0, m4
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+%if ARCH_X86_64
+ mova m11, [base+pb_0to15]
+ lea r10d, [r10+dxq+(1<<6)]
+ mov r11d, (128-7)<<6
+%else
+ mova m3, [base+pb_0to15]
+ mov r3d, [rsp+gprsize+16*18+4*1]
+ mov dword [rsp+gprsize+16*18+4*2], (128-7)<<6
+ lea r3d, [r3+dxq+(1<<6)]
+ mov [rsp+gprsize+16*18+4*1], r3d
+ mova [rsp+gprsize+16*16], m3
+%endif
+ add dxd, dxd
+ paddw m1, m2
+ pmulhrsw m1, m10
+ movq m2, [rsp+gprsize+16*14]
+ paddw m2, m2
+ movq [rsp+gprsize+16*14], m2
+ packuswb m1, m1
+ punpcklbw m1, m0
+ mova [rsp+gprsize+16*8], m1
+ ret
+.w4_no_upsample_above:
+ lea r3d, [hq+3]
+ mov [rsp], angled
+ sub angled, 1112 ; angle - 90
+ movd m0, r3d
+ mov r3d, 90
+ movd m1, angled
+ sub r3d, angled ; 180 - angle
+ shr angled, 8 ; is_sm << 1
+ movu m3, [base+z_filter_wh4]
+ mova m4, [base+z_filter_t_w48+angleq*8]
+ call .w8_filter_top
+ mov angled, [rsp]
+ lea r3d, [hq+2]
+ sub angled, 139
+ shl r3d, 6
+ test r3d, angled
+ jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
+.upsample_left: ; w4/w8
+ neg hq
+ movd m0, [tlq+hq]
+ pshufb m0, m7
+ movd [rsp+16*6+hq-4], m0
+ movq m3, [rsp+16*5+7]
+ movq m0, [rsp+16*5+8]
+ movq m2, [rsp+16*5+9]
+ movq m4, [rsp+16*5+10]
+ movddup m5, [base+pb_36_m4]
+ punpcklbw m1, m0, m3
+ punpcklbw m2, m4
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ movshdup m3, [base+z2_dy_offset]
+%if ARCH_X86_64
+ mova m12, [base+z2_upsample]
+ add dyd, dyd
+%else
+ mova m4, [base+z2_upsample]
+ shl dword [rsp+16*7+4*1], 1
+ mova m12, m4
+%endif
+ paddw m1, m2
+ pmulhrsw m1, m10
+ movq [rsp+16*15], m3
+ packuswb m1, m1
+ punpcklbw m0, m1
+ mova [rsp+16*5], m0
+.w4_main:
+ movd m6, dxd
+%if ARCH_X86_64
+ movd m3, dyd
+%else
+ movd m3, [rsp+16*7+4*1]
+%endif
+ movddup m0, [rsp+16*14+8*0]
+ pshufb m6, [base+pw_256]
+ paddw m7, m6, m6
+ movq m5, [base+pw_m1to4]
+ pshuflw m4, m3, q0000
+ punpcklqdq m6, m7
+ pmullw m4, m5
+ pshuflw m3, m3, q1111
+ paddw m6, m0
+ pshuflw m0, m4, q3333
+ psubw m4, [rsp+16*15]
+ movq [rsp+16*6+8*1], m3
+ movq [rsp+8*1], m0 ; dy*4
+%if ARCH_X86_64
+ mov r8, dstq
+%endif
+.w4_loop0:
+%if ARCH_X86_32
+ mov r8, dstq
+%endif
+ mova [rsp+16*12], m6
+ mov r2d, r10d
+ movq [rsp+8*0], m4
+ pand m0, m4, m8
+ psraw m4, 6
+ psubw m1, m9, m0
+ psllw m0, 8
+ por m0, m1 ; 64-frac_y, frac_y
+ movq [rsp+8*3], m0
+ pabsw m4, m4
+ movq [rsp+8*2], m4
+ movzx hd, r9b
+.w4_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x0
+ movq m0, [rsp+r2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x1
+ movhps m0, [rsp+r3]
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x2
+ movq m1, [rsp+r2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x3
+ movhps m1, [rsp+r3]
+ pand m2, m8, m6
+ paddsw m5, m6, m7
+ psubw m3, m9, m2
+ psllw m2, 8
+ pshufb m0, m11
+ por m2, m3
+ pmaddubsw m0, m2
+ pand m2, m8, m5
+ psubw m3, m9, m2
+ psllw m2, 8
+ pshufb m1, m11
+ por m2, m3
+ pmaddubsw m1, m2
+ cmp r3d, 127 ; topleft
+ jge .w4_toponly
+ movzx r3d, byte [rsp+8*2+0] ; base_y0
+ movq m3, [rsp+r3]
+ movzx r3d, byte [rsp+8*2+2] ; base_y1
+ movhps m3, [rsp+r3]
+ movzx r3d, byte [rsp+8*2+4] ; base_y2
+ movq m4, [rsp+r3]
+ movzx r3d, byte [rsp+8*2+6] ; base_y3
+ movhps m4, [rsp+r3]
+ pshufb m3, m12
+ pshufb m4, m12
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ movddup m4, [rsp+8*3]
+ pmaddubsw m2, m4
+ pmaddubsw m3, m4
+ psraw m6, 15 ; base_x < topleft
+ pand m2, m6
+ pandn m6, m0
+ por m0, m2, m6
+ psraw m6, m5, 15
+ pand m3, m6
+ pandn m6, m1
+ por m1, m3, m6
+.w4_toponly:
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ movifnidn strideq, stridemp
+ packuswb m0, m1
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ punpckhqdq m0, m0
+ movd [dstq+strideq*0], m0
+ psrlq m0, 32
+ movd [dstq+strideq*1], m0
+ sub hd, 4
+ jz .w4_end
+ movq m4, [rsp+8*2]
+ movq m3, [rsp+16*6+8*1]
+ paddw m6, m5, m7 ; xpos += dx
+ psubw m4, m3
+ movq [rsp+8*2], m4
+ lea dstq, [dstq+strideq*2]
+ cmp r2d, r11d
+ jge .w4_loop
+ movddup m5, [rsp+8*3]
+.w4_leftonly_loop:
+ movzx r3d, byte [rsp+8*2+0] ; base_y0
+ movq m1, [rsp+r3]
+ movzx r3d, byte [rsp+8*2+2] ; base_y1
+ movhps m1, [rsp+r3]
+ movzx r3d, byte [rsp+8*2+4] ; base_y2
+ movq m2, [rsp+r3]
+ movzx r3d, byte [rsp+8*2+6] ; base_y3
+ movhps m2, [rsp+r3]
+ psubw m4, m3
+ pshufb m1, m12
+ pshufb m2, m12
+ movq [rsp+8*2], m4
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ movifnidn strideq, stridemp
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m0, m1
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ punpckhqdq m0, m0
+ movd [dstq+strideq*0], m0
+ psrlq m0, 32
+ movd [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 4
+ jg .w4_leftonly_loop
+.w4_end:
+ sub r9d, 1<<8
+ jl .w4_ret
+ movq m4, [rsp+8*1]
+%if ARCH_X86_64
+ add r8, 4
+ mov dstq, r8
+%else
+ mov dstq, r8
+ add dstq, 4
+%endif
+ paddw m4, [rsp+8*0] ; base_y += 4*dy
+ movzx r3d, word [rsp+16*15+8*1]
+ add r10d, r3d
+ movddup m6, [rsp+16*15+8*1]
+ paddw m6, [rsp+16*12] ; base_x += (4 << upsample_above)
+ jmp .w4_loop0
+.w4_ret:
+ RET
+.w8:
+ test angled, 0x400
+ jnz .w4_main
+ movd m5, [tlq+8]
+ lea r3d, [angleq+126]
+ pshufb m5, m7
+%if ARCH_X86_64
+ mov r3b, hb
+%else
+ xor r3b, r3b
+ or r3d, hd
+%endif
+ movd [rsp+16*8+8], m5
+ cmp r3d, 8
+ ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
+ call .upsample_above
+ sub angled, 53
+ lea r3d, [hq+7]
+ xor angled, 0x7f ; 180 - angle
+ movu m1, [base+z_filter_wh8]
+ movd m0, r3d
+ movd m6, angled
+ shr angled, 8 ; is_sm << 1
+ psrldq m2, [base+z_filter_t_w48+angleq*8], 4
+ pshufb m0, m7
+ pshufb m6, m7
+ pcmpeqb m0, m1
+ pand m6, m0
+ pcmpgtb m6, m2
+%if ARCH_X86_64
+ movq [rsp+16*15+8*1], m10 ; 8<<6
+%else
+ movq m0, m10
+ movq [rsp+16*15+8*1], m0
+%endif
+ jmp .w8_filter_left
+.w8_no_upsample_above:
+ lea r3d, [hq+7]
+ mov [rsp], angled
+ sub angled, 90
+ movd m0, r3d
+ mov r3d, 90
+ movd m1, angled
+ sub r3d, angled ; 180 - angle
+ shr angled, 8 ; is_sm << 1
+ movu m3, [base+z_filter_wh8]
+ psrldq m4, [base+z_filter_t_w48+angleq*8], 4
+ call .w8_filter_top
+ mov r3d, [rsp]
+ sub r3d, 141
+%if ARCH_X86_64
+ mov r3b, hb
+%else
+ xor r3b, r3b
+ or r3d, hd
+%endif
+ cmp r3d, 8
+ jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm
+.w8_filter_left:
+ pmovmskb r5d, m6
+ test r5d, r5d
+ jz .w4_main
+ imul r5d, 0x55555555
+ mov r3, tlq
+ shr r5d, 30
+ sub r5, 3 ; filter_strength-3
+ jmp .filter_left
+.w8_filter_top:
+ movd m6, r3d
+ REPX {pshufb x, m7}, m0, m1, m6
+ pcmpeqb m0, m3
+ pand m1, m0
+ pand m6, m0
+ pcmpgtb m1, m4
+ pcmpgtb m6, m4
+ pmovmskb r5d, m1
+ test r5d, r5d
+ jz .w8_filter_top_end ; filter_strength == 0
+ imul r5d, 0x55555555
+ movq m0, [rsp+gprsize+16*8-2]
+ shr r5d, 30
+ movq m1, [rsp+gprsize+16*8-1]
+ sub r5, 3 ; filter_strength-3
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*0]
+ punpcklbw m0, m1
+ pmaddubsw m0, m7
+ movq m1, [rsp+gprsize+16*8+0]
+ movq m2, [rsp+gprsize+16*8+1]
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*1]
+ punpcklbw m1, m2
+ pmaddubsw m1, m7
+ movq m2, [rsp+gprsize+16*8+2]
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*2]
+ punpcklbw m2, m2
+ pmaddubsw m2, m7
+ paddw m0, m1
+ paddw m0, m2
+%if ARCH_X86_64
+ mov r3d, r7m ; maxw, offset due to call
+%else
+ mov r3d, [rsp+gprsize+16*18+4*3]
+%endif
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m0, m1
+ movq [rsp+gprsize+16*8], m0
+ cmp r3d, 8
+ jge .w8_filter_top_end
+ movq m0, [tlq+r3+1]
+ movq [rsp+gprsize+r3+16*8], m0
+.w8_filter_top_end:
+ ret
+.w16:
+ test angled, 0x400
+ jnz .w4_main
+ lea r3d, [hq+15]
+ sub angled, 90
+ movd m0, r3d
+ mov r3d, 90
+ movd m1, angled
+ sub r3d, angled ; 180 - angle
+ shr angled, 8 ; is_sm << 1
+ movd m6, r3d
+ REPX {pshufb x, m7}, m0, m1, m6
+ movq m3, [base+z_filter_t_w16+angleq*4]
+ pcmpeqb m0, [base+z_filter_wh16]
+ pand m1, m0
+ pand m6, m0
+ pcmpgtb m1, m3
+ pcmpgtb m6, m3
+ pmovmskb r5d, m1
+ mov r3, tlq
+ test r5d, r5d
+ jz .w16_filter_left ; filter_strength == 0
+ imul r5d, 0x24924924
+ pshufb m5, [base+z_filter_t_w16] ; tlq[16]
+ shr r5d, 30
+ adc r5, -4 ; filter_strength-3
+ movd [rsp+16*9], m5
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*0]
+ movu m1, [rsp+16*8-2]
+ movu m2, [rsp+16*8-1]
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m7
+ punpckhbw m1, m2
+ pmaddubsw m1, m7
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*1]
+ mova m3, [rsp+16*8+0]
+ movu m4, [rsp+16*8+1]
+ punpcklbw m2, m3, m4
+ pmaddubsw m2, m7
+ punpckhbw m3, m4
+ pmaddubsw m3, m7
+ paddw m0, m2
+ paddw m1, m3
+ test r5d, r5d
+ jnz .w16_filter_end ; 3-tap
+ movddup m7, [base+z_filter_k+8*8]
+ movu m3, [rsp+16*8+2]
+ punpcklbw m2, m3, m3
+ pmaddubsw m2, m7
+ punpckhbw m3, m3
+ pmaddubsw m3, m7
+ paddw m0, m2
+ paddw m1, m3
+.w16_filter_end:
+ mov r2d, maxwm
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m0, m1
+ mova [rsp+16*8], m0
+ cmp r2d, 16
+ jge .w16_filter_left
+ movu m0, [r3+r2+1]
+ movu [rsp+r2+16*8], m0
+.w16_filter_left:
+ pmovmskb r5d, m6
+ test r5d, r5d
+ jz .w4_main
+ imul r5d, 0x24924924
+ shr r5d, 30
+ adc r5, -4 ; filter_strength-3
+ jmp .filter_left
+.w32:
+ test angled, 0x400
+ jnz .w4_main
+ pshufb m6, [base+z_filter_t_w16] ; tlq[32]
+ mov r3, tlq
+ lea tlq, [rsp+16*9]
+ movd [tlq+16*1], m6
+ xor r5d, r5d ; filter_strength = 3
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ mova m0, [tlq+16*0]
+ mova m1, [tlq+16*1]
+ mov r2d, maxwm
+ mova [rsp+16*8], m0
+ mova [rsp+16*9], m1
+ cmp r2d, 32
+ jge .filter_left
+ movu m0, [r3+r2+16*0+1]
+ movu m1, [r3+r2+16*1+1]
+ movu [rsp+r2+16*8], m0
+ movu [rsp+r2+16*9], m1
+ jmp .filter_left
+.w64:
+ movu m0, [tlq+16*2+1]
+ movu m1, [tlq+16*3+1]
+ mova [rsp+16*10], m0
+ mova [rsp+16*11], m1
+ test angled, 0x400
+ jnz .w4_main
+ pshufb m1, [base+z_filter_t_w16] ; tlq[64]
+ mov r3, tlq
+ lea tlq, [rsp+16*11]
+ movd [tlq+16*1], m1
+ xor r5d, r5d ; filter_strength = 3
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ sub tlq, 16*2
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ mova m0, [tlq+16*0]
+ mova m1, [tlq+16*1]
+ mova m2, [tlq+16*2]
+ mova m3, [tlq+16*3]
+ mov r2d, maxwm
+ mova [rsp+16* 8], m0
+ mova [rsp+16* 9], m1
+ mova [rsp+16*10], m2
+ mova [rsp+16*11], m3
+ cmp r2d, 64
+ jge .filter_left
+ movu m0, [r3+r2+16*0+1]
+ movu m1, [r3+r2+16*1+1]
+ movu [rsp+r2+16* 8], m0
+ movu [rsp+r2+16* 9], m1
+ cmp r2d, 32
+ jge .filter_left
+ movu m0, [r3+r2+16*2+1]
+ movu m1, [r3+r2+16*3+1]
+ movu [rsp+r2+16*10], m0
+ movu [rsp+r2+16*11], m1
+.filter_left:
+ neg hq
+ movd m0, [r3+hq]
+ pxor m1, m1
+ pshufb m0, m1
+ movd [rsp+16*6+hq-4], m0
+ lea tlq, [rsp+16*5]
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ cmp hd, -32
+ jge .filter_left_end
+ sub tlq, 16*2
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ mova m0, [tlq+16*0]
+ mova m1, [tlq+16*1]
+ mova [rsp+16*2], m0
+ mova [rsp+16*3], m1
+.filter_left_end:
+ mov r2d, maxhm
+ mova m0, [rsp+16*5]
+ mova m1, [rsp+16*6]
+ mova m2, [rsp+16*7]
+ neg r2
+ mova [rsp+16*4], m0
+ mova [rsp+16*5], m1
+ mova [rsp+16*6], m2
+ cmp r2d, hd
+ jle .w4_main
+ movu m0, [r3+r2-16*2]
+ movu m1, [r3+r2-16*1]
+ movu [rsp+r2+16*4], m0
+ movu [rsp+r2+16*5], m1
+ cmp r2d, -32
+ jle .w4_main
+ movu m0, [r3+r2-16*4]
+ movu m1, [r3+r2-16*3]
+ movu [rsp+r2+16*2], m0
+ movu [rsp+r2+16*3], m1
+ jmp .w4_main
+
+%if ARCH_X86_64
+cglobal ipred_z3_8bpc, 4, 9, 11, 16*10, dst, stride, tl, w, h, angle, dy, _, org_w
+ %define base r7-$$
+ lea r7, [$$]
+ mova m8, [base+pw_62]
+ mova m9, [base+pw_64]
+ mova m10, [base+pw_512]
+ mov org_wd, wd
+%else
+cglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, tl, w, h, angle, dy
+ %define base r1-$$
+ %define m8 [base+pw_62]
+ %define m9 [base+pw_64]
+ %define m10 [base+pw_512]
+ %define org_wd r5
+ %define org_wq r5
+ mov [dstq+strideq*0], strideq
+ mov [dstq+strideq*1], wd
+ LEA r1, $$
+%endif
+ tzcnt hd, hm
+ movifnidn angled, anglem
+ dec tlq
+ movsxd hq, [base+ipred_z3_ssse3_table+hq*4]
+ sub angled, 180
+ mov dyd, angled
+ neg dyd
+ xor angled, 0x400
+ or dyq, ~0x7e
+ lea hq, [base+ipred_z3_ssse3_table+hq]
+ movzx dyd, word [base+dr_intra_derivative+45*2-1+dyq]
+ jmp hq
+.h4:
+ lea r4d, [angleq+88]
+ test r4d, 0x480
+ jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40
+ sar r4d, 9
+ add r4d, wd
+ cmp r4d, 8
+ jg .h4_no_upsample ; w > 8 || (w == 8 && is_sm)
+ movu m3, [tlq-7]
+ movu m1, [base+z_upsample1-4]
+ movu m4, [base+z_filter_s+2]
+ pshufb m0, m3, m1
+ pxor m1, m1
+ pshufb m2, m3, m1
+ pshufb m1, m3, m4
+ mova [rsp+16], m2 ; top[max_base_y]
+ movddup m2, [base+pb_36_m4]
+ add dyd, dyd
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ movd m5, dyd
+ mov r5d, dyd
+ pshufb m5, [base+pw_256]
+ paddw m0, m1
+ pmulhrsw m0, m10
+ shl wd, 2
+ mov tlq, rsp
+ sub rsp, wq
+ packuswb m0, m0
+ punpcklbw m0, m3
+ paddw m6, m5, m5
+ punpcklqdq m5, m6
+ pshufb m0, [base+pb_15to0]
+ mova [tlq], m0
+.h4_upsample_loop:
+ lea r4d, [r5+dyq]
+ shr r5d, 6
+ movq m0, [tlq+r5]
+ lea r5d, [r4+dyq]
+ shr r4d, 6
+ movhps m0, [tlq+r4]
+ pand m2, m8, m5
+ psubw m1, m9, m2
+ psllw m2, 8
+ por m1, m2
+ pmaddubsw m0, m1
+ paddw m5, m6
+ pmulhrsw m0, m10
+ packuswb m0, m0
+ movq [rsp+wq-8], m0
+ sub wd, 8
+ jg .h4_upsample_loop
+ jmp .h4_transpose
+.h4_no_upsample:
+ mov r4d, 7
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h4_main
+ lea r4d, [wq+3]
+ movd m0, r4d
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ pcmpeqb m1, m0, [base+z_filter_wh4]
+ pand m1, m2
+ pcmpgtb m1, [base+z_filter_t_w48+angleq*8]
+ pmovmskb r5d, m1
+ mov r4d, 7
+ test r5d, r5d
+ jz .h4_main ; filter_strength == 0
+ movu m2, [tlq-7]
+ imul r5d, 0x55555555
+ movu m3, [base+z_filter_s-2]
+ shr r5d, 30 ; filter_strength
+ mova m4, [base+z_upsample2]
+ movddup m5, [base+z_filter_k-8+r5*8+24*0]
+ movddup m6, [base+z_filter_k-8+r5*8+24*1]
+ movddup m7, [base+z_filter_k-8+r5*8+24*2]
+ pshufb m0, m2, m3
+ shufps m3, m4, q2121
+ pmaddubsw m1, m0, m5
+ pmaddubsw m0, m6
+ pshufb m5, m2, m3
+ pmaddubsw m3, m5, m6
+ pmaddubsw m5, m7
+ pshufb m2, m4
+ pmaddubsw m2, m7
+ paddw m0, m1
+ paddw m1, m3
+ paddw m0, m5
+ paddw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ lea r2d, [r4+2]
+ cmp wd, 4
+ cmovne r4d, r2d
+ pshufd m0, m0, q0000
+ lea tlq, [rsp+15]
+ packuswb m0, m1
+ mova [rsp], m0
+.h4_main:
+ movd m5, dyd
+ movddup m0, [base+z_base_inc] ; base_inc << 6
+ sub tlq, r4
+ shl r4d, 6
+ movd m7, [tlq]
+ movd m4, r4d
+ pshufb m5, [base+pw_256]
+ neg dyq
+ pshufb m7, [base+pw_m256]
+ mova m3, [base+z3_shuf_h4]
+ lea r5, [dyq+r4+63] ; ypos
+ pshufb m4, [base+pw_256]
+ psubw m4, m0 ; max_base_y
+ shl wd, 2
+ paddw m6, m5, m5
+ sub rsp, wq
+ punpcklqdq m5, m6
+.h4_loop:
+ lea r4, [r5+dyq]
+ sar r5, 6
+ movq m0, [tlq+r5-4]
+ lea r5, [r4+dyq]
+ sar r4, 6
+ movhps m0, [tlq+r4-4]
+ pand m2, m8, m5
+ psubw m1, m9, m2
+ psllw m2, 8
+ pshufb m0, m3
+ por m1, m2
+ pmaddubsw m0, m1
+ pcmpgtw m1, m4, m5
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ packuswb m0, m0
+ movq [rsp+wq-8], m0
+ sub wd, 8
+ jz .h4_transpose
+ test r5d, r5d
+ jg .h4_loop
+ packuswb m7, m7
+.h4_end_loop:
+ movq [rsp+wq-8], m7
+ sub wd, 8
+ jg .h4_end_loop
+.h4_transpose:
+ mova m1, [base+z_transpose4]
+%if ARCH_X86_32
+ mov strideq, [dstq]
+ mov org_wd, [dstq+strideq]
+%endif
+ lea r2, [strideq*3]
+ lea dstq, [dstq+org_wq-4]
+.h4_transpose_loop:
+ mova m0, [rsp]
+ add rsp, 16
+ pshufb m0, m1
+ movd [dstq+strideq*0], m0
+ pshuflw m2, m0, q1032
+ movd [dstq+strideq*1], m2
+ punpckhqdq m0, m0
+ movd [dstq+strideq*2], m0
+ psrlq m0, 32
+ movd [dstq+r2 ], m0
+ sub dstq, 4
+ sub org_wd, 4
+ jg .h4_transpose_loop
+ RET
+.h8:
+ lea r4d, [angleq+88]
+ and r4d, ~0x7f
+ or r4d, wd
+ cmp r4d, 8
+ ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
+ mova m4, [tlq-15]
+ and r4d, 4
+ movu m3, [tlq- 9]
+ movd m1, r4d
+ movu m2, [base+z_filter_s+2]
+ pxor m0, m0
+ movu m5, [base+z_filter_s+6]
+ movddup m7, [base+pb_36_m4]
+ pshufb m1, m0 ; w & 4
+ movu m0, [base+z_upsample1-4]
+ pmaxub m1, m0 ; clip 4x8
+ add dyd, dyd
+ pshufb m0, m4, m1
+ pmaddubsw m0, m7
+ pshufb m1, m4, m2
+ pmaddubsw m1, m7
+ pshufb m2, m3, [base+z_upsample1]
+ pmaddubsw m2, m7
+ pshufb m3, m5
+ pmaddubsw m3, m7
+ movd m5, dyd
+ neg dyq
+ paddw m1, m0
+ paddw m2, m3
+ pmulhrsw m1, m10
+ pmulhrsw m2, m10
+ shl wd, 3
+ lea tlq, [rsp+16]
+ pshufb m5, [base+pw_256]
+ sub rsp, wq
+ packuswb m1, m2
+ lea r5, [dyq+63]
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ mova [tlq-16*1], m0
+ mova [tlq-16*0], m1
+ paddw m6, m5, m5
+ punpcklqdq m5, m6
+.h8_upsample_loop:
+ lea r4, [r5+dyq]
+ sar r5, 6
+ movu m0, [tlq+r5]
+ lea r5, [r4+dyq]
+ sar r4, 6
+ movu m1, [tlq+r4]
+ pand m3, m8, m5
+ psubw m2, m9, m3
+ psllw m2, 8
+ por m3, m2
+ pshufd m2, m3, q1010
+ pmaddubsw m0, m2
+ punpckhqdq m3, m3
+ pmaddubsw m1, m3
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m1, m0
+ mova [rsp+wq-16], m1
+ sub wd, 16
+ jg .h8_upsample_loop
+ jmp .h8_transpose
+.h8_no_upsample:
+ lea r4d, [wq+7]
+ movd m0, r4d
+ and r4d, 7
+ or r4d, 8 ; imin(w+7, 15)
+ test angled, 0x400
+ jnz .h8_main
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ movu m1, [base+z_filter_wh8]
+ psrldq m3, [base+z_filter_t_w48+angleq*8], 4
+ pcmpeqb m1, m0
+ pand m1, m2
+ pcmpgtb m1, m3
+ pmovmskb r5d, m1
+ test r5d, r5d
+ jz .h8_main ; filter_strength == 0
+ mova m0, [tlq-15]
+ imul r5d, 0x55555555
+ movd m1, [tlq+1]
+ neg r4
+ movd m2, [tlq+r4]
+ shr r5d, 30
+ pxor m7, m7
+ lea tlq, [rsp+16*2]
+ sub r5, 3 ; filter_strength-3
+ mova [tlq+16*0], m0
+ pshufb m1, m7
+ mova [tlq+16*1], m1
+ pshufb m2, m7
+ movq [tlq+r4+8], m2
+ neg r4d
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ sar r5d, 1
+ add tlq, 31
+ add r5d, 17
+ cmp wd, 8
+ cmova r4d, r5d
+.h8_main:
+ movd m5, dyd
+ sub tlq, r4
+ shl r4d, 6
+ movd m7, [tlq]
+ movd m4, r4d
+ pshufb m5, [base+pw_256]
+ neg dyq
+ pshufb m7, [base+pw_m256]
+ mova m3, [base+z3_shuf]
+ lea r5, [dyq+r4+63]
+ pshufb m4, [base+pw_256]
+ psubw m4, [base+z3_base_inc]
+ shl wd, 3
+ mova m6, m5
+ sub rsp, wq
+.h8_loop:
+ mov r4, r5
+ sar r4, 6
+ movu m0, [tlq+r4-8]
+ pand m2, m8, m5
+ psubw m1, m9, m2
+ psllw m2, 8
+ pshufb m0, m3
+ por m1, m2
+ pmaddubsw m0, m1
+ pcmpgtw m1, m4, m5
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ packuswb m0, m0
+ movq [rsp+wq-8], m0
+ sub wd, 8
+ jz .h8_transpose
+ add r5, dyq
+ jg .h8_loop
+ packuswb m7, m7
+.h8_end_loop:
+ movq [rsp+wq-8], m7
+ sub wd, 8
+ jg .h8_end_loop
+.h8_transpose:
+%if ARCH_X86_32
+ mov strideq, [dstq]
+ mov org_wd, [dstq+strideq]
+%endif
+ or r3d, 8
+ cmp org_wd, 4
+%if ARCH_X86_64
+ jne .end_transpose_main
+%else
+ jne .end_transpose_loop
+%endif
+ mova m1, [rsp+16*1]
+ mova m0, [rsp+16*0]
+ lea r2, [strideq*3]
+ add rsp, 16*2
+ punpcklbw m2, m1, m0
+ punpckhbw m1, m0
+ punpckhbw m0, m1, m2
+ punpcklbw m1, m2
+.write_4x8_end:
+ call .write_4x8
+ RET
+.write_4x8:
+ movd [dstq+r2 ], m0
+ pshuflw m4, m0, q1032
+ movd [dstq+strideq*2], m4
+ punpckhqdq m0, m0
+ movd [dstq+strideq*1], m0
+ psrlq m0, 32
+ movd [dstq+strideq*0], m0
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+r2 ], m1
+ pshuflw m4, m1, q1032
+ movd [dstq+strideq*2], m4
+ punpckhqdq m1, m1
+ movd [dstq+strideq*1], m1
+ psrlq m1, 32
+ movd [dstq+strideq*0], m1
+ ret
+.h16:
+ lea r4d, [wq+15]
+ movd m0, r4d
+ and r4d, 15
+ or r4d, 16 ; imin(w+15, 31)
+ test angled, 0x400
+ jnz .h16_main
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ movq m3, [base+z_filter_t_w16+angleq*4]
+ pcmpeqb m1, m0, [base+z_filter_wh16]
+ pand m1, m2
+ pcmpgtb m1, m3
+ pmovmskb r5d, m1
+ test r5d, r5d
+ jz .h16_main ; filter_strength == 0
+ mova m0, [tlq-16*2+1]
+ imul r5d, 0x24924924
+ mova m1, [tlq-16*1+1]
+ neg r4
+ movd m2, [tlq-16*0+1]
+ shr r5d, 30
+ movd m3, [tlq+r4]
+ adc r5, -4 ; filter_strength-3
+ pxor m7, m7
+ lea tlq, [rsp+16*2]
+ mova [tlq-16*1], m0
+ pshufb m2, m7
+ mova [tlq+16*0], m1
+ pshufb m3, m7
+ mova [tlq+16*1], m2
+ movq [tlq+r4+8], m3
+ neg r4d
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ add tlq, 31
+ cmp wd, 16
+ jle .h16_main
+ pshuflw m0, [tlq-47], q0000
+ sar r5, 1
+ movq m1, [base+z3_filter_k_tail+r5*4]
+ lea r4d, [r5+33]
+ pmaddubsw m0, m1
+%if ARCH_X86_64
+ pmulhrsw m0, m10
+%else
+ pmulhrsw m0, m4
+%endif
+ packuswb m0, m0
+ movd [tlq-35], m0
+.h16_main:
+ movd m5, dyd
+ sub tlq, r4
+ movd m4, r4d
+ shl r4d, 6
+ movd m7, [tlq]
+ pxor m6, m6
+ pshufb m5, [base+pw_256]
+ neg dyq
+ pshufb m7, m6
+ mova m3, [base+z3_shuf]
+ lea r5, [dyq+r4+63]
+ pshufb m4, m6
+ psubb m4, [base+pb_15to0]
+ shl wd, 4
+ mova m6, m5
+ sub rsp, wq
+.h16_loop:
+ mov r4, r5
+ pand m2, m8, m5
+ sar r4, 6
+ psubw m1, m9, m2
+ psllw m2, 8
+ movu m0, [tlq+r4-8*2]
+ por m2, m1
+ movu m1, [tlq+r4-8*1]
+ pshufb m0, m3
+ pmaddubsw m0, m2
+ pshufb m1, m3
+ pmaddubsw m1, m2
+ psrlw m2, m5, 6
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packsswb m2, m2
+ packuswb m0, m1
+ pcmpgtb m1, m4, m2
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ mova [rsp+wq-16], m0
+ sub wd, 16
+ jz .h16_transpose
+ add r5, dyq
+ jg .h16_loop
+.h16_end_loop:
+ mova [rsp+wq-16], m7
+ sub wd, 16
+ jg .h16_end_loop
+.h16_transpose:
+%if ARCH_X86_32
+ mov strideq, [dstq]
+ mov org_wd, [dstq+strideq]
+%endif
+ or r3d, 16
+ cmp org_wd, 4
+%if ARCH_X86_64
+ jne .end_transpose_main
+%else
+ jne .end_transpose_loop
+%endif
+.h16_transpose_w4:
+ mova m2, [rsp+16*3]
+ mova m4, [rsp+16*2]
+ mova m3, [rsp+16*1]
+ mova m0, [rsp+16*0]
+ lea r2, [strideq*3]
+ add rsp, 16*4
+ punpckhbw m1, m2, m4
+ punpcklbw m2, m4
+ punpckhbw m4, m3, m0
+ punpcklbw m3, m0
+ punpckhwd m0, m1, m4
+ punpcklwd m1, m4
+ call .write_4x8
+ lea dstq, [dstq+strideq*4]
+ punpckhwd m0, m2, m3
+ punpcklwd m1, m2, m3
+ jmp .write_4x8_end
+.h32:
+ lea r4d, [wq+31]
+ and r4d, 31
+ or r4d, 32 ; imin(w+31, 63)
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h32_main
+ mova m0, [tlq-16*4+1]
+ mova m1, [tlq-16*3+1]
+ mova m2, [tlq-16*2+1]
+ mova m3, [tlq-16*1+1]
+ movd m4, [tlq-16*0+1]
+ neg r4
+ movd m5, [tlq+r4]
+ pxor m7, m7
+ lea tlq, [rsp+16*4]
+ mova [tlq-16*3], m0
+ mova [tlq-16*2], m1
+ xor r5d, r5d ; filter_strength = 3
+ mova [tlq-16*1], m2
+ pshufb m4, m7
+ mova [tlq+16*0], m3
+ pshufb m5, m7
+ mova [tlq+16*1], m4
+ movq [tlq+r4+8], m5
+ neg r4d
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ sub tlq, 16*2
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ add tlq, 63
+ cmp wd, 32
+ jle .h32_main
+ pshuflw m0, [tlq-79], q0000
+ movq m1, [base+z3_filter_k_tail]
+ add r4d, 2
+ pmaddubsw m0, m1
+%if ARCH_X86_64
+ pmulhrsw m0, m10
+%else
+ pmulhrsw m0, m4
+%endif
+ packuswb m0, m0
+ movd [tlq-67], m0
+.h32_main:
+ movd m5, dyd
+ sub tlq, r4
+ movd m4, r4d
+ shl r4d, 6
+ movd m7, [tlq]
+ pxor m6, m6
+ pshufb m5, [base+pw_256]
+ neg dyq
+ pshufb m7, m6
+ mova m3, [base+z3_shuf]
+ lea r5, [dyq+r4+63]
+ pshufb m4, m6
+ psubb m4, [base+pb_15to0]
+ mova m6, m5
+.h32_loop:
+ mov r4, r5
+ pand m2, m8, m5
+ sar r4, 6
+ psubw m1, m9, m2
+ psllw m2, 8
+ movu m0, [tlq+r4-8*4]
+ por m2, m1
+ movu m1, [tlq+r4-8*3]
+ pshufb m0, m3
+ pmaddubsw m0, m2
+ pshufb m1, m3
+ pmaddubsw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ sub rsp, 32
+ packuswb m0, m1
+ mova [rsp+16*0], m0
+ movu m0, [tlq+r4-8*2]
+ movu m1, [tlq+r4-8*1]
+ pshufb m0, m3
+ pshufb m1, m3
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ psrlw m2, m5, 6
+ paddw m5, m6
+ packsswb m2, m2
+ packuswb m0, m1
+ pcmpgtb m1, m4, m2
+ paddsb m2, [base+pb_16]
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ pcmpgtb m1, m4, m2
+ mova [rsp+16*1], m0
+ pand m0, m1, [rsp+16*0]
+ pandn m1, m7
+ por m0, m1
+ mova [rsp+16*0], m0
+ dec wd
+ jz .h32_transpose
+ add r5, dyq
+ jg .h32_loop
+.h32_end_loop:
+ sub rsp, 32
+ mova [rsp+16*1], m7
+ mova [rsp+16*0], m7
+ dec wd
+ jg .h32_end_loop
+.h32_transpose:
+ or r3d, 32
+ jmp .end_transpose_main
+.h64:
+ lea r4d, [wq+63]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h64_main
+ mova m0, [tlq-16*8+1]
+ mova m1, [tlq-16*7+1]
+ mova m2, [tlq-16*6+1]
+ mova m3, [tlq-16*5+1]
+ mova [rsp+16*1], m0
+ mova [rsp+16*2], m1
+ mova [rsp+16*3], m2
+ mova [rsp+16*4], m3
+ mova m0, [tlq-16*4+1]
+ mova m1, [tlq-16*3+1]
+ mova m2, [tlq-16*2+1]
+ mova m3, [tlq-16*1+1]
+ movd m4, [tlq-16*0+1]
+ neg r4
+ movd m5, [tlq+r4]
+ pxor m7, m7
+ lea tlq, [rsp+16*8]
+ mova [tlq-16*3], m0
+ mova [tlq-16*2], m1
+ xor r5d, r5d ; filter_strength = 3
+ mova [tlq-16*1], m2
+ pshufb m4, m7
+ mova [tlq+16*0], m3
+ pshufb m5, m7
+ mova [tlq+16*1], m4
+ movq [tlq+r4+8], m5
+ neg r4d
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ sub tlq, 16*2
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ sub tlq, 16*2
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ sub tlq, 16*2
+ cmp wd, 64
+ jl .h64_filter96 ; skip one call if the last 32 bytes aren't used
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+.h64_filter96:
+ add tlq, 127
+.h64_main:
+ movd m5, dyd
+ sub tlq, r4
+ movd m4, r4d
+ shl r4d, 6
+ movd m7, [tlq]
+ pxor m6, m6
+ pshufb m5, [base+pw_256]
+ neg dyq
+ pshufb m7, m6
+ mova m3, [base+z3_shuf]
+ lea r5, [dyq+r4+63]
+ pshufb m4, m6
+ psubb m4, [base+pb_15to0]
+ mova m6, m5
+.h64_loop:
+ mov r4, r5
+ pand m2, m8, m5
+ sar r4, 6
+ psubw m1, m9, m2
+ psllw m2, 8
+ movu m0, [tlq+r4-8*8]
+ por m2, m1
+ movu m1, [tlq+r4-8*7]
+ pshufb m0, m3
+ pmaddubsw m0, m2
+ pshufb m1, m3
+ pmaddubsw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ sub rsp, 64
+ packuswb m0, m1
+ mova [rsp+16*0], m0
+ movu m0, [tlq+r4-8*6]
+ movu m1, [tlq+r4-8*5]
+ pshufb m0, m3
+ pshufb m1, m3
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m0, m1
+ mova [rsp+16*1], m0
+ movu m0, [tlq+r4-8*4]
+ movu m1, [tlq+r4-8*3]
+ pshufb m0, m3
+ pshufb m1, m3
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m0, m1
+ mova [rsp+16*2], m0
+ movu m0, [tlq+r4-8*2]
+ movu m1, [tlq+r4-8*1]
+ pshufb m0, m3
+ pshufb m1, m3
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ psrlw m2, m5, 6
+ paddw m5, m6
+ packsswb m2, m2
+ packuswb m0, m1
+ pcmpgtb m1, m4, m2
+ paddsb m2, [base+pb_16]
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ pcmpgtb m1, m4, m2
+ paddsb m2, [base+pb_16]
+ mova [rsp+16*3], m0
+ pand m0, m1, [rsp+16*2]
+ pandn m1, m7
+ por m0, m1
+ pcmpgtb m1, m4, m2
+ paddsb m2, [base+pb_16]
+ mova [rsp+16*2], m0
+ pand m0, m1, [rsp+16*1]
+ pandn m1, m7
+ por m0, m1
+ pcmpgtb m1, m4, m2
+ mova [rsp+16*1], m0
+ pand m0, m1, [rsp+16*0]
+ pandn m1, m7
+ por m0, m1
+ mova [rsp+16*0], m0
+ dec wd
+ jz .h64_transpose
+ add r5, dyq
+ jg .h64_loop
+.h64_end_loop:
+ sub rsp, 64
+ mova [rsp+16*3], m7
+ mova [rsp+16*2], m7
+ mova [rsp+16*1], m7
+ mova [rsp+16*0], m7
+ dec wd
+ jg .h64_end_loop
+.h64_transpose:
+ or r3d, 64
+.end_transpose_main:
+%if ARCH_X86_64
+ lea r5, [r3*3]
+ lea r7, [strideq*3]
+%else
+ mov strideq, [dstq]
+ mov org_wd, [dstq+strideq]
+%endif
+.end_transpose_loop:
+ lea r4, [rsp+r3-8]
+ lea r6, [dstq+org_wq-8]
+.end_transpose_loop_y:
+ movq m0, [r4+r3*1]
+ movq m4, [r4+r3*0]
+%if ARCH_X86_64
+ movq m1, [r4+r5 ]
+ movq m5, [r4+r3*2]
+ lea r2, [r4+r3*4]
+%else
+ lea r2, [r4+r3*2]
+ movq m1, [r2+r3*1]
+ movq m5, [r2+r3*0]
+ lea r2, [r2+r3*2]
+%endif
+ movq m2, [r2+r3*1]
+ movq m6, [r2+r3*0]
+%if ARCH_X86_64
+ movq m3, [r2+r5 ]
+ movq m7, [r2+r3*2]
+%else
+ lea r2, [r2+r3*2]
+ movq m3, [r2+r3*1]
+ movq m7, [r2+r3*0]
+%endif
+ sub r4, 8
+ punpcklbw m0, m4
+ punpcklbw m1, m5
+ punpcklbw m2, m6
+ punpcklbw m3, m7
+ punpckhwd m4, m1, m0
+ punpcklwd m1, m0
+ punpckhwd m0, m3, m2
+ punpcklwd m3, m2
+ punpckhdq m2, m3, m1
+ punpckldq m3, m1
+ punpckldq m1, m0, m4
+ punpckhdq m0, m4
+ movhps [r6+strideq*0], m0
+ movq [r6+strideq*1], m0
+%if ARCH_X86_64
+ movhps [r6+strideq*2], m1
+ movq [r6+r7 ], m1
+ lea r6, [r6+strideq*4]
+%else
+ lea r6, [r6+strideq*2]
+ movhps [r6+strideq*0], m1
+ movq [r6+strideq*1], m1
+ lea r6, [r6+strideq*2]
+%endif
+ movhps [r6+strideq*0], m2
+ movq [r6+strideq*1], m2
+%if ARCH_X86_64
+ movhps [r6+strideq*2], m3
+ movq [r6+r7 ], m3
+ lea r6, [r6+strideq*4]
+%else
+ lea r6, [r6+strideq*2]
+ movhps [r6+strideq*0], m3
+ movq [r6+strideq*1], m3
+ lea r6, [r6+strideq*2]
+%endif
+ cmp r4, rsp
+ jae .end_transpose_loop_y
+ lea rsp, [rsp+r3*8]
+ sub org_wd, 8
+ jg .end_transpose_loop
+ RET
+
+;---------------------------------------------------------------------------------------
+;int dav1d_pal_pred_ssse3(pixel *dst, const ptrdiff_t stride, const uint16_t *const pal,
+; const uint8_t *idx, const int w, const int h);
+;---------------------------------------------------------------------------------------
+cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
+ mova m4, [palq]
+ LEA r2, pal_pred_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r2+wq*4]
+ packuswb m4, m4
+ add wq, r2
+ lea r2, [strideq*3]
+ jmp wq
+.w4:
+ pshufb m0, m4, [idxq]
+ add idxq, 16
+ movd [dstq ], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq ], m1
+ punpckhqdq m0, m0
+ movd [dstq+strideq*2], m0
+ psrlq m0, 32
+ movd [dstq+r2 ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+ALIGN function_align
+.w8:
+ pshufb m0, m4, [idxq]
+ pshufb m1, m4, [idxq+16]
+ add idxq, 32
+ movq [dstq ], m0
+ movhps [dstq+strideq ], m0
+ movq [dstq+strideq*2], m1
+ movhps [dstq+r2 ], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8
+ RET
+ALIGN function_align
+.w16:
+ pshufb m0, m4, [idxq]
+ pshufb m1, m4, [idxq+16]
+ pshufb m2, m4, [idxq+32]
+ pshufb m3, m4, [idxq+48]
+ add idxq, 64
+ mova [dstq ], m0
+ mova [dstq+strideq ], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r2 ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16
+ RET
+ALIGN function_align
+.w32:
+ pshufb m0, m4, [idxq]
+ pshufb m1, m4, [idxq+16]
+ pshufb m2, m4, [idxq+32]
+ pshufb m3, m4, [idxq+48]
+ add idxq, 64
+ mova [dstq ], m0
+ mova [dstq+16 ], m1
+ mova [dstq+strideq ], m2
+ mova [dstq+strideq+16], m3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32
+ RET
+ALIGN function_align
+.w64:
+ pshufb m0, m4, [idxq]
+ pshufb m1, m4, [idxq+16]
+ pshufb m2, m4, [idxq+32]
+ pshufb m3, m4, [idxq+48]
+ add idxq, 64
+ mova [dstq ], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ add dstq, strideq
+ sub hd, 1
+ jg .w64
+ RET
+
+;---------------------------------------------------------------------------------------
+;void dav1d_ipred_cfl_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int16_t *ac, const int alpha);
+;---------------------------------------------------------------------------------------
+%macro IPRED_CFL 1 ; ac in, unpacked pixels out
+ psignw m3, m%1, m1
+ pabsw m%1, m%1
+ pmulhrsw m%1, m2
+ psignw m%1, m3
+ paddw m%1, m0
+%endmacro
+
+%if UNIX64
+DECLARE_REG_TMP 7
+%else
+DECLARE_REG_TMP 5
+%endif
+
+cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ movifnidn wd, wm
+ movifnidn hd, hm
+ tzcnt r6d, hd
+ lea t0d, [wq+hq]
+ movd m4, t0d
+ tzcnt t0d, t0d
+ movd m5, t0d
+ LEA t0, ipred_cfl_ssse3_table
+ tzcnt wd, wd
+ movsxd r6, [t0+r6*4]
+ movsxd wq, [t0+wq*4+16]
+ pcmpeqd m3, m3
+ psrlw m4, 1
+ add r6, t0
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+.h4:
+ movd m0, [tlq-4]
+ pmaddubsw m0, m3
+ jmp wq
+.w4:
+ movd m1, [tlq+1]
+ pmaddubsw m1, m3
+ psubw m0, m4
+ paddw m0, m1
+ pmaddwd m0, m3
+ cmp hd, 4
+ jg .w4_mul
+ psrlw m0, 3 ; dc >>= ctz(width + height);
+ jmp .w4_end
+.w4_mul:
+ punpckhqdq m1, m0, m0
+ paddw m0, m1
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ psrlw m0, 2
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 8
+ cmovz r6d, r2d
+ movd m5, r6d
+ pmulhuw m0, m5
+.w4_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s4:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ lea r6, [strideq*3]
+ pabsw m2, m1
+ psllw m2, 9
+.s4_loop:
+ mova m4, [acq]
+ mova m5, [acq+16]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ movd [dstq+strideq*0], m4
+ pshuflw m4, m4, q1032
+ movd [dstq+strideq*1], m4
+ punpckhqdq m4, m4
+ movd [dstq+strideq*2], m4
+ psrlq m4, 32
+ movd [dstq+r6 ], m4
+ lea dstq, [dstq+strideq*4]
+ add acq, 32
+ sub hd, 4
+ jg .s4_loop
+ RET
+ALIGN function_align
+.h8:
+ movq m0, [tlq-8]
+ pmaddubsw m0, m3
+ jmp wq
+.w8:
+ movq m1, [tlq+1]
+ pmaddubsw m1, m3
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ paddw m0, m1
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ cmp hd, 32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w8_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s8:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ lea r6, [strideq*3]
+ pabsw m2, m1
+ psllw m2, 9
+.s8_loop:
+ mova m4, [acq]
+ mova m5, [acq+16]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ movq [dstq ], m4
+ movhps [dstq+strideq ], m4
+ mova m4, [acq+32]
+ mova m5, [acq+48]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ movq [dstq+strideq*2], m4
+ movhps [dstq+r6 ], m4
+ lea dstq, [dstq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .s8_loop
+ RET
+ALIGN function_align
+.h16:
+ mova m0, [tlq-16]
+ pmaddubsw m0, m3
+ jmp wq
+.w16:
+ movu m1, [tlq+1]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 8|32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w16_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s16:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ pabsw m2, m1
+ psllw m2, 9
+.s16_loop:
+ mova m4, [acq]
+ mova m5, [acq+16]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ mova [dstq], m4
+ mova m4, [acq+32]
+ mova m5, [acq+48]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ mova [dstq+strideq], m4
+ lea dstq, [dstq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .s16_loop
+ RET
+ALIGN function_align
+.h32:
+ mova m0, [tlq-32]
+ pmaddubsw m0, m3
+ mova m2, [tlq-16]
+ pmaddubsw m2, m3
+ paddw m0, m2
+ jmp wq
+.w32:
+ movu m1, [tlq+1]
+ pmaddubsw m1, m3
+ movu m2, [tlq+17]
+ pmaddubsw m2, m3
+ paddw m1, m2
+ paddw m0, m1
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 64|16
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w32_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s32:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ pabsw m2, m1
+ psllw m2, 9
+.s32_loop:
+ mova m4, [acq]
+ mova m5, [acq+16]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ mova [dstq], m4
+ mova m4, [acq+32]
+ mova m5, [acq+48]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ mova [dstq+16], m4
+ add dstq, strideq
+ add acq, 64
+ dec hd
+ jg .s32_loop
+ RET
+
+;---------------------------------------------------------------------------------------
+;void dav1d_ipred_cfl_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int16_t *ac, const int alpha);
+;---------------------------------------------------------------------------------------
+cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ mov hd, hm ; zero upper half
+ tzcnt r6d, hd
+ sub tlq, hq
+ tzcnt wd, wm
+ movu m0, [tlq]
+ mov t0d, 0x8000
+ movd m3, t0d
+ movd m2, r6d
+ psrld m3, m2
+ LEA t0, ipred_cfl_left_ssse3_table
+ movsxd r6, [t0+r6*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, t0
+ add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table
+ movsxd wq, [t0+wq*4]
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+.h32:
+ movu m1, [tlq+16] ; unaligned when jumping here from dc_top
+ pmaddubsw m1, m2
+ paddw m0, m1
+.h16:
+ pshufd m1, m0, q3232 ; psrlq m1, m0, 16
+ paddw m0, m1
+.h8:
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+.h4:
+ pmaddwd m0, m2
+ pmulhrsw m0, m3
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ jmp wq
+
+;---------------------------------------------------------------------------------------
+;void dav1d_ipred_cfl_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int16_t *ac, const int alpha);
+;---------------------------------------------------------------------------------------
+cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ LEA t0, ipred_cfl_left_ssse3_table
+ tzcnt wd, wm
+ inc tlq
+ movu m0, [tlq]
+ movifnidn hd, hm
+ mov r6d, 0x8000
+ movd m3, r6d
+ movd m2, wd
+ psrld m3, m2
+ movsxd r6, [t0+wq*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, t0
+ add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table
+ movsxd wq, [t0+wq*4]
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+
+;---------------------------------------------------------------------------------------
+;void dav1d_ipred_cfl_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int16_t *ac, const int alpha);
+;---------------------------------------------------------------------------------------
+cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ tzcnt wd, wm
+ movifnidn hd, hm
+ LEA r6, ipred_cfl_splat_ssse3_table
+ movsxd wq, [r6+wq*4]
+ movddup m0, [r6-ipred_cfl_splat_ssse3_table+pw_128]
+ add wq, r6
+ movifnidn acq, acmp
+ jmp wq
+
+%macro RELOAD_ACQ_32 1
+ mov acq, ac_bakq ; restore acq
+%endmacro
+
+%if ARCH_X86_64
+cglobal ipred_cfl_ac_420_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
+DECLARE_REG_TMP 7
+ movddup m2, [pb_2]
+%else
+cglobal ipred_cfl_ac_420_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
+DECLARE_REG_TMP 4
+%define ac_bakq acmp
+ mov t0d, 0x02020202
+ movd m2, t0d
+ pshufd m2, m2, q0000
+%endif
+ movifnidn wd, wm
+ mov t0d, hm
+ mov hd, t0d
+ imul t0d, wd
+ movd m5, t0d
+ movifnidn hpadd, hpadm
+%if ARCH_X86_64
+ mov ac_bakq, acq
+%endif
+ shl hpadd, 2
+ sub hd, hpadd
+ pxor m4, m4
+ cmp wd, 8
+ jg .w16
+ je .w8
+ ; fall-through
+%if ARCH_X86_64
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
+%else
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
+%endif
+.w4:
+ lea stride3q, [strideq*3]
+.w4_loop:
+ movq m0, [yq]
+ movq m1, [yq+strideq]
+ movhps m0, [yq+strideq*2]
+ movhps m1, [yq+stride3q]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*4]
+ add acq, 16
+ sub hd, 2
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .calc_avg_4_8
+ punpckhqdq m0, m0
+.w4_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 16
+ sub hpadd, 2
+ jg .w4_hpad_loop
+ jmp .calc_avg_4_8
+.w8:
+ lea stride3q, [strideq*3]
+ test wpadd, wpadd
+ jnz .w8_wpad
+.w8_loop:
+ mova m0, [yq]
+ mova m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ mova m0, [yq+strideq*2]
+ mova m1, [yq+stride3q]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq+16], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*4]
+ add acq, 32
+ sub hd, 2
+ jg .w8_loop
+ test hpadd, hpadd
+ jz .calc_avg_4_8
+ jmp .w8_hpad
+.w8_wpad: ; wpadd=1
+ movddup m0, [yq]
+ movddup m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ pshufhw m0, m0, q3333
+ mova [acq], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 16
+ sub hd, 1
+ jg .w8_wpad
+ test hpadd, hpadd
+ jz .calc_avg_4_8
+.w8_hpad:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 16
+ sub hpadd, 1
+ jg .w8_hpad
+ jmp .calc_avg_4_8
+.w16:
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ mova m0, [yq]
+ mova m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ mova m6, [yq+16]
+ mova m1, [yq+strideq+16]
+ pmaddubsw m6, m2
+ pmaddubsw m1, m2
+ paddw m6, m1
+ mova [acq+16], m6
+ paddw m4, m6
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ dec hd
+ jg .w16_loop
+ test hpadd, hpadd
+ jz .calc_avg16
+ jmp .w16_hpad_loop
+.w16_wpad:
+ cmp wpadd, 2
+ jl .w16_pad1
+ je .w16_pad2
+.w16_pad3:
+ movddup m0, [yq]
+ movddup m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ pshufhw m0, m0, q3333
+ mova [acq], m0
+ paddw m4, m0
+ mova m6, m0
+ punpckhqdq m6, m0, m0
+ mova [acq+16], m6
+ paddw m4, m6
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ dec hd
+ jg .w16_pad3
+ jmp .w16_wpad_done
+.w16_pad2:
+ mova m0, [yq]
+ mova m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ pshufhw m6, m0, q3333
+ punpckhqdq m6, m6
+ mova [acq+16], m6
+ paddw m4, m6
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ dec hd
+ jg .w16_pad2
+ jmp .w16_wpad_done
+.w16_pad1:
+ mova m0, [yq]
+ mova m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ movddup m6, [yq+16]
+ movddup m1, [yq+strideq+16]
+ pmaddubsw m6, m2
+ pmaddubsw m1, m2
+ paddw m6, m1
+ pshufhw m6, m6, q3333
+ mova [acq+16], m6
+ paddw m4, m6
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ dec hd
+ jg .w16_pad1
+.w16_wpad_done:
+ test hpadd, hpadd
+ jz .calc_avg16
+.w16_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ mova [acq+16], m6
+ paddw m4, m6
+ add acq, 32
+ dec hpadd
+ jg .w16_hpad_loop
+ jmp .calc_avg16
+
+%if ARCH_X86_64
+ DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
+%else
+ DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
+%endif
+.calc_avg_4_8:
+ psrlw m2, 9
+ pmaddwd m4, m2
+ jmp .calc_avg
+.calc_avg16:
+ psrld m0, m4, 16
+ pslld m4, 16
+ psrld m4, 16
+ paddd m4, m0
+.calc_avg:
+ movd szd, m5
+ psrad m5, 1
+ tzcnt r1d, szd
+ paddd m4, m5
+ movd m1, r1d
+ pshufd m0, m4, q2301
+ paddd m0, m4
+ pshufd m4, m0, q1032
+ paddd m0, m4
+ psrad m0, m1 ; sum >>= log2sz;
+ packssdw m0, m0
+ RELOAD_ACQ_32 acq
+.sub_loop:
+ mova m1, [acq]
+ psubw m1, m0 ; ac[x] -= sum;
+ mova [acq], m1
+ add acq, 16
+ sub szd, 8
+ jg .sub_loop
+ RET
+
+%if ARCH_X86_64
+cglobal ipred_cfl_ac_422_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
+ movddup m2, [pb_4]
+%else
+cglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
+ mov t0d, 0x04040404
+ movd m2, t0d
+ pshufd m2, m2, q0000
+%endif
+ movifnidn wd, wm
+ mov t0d, hm
+ mov hd, t0d
+ imul t0d, wd
+ movd m6, t0d
+ movifnidn hpadd, hpadm
+%if ARCH_X86_64
+ mov ac_bakq, acq
+%endif
+ shl hpadd, 2
+ sub hd, hpadd
+ pxor m4, m4
+ pxor m5, m5
+ cmp wd, 8
+ jg .w16
+ je .w8
+ ; fall-through
+
+%if ARCH_X86_64
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
+%else
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
+%endif
+.w4:
+ lea stride3q, [strideq*3]
+.w4_loop:
+ movq m1, [yq]
+ movhps m1, [yq+strideq]
+ movq m0, [yq+strideq*2]
+ movhps m0, [yq+stride3q]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m4, m0
+ paddw m5, m1
+ lea yq, [yq+strideq*4]
+ add acq, 32
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .calc_avg_4
+ punpckhqdq m0, m0
+.w4_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 16
+ sub hpadd, 2
+ jg .w4_hpad_loop
+ jmp .calc_avg_4
+.w8:
+ lea stride3q, [strideq*3]
+ test wpadd, wpadd
+ jnz .w8_wpad
+.w8_loop:
+ mova m1, [yq]
+ mova m0, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m4, m0
+ paddw m5, m1
+ mova m1, [yq+strideq*2]
+ mova m0, [yq+stride3q]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ mova [acq+48], m0
+ paddw m4, m0
+ paddw m5, m1
+ lea yq, [yq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .w8_loop
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+ jmp .w8_hpad
+.w8_wpad:
+ movddup m1, [yq]
+ pmaddubsw m1, m2
+ pshufhw m1, m1, q3333
+ mova [acq], m1
+ paddw m5, m1
+ movddup m0, [yq+strideq]
+ pmaddubsw m0, m2
+ pshufhw m0, m0, q3333
+ mova [acq+16], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ sub hd, 2
+ jg .w8_wpad
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+.w8_hpad:
+ mova [acq], m0
+ paddw m4, m0
+ mova [acq+16], m0
+ paddw m4, m0
+ add acq, 32
+ sub hpadd, 2
+ jg .w8_hpad
+ jmp .calc_avg_8_16
+.w16:
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ mova m1, [yq]
+ mova m0, [yq+16]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m5, m0
+ paddw m5, m1
+ mova m1, [yq+strideq]
+ mova m0, [yq+strideq+16]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ mova [acq+48], m0
+ paddw m4, m0
+ paddw m4, m1
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_loop
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+ jmp .w16_hpad_loop
+.w16_wpad:
+ cmp wpadd, 2
+ jl .w16_pad1
+ je .w16_pad2
+.w16_pad3:
+ movddup m1, [yq]
+ pmaddubsw m1, m2
+ pshufhw m1, m1, q3333
+ mova [acq], m1
+ paddw m5, m1
+ punpckhqdq m1, m1
+ mova [acq+16], m1
+ paddw m5, m1
+ movddup m1, [yq+strideq]
+ pmaddubsw m1, m2
+ pshufhw m1, m1, q3333
+ mova [acq+32], m1
+ paddw m4, m1
+ punpckhqdq m0, m1, m1
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_pad3
+ jmp .w16_wpad_done
+.w16_pad2:
+ mova m1, [yq]
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ pshufhw m1, m1, q3333
+ punpckhqdq m1, m1
+ mova [acq+16], m1
+ paddw m5, m1
+ mova m1, [yq+strideq]
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ mova m0, m1
+ pshufhw m0, m0, q3333
+ punpckhqdq m0, m0
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_pad2
+ jmp .w16_wpad_done
+.w16_pad1:
+ mova m1, [yq]
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ movddup m0, [yq+16]
+ pmaddubsw m0, m2
+ pshufhw m0, m0, q3333
+ mova [acq+16], m0
+ paddw m5, m0
+ mova m1, [yq+strideq]
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ movddup m0, [yq+strideq+16]
+ pmaddubsw m0, m2
+ pshufhw m0, m0, q3333
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_pad1
+.w16_wpad_done:
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+.w16_hpad_loop:
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m4, m1
+ paddw m5, m0
+ mova [acq+32], m1
+ mova [acq+48], m0
+ paddw m4, m1
+ paddw m5, m0
+ add acq, 64
+ sub hpadd, 2
+ jg .w16_hpad_loop
+ jmp .calc_avg_8_16
+
+%if ARCH_X86_64
+ DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
+%else
+ DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
+%endif
+.calc_avg_4:
+ psrlw m2, 10
+ pmaddwd m5, m2
+ pmaddwd m0, m4, m2
+ jmp .calc_avg
+.calc_avg_8_16:
+ mova m0, m5
+ psrld m5, 16
+ pslld m0, 16
+ psrld m0, 16
+ paddd m5, m0
+ mova m0, m4
+ psrld m0, 16
+ pslld m4, 16
+ psrld m4, 16
+ paddd m0, m4
+.calc_avg:
+ paddd m5, m0
+ movd szd, m6
+ psrad m6, 1
+ tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height);
+ paddd m5, m6
+ movd m1, r1d
+ pshufd m0, m5, q2301
+ paddd m0, m5
+ pshufd m5, m0, q1032
+ paddd m0, m5
+ psrad m0, m1 ; sum >>= log2sz;
+ packssdw m0, m0
+ RELOAD_ACQ_32 acq ; ac = ac_orig
+.sub_loop:
+ mova m1, [acq]
+ psubw m1, m0
+ mova [acq], m1
+ add acq, 16
+ sub szd, 8
+ jg .sub_loop
+ RET
+
+%if ARCH_X86_64
+cglobal ipred_cfl_ac_444_8bpc, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak
+ movddup m2, [pb_4]
+%else
+cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h
+%define ac_bakq [rsp+16*4]
+ mov t0d, 0x04040404
+ movd m2, t0d
+ pshufd m2, m2, q0000
+%endif
+ movifnidn wd, wm
+ movifnidn hpadd, hpadm
+ movd m0, hpadd
+ mov t0d, hm
+ mov hd, t0d
+ imul t0d, wd
+ movd m6, t0d
+ movd hpadd, m0
+ mov ac_bakq, acq
+ shl hpadd, 2
+ sub hd, hpadd
+ pxor m5, m5
+ pxor m4, m4
+ cmp wd, 16
+ jg .w32
+ cmp wd, 8
+ jg .w16
+ je .w8
+ ; fall-through
+
+%if ARCH_X86_64
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
+%else
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
+%endif
+.w4:
+ lea stride3q, [strideq*3]
+.w4_loop:
+ movd m1, [yq]
+ movd m3, [yq+strideq]
+ punpckldq m1, m3
+ punpcklbw m1, m1
+ movd m0, [yq+strideq*2]
+ movd m3, [yq+stride3q]
+ punpckldq m0, m3
+ punpcklbw m0, m0
+ pmaddubsw m1, m2
+ pmaddubsw m0, m2
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m5, m0
+ paddw m5, m1
+ lea yq, [yq+strideq*4]
+ add acq, 32
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .calc_avg_4
+ punpckhqdq m0, m0
+.w4_hpad_loop:
+ mova [acq], m0
+ paddw m5, m0
+ add acq, 16
+ sub hpadd, 2
+ jg .w4_hpad_loop
+.calc_avg_4:
+ psrlw m2, 10
+ pmaddwd m5, m2
+ jmp .calc_avg
+
+.w8:
+ lea stride3q, [strideq*3]
+ test wpadd, wpadd
+ jnz .w8_wpad
+.w8_loop:
+ movq m1, [yq]
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ movq m0, [yq+strideq]
+ punpcklbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0
+ movq m1, [yq+strideq*2]
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ movq m0, [yq+stride3q]
+ punpcklbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .w8_loop
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+ jmp .w8_hpad
+.w8_wpad:
+ movd m1, [yq]
+ punpcklbw m1, m1
+ punpcklqdq m1, m1
+ pmaddubsw m1, m2
+ pshufhw m1, m1, q3333
+ mova [acq], m1
+ paddw m5, m1
+ movd m0, [yq+strideq]
+ punpcklbw m0, m0
+ punpcklqdq m0, m0
+ pmaddubsw m0, m2
+ pshufhw m0, m0, q3333
+ mova [acq+16], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ sub hd, 2
+ jg .w8_wpad
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+.w8_hpad:
+ mova [acq], m0
+ paddw m5, m0
+ mova [acq+16], m0
+ paddw m4, m0
+ add acq, 32
+ sub hpadd, 2
+ jg .w8_hpad
+ jmp .calc_avg_8_16
+
+.w16:
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0
+ mova m0, [yq+strideq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_loop
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+ jmp .w16_hpad_loop
+.w16_wpad:
+ cmp wpadd, 2
+ jl .w16_pad1
+ je .w16_pad2
+.w16_pad3:
+ movd m1, [yq]
+ punpcklbw m1, m1
+ punpcklqdq m1, m1
+ pshufhw m1, m1, q3333
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ punpckhqdq m1, m1
+ mova [acq+16], m1
+ paddw m5, m1
+ movd m1, [yq+strideq]
+ punpcklbw m1, m1
+ punpcklqdq m1, m1
+ pshufhw m1, m1, q3333
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ punpckhqdq m0, m1, m1
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_pad3
+ jmp .w16_wpad_done
+.w16_pad2:
+ movq m1, [yq]
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ pshufhw m1, m1, q3333
+ punpckhqdq m1, m1
+ mova [acq+16], m1
+ paddw m5, m1
+ movq m1, [yq+strideq]
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ mova m0, m1
+ pshufhw m0, m0, q3333
+ punpckhqdq m0, m0
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_pad2
+ jmp .w16_wpad_done
+.w16_pad1:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ punpckhbw m0, m0
+ punpcklqdq m0, m0
+ pshufhw m0, m0, q3333
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0
+ mova m0, [yq+strideq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ punpckhbw m0, m0
+ punpcklqdq m0, m0
+ pshufhw m0, m0, q3333
+ pmaddubsw m0, m2
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_pad1
+.w16_wpad_done:
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+.w16_hpad_loop:
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m4, m1
+ paddw m5, m0
+ mova [acq+32], m1
+ mova [acq+48], m0
+ paddw m4, m1
+ paddw m5, m0
+ add acq, 64
+ sub hpadd, 2
+ jg .w16_hpad_loop
+.calc_avg_8_16:
+ mova m0, m5
+ psrld m5, 16
+ pslld m0, 16
+ psrld m0, 16
+ paddd m5, m0
+ mova m0, m4
+ psrld m0, 16
+ pslld m4, 16
+ psrld m4, 16
+ paddd m0, m4
+ paddd m5, m0
+ jmp .calc_avg
+
+.w32:
+ pxor m0, m0
+ mova [rsp ], m0
+ mova [rsp+16], m0
+ mova [rsp+32], m0
+ mova [rsp+48], m0
+ test wpadd, wpadd
+ jnz .w32_wpad
+.w32_loop:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m4, [yq+16]
+ mova m3, m4
+ punpcklbw m3, m3
+ pmaddubsw m3, m2
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ punpckhbw m4, m4
+ pmaddubsw m4, m2
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_loop
+ test hpadd, hpadd
+ jz .calc_avg_32
+ jmp .w32_hpad_loop
+.w32_wpad:
+ cmp wpadd, 2
+ jl .w32_pad1
+ je .w32_pad2
+ cmp wpadd, 4
+ jl .w32_pad3
+ je .w32_pad4
+ cmp wpadd, 6
+ jl .w32_pad5
+ je .w32_pad6
+.w32_pad7:
+ movd m1, [yq]
+ punpcklbw m1, m1
+ punpcklqdq m1, m1
+ pshufhw m1, m1, q3333
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ mova m0, m1
+ punpckhqdq m0, m0
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m3, m0
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ mova m4, m3
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad7
+ jmp .w32_wpad_done
+.w32_pad6:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ pshufhw m0, m1, q3333
+ punpckhqdq m0, m0
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m3, m0
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ mova m4, m3
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad6
+ jmp .w32_wpad_done
+.w32_pad5:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ mova m5, [rsp]
+ paddw m5, m1
+ mova [rsp ], m5
+ punpckhbw m0, m0
+ punpcklqdq m0, m0
+ pshufhw m0, m0, q3333
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m3, m0
+ punpckhqdq m3, m3
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ mova m4, m3
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad5
+ jmp .w32_wpad_done
+.w32_pad4:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m3, m0
+ pshufhw m3, m3, q3333
+ punpckhqdq m3, m3
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ mova m4, m3
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad4
+ jmp .w32_wpad_done
+.w32_pad3:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ movd m3, [yq+16]
+ punpcklbw m3, m3
+ punpcklqdq m3, m3
+ pshufhw m3, m3, q3333
+ pmaddubsw m3, m2
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ mova m4, m3
+ punpckhqdq m4, m4
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad3
+ jmp .w32_wpad_done
+.w32_pad2:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m3, [yq+16]
+ punpcklbw m3, m3
+ pmaddubsw m3, m2
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ pshufhw m4, m3, q3333
+ punpckhqdq m4, m4
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad2
+ jmp .w32_wpad_done
+.w32_pad1:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m4, [yq+16]
+ mova m3, m4
+ punpcklbw m3, m3
+ pmaddubsw m3, m2
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ punpckhbw m4, m4
+ punpcklqdq m4, m4
+ pshufhw m4, m4, q3333
+ pmaddubsw m4, m2
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad1
+.w32_wpad_done:
+ test hpadd, hpadd
+ jz .calc_avg_32
+.w32_hpad_loop:
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova [acq+32], m3
+ mova [acq+48], m4
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ add acq, 64
+ sub hpadd, 1
+ jg .w32_hpad_loop
+
+%if ARCH_X86_64
+ DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
+%else
+ DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
+%endif
+
+.calc_avg_32:
+ mova m5, [rsp]
+ mova m0, m5
+ psrld m5, 16
+ pslld m0, 16
+ psrld m0, 16
+ paddd m5, m0
+ mova m0, [rsp+16]
+ mova m3, m0
+ psrld m0, 16
+ pslld m3, 16
+ psrld m3, 16
+ paddd m0, m3
+ paddd m5, m0
+ mova m0, [rsp+32]
+ mova m3, m0
+ psrld m0, 16
+ pslld m3, 16
+ psrld m3, 16
+ paddd m0, m3
+ mova m1, [rsp+48]
+ mova m3, m1
+ psrld m1, 16
+ pslld m3, 16
+ psrld m3, 16
+ paddd m1, m3
+ paddd m1, m0
+ paddd m5, m1
+.calc_avg:
+ movd szd, m6
+ psrad m6, 1
+ tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height);
+ paddd m5, m6
+ movd m1, r1d
+ pshufd m0, m5, q2301
+ paddd m0, m5
+ pshufd m5, m0, q1032
+ paddd m0, m5
+ psrad m0, m1 ; sum >>= log2sz;
+ packssdw m0, m0
+ RELOAD_ACQ_32 acq ; ac = ac_orig
+.sub_loop:
+ mova m1, [acq]
+ psubw m1, m0
+ mova [acq], m1
+ add acq, 16
+ sub szd, 8
+ jg .sub_loop
+ RET
+
+; %1 simd register that hold the mask and will hold the result
+; %2 simd register that holds the "true" values
+; %3 location of the "false" values (simd register/memory)
+%macro BLEND 3 ; mask, true, false
+ pand %2, %1
+ pandn %1, %3
+ por %1, %2
+%endmacro
+
+%macro PAETH 2 ; top, ldiff
+ pavgb m1, m%1, m3
+ pxor m0, m%1, m3
+ pand m0, m4
+ psubusb m2, m5, m1
+ psubb m1, m0
+ psubusb m1, m5
+ por m1, m2
+ paddusb m1, m1
+ por m1, m0 ; min(tldiff, 255)
+ psubusb m2, m5, m3
+ psubusb m0, m3, m5
+ por m2, m0 ; tdiff
+%ifnum %2
+ pminub m2, m%2
+ pcmpeqb m0, m%2, m2 ; ldiff <= tdiff
+%else
+ mova m0, %2
+ pminub m2, m0
+ pcmpeqb m0, m2
+%endif
+ pminub m1, m2
+ pcmpeqb m1, m2 ; ldiff <= tldiff && tdiff <= tldiff
+ mova m2, m3
+ BLEND m0, m2, m%1
+ BLEND m1, m0, m5
+%endmacro
+
+cglobal ipred_paeth_8bpc, 3, 6, 8, -7*16, dst, stride, tl, w, h
+%define base r5-ipred_paeth_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ pxor m0, m0
+ movd m5, [tlq]
+ pshufb m5, m0
+ LEA r5, ipred_paeth_ssse3_table
+ movsxd wq, [r5+wq*4]
+ movddup m4, [base+ipred_paeth_shuf]
+ add wq, r5
+ jmp wq
+.w4:
+ movd m6, [tlq+1] ; top
+ pshufd m6, m6, q0000
+ lea r3, [strideq*3]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0 ; ldiff
+.w4_loop:
+ sub tlq, 4
+ movd m3, [tlq]
+ mova m1, [base+ipred_h_shuf]
+ pshufb m3, m1 ; left
+ PAETH 6, 7
+ movd [dstq ], m1
+ pshuflw m0, m1, q1032
+ movd [dstq+strideq ], m0
+ punpckhqdq m1, m1
+ movd [dstq+strideq*2], m1
+ psrlq m1, 32
+ movd [dstq+r3 ], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ movddup m6, [tlq+1]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w8_loop:
+ sub tlq, 2
+ movd m3, [tlq]
+ pshufb m3, [base+ipred_paeth_shuf]
+ PAETH 6, 7
+ movq [dstq ], m1
+ movhps [dstq+strideq], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ movu m6, [tlq+1]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w16_loop:
+ sub tlq, 1
+ movd m3, [tlq]
+ pxor m1, m1
+ pshufb m3, m1
+ PAETH 6, 7
+ mova [dstq], m1
+ add dstq, strideq
+ sub hd, 1
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ movu m6, [tlq+1]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+ mova [rsp ], m6
+ mova [rsp+16], m7
+ movu m6, [tlq+17]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+ mova [rsp+32], m6
+.w32_loop:
+ dec tlq
+ movd m3, [tlq]
+ pxor m1, m1
+ pshufb m3, m1
+ mova m6, [rsp]
+ PAETH 6, [rsp+16]
+ mova [dstq ], m1
+ mova m6, [rsp+32]
+ PAETH 6, 7
+ mova [dstq+16], m1
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+ RET
+ALIGN function_align
+.w64:
+ movu m6, [tlq+1]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+ mova [rsp ], m6
+ mova [rsp+16], m7
+ movu m6, [tlq+17]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+ mova [rsp+32], m6
+ mova [rsp+48], m7
+ movu m6, [tlq+33]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+ mova [rsp+64], m6
+ mova [rsp+80], m7
+ movu m6, [tlq+49]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+ mova [rsp+96], m6
+.w64_loop:
+ dec tlq
+ movd m3, [tlq]
+ pxor m1, m1
+ pshufb m3, m1
+ mova m6, [rsp]
+ PAETH 6, [rsp+16]
+ mova [dstq ], m1
+ mova m6, [rsp+32]
+ PAETH 6, [rsp+48]
+ mova [dstq+16], m1
+ mova m6, [rsp+64]
+ PAETH 6, [rsp+80]
+ mova [dstq+32], m1
+ mova m6, [rsp+96]
+ PAETH 6, 7
+ mova [dstq+48], m1
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+
+%macro FILTER 4 ;dst, src, tmp, shuf
+%ifnum %4
+ pshufb m%2, m%4
+%else
+ pshufb m%2, %4
+%endif
+ pshufd m%1, m%2, q0000 ;p0 p1
+ pmaddubsw m%1, m2
+ pshufd m%3, m%2, q1111 ;p2 p3
+ pmaddubsw m%3, m3
+ paddw m%1, [base+pw_8]
+ paddw m%1, m%3
+ pshufd m%3, m%2, q2222 ;p4 p5
+ pmaddubsw m%3, m4
+ paddw m%1, m%3
+ pshufd m%3, m%2, q3333 ;p6 __
+ pmaddubsw m%3, m5
+ paddw m%1, m%3
+ psraw m%1, 4
+ packuswb m%1, m%1
+%endmacro
+
+cglobal ipred_filter_8bpc, 3, 7, 8, dst, stride, tl, w, h, filter
+%define base r6-$$
+ LEA r6, $$
+ tzcnt wd, wm
+%ifidn filterd, filterm
+ movzx filterd, filterb
+%else
+ movzx filterd, byte filterm
+%endif
+ shl filterd, 6
+ lea filterq, [base+filter_intra_taps+filterq]
+ movq m0, [tlq-3] ;_ 6 5 0 1 2 3 4
+ movsxd wq, [base+ipred_filter_ssse3_table+wq*4]
+ mova m2, [filterq+16*0]
+ mova m3, [filterq+16*1]
+ mova m4, [filterq+16*2]
+ mova m5, [filterq+16*3]
+ lea wq, [base+ipred_filter_ssse3_table+wq]
+ mov hd, hm
+ jmp wq
+.w4:
+ mova m1, [base+filter_shuf1]
+ sub tlq, 3
+ sub tlq, hq
+ jmp .w4_loop_start
+.w4_loop:
+ movd m0, [tlq+hq]
+ punpckldq m0, m6
+ lea dstq, [dstq+strideq*2]
+.w4_loop_start:
+ FILTER 6, 0, 7, 1
+ movd [dstq+strideq*0], m6
+ pshuflw m6, m6, q1032
+ movd [dstq+strideq*1], m6
+ sub hd, 2
+ jg .w4_loop
+ RET
+
+ALIGN function_align
+.w8:
+ movq m6, [tlq+1] ;_ _ _ 0 1 2 3 4
+ sub tlq, 5
+ sub tlq, hq
+
+.w8_loop:
+ FILTER 7, 0, 1, [base+filter_shuf1]
+ punpcklqdq m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ FILTER 0, 6, 1, [base+filter_shuf2]
+
+ punpckldq m6, m7, m0
+ movq [dstq+strideq*0], m6
+ punpckhqdq m6, m6
+ movq [dstq+strideq*1], m6
+
+ movd m0, [tlq+hq] ;_ 6 5 0
+ punpckldq m0, m6 ;_ 6 5 0 1 2 3 4
+
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+
+ALIGN function_align
+.w16:
+ movu m6, [tlq+1] ;top row
+ sub tlq, 5
+ sub tlq, hq
+
+.w16_loop:
+ FILTER 7, 0, 1, [base+filter_shuf1]
+ punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+strideq*0], m7
+ psrlq m7, 32
+ palignr m7, m6, 4
+
+ FILTER 6, 0, 1, [base+filter_shuf2]
+ punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+4+strideq*0], m6
+ psrlq m6, 32
+ palignr m6, m7, 4
+
+ FILTER 7, 0, 1, [base+filter_shuf2]
+ punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+8+strideq*0], m7
+ psrlq m7, 32
+ palignr m7, m6, 4
+
+ FILTER 6, 0, 1, [base+filter_shuf2]
+ movd [dstq+12+strideq*0], m6
+ psrlq m6, 32
+ palignr m6, m7, 4
+ mova [dstq+strideq*1], m6
+
+ movd m0, [tlq+hq] ;_ 6 5 0
+ punpckldq m0, m6 ;_ 6 5 0 1 2 3 4
+
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+
+ALIGN function_align
+.w32:
+ movu m6, [tlq+1] ;top row
+ lea filterq, [tlq+17]
+ sub tlq, 5
+ sub tlq, hq
+
+.w32_loop:
+ FILTER 7, 0, 1, [base+filter_shuf1]
+ punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+strideq*0], m7
+ psrlq m7, 32
+ palignr m7, m6, 4
+
+ FILTER 6, 0, 1, [base+filter_shuf2]
+ punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+4+strideq*0], m6
+ psrlq m6, 32
+ palignr m6, m7, 4
+
+ FILTER 7, 0, 1, [base+filter_shuf2]
+ punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+8+strideq*0], m7
+ psrlq m7, 32
+ palignr m7, m6, 4
+
+ FILTER 6, 0, 1, [base+filter_shuf2]
+ movu m1, [filterq]
+ punpckldq m0, m7, m1 ;_ _ _ 0 1 2 3 4 _ _ _ _ _ _ _ _
+ punpcklqdq m0, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+12+strideq*0], m6
+ psrlq m6, 32
+ palignr m6, m7, 4
+ mova [dstq+strideq*1], m6
+
+ mova m6, m1
+
+ FILTER 7, 0, 6, [base+filter_shuf2]
+ punpcklqdq m0, m1, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+16+strideq*0], m7
+ psrlq m7, 32
+ palignr m7, m1, 4
+
+ FILTER 6, 0, 1, [base+filter_shuf2]
+ punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+20+strideq*0], m6
+ psrlq m6, 32
+ palignr m6, m7, 4
+
+ FILTER 7, 0, 1, [base+filter_shuf2]
+ punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+24+strideq*0], m7
+ psrlq m7, 32
+ palignr m7, m6, 4
+
+ FILTER 6, 0, 1, [base+filter_shuf2]
+ movd [dstq+28+strideq*0], m6
+ psrlq m6, 32
+ palignr m6, m7, 4
+ mova [dstq+16+strideq*1], m6
+
+ mova m6, [dstq+strideq*1]
+ movd m0, [tlq+hq] ;_ 6 5 0
+ punpckldq m0, m6 ;_ 6 5 0 1 2 3 4
+ lea filterq, [dstq+16+strideq*1]
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET