From 43a97878ce14b72f0981164f87f2e35e14151312 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 11:22:09 +0200 Subject: Adding upstream version 110.0.1. Signed-off-by: Daniel Baumann --- media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm | 2497 +++++++++++++++++++++ 1 file changed, 2497 insertions(+) create mode 100644 media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm (limited to 'media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm') diff --git a/media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm b/media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm new file mode 100644 index 0000000000..808056a809 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm @@ -0,0 +1,2497 @@ +;****************************************************************************** +;* VP9 Intra prediction SIMD optimizations +;* +;* Copyright (c) 2015 Ronald S. Bultje +;* Copyright (c) 2015 Henrik Gramner +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +pd_2: times 8 dd 2 +pd_4: times 8 dd 4 +pd_8: times 8 dd 8 + +pb_2to15_14_15: db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15 +pb_4_5_8to13_8x0: db 4,5,8,9,10,11,12,13,0,0,0,0,0,0,0,0 +pb_0to7_67x4: db 0,1,2,3,4,5,6,7,6,7,6,7,6,7,6,7 + +cextern pw_1 +cextern pw_1023 +cextern pw_4095 +cextern pd_16 +cextern pd_32 +cextern pd_65535; + +; FIXME most top-only functions (ddl, vl, v, dc_top) can be modified to take +; only 3 registers on x86-32, which would make it one cycle faster, but that +; would make the code quite a bit uglier... + +SECTION .text + +%macro SCRATCH 3-4 +%if ARCH_X86_64 + SWAP %1, %2 +%if %0 == 4 +%define reg_%4 m%2 +%endif +%else + mova [%3], m%1 +%if %0 == 4 +%define reg_%4 [%3] +%endif +%endif +%endmacro + +%macro UNSCRATCH 3-4 +%if ARCH_X86_64 + SWAP %1, %2 +%else + mova m%1, [%3] +%endif +%if %0 == 4 +%undef reg_%4 +%endif +%endmacro + +%macro PRELOAD 2-3 +%if ARCH_X86_64 + mova m%1, [%2] +%if %0 == 3 +%define reg_%3 m%1 +%endif +%elif %0 == 3 +%define reg_%3 [%2] +%endif +%endmacro + +INIT_MMX mmx +cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + RET + +INIT_XMM sse +cglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + RET + +INIT_XMM sse +cglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] + mova m1, [aq+mmsize] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m1 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m1 + mova [dstq+strideq*2+ 0], m0 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q + 0], m0 + mova [dstq+stride3q +16], m1 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +INIT_XMM sse +cglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq+mmsize*0] + mova m1, [aq+mmsize*1] + mova m2, [aq+mmsize*2] + mova m3, [aq+mmsize*3] + DEFINE_ARGS dst, stride, cnt + mov cntd, 16 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m1 + mova [dstq+strideq*0+32], m2 + mova [dstq+strideq*0+48], m3 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m1 + mova [dstq+strideq*1+32], m2 + mova [dstq+strideq*1+48], m3 + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop + RET + +INIT_MMX mmxext +cglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a + mova m3, [lq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pshufw m0, m3, q3333 + pshufw m1, m3, q2222 + pshufw m2, m3, q1111 + pshufw m3, m3, q0000 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + RET + +INIT_XMM sse2 +cglobal vp9_ipred_h_8x8_16, 3, 3, 4, dst, stride, l, a + mova m2, [lq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + punpckhwd m3, m2, m2 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + pshufd m0, m3, q1111 + pshufd m1, m3, q0000 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m1 + lea dstq, [dstq+strideq*4] + punpcklwd m2, m2 + pshufd m0, m2, q3333 + pshufd m1, m2, q2222 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + pshufd m0, m2, q1111 + pshufd m1, m2, q0000 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m1 + RET + +INIT_XMM sse2 +cglobal vp9_ipred_h_16x16_16, 3, 5, 4, dst, stride, l, stride3, cnt + mov cntd, 3 + lea stride3q, [strideq*3] +.loop: + movh m3, [lq+cntq*8] + punpcklwd m3, m3 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*1+ 0], m1 + mova [dstq+strideq*1+16], m1 + mova [dstq+strideq*2+ 0], m2 + mova [dstq+strideq*2+16], m2 + mova [dstq+stride3q + 0], m3 + mova [dstq+stride3q +16], m3 + lea dstq, [dstq+strideq*4] + dec cntd + jge .loop + RET + +INIT_XMM sse2 +cglobal vp9_ipred_h_32x32_16, 3, 5, 4, dst, stride, l, stride3, cnt + mov cntd, 7 + lea stride3q, [strideq*3] +.loop: + movh m3, [lq+cntq*8] + punpcklwd m3, m3 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*0+32], m0 + mova [dstq+strideq*0+48], m0 + mova [dstq+strideq*1+ 0], m1 + mova [dstq+strideq*1+16], m1 + mova [dstq+strideq*1+32], m1 + mova [dstq+strideq*1+48], m1 + mova [dstq+strideq*2+ 0], m2 + mova [dstq+strideq*2+16], m2 + mova [dstq+strideq*2+32], m2 + mova [dstq+strideq*2+48], m2 + mova [dstq+stride3q + 0], m3 + mova [dstq+stride3q +16], m3 + mova [dstq+stride3q +32], m3 + mova [dstq+stride3q +48], m3 + lea dstq, [dstq+strideq*4] + dec cntd + jge .loop + RET + +INIT_MMX mmxext +cglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a + mova m0, [lq] + paddw m0, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pmaddwd m0, [pw_1] + pshufw m1, m0, q3232 + paddd m0, [pd_4] + paddd m0, m1 + psrad m0, 3 + pshufw m0, m0, q0000 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a + mova m0, [lq] + paddw m0, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pmaddwd m0, [pw_1] + pshufd m1, m0, q3232 + paddd m0, m1 + pshufd m1, m0, q1111 + paddd m0, [pd_8] + paddd m0, m1 + psrad m0, 4 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a + mova m0, [lq] + paddw m0, [lq+mmsize] + paddw m0, [aq] + paddw m0, [aq+mmsize] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 + pmaddwd m0, [pw_1] + pshufd m1, m0, q3232 + paddd m0, m1 + pshufd m1, m0, q1111 + paddd m0, [pd_16] + paddd m0, m1 + psrad m0, 5 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m0 + mova [dstq+strideq*2+ 0], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q + 0], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +INIT_XMM sse2 +cglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a + mova m0, [lq+mmsize*0] + paddw m0, [lq+mmsize*1] + paddw m0, [lq+mmsize*2] + paddw m0, [lq+mmsize*3] + paddw m0, [aq+mmsize*0] + paddw m0, [aq+mmsize*1] + paddw m0, [aq+mmsize*2] + paddw m0, [aq+mmsize*3] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 16 + pmaddwd m0, [pw_1] + pshufd m1, m0, q3232 + paddd m0, m1 + pshufd m1, m0, q1111 + paddd m0, [pd_32] + paddd m0, m1 + psrad m0, 6 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*0+32], m0 + mova [dstq+strideq*0+48], m0 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m0 + mova [dstq+strideq*1+32], m0 + mova [dstq+strideq*1+48], m0 + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop + RET + +%macro DC_1D_FNS 2 +INIT_MMX mmxext +cglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a + mova m0, [%2] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pmaddwd m0, [pw_1] + pshufw m1, m0, q3232 + paddd m0, [pd_2] + paddd m0, m1 + psrad m0, 2 + pshufw m0, m0, q0000 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a + mova m0, [%2] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pmaddwd m0, [pw_1] + pshufd m1, m0, q3232 + paddd m0, m1 + pshufd m1, m0, q1111 + paddd m0, [pd_4] + paddd m0, m1 + psrad m0, 3 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a + mova m0, [%2] + paddw m0, [%2+mmsize] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 + pmaddwd m0, [pw_1] + pshufd m1, m0, q3232 + paddd m0, m1 + pshufd m1, m0, q1111 + paddd m0, [pd_8] + paddd m0, m1 + psrad m0, 4 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m0 + mova [dstq+strideq*2+ 0], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q + 0], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +INIT_XMM sse2 +cglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a + mova m0, [%2+mmsize*0] + paddw m0, [%2+mmsize*1] + paddw m0, [%2+mmsize*2] + paddw m0, [%2+mmsize*3] + DEFINE_ARGS dst, stride, cnt + mov cntd, 16 + pmaddwd m0, [pw_1] + pshufd m1, m0, q3232 + paddd m0, m1 + pshufd m1, m0, q1111 + paddd m0, [pd_16] + paddd m0, m1 + psrad m0, 5 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*0+32], m0 + mova [dstq+strideq*0+48], m0 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m0 + mova [dstq+strideq*1+32], m0 + mova [dstq+strideq*1+48], m0 + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop + RET +%endmacro + +DC_1D_FNS top, aq +DC_1D_FNS left, lq + +INIT_MMX mmxext +cglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a + mova m5, [pw_1023] +.body: + mova m4, [aq] + mova m3, [lq] + movd m0, [aq-4] + pshufw m0, m0, q1111 + psubw m4, m0 + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pshufw m0, m3, q3333 + pshufw m1, m3, q2222 + pshufw m2, m3, q1111 + pshufw m3, m3, q0000 + paddw m0, m4 + paddw m1, m4 + paddw m2, m4 + paddw m3, m4 + pxor m4, m4 + pmaxsw m0, m4 + pmaxsw m1, m4 + pmaxsw m2, m4 + pmaxsw m3, m4 + pminsw m0, m5 + pminsw m1, m5 + pminsw m2, m5 + pminsw m3, m5 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + RET + +cglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a + mova m5, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body + +INIT_XMM sse2 +cglobal vp9_ipred_tm_8x8_10, 4, 5, 7, dst, stride, l, a + mova m4, [pw_1023] +.body: + pxor m6, m6 + mova m5, [aq] + movd m0, [aq-4] + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + psubw m5, m0 + DEFINE_ARGS dst, stride, l, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 1 +.loop: + movh m3, [lq+cntq*8] + punpcklwd m3, m3 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + paddw m0, m5 + paddw m1, m5 + paddw m2, m5 + paddw m3, m5 + pmaxsw m0, m6 + pmaxsw m1, m6 + pmaxsw m2, m6 + pmaxsw m3, m6 + pminsw m0, m4 + pminsw m1, m4 + pminsw m2, m4 + pminsw m3, m4 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + dec cntd + jge .loop + RET + +cglobal vp9_ipred_tm_8x8_12, 4, 5, 7, dst, stride, l, a + mova m4, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body + +INIT_XMM sse2 +cglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a + mova m7, [pw_1023] +.body: + pxor m6, m6 + mova m4, [aq] + mova m5, [aq+mmsize] + movd m0, [aq-4] + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + psubw m4, m0 + psubw m5, m0 + DEFINE_ARGS dst, stride, l, cnt + mov cntd, 7 +.loop: + movd m3, [lq+cntq*4] + punpcklwd m3, m3 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + paddw m0, m2, m4 + paddw m2, m5 + paddw m1, m3, m4 + paddw m3, m5 + pmaxsw m0, m6 + pmaxsw m2, m6 + pmaxsw m1, m6 + pmaxsw m3, m6 + pminsw m0, m7 + pminsw m2, m7 + pminsw m1, m7 + pminsw m3, m7 + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m2 + mova [dstq+strideq*1+ 0], m1 + mova [dstq+strideq*1+16], m3 + lea dstq, [dstq+strideq*2] + dec cntd + jge .loop + RET + +cglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a + mova m7, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body + +INIT_XMM sse2 +cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a + mova m0, [pw_1023] +.body: + pxor m1, m1 +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 +%define reg_min m9 +%define reg_max m8 +%else + mova [rsp+ 0], m0 + mova [rsp+16], m1 +%define reg_min [rsp+16] +%define reg_max [rsp+ 0] +%endif + + mova m4, [aq+mmsize*0] + mova m5, [aq+mmsize*1] + mova m6, [aq+mmsize*2] + mova m7, [aq+mmsize*3] + movd m0, [aq-4] + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + psubw m4, m0 + psubw m5, m0 + psubw m6, m0 + psubw m7, m0 + DEFINE_ARGS dst, stride, l, cnt + mov cntd, 31 +.loop: + pinsrw m3, [lq+cntq*2], 0 + punpcklwd m3, m3 + pshufd m3, m3, q0000 + paddw m0, m3, m4 + paddw m1, m3, m5 + paddw m2, m3, m6 + paddw m3, m7 + pmaxsw m0, reg_min + pmaxsw m1, reg_min + pmaxsw m2, reg_min + pmaxsw m3, reg_min + pminsw m0, reg_max + pminsw m1, reg_max + pminsw m2, reg_max + pminsw m3, reg_max + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m1 + mova [dstq+strideq*0+32], m2 + mova [dstq+strideq*0+48], m3 + add dstq, strideq + dec cntd + jge .loop + RET + +cglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a + mova m0, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body + +; Directional intra predicion functions +; +; in the functions below, 'abcdefgh' refers to above data (sometimes simply +; abbreviated as a[N-M]). 'stuvwxyz' refers to left data (sometimes simply +; abbreviated as l[N-M]). * is top-left data. ABCDEFG or A[N-M] is filtered +; above data, STUVWXYZ or L[N-M] is filtered left data, and # is filtered +; top-left data. + +; left=(left+2*center+right+2)>>2 +%macro LOWPASS 3 ; left [dst], center, right + paddw m%1, m%3 + psraw m%1, 1 + pavgw m%1, m%2 +%endmacro + +; abcdefgh (src) -> bcdefghh (dst) +; dst/src can be the same register +%macro SHIFT_RIGHT 2-3 [pb_2to15_14_15] ; dst, src, [ssse3_shift_reg] +%if cpuflag(ssse3) + pshufb %1, %2, %3 ; abcdefgh -> bcdefghh +%else + psrldq %1, %2, 2 ; abcdefgh -> bcdefgh. + pshufhw %1, %1, q2210 ; bcdefgh. -> bcdefghh +%endif +%endmacro + +; abcdefgh (src) -> bcdefghh (dst1) and cdefghhh (dst2) +%macro SHIFT_RIGHTx2 3-4 [pb_2to15_14_15] ; dst1, dst2, src, [ssse3_shift_reg] +%if cpuflag(ssse3) + pshufb %1, %3, %4 ; abcdefgh -> bcdefghh + pshufb %2, %1, %4 ; bcdefghh -> cdefghhh +%else + psrldq %1, %3, 2 ; abcdefgh -> bcdefgh. + psrldq %2, %3, 4 ; abcdefgh -> cdefgh.. + pshufhw %1, %1, q2210 ; bcdefgh. -> bcdefghh + pshufhw %2, %2, q1110 ; cdefgh.. -> cdefghhh +%endif +%endmacro + +%macro DL_FUNCS 0 +cglobal vp9_ipred_dl_4x4_16, 2, 4, 3, dst, stride, l, a + movifnidn aq, amp + movu m1, [aq] ; abcdefgh + pshufhw m0, m1, q3310 ; abcdefhh + SHIFT_RIGHT m1, m1 ; bcdefghh + psrldq m2, m1, 2 ; cdefghh. + LOWPASS 0, 1, 2 ; BCDEFGh. + pshufd m1, m0, q3321 ; DEFGh... + movh [dstq+strideq*0], m0 + movh [dstq+strideq*2], m1 + add dstq, strideq + psrldq m0, 2 ; CDEFGh.. + psrldq m1, 2 ; EFGh.... + movh [dstq+strideq*0], m0 + movh [dstq+strideq*2], m1 + RET + +cglobal vp9_ipred_dl_8x8_16, 2, 4, 5, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] ; abcdefgh +%if cpuflag(ssse3) + mova m4, [pb_2to15_14_15] +%endif + SHIFT_RIGHTx2 m1, m2, m0, m4 ; bcdefghh/cdefghhh + LOWPASS 0, 1, 2 ; BCDEFGHh + shufps m1, m0, m2, q3332 ; FGHhhhhh + shufps m3, m0, m1, q2121 ; DEFGHhhh + DEFINE_ARGS dst, stride, stride5 + lea stride5q, [strideq*5] + + mova [dstq+strideq*0], m0 + mova [dstq+strideq*4], m1 + SHIFT_RIGHT m0, m0, m4 ; CDEFGHhh + pshuflw m1, m1, q3321 ; GHhhhhhh + pshufd m2, m0, q3321 ; EFGHhhhh + mova [dstq+strideq*1], m0 + mova [dstq+stride5q ], m1 + lea dstq, [dstq+strideq*2] + pshuflw m1, m1, q3321 ; Hhhhhhhh + mova [dstq+strideq*0], m3 + mova [dstq+strideq*4], m1 + pshuflw m1, m1, q3321 ; hhhhhhhh + mova [dstq+strideq*1], m2 + mova [dstq+stride5q ], m1 + RET + +cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] ; abcdefgh + mova m3, [aq+mmsize] ; ijklmnop + PALIGNR m1, m3, m0, 2, m4 ; bcdefghi + PALIGNR m2, m3, m0, 4, m4 ; cdefghij + LOWPASS 0, 1, 2 ; BCDEFGHI +%if cpuflag(ssse3) + mova m4, [pb_2to15_14_15] +%endif + SHIFT_RIGHTx2 m2, m1, m3, m4 ; jklmnopp/klmnoppp + LOWPASS 1, 2, 3 ; JKLMNOPp + pshufd m2, m2, q3333 ; pppppppp + DEFINE_ARGS dst, stride, cnt + mov cntd, 8 + +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m1 + mova [dstq+strideq*8+ 0], m1 + mova [dstq+strideq*8+16], m2 + add dstq, strideq +%if cpuflag(avx) + vpalignr m0, m1, m0, 2 +%else + PALIGNR m3, m1, m0, 2, m4 + mova m0, m3 +%endif + SHIFT_RIGHT m1, m1, m4 + dec cntd + jg .loop + RET + +cglobal vp9_ipred_dl_32x32_16, 2, 5, 7, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq+mmsize*0] ; abcdefgh + mova m1, [aq+mmsize*1] ; ijklmnop + mova m2, [aq+mmsize*2] ; qrstuvwx + mova m3, [aq+mmsize*3] ; yz012345 + PALIGNR m4, m1, m0, 2, m6 + PALIGNR m5, m1, m0, 4, m6 + LOWPASS 0, 4, 5 ; BCDEFGHI + PALIGNR m4, m2, m1, 2, m6 + PALIGNR m5, m2, m1, 4, m6 + LOWPASS 1, 4, 5 ; JKLMNOPQ + PALIGNR m4, m3, m2, 2, m6 + PALIGNR m5, m3, m2, 4, m6 + LOWPASS 2, 4, 5 ; RSTUVWXY +%if cpuflag(ssse3) + mova m6, [pb_2to15_14_15] +%endif + SHIFT_RIGHTx2 m4, m5, m3, m6 + LOWPASS 3, 4, 5 ; Z0123455 + pshufd m4, m4, q3333 ; 55555555 + DEFINE_ARGS dst, stride, stride8, stride24, cnt + mov cntd, 8 + lea stride8q, [strideq*8] + lea stride24q, [stride8q*3] + +.loop: + mova [dstq+stride8q*0+ 0], m0 + mova [dstq+stride8q*0+16], m1 + mova [dstq+stride8q*0+32], m2 + mova [dstq+stride8q*0+48], m3 + mova [dstq+stride8q*1+ 0], m1 + mova [dstq+stride8q*1+16], m2 + mova [dstq+stride8q*1+32], m3 + mova [dstq+stride8q*1+48], m4 + mova [dstq+stride8q*2+ 0], m2 + mova [dstq+stride8q*2+16], m3 + mova [dstq+stride8q*2+32], m4 + mova [dstq+stride8q*2+48], m4 + mova [dstq+stride24q + 0], m3 + mova [dstq+stride24q +16], m4 + mova [dstq+stride24q +32], m4 + mova [dstq+stride24q +48], m4 + add dstq, strideq +%if cpuflag(avx) + vpalignr m0, m1, m0, 2 + vpalignr m1, m2, m1, 2 + vpalignr m2, m3, m2, 2 +%else + PALIGNR m5, m1, m0, 2, m6 + mova m0, m5 + PALIGNR m5, m2, m1, 2, m6 + mova m1, m5 + PALIGNR m5, m3, m2, 2, m6 + mova m2, m5 +%endif + SHIFT_RIGHT m3, m3, m6 + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +DL_FUNCS +INIT_XMM ssse3 +DL_FUNCS +INIT_XMM avx +DL_FUNCS + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] ; abcdefghijklmnop + vpbroadcastw xm1, [aq+30] ; pppppppp + vperm2i128 m2, m0, m1, q0201 ; ijklmnoppppppppp + vpalignr m3, m2, m0, 2 ; bcdefghijklmnopp + vpalignr m4, m2, m0, 4 ; cdefghijklmnoppp + LOWPASS 0, 3, 4 ; BCDEFGHIJKLMNOPp + vperm2i128 m2, m0, m1, q0201 ; JKLMNOPppppppppp + DEFINE_ARGS dst, stride, stride3, cnt + mov cntd, 2 + lea stride3q, [strideq*3] + +.loop: + mova [dstq+strideq*0], m0 + vpalignr m3, m2, m0, 2 + vpalignr m4, m2, m0, 4 + mova [dstq+strideq*1], m3 + mova [dstq+strideq*2], m4 + vpalignr m3, m2, m0, 6 + vpalignr m4, m2, m0, 8 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m4 + vpalignr m3, m2, m0, 10 + vpalignr m4, m2, m0, 12 + mova [dstq+strideq*1], m3 + mova [dstq+strideq*2], m4 + vpalignr m3, m2, m0, 14 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + mova m0, m2 + vperm2i128 m2, m2, m2, q0101 ; pppppppppppppppp + dec cntd + jg .loop + RET + +cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq+mmsize*0+ 0] ; abcdefghijklmnop + mova m1, [aq+mmsize*1+ 0] ; qrstuvwxyz012345 + vpbroadcastw xm4, [aq+mmsize*1+30] ; 55555555 + vperm2i128 m5, m0, m1, q0201 ; ijklmnopqrstuvwx + vpalignr m2, m5, m0, 2 ; bcdefghijklmnopq + vpalignr m3, m5, m0, 4 ; cdefghijklmnopqr + LOWPASS 0, 2, 3 ; BCDEFGHIJKLMNOPQ + vperm2i128 m5, m1, m4, q0201 ; yz01234555555555 + vpalignr m2, m5, m1, 2 ; rstuvwxyz0123455 + vpalignr m3, m5, m1, 4 ; stuvwxyz01234555 + LOWPASS 1, 2, 3 ; RSTUVWXYZ......5 + vperm2i128 m2, m1, m4, q0201 ; Z......555555555 + vperm2i128 m5, m0, m1, q0201 ; JKLMNOPQRSTUVWXY + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 + +.loop: + mova [dstq+strideq*0 + 0], m0 + mova [dstq+strideq*0 +32], m1 + vpalignr m3, m5, m0, 2 + vpalignr m4, m2, m1, 2 + mova [dstq+strideq*1 + 0], m3 + mova [dstq+strideq*1 +32], m4 + vpalignr m3, m5, m0, 4 + vpalignr m4, m2, m1, 4 + mova [dstq+strideq*2 + 0], m3 + mova [dstq+strideq*2 +32], m4 + vpalignr m3, m5, m0, 6 + vpalignr m4, m2, m1, 6 + mova [dstq+stride3q*1+ 0], m3 + mova [dstq+stride3q*1+32], m4 + lea dstq, [dstq+strideq*4] + vpalignr m3, m5, m0, 8 + vpalignr m4, m2, m1, 8 + mova [dstq+strideq*0 + 0], m3 + mova [dstq+strideq*0 +32], m4 + vpalignr m3, m5, m0, 10 + vpalignr m4, m2, m1, 10 + mova [dstq+strideq*1 + 0], m3 + mova [dstq+strideq*1 +32], m4 + vpalignr m3, m5, m0, 12 + vpalignr m4, m2, m1, 12 + mova [dstq+strideq*2+ 0], m3 + mova [dstq+strideq*2+32], m4 + vpalignr m3, m5, m0, 14 + vpalignr m4, m2, m1, 14 + mova [dstq+stride3q+ 0], m3 + mova [dstq+stride3q+ 32], m4 + vpalignr m3, m5, m0, 16 + vpalignr m4, m2, m1, 16 + vperm2i128 m5, m3, m4, q0201 + vperm2i128 m2, m4, m4, q0101 + mova m0, m3 + mova m1, m4 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET +%endif + +%macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function +cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a + movh m0, [lq] ; wxyz.... + movhps m0, [aq-2] ; wxyz*abc + movd m1, [aq+6] ; d....... + PALIGNR m1, m0, 2, m2 ; xyz*abcd + psrldq m2, m1, 2 ; yz*abcd. + LOWPASS 0, 1, 2 ; XYZ#ABC. + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + movh [dstq+stride3q ], m0 + psrldq m0, 2 ; YZ#ABC.. + movh [dstq+strideq*2], m0 + psrldq m0, 2 ; Z#ABC... + movh [dstq+strideq*1], m0 + psrldq m0, 2 ; #ABC.... + movh [dstq+strideq*0], m0 + RET + +cglobal vp9_ipred_dr_8x8_16, 4, 4, 5, dst, stride, l, a + mova m0, [lq] ; stuvwxyz + movu m1, [aq-2] ; *abcdefg + mova m2, [aq] ; abcdefgh + psrldq m3, m2, 2 ; bcdefgh. + LOWPASS 3, 2, 1 ; ABCDEFG. + PALIGNR m1, m0, 2, m4 ; tuvwxyz* + PALIGNR m2, m1, 2, m4 ; uvwxyz*a + LOWPASS 2, 1, 0 ; TUVWXYZ# + DEFINE_ARGS dst, stride, dst4, stride3 + lea stride3q, [strideq*3] + lea dst4q, [dstq+strideq*4] + + movhps [dstq +stride3q +0], m2 + movh [dstq+ stride3q +8], m3 + mova [dst4q+stride3q +0], m2 + PALIGNR m1, m3, m2, 2, m0 + psrldq m3, 2 + movhps [dstq +strideq*2+0], m1 + movh [dstq+ strideq*2+8], m3 + mova [dst4q+strideq*2+0], m1 + PALIGNR m2, m3, m1, 2, m0 + psrldq m3, 2 + movhps [dstq +strideq*1+0], m2 + movh [dstq+ strideq*1+8], m3 + mova [dst4q+strideq*1+0], m2 + PALIGNR m1, m3, m2, 2, m0 + psrldq m3, 2 + movhps [dstq +strideq*0+0], m1 + movh [dstq+ strideq*0+8], m3 + mova [dst4q+strideq*0+0], m1 + RET + +cglobal vp9_ipred_dr_16x16_16, 4, 4, 7, dst, stride, l, a + mova m0, [lq] ; klmnopqr + mova m1, [lq+mmsize] ; stuvwxyz + movu m2, [aq-2] ; *abcdefg + movu m3, [aq+mmsize-2] ; hijklmno + mova m4, [aq] ; abcdefgh + mova m5, [aq+mmsize] ; ijklmnop + psrldq m6, m5, 2 ; jklmnop. + LOWPASS 6, 5, 3 ; IJKLMNO. + PALIGNR m5, m4, 2, m3 ; bcdefghi + LOWPASS 5, 4, 2 ; ABCDEFGH + PALIGNR m2, m1, 2, m3 ; tuvwxyz* + PALIGNR m4, m2, 2, m3 ; uvwxyz*a + LOWPASS 4, 2, 1 ; TUVWXYZ# + PALIGNR m1, m0, 2, m3 ; lmnopqrs + PALIGNR m2, m1, 2, m3 ; mnopqrst + LOWPASS 2, 1, 0 ; LMNOPQRS + DEFINE_ARGS dst, stride, dst8, cnt + lea dst8q, [dstq+strideq*8] + mov cntd, 8 + +.loop: + sub dst8q, strideq + mova [dst8q+strideq*0+ 0], m4 + mova [dst8q+strideq*0+16], m5 + mova [dst8q+strideq*8+ 0], m2 + mova [dst8q+strideq*8+16], m4 +%if cpuflag(avx) + vpalignr m2, m4, m2, 2 + vpalignr m4, m5, m4, 2 + vpalignr m5, m6, m5, 2 +%else + PALIGNR m0, m4, m2, 2, m1 + mova m2, m0 + PALIGNR m0, m5, m4, 2, m1 + mova m4, m0 + PALIGNR m0, m6, m5, 2, m1 + mova m5, m0 +%endif + psrldq m6, 2 + dec cntd + jg .loop + RET + +cglobal vp9_ipred_dr_32x32_16, 4, 5, 10 + notcpuflag(ssse3), \ + %1 * ARCH_X86_32 * -mmsize, dst, stride, l, a + mova m0, [aq+mmsize*3] ; a[24-31] + movu m1, [aq+mmsize*3-2] ; a[23-30] + psrldq m2, m0, 2 ; a[25-31]. + LOWPASS 2, 0, 1 ; A[24-30]. + mova m1, [aq+mmsize*2] ; a[16-23] + movu m3, [aq+mmsize*2-2] ; a[15-22] + PALIGNR m0, m1, 2, m4 ; a[17-24] + LOWPASS 0, 1, 3 ; A[16-23] + mova m3, [aq+mmsize*1] ; a[8-15] + movu m4, [aq+mmsize*1-2] ; a[7-14] + PALIGNR m1, m3, 2, m5 ; a[9-16] + LOWPASS 1, 3, 4 ; A[8-15] + mova m4, [aq+mmsize*0] ; a[0-7] + movu m5, [aq+mmsize*0-2] ; *a[0-6] + PALIGNR m3, m4, 2, m6 ; a[1-8] + LOWPASS 3, 4, 5 ; A[0-7] + SCRATCH 1, 8, rsp+0*mmsize + SCRATCH 3, 9, rsp+1*mmsize +%if notcpuflag(ssse3) + SCRATCH 0, 10, rsp+2*mmsize +%endif + mova m6, [lq+mmsize*3] ; l[24-31] + PALIGNR m5, m6, 2, m0 ; l[25-31]* + PALIGNR m4, m5, 2, m0 ; l[26-31]*a + LOWPASS 4, 5, 6 ; L[25-31]# + mova m7, [lq+mmsize*2] ; l[16-23] + PALIGNR m6, m7, 2, m0 ; l[17-24] + PALIGNR m5, m6, 2, m0 ; l[18-25] + LOWPASS 5, 6, 7 ; L[17-24] + mova m1, [lq+mmsize*1] ; l[8-15] + PALIGNR m7, m1, 2, m0 ; l[9-16] + PALIGNR m6, m7, 2, m0 ; l[10-17] + LOWPASS 6, 7, 1 ; L[9-16] + mova m3, [lq+mmsize*0] ; l[0-7] + PALIGNR m1, m3, 2, m0 ; l[1-8] + PALIGNR m7, m1, 2, m0 ; l[2-9] + LOWPASS 7, 1, 3 ; L[1-8] +%if cpuflag(ssse3) +%if cpuflag(avx) + UNSCRATCH 1, 8, rsp+0*mmsize +%endif + UNSCRATCH 3, 9, rsp+1*mmsize +%else + UNSCRATCH 0, 10, rsp+2*mmsize +%endif + DEFINE_ARGS dst8, stride, stride8, stride24, cnt + lea stride8q, [strideq*8] + lea stride24q, [stride8q*3] + lea dst8q, [dst8q+strideq*8] + mov cntd, 8 + +.loop: + sub dst8q, strideq +%if notcpuflag(avx) + UNSCRATCH 1, 8, rsp+0*mmsize +%if notcpuflag(ssse3) + UNSCRATCH 3, 9, rsp+1*mmsize +%endif +%endif + mova [dst8q+stride8q*0+ 0], m4 + mova [dst8q+stride8q*0+16], m3 + mova [dst8q+stride8q*0+32], m1 + mova [dst8q+stride8q*0+48], m0 + mova [dst8q+stride8q*1+ 0], m5 + mova [dst8q+stride8q*1+16], m4 + mova [dst8q+stride8q*1+32], m3 + mova [dst8q+stride8q*1+48], m1 + mova [dst8q+stride8q*2+ 0], m6 + mova [dst8q+stride8q*2+16], m5 + mova [dst8q+stride8q*2+32], m4 + mova [dst8q+stride8q*2+48], m3 + mova [dst8q+stride24q + 0], m7 + mova [dst8q+stride24q +16], m6 + mova [dst8q+stride24q +32], m5 + mova [dst8q+stride24q +48], m4 +%if cpuflag(avx) + vpalignr m7, m6, m7, 2 + vpalignr m6, m5, m6, 2 + vpalignr m5, m4, m5, 2 + vpalignr m4, m3, m4, 2 + vpalignr m3, m1, m3, 2 + vpalignr m1, m0, m1, 2 + vpalignr m0, m2, m0, 2 +%else + SCRATCH 2, 8, rsp+0*mmsize +%if notcpuflag(ssse3) + SCRATCH 0, 9, rsp+1*mmsize +%endif + PALIGNR m2, m6, m7, 2, m0 + mova m7, m2 + PALIGNR m2, m5, m6, 2, m0 + mova m6, m2 + PALIGNR m2, m4, m5, 2, m0 + mova m5, m2 + PALIGNR m2, m3, m4, 2, m0 + mova m4, m2 + PALIGNR m2, m1, m3, 2, m0 + mova m3, m2 +%if notcpuflag(ssse3) + UNSCRATCH 0, 9, rsp+1*mmsize + SCRATCH 3, 9, rsp+1*mmsize +%endif + PALIGNR m2, m0, m1, 2, m3 + mova m1, m2 + UNSCRATCH 2, 8, rsp+0*mmsize + SCRATCH 1, 8, rsp+0*mmsize + PALIGNR m1, m2, m0, 2, m3 + mova m0, m1 +%endif + psrldq m2, 2 + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +DR_FUNCS 3 +INIT_XMM ssse3 +DR_FUNCS 2 +INIT_XMM avx +DR_FUNCS 2 + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_dr_16x16_16, 4, 5, 6, dst, stride, l, a + mova m0, [lq] ; klmnopqrstuvwxyz + movu m1, [aq-2] ; *abcdefghijklmno + mova m2, [aq] ; abcdefghijklmnop + vperm2i128 m4, m2, m2, q2001 ; ijklmnop........ + vpalignr m5, m4, m2, 2 ; bcdefghijklmnop. + vperm2i128 m3, m0, m1, q0201 ; stuvwxyz*abcdefg + LOWPASS 1, 2, 5 ; ABCDEFGHIJKLMNO. + vpalignr m4, m3, m0, 2 ; lmnopqrstuvwxyz* + vpalignr m5, m3, m0, 4 ; mnopqrstuvwxyz*a + LOWPASS 0, 4, 5 ; LMNOPQRSTUVWXYZ# + vperm2i128 m5, m0, m1, q0201 ; TUVWXYZ#ABCDEFGH + DEFINE_ARGS dst, stride, stride3, stride5, dst3 + lea dst3q, [dstq+strideq*4] + lea stride3q, [strideq*3] + lea stride5q, [stride3q+strideq*2] + + vpalignr m3, m5, m0, 2 + vpalignr m4, m1, m5, 2 + mova [dst3q+stride5q*2], m3 ; 14 + mova [ dstq+stride3q*2], m4 ; 6 + vpalignr m3, m5, m0, 4 + vpalignr m4, m1, m5, 4 + sub dst3q, strideq + mova [dst3q+stride5q*2], m3 ; 13 + mova [dst3q+strideq*2 ], m4 ; 5 + mova [dst3q+stride3q*4], m0 ; 15 + vpalignr m3, m5, m0, 6 + vpalignr m4, m1, m5, 6 + mova [dstq+stride3q*4], m3 ; 12 + mova [dst3q+strideq*1], m4 ; 4 + vpalignr m3, m5, m0, 8 + vpalignr m4, m1, m5, 8 + mova [dst3q+strideq*8], m3 ; 11 + mova [dst3q+strideq*0], m4 ; 3 + vpalignr m3, m5, m0, 10 + vpalignr m4, m1, m5, 10 + mova [dstq+stride5q*2], m3 ; 10 + mova [dstq+strideq*2 ], m4 ; 2 + vpalignr m3, m5, m0, 12 + vpalignr m4, m1, m5, 12 + mova [dst3q+stride3q*2], m3 ; 9 + mova [dstq+strideq*1 ], m4 ; 1 + vpalignr m3, m5, m0, 14 + vpalignr m4, m1, m5, 14 + mova [dstq+strideq*8], m3 ; 8 + mova [dstq+strideq*0], m4 ; 0 + mova [dst3q+strideq*4], m5 ; 7 + RET + +cglobal vp9_ipred_vl_16x16_16, 4, 5, 7, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] ; abcdefghijklmnop + vpbroadcastw xm1, [aq+30] ; pppppppp + vperm2i128 m2, m0, m1, q0201 ; ijklmnoppppppppp + vpalignr m3, m2, m0, 2 ; bcdefghijklmnopp + vperm2i128 m4, m3, m1, q0201 ; jklmnopppppppppp + vpalignr m5, m2, m0, 4 ; cdefghijklmnoppp + vperm2i128 m6, m5, m1, q0201 ; klmnoppppppppppp + LOWPASS 5, 3, 0 ; BCDEFGHIJKLMNOPP + LOWPASS 6, 4, 2 ; JKLMNOPPPPPPPPPP + pavgw m3, m0 ; abcdefghijklmnop + pavgw m4, m2 ; ijklmnoppppppppp + DEFINE_ARGS dst, stride, stride3, stride5, dst4 + lea dst4q, [dstq+strideq*4] + lea stride3q, [strideq*3] + lea stride5q, [stride3q+strideq*2] + + mova [dstq+strideq*0], m3 ; 0 abcdefghijklmnop + mova [dstq+strideq*1], m5 ; 1 BCDEFGHIJKLMNOPP + vpalignr m0, m4, m3, 2 + vpalignr m1, m6, m5, 2 + mova [dstq+strideq*2 ], m0 ; 2 bcdefghijklmnopp + mova [dstq+stride3q*1], m1 ; 3 CDEFGHIJKLMNOPPP + vpalignr m0, m4, m3, 4 + vpalignr m1, m6, m5, 4 + mova [dst4q+strideq*0], m0 ; 4 cdefghijklmnoppp + mova [dstq+stride5q*1], m1 ; 5 DEFGHIJKLMNOPPPP + vpalignr m0, m4, m3, 6 + vpalignr m1, m6, m5, 6 + mova [ dstq+stride3q*2], m0 ; 6 defghijklmnopppp + mova [dst4q+stride3q*1], m1 ; 7 EFGHIJKLMNOPPPPP + vpalignr m0, m4, m3, 8 + vpalignr m1, m6, m5, 8 + mova [ dstq+strideq*8], m0 ; 8 efghijklmnoppppp + mova [dst4q+stride5q*1], m1 ; 9 FGHIJKLMNOPPPPPP + vpalignr m0, m4, m3, 10 + mova [dstq+stride5q*2], m0 ; 10 fghijklmnopppppp + vpalignr m0, m4, m3, 12 + mova [dst4q+strideq*8], m0 ; 12 ghijklmnoppppppp + vpalignr m0, m4, m3, 14 + mova [dst4q+stride5q*2], m0 ; 14 hijklmnopppppppp + sub dst4q, strideq + vpalignr m1, m6, m5, 10 + mova [dst4q+strideq*8], m1 ; 11 GHIJKLMNOPPPPPPP + vpalignr m1, m6, m5, 12 + mova [dst4q+stride5q*2], m1 ; 13 HIJKLMNOPPPPPPPP + vpalignr m1, m6, m5, 14 + mova [dst4q+stride3q*4], m1 ; 15 IJKLMNOPPPPPPPPP + RET + +cglobal vp9_ipred_hd_16x16_16, 4, 5, 7, dst, stride, l, a + movu m0, [aq-2] ; *abcdefghijklmno + mova m1, [lq] ; klmnopqrstuvwxyz + vperm2i128 m2, m1, m0, q0201 ; stuvwxyz*abcdefg + vpalignr m3, m2, m1, 2 ; lmnopqrstuvwxyz* + vpalignr m4, m2, m1, 4 ; mnopqrstuvwxyz*a + LOWPASS 4, 3, 1 ; LMNOPQRSTUVWXYZ# + pavgw m3, m1 ; klmnopqrstuvwxyz + mova m1, [aq] ; abcdefghijklmnop + movu m2, [aq+2] ; bcdefghijklmnop. + LOWPASS 2, 1, 0 ; ABCDEFGHIJKLMNO. + vpunpcklwd m0, m3, m4 ; kLlMmNnOsTtUuVvW + vpunpckhwd m1, m3, m4 ; oPpQqRrSwXxYyZz# + vperm2i128 m3, m1, m0, q0002 ; kLlMmNnOoPpQqRrS + vperm2i128 m4, m0, m1, q0301 ; sTtUuVvWwXxYyZz# + vperm2i128 m0, m4, m2, q0201 ; wXxYyZz#ABCDEFGH + vperm2i128 m1, m3, m4, q0201 ; oPpQqRrSsTtUuVvW + DEFINE_ARGS dst, stride, stride3, stride5, dst5 + lea stride3q, [strideq*3] + lea stride5q, [stride3q+strideq*2] + lea dst5q, [dstq+stride5q] + + mova [dst5q+stride5q*2], m3 ; 15 kLlMmNnOoPpQqRrS + mova [dst5q+stride3q*2], m1 ; 11 oPpQqRrSsTtUuVvW + mova [dst5q+strideq*2], m4 ; 7 sTtUuVvWwXxYyZz# + mova [dstq+stride3q*1], m0 ; 3 wXxYyZz#ABCDEFGH + vpalignr m5, m4, m1, 4 + mova [dstq+stride5q*2], m5 ; 10 pQqRrSsTtUuVvWwX + vpalignr m5, m0, m4, 4 + vpalignr m6, m2, m0, 4 + mova [dstq+stride3q*2], m5 ; 6 tUuVvWwXxYyZz#AB + mova [dstq+strideq*2], m6 ; 2 xYyZz#ABCDEFGHIJ + vpalignr m5, m4, m1, 8 + mova [dst5q+strideq*4], m5 ; 9 qRrSsTtUuVvWwXxY + vpalignr m5, m0, m4, 8 + vpalignr m6, m2, m0, 8 + mova [dstq+stride5q*1], m5 ; 5 uVvWwXxYyZz#ABCD + mova [dstq+strideq*1], m6 ; 1 yZz#ABCDEFGHIJKL + vpalignr m5, m1, m3, 12 + vpalignr m6, m4, m1, 12 + mova [dstq+stride3q*4], m5 ; 12 nOoPpQqRrSsTtUuV + mova [dst5q+stride3q], m6 ; 8 rSsTtUuVvWwXxYyZ + vpalignr m5, m0, m4, 12 + vpalignr m6, m2, m0, 12 + mova [dstq+strideq*4], m5 ; 4 nOoPpQqRrSsTtUuV + mova [dstq+strideq*0], m6 ; 0 z#ABCDEFGHIJKLMN + sub dst5q, strideq + vpalignr m5, m1, m3, 4 + mova [dst5q+stride5q*2], m5 ; 14 lMmNnOoPpQqRrSsT + sub dst5q, strideq + vpalignr m5, m1, m3, 8 + mova [dst5q+stride5q*2], m5 ; 13 mNnOoPpQqRrSsTtU + RET + +%if ARCH_X86_64 +cglobal vp9_ipred_dr_32x32_16, 4, 7, 10, dst, stride, l, a + mova m0, [lq+mmsize*0+0] ; l[0-15] + mova m1, [lq+mmsize*1+0] ; l[16-31] + movu m2, [aq+mmsize*0-2] ; *abcdefghijklmno + mova m3, [aq+mmsize*0+0] ; abcdefghijklmnop + mova m4, [aq+mmsize*1+0] ; qrstuvwxyz012345 + vperm2i128 m5, m0, m1, q0201 ; lmnopqrstuvwxyz0 + vpalignr m6, m5, m0, 2 ; mnopqrstuvwxyz01 + vpalignr m7, m5, m0, 4 ; nopqrstuvwxyz012 + LOWPASS 0, 6, 7 ; L[0-15] + vperm2i128 m7, m1, m2, q0201 ; stuvwxyz*abcdefg + vpalignr m5, m7, m1, 2 ; lmnopqrstuvwxyz* + vpalignr m6, m7, m1, 4 ; mnopqrstuvwxyz*a + LOWPASS 1, 5, 6 ; L[16-31]# + vperm2i128 m5, m3, m4, q0201 ; ijklmnopqrstuvwx + vpalignr m6, m5, m3, 2 ; bcdefghijklmnopq + LOWPASS 2, 3, 6 ; A[0-15] + movu m3, [aq+mmsize*1-2] ; pqrstuvwxyz01234 + vperm2i128 m6, m4, m4, q2001 ; yz012345........ + vpalignr m7, m6, m4, 2 ; rstuvwxyz012345. + LOWPASS 3, 4, 7 ; A[16-31]. + vperm2i128 m4, m1, m2, q0201 ; TUVWXYZ#ABCDEFGH + vperm2i128 m5, m0, m1, q0201 ; L[7-15]L[16-23] + vperm2i128 m8, m2, m3, q0201 ; IJKLMNOPQRSTUVWX + DEFINE_ARGS dst8, stride, stride3, stride7, stride5, dst24, cnt + lea stride3q, [strideq*3] + lea stride5q, [stride3q+strideq*2] + lea stride7q, [strideq*4+stride3q] + lea dst24q, [dst8q+stride3q*8] + lea dst8q, [dst8q+strideq*8] + mov cntd, 2 + +.loop: + mova [dst24q+stride7q+0 ], m0 ; 31 23 15 7 + mova [dst24q+stride7q+32], m1 + mova [dst8q+stride7q+0], m1 + mova [dst8q+stride7q+32], m2 + vpalignr m6, m4, m1, 2 + vpalignr m7, m5, m0, 2 + vpalignr m9, m8, m2, 2 + mova [dst24q+stride3q*2+0], m7 ; 30 22 14 6 + mova [dst24q+stride3q*2+32], m6 + mova [dst8q+stride3q*2+0], m6 + mova [dst8q+stride3q*2+32], m9 + vpalignr m6, m4, m1, 4 + vpalignr m7, m5, m0, 4 + vpalignr m9, m8, m2, 4 + mova [dst24q+stride5q+0], m7 ; 29 21 13 5 + mova [dst24q+stride5q+32], m6 + mova [dst8q+stride5q+0], m6 + mova [dst8q+stride5q+32], m9 + vpalignr m6, m4, m1, 6 + vpalignr m7, m5, m0, 6 + vpalignr m9, m8, m2, 6 + mova [dst24q+strideq*4+0 ], m7 ; 28 20 12 4 + mova [dst24q+strideq*4+32], m6 + mova [dst8q+strideq*4+0], m6 + mova [dst8q+strideq*4+32], m9 + vpalignr m6, m4, m1, 8 + vpalignr m7, m5, m0, 8 + vpalignr m9, m8, m2, 8 + mova [dst24q+stride3q+0 ], m7 ; 27 19 11 3 + mova [dst24q+stride3q+32], m6 + mova [dst8q+stride3q+0], m6 + mova [dst8q+stride3q+32], m9 + vpalignr m6, m4, m1, 10 + vpalignr m7, m5, m0, 10 + vpalignr m9, m8, m2, 10 + mova [dst24q+strideq*2+0 ], m7 ; 26 18 10 2 + mova [dst24q+strideq*2+32], m6 + mova [dst8q+strideq*2+0], m6 + mova [dst8q+strideq*2+32], m9 + vpalignr m6, m4, m1, 12 + vpalignr m7, m5, m0, 12 + vpalignr m9, m8, m2, 12 + mova [dst24q+strideq+0 ], m7 ; 25 17 9 1 + mova [dst24q+strideq+32], m6 + mova [dst8q+strideq+0], m6 + mova [dst8q+strideq+32], m9 + vpalignr m6, m4, m1, 14 + vpalignr m7, m5, m0, 14 + vpalignr m9, m8, m2, 14 + mova [dst24q+strideq*0+0 ], m7 ; 24 16 8 0 + mova [dst24q+strideq*0+32], m6 + mova [dst8q+strideq*0+0], m6 + mova [dst8q+strideq*0+32], m9 + mova m0, m5 + mova m5, m1 + mova m1, m4 + mova m4, m2 + mova m2, m8 + mova m8, m3 + sub dst24q, stride7q + sub dst24q, strideq + sub dst8q, stride7q + sub dst8q, strideq + dec cntd + jg .loop + RET +%endif +%endif + +%macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function +cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a + movifnidn aq, amp + movu m0, [aq] ; abcdefgh + psrldq m1, m0, 2 ; bcdefgh. + psrldq m2, m0, 4 ; cdefgh.. + LOWPASS 2, 1, 0 ; BCDEFGH. + pavgw m1, m0 ; ABCDEFG. + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + movh [dstq+strideq*0], m1 + movh [dstq+strideq*1], m2 + psrldq m1, 2 + psrldq m2, 2 + movh [dstq+strideq*2], m1 + movh [dstq+stride3q ], m2 + RET + +cglobal vp9_ipred_vl_8x8_16, 2, 4, 4, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] ; abcdefgh +%if cpuflag(ssse3) + mova m3, [pb_2to15_14_15] +%endif + SHIFT_RIGHTx2 m1, m2, m0, m3 ; bcdefghh/cdefghhh + LOWPASS 2, 1, 0 ; BCDEFGHh + pavgw m1, m0 ; ABCDEFGh + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + mova [dstq+strideq*0], m1 + mova [dstq+strideq*1], m2 + SHIFT_RIGHT m1, m1, m3 + SHIFT_RIGHT m2, m2, m3 + mova [dstq+strideq*2], m1 + mova [dstq+stride3q ], m2 + lea dstq, [dstq+strideq*4] + SHIFT_RIGHT m1, m1, m3 + SHIFT_RIGHT m2, m2, m3 + mova [dstq+strideq*0], m1 + mova [dstq+strideq*1], m2 + SHIFT_RIGHT m1, m1, m3 + SHIFT_RIGHT m2, m2, m3 + mova [dstq+strideq*2], m1 + mova [dstq+stride3q ], m2 + RET + +cglobal vp9_ipred_vl_16x16_16, 2, 4, 6, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] + mova m1, [aq+mmsize] + PALIGNR m2, m1, m0, 2, m3 + PALIGNR m3, m1, m0, 4, m4 + LOWPASS 3, 2, 0 + pavgw m2, m0 +%if cpuflag(ssse3) + mova m4, [pb_2to15_14_15] +%endif + SHIFT_RIGHTx2 m5, m0, m1, m4 + LOWPASS 0, 5, 1 + pavgw m1, m5 + DEFINE_ARGS dst, stride, cnt + mov cntd, 8 + +.loop: + mova [dstq+strideq*0+ 0], m2 + mova [dstq+strideq*0+16], m1 + mova [dstq+strideq*1+ 0], m3 + mova [dstq+strideq*1+16], m0 + lea dstq, [dstq+strideq*2] +%if cpuflag(avx) + vpalignr m2, m1, m2, 2 + vpalignr m3, m0, m3, 2 +%else + PALIGNR m5, m1, m2, 2, m4 + mova m2, m5 + PALIGNR m5, m0, m3, 2, m4 + mova m3, m5 +%endif + SHIFT_RIGHT m1, m1, m4 + SHIFT_RIGHT m0, m0, m4 + dec cntd + jg .loop + RET + +cglobal vp9_ipred_vl_32x32_16, 2, 5, 11, %1 * mmsize * ARCH_X86_32, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq+mmsize*0] + mova m1, [aq+mmsize*1] + mova m2, [aq+mmsize*2] + PALIGNR m6, m1, m0, 2, m5 + PALIGNR m7, m1, m0, 4, m5 + LOWPASS 7, 6, 0 + pavgw m6, m0 + SCRATCH 6, 8, rsp+0*mmsize + PALIGNR m4, m2, m1, 2, m0 + PALIGNR m5, m2, m1, 4, m0 + LOWPASS 5, 4, 1 + pavgw m4, m1 + mova m0, [aq+mmsize*3] + PALIGNR m1, m0, m2, 2, m6 + PALIGNR m3, m0, m2, 4, m6 + LOWPASS 3, 1, 2 + pavgw m2, m1 +%if cpuflag(ssse3) + PRELOAD 10, pb_2to15_14_15, shuf +%endif + SHIFT_RIGHTx2 m6, m1, m0, reg_shuf + LOWPASS 1, 6, 0 + pavgw m0, m6 +%if ARCH_X86_64 + pshufd m9, m6, q3333 +%endif +%if cpuflag(avx) + UNSCRATCH 6, 8, rsp+0*mmsize +%endif + DEFINE_ARGS dst, stride, cnt, stride16, stride17 + mov stride16q, strideq + mov cntd, 8 + shl stride16q, 4 + lea stride17q, [stride16q+strideq] + + ; FIXME m8 is unused for avx, so we could save one register here for win64 +.loop: +%if notcpuflag(avx) + UNSCRATCH 6, 8, rsp+0*mmsize +%endif + mova [dstq+strideq*0+ 0], m6 + mova [dstq+strideq*0+16], m4 + mova [dstq+strideq*0+32], m2 + mova [dstq+strideq*0+48], m0 + mova [dstq+strideq*1+ 0], m7 + mova [dstq+strideq*1+16], m5 + mova [dstq+strideq*1+32], m3 + mova [dstq+strideq*1+48], m1 + mova [dstq+stride16q+ 0], m4 + mova [dstq+stride16q+16], m2 + mova [dstq+stride16q+32], m0 +%if ARCH_X86_64 + mova [dstq+stride16q+48], m9 +%endif + mova [dstq+stride17q+ 0], m5 + mova [dstq+stride17q+16], m3 + mova [dstq+stride17q+32], m1 +%if ARCH_X86_64 + mova [dstq+stride17q+48], m9 +%endif + lea dstq, [dstq+strideq*2] +%if cpuflag(avx) + vpalignr m6, m4, m6, 2 + vpalignr m4, m2, m4, 2 + vpalignr m2, m0, m2, 2 + vpalignr m7, m5, m7, 2 + vpalignr m5, m3, m5, 2 + vpalignr m3, m1, m3, 2 +%else + SCRATCH 3, 8, rsp+0*mmsize +%if notcpuflag(ssse3) + SCRATCH 1, 10, rsp+1*mmsize +%endif + PALIGNR m3, m4, m6, 2, m1 + mova m6, m3 + PALIGNR m3, m2, m4, 2, m1 + mova m4, m3 + PALIGNR m3, m0, m2, 2, m1 + mova m2, m3 + PALIGNR m3, m5, m7, 2, m1 + mova m7, m3 + UNSCRATCH 3, 8, rsp+0*mmsize + SCRATCH 6, 8, rsp+0*mmsize +%if notcpuflag(ssse3) + UNSCRATCH 1, 10, rsp+1*mmsize + SCRATCH 7, 10, rsp+1*mmsize +%endif + PALIGNR m6, m3, m5, 2, m7 + mova m5, m6 + PALIGNR m6, m1, m3, 2, m7 + mova m3, m6 +%if notcpuflag(ssse3) + UNSCRATCH 7, 10, rsp+1*mmsize +%endif +%endif + SHIFT_RIGHT m1, m1, reg_shuf + SHIFT_RIGHT m0, m0, reg_shuf + dec cntd + jg .loop + +%if ARCH_X86_32 + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] +%assign %%n 0 +%rep 4 + mova [dstq+strideq*0+48], m0 + mova [dstq+strideq*1+48], m0 + mova [dstq+strideq*2+48], m0 + mova [dstq+stride3q +48], m0 +%if %%n < 3 + lea dstq, [dstq+strideq*4] +%endif +%assign %%n (%%n+1) +%endrep +%endif + RET +%endmacro + +INIT_XMM sse2 +VL_FUNCS 2 +INIT_XMM ssse3 +VL_FUNCS 1 +INIT_XMM avx +VL_FUNCS 1 + +%macro VR_FUNCS 0 +cglobal vp9_ipred_vr_4x4_16, 4, 4, 3, dst, stride, l, a + movu m0, [aq-2] + movhps m1, [lq] + PALIGNR m0, m1, 10, m2 ; xyz*abcd + pslldq m1, m0, 2 ; .xyz*abc + pslldq m2, m0, 4 ; ..xyz*ab + LOWPASS 2, 1, 0 ; ..YZ#ABC + pavgw m1, m0 ; ....#ABC + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + movhps [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m2 + shufps m0, m2, m1, q3210 +%if cpuflag(ssse3) + pshufb m2, [pb_4_5_8to13_8x0] +%else + pshuflw m2, m2, q2222 + psrldq m2, 6 +%endif + psrldq m0, 6 + movh [dstq+strideq*2], m0 + movh [dstq+stride3q ], m2 + RET + +cglobal vp9_ipred_vr_8x8_16, 4, 4, 5, dst, stride, l, a + movu m1, [aq-2] ; *abcdefg + movu m2, [lq] ; stuvwxyz + mova m0, [aq] ; abcdefgh + PALIGNR m3, m1, m2, 14, m4 ; z*abcdef + LOWPASS 3, 1, 0 + pavgw m0, m1 + PALIGNR m1, m2, 2, m4 ; tuvwxyz* + pslldq m4, m2, 2 ; .stuvwxy + LOWPASS 4, 2, 1 + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m3 + PALIGNR m0, m4, 14, m1 + pslldq m4, 2 + PALIGNR m3, m4, 14, m1 + pslldq m4, 2 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + PALIGNR m0, m4, 14, m1 + pslldq m4, 2 + PALIGNR m3, m4, 14, m1 + pslldq m4, 2 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m3 + PALIGNR m0, m4, 14, m1 + pslldq m4, 2 + PALIGNR m3, m4, 14, m4 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m3 + RET + +cglobal vp9_ipred_vr_16x16_16, 4, 4, 8, dst, stride, l, a + movu m1, [aq-2] ; *abcdefg + movu m2, [aq+mmsize-2] ; hijklmno + mova m3, [aq] ; abcdefgh + mova m4, [aq+mmsize] ; ijklmnop + mova m5, [lq+mmsize] ; stuvwxyz + PALIGNR m0, m1, m5, 14, m6 ; z*abcdef + movu m6, [aq+mmsize-4] ; ghijklmn + LOWPASS 6, 2, 4 + pavgw m2, m4 + LOWPASS 0, 1, 3 + pavgw m3, m1 + PALIGNR m1, m5, 2, m7 ; tuvwxyz* + movu m7, [lq+mmsize-2] ; rstuvwxy + LOWPASS 1, 5, 7 + movu m5, [lq+2] ; lmnopqrs + pslldq m4, m5, 2 ; .lmnopqr + pslldq m7, m5, 4 ; ..lmnopq + LOWPASS 5, 4, 7 + psrld m4, m1, 16 + psrld m7, m5, 16 + pand m1, [pd_65535] + pand m5, [pd_65535] + packssdw m7, m4 + packssdw m5, m1 + DEFINE_ARGS dst, stride, cnt + mov cntd, 8 + +.loop: + mova [dstq+strideq*0+ 0], m3 + mova [dstq+strideq*0+16], m2 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m6 + lea dstq, [dstq+strideq*2] + PALIGNR m2, m3, 14, m4 + PALIGNR m3, m7, 14, m4 + pslldq m7, 2 + PALIGNR m6, m0, 14, m4 + PALIGNR m0, m5, 14, m4 + pslldq m5, 2 + dec cntd + jg .loop + RET + +cglobal vp9_ipred_vr_32x32_16, 4, 5, 14, 6 * mmsize * ARCH_X86_32, dst, stride, l, a + movu m0, [aq+mmsize*0-2] ; *a[0-6] + movu m1, [aq+mmsize*1-2] ; a[7-14] + movu m2, [aq+mmsize*2-2] ; a[15-22] + movu m3, [aq+mmsize*3-2] ; a[23-30] + mova m4, [aq+mmsize*3+0] ; a[24-31] + movu m5, [aq+mmsize*3-4] ; a[22-29] + LOWPASS 5, 3, 4 ; A[23-30] + SCRATCH 5, 8, rsp+0*mmsize + pavgw m3, m4 + mova m4, [aq+mmsize*2+0] ; a[16-23] + movu m6, [aq+mmsize*2-4] ; a[14-21] + LOWPASS 6, 2, 4 ; A[15-22] + SCRATCH 6, 9, rsp+1*mmsize + pavgw m2, m4 + mova m4, [aq+mmsize*1+0] ; a[8-15] + movu m7, [aq+mmsize*1-4] ; a[6-13] + LOWPASS 7, 1, 4 ; A[7-14] + SCRATCH 7, 10, rsp+2*mmsize + pavgw m1, m4 + mova m4, [aq+mmsize*0+0] ; a[0-7] + mova m5, [lq+mmsize*3+0] ; l[24-31] + PALIGNR m6, m0, m5, 14, m7 ; l[31]*a[0-5] + LOWPASS 6, 0, 4 ; #A[0-6] + SCRATCH 6, 11, rsp+3*mmsize + pavgw m4, m0 + PALIGNR m0, m5, 2, m7 ; l[25-31]* + movu m7, [lq+mmsize*3-2] ; l[23-30] + LOWPASS 0, 5, 7 ; L[24-31] + movu m5, [lq+mmsize*2-2] ; l[15-22] + mova m7, [lq+mmsize*2+0] ; l[16-23] + movu m6, [lq+mmsize*2+2] ; l[17-24] + LOWPASS 5, 7, 6 ; L[16-23] + psrld m7, m0, 16 + psrld m6, m5, 16 + pand m0, [pd_65535] + pand m5, [pd_65535] + packssdw m6, m7 + packssdw m5, m0 + SCRATCH 5, 12, rsp+4*mmsize + SCRATCH 6, 13, rsp+5*mmsize + movu m6, [lq+mmsize*1-2] ; l[7-14] + mova m0, [lq+mmsize*1+0] ; l[8-15] + movu m5, [lq+mmsize*1+2] ; l[9-16] + LOWPASS 6, 0, 5 ; L[8-15] + movu m0, [lq+mmsize*0+2] ; l[1-8] + pslldq m5, m0, 2 ; .l[1-7] + pslldq m7, m0, 4 ; ..l[1-6] + LOWPASS 0, 5, 7 + psrld m5, m6, 16 + psrld m7, m0, 16 + pand m6, [pd_65535] + pand m0, [pd_65535] + packssdw m7, m5 + packssdw m0, m6 + UNSCRATCH 6, 13, rsp+5*mmsize + DEFINE_ARGS dst, stride, stride16, cnt, stride17 + mov stride16q, strideq + mov cntd, 8 + shl stride16q, 4 +%if ARCH_X86_64 + lea stride17q, [stride16q+strideq] +%endif + +.loop: + mova [dstq+strideq*0+ 0], m4 + mova [dstq+strideq*0+16], m1 + mova [dstq+strideq*0+32], m2 + mova [dstq+strideq*0+48], m3 +%if ARCH_X86_64 + mova [dstq+strideq*1+ 0], m11 + mova [dstq+strideq*1+16], m10 + mova [dstq+strideq*1+32], m9 + mova [dstq+strideq*1+48], m8 +%endif + mova [dstq+stride16q+ 0], m6 + mova [dstq+stride16q+16], m4 + mova [dstq+stride16q+32], m1 + mova [dstq+stride16q+48], m2 +%if ARCH_X86_64 + mova [dstq+stride17q+ 0], m12 + mova [dstq+stride17q+16], m11 + mova [dstq+stride17q+32], m10 + mova [dstq+stride17q+48], m9 +%endif + lea dstq, [dstq+strideq*2] + PALIGNR m3, m2, 14, m5 + PALIGNR m2, m1, 14, m5 + PALIGNR m1, m4, 14, m5 + PALIGNR m4, m6, 14, m5 + PALIGNR m6, m7, 14, m5 + pslldq m7, 2 +%if ARCH_X86_64 + PALIGNR m8, m9, 14, m5 + PALIGNR m9, m10, 14, m5 + PALIGNR m10, m11, 14, m5 + PALIGNR m11, m12, 14, m5 + PALIGNR m12, m0, 14, m5 + pslldq m0, 2 +%endif + dec cntd + jg .loop + +%if ARCH_X86_32 + UNSCRATCH 5, 12, rsp+4*mmsize + UNSCRATCH 4, 11, rsp+3*mmsize + UNSCRATCH 3, 10, rsp+2*mmsize + UNSCRATCH 2, 9, rsp+1*mmsize + UNSCRATCH 1, 8, rsp+0*mmsize + mov dstq, dstm + mov cntd, 8 + add dstq, strideq +.loop2: + mova [dstq+strideq*0+ 0], m4 + mova [dstq+strideq*0+16], m3 + mova [dstq+strideq*0+32], m2 + mova [dstq+strideq*0+48], m1 + mova [dstq+stride16q+ 0], m5 + mova [dstq+stride16q+16], m4 + mova [dstq+stride16q+32], m3 + mova [dstq+stride16q+48], m2 + lea dstq, [dstq+strideq*2] + PALIGNR m1, m2, 14, m6 + PALIGNR m2, m3, 14, m6 + PALIGNR m3, m4, 14, m6 + PALIGNR m4, m5, 14, m6 + PALIGNR m5, m0, 14, m6 + pslldq m0, 2 + dec cntd + jg .loop2 +%endif + RET +%endmacro + +INIT_XMM sse2 +VR_FUNCS +INIT_XMM ssse3 +VR_FUNCS +INIT_XMM avx +VR_FUNCS + +%macro HU_FUNCS 1 ; stack_mem_for_32x32_32bit_function +cglobal vp9_ipred_hu_4x4_16, 3, 3, 3, dst, stride, l, a + movh m0, [lq] ; abcd +%if cpuflag(ssse3) + pshufb m0, [pb_0to7_67x4] ; abcddddd +%else + punpcklqdq m0, m0 + pshufhw m0, m0, q3333 ; abcddddd +%endif + psrldq m1, m0, 2 ; bcddddd. + psrldq m2, m0, 4 ; cddddd.. + LOWPASS 2, 1, 0 ; BCDddd.. + pavgw m1, m0 ; abcddddd + SBUTTERFLY wd, 1, 2, 0 ; aBbCcDdd, dddddddd + PALIGNR m2, m1, 4, m0 ; bCcDdddd + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + movh [dstq+strideq*0], m1 ; aBbC + movh [dstq+strideq*1], m2 ; bCcD + movhps [dstq+strideq*2], m1 ; cDdd + movhps [dstq+stride3q ], m2 ; dddd + RET + +cglobal vp9_ipred_hu_8x8_16, 3, 3, 4, dst, stride, l, a + mova m0, [lq] +%if cpuflag(ssse3) + mova m3, [pb_2to15_14_15] +%endif + SHIFT_RIGHTx2 m1, m2, m0, m3 + LOWPASS 2, 1, 0 + pavgw m1, m0 + SBUTTERFLY wd, 1, 2, 0 + shufps m0, m1, m2, q1032 + pshufd m3, m2, q3332 + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + mova [dstq+strideq *0], m1 + mova [dstq+strideq *2], m0 + mova [dstq+strideq *4], m2 + mova [dstq+stride3q*2], m3 + add dstq, strideq +%if cpuflag(avx) + vpalignr m1, m2, m1, 4 +%else + PALIGNR m0, m2, m1, 4, m3 + mova m1, m0 +%endif + pshufd m2, m2, q3321 + shufps m0, m1, m2, q1032 + pshufd m3, m2, q3332 + mova [dstq+strideq *0], m1 + mova [dstq+strideq *2], m0 + mova [dstq+strideq *4], m2 + mova [dstq+stride3q*2], m3 + RET + +cglobal vp9_ipred_hu_16x16_16, 3, 4, 6 + notcpuflag(ssse3), dst, stride, l, a + mova m0, [lq] + mova m3, [lq+mmsize] + movu m1, [lq+2] + movu m2, [lq+4] + LOWPASS 2, 1, 0 + pavgw m1, m0 + SBUTTERFLY wd, 1, 2, 0 +%if cpuflag(ssse3) + mova m5, [pb_2to15_14_15] +%endif + SHIFT_RIGHTx2 m0, m4, m3, m5 + LOWPASS 4, 0, 3 + pavgw m3, m0 + SBUTTERFLY wd, 3, 4, 5 + pshufd m0, m0, q3333 + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 + +.loop: + mova [dstq+strideq *0+ 0], m1 + mova [dstq+strideq *0+16], m2 + mova [dstq+strideq *4+ 0], m2 + mova [dstq+strideq *4+16], m3 + mova [dstq+strideq *8+ 0], m3 + mova [dstq+strideq *8+16], m4 + mova [dstq+stride3q*4+ 0], m4 + mova [dstq+stride3q*4+16], m0 + add dstq, strideq +%if cpuflag(avx) + vpalignr m1, m2, m1, 4 + vpalignr m2, m3, m2, 4 + vpalignr m3, m4, m3, 4 + vpalignr m4, m0, m4, 4 +%else + PALIGNR m5, m2, m1, 4, m6 + mova m1, m5 + PALIGNR m5, m3, m2, 4, m6 + mova m2, m5 + PALIGNR m5, m4, m3, 4, m6 + mova m3, m5 + PALIGNR m5, m0, m4, 4, m6 + mova m4, m5 +%endif + dec cntd + jg .loop + RET + +cglobal vp9_ipred_hu_32x32_16, 3, 7, 10 + notcpuflag(ssse3), \ + %1 * -mmsize * ARCH_X86_32, dst, stride, l, a + mova m2, [lq+mmsize*0+0] + movu m1, [lq+mmsize*0+2] + movu m0, [lq+mmsize*0+4] + LOWPASS 0, 1, 2 + pavgw m1, m2 + SBUTTERFLY wd, 1, 0, 2 + SCRATCH 1, 8, rsp+0*mmsize + mova m4, [lq+mmsize*1+0] + movu m3, [lq+mmsize*1+2] + movu m2, [lq+mmsize*1+4] + LOWPASS 2, 3, 4 + pavgw m3, m4 + SBUTTERFLY wd, 3, 2, 4 + mova m6, [lq+mmsize*2+0] + movu m5, [lq+mmsize*2+2] + movu m4, [lq+mmsize*2+4] + LOWPASS 4, 5, 6 + pavgw m5, m6 + SBUTTERFLY wd, 5, 4, 6 + mova m7, [lq+mmsize*3+0] + SCRATCH 0, 9, rsp+1*mmsize +%if cpuflag(ssse3) + mova m0, [pb_2to15_14_15] +%endif + SHIFT_RIGHTx2 m1, m6, m7, m0 + LOWPASS 6, 1, 7 + pavgw m7, m1 + SBUTTERFLY wd, 7, 6, 0 + pshufd m1, m1, q3333 + UNSCRATCH 0, 9, rsp+1*mmsize + DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28 + lea stride3q, [strideq*3] + lea stride4q, [strideq*4] + lea stride28q, [stride4q*8] + lea stride20q, [stride4q*5] + sub stride28q, stride4q + mov cntd, 4 + +.loop: +%if ARCH_X86_64 + SWAP 1, 8 +%else + mova [rsp+1*mmsize], m1 + mova m1, [rsp+0*mmsize] +%endif + mova [dstq+strideq *0+ 0], m1 + mova [dstq+strideq *0+16], m0 + mova [dstq+strideq *0+32], m3 + mova [dstq+strideq *0+48], m2 + mova [dstq+stride4q*1+ 0], m0 + mova [dstq+stride4q*1+16], m3 + mova [dstq+stride4q*1+32], m2 + mova [dstq+stride4q*1+48], m5 + mova [dstq+stride4q*2+ 0], m3 + mova [dstq+stride4q*2+16], m2 + mova [dstq+stride4q*2+32], m5 + mova [dstq+stride4q*2+48], m4 +%if cpuflag(avx) + vpalignr m1, m0, m1, 4 + vpalignr m0, m3, m0, 4 + vpalignr m3, m2, m3, 4 +%else + SCRATCH 6, 9, rsp+2*mmsize +%if notcpuflag(ssse3) + SCRATCH 7, 10, rsp+3*mmsize +%endif + PALIGNR m6, m0, m1, 4, m7 + mova m1, m6 + PALIGNR m6, m3, m0, 4, m7 + mova m0, m6 + PALIGNR m6, m2, m3, 4, m7 + mova m3, m6 + UNSCRATCH 6, 9, rsp+2*mmsize + SCRATCH 0, 9, rsp+2*mmsize +%if notcpuflag(ssse3) + UNSCRATCH 7, 10, rsp+3*mmsize + SCRATCH 3, 10, rsp+3*mmsize +%endif +%endif +%if ARCH_X86_64 + SWAP 1, 8 +%else + mova [rsp+0*mmsize], m1 + mova m1, [rsp+1*mmsize] +%endif + mova [dstq+stride3q*4+ 0], m2 + mova [dstq+stride3q*4+16], m5 + mova [dstq+stride3q*4+32], m4 + mova [dstq+stride3q*4+48], m7 + mova [dstq+stride4q*4+ 0], m5 + mova [dstq+stride4q*4+16], m4 + mova [dstq+stride4q*4+32], m7 + mova [dstq+stride4q*4+48], m6 + mova [dstq+stride20q + 0], m4 + mova [dstq+stride20q +16], m7 + mova [dstq+stride20q +32], m6 + mova [dstq+stride20q +48], m1 + mova [dstq+stride3q*8+ 0], m7 + mova [dstq+stride3q*8+16], m6 + mova [dstq+stride3q*8+32], m1 + mova [dstq+stride3q*8+48], m1 + mova [dstq+stride28q + 0], m6 + mova [dstq+stride28q +16], m1 + mova [dstq+stride28q +32], m1 + mova [dstq+stride28q +48], m1 +%if cpuflag(avx) + vpalignr m2, m5, m2, 4 + vpalignr m5, m4, m5, 4 + vpalignr m4, m7, m4, 4 + vpalignr m7, m6, m7, 4 + vpalignr m6, m1, m6, 4 +%else + PALIGNR m0, m5, m2, 4, m3 + mova m2, m0 + PALIGNR m0, m4, m5, 4, m3 + mova m5, m0 + PALIGNR m0, m7, m4, 4, m3 + mova m4, m0 + PALIGNR m0, m6, m7, 4, m3 + mova m7, m0 + PALIGNR m0, m1, m6, 4, m3 + mova m6, m0 + UNSCRATCH 0, 9, rsp+2*mmsize +%if notcpuflag(ssse3) + UNSCRATCH 3, 10, rsp+3*mmsize +%endif +%endif + add dstq, strideq + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +HU_FUNCS 4 +INIT_XMM ssse3 +HU_FUNCS 3 +INIT_XMM avx +HU_FUNCS 2 + +%macro HD_FUNCS 0 +cglobal vp9_ipred_hd_4x4_16, 4, 4, 4, dst, stride, l, a + movh m0, [lq] + movhps m0, [aq-2] + psrldq m1, m0, 2 + psrldq m2, m0, 4 + LOWPASS 2, 1, 0 + pavgw m1, m0 + punpcklwd m1, m2 + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + movh [dstq+stride3q ], m1 + movhps [dstq+strideq*1], m1 + movhlps m2, m2 + PALIGNR m2, m1, 4, m0 + movh [dstq+strideq*2], m2 + movhps [dstq+strideq*0], m2 + RET + +cglobal vp9_ipred_hd_8x8_16, 4, 4, 5, dst, stride, l, a + mova m0, [lq] + movu m1, [aq-2] + PALIGNR m2, m1, m0, 2, m3 + PALIGNR m3, m1, m0, 4, m4 + LOWPASS 3, 2, 0 + pavgw m2, m0 + SBUTTERFLY wd, 2, 3, 0 + psrldq m0, m1, 2 + psrldq m4, m1, 4 + LOWPASS 1, 0, 4 + DEFINE_ARGS dst8, mstride, cnt + lea dst8q, [dst8q+mstrideq*8] + neg mstrideq + mov cntd, 4 + +.loop: + add dst8q, mstrideq + mova [dst8q+mstrideq*0], m2 + mova [dst8q+mstrideq*4], m3 +%if cpuflag(avx) + vpalignr m2, m3, m2, 4 + vpalignr m3, m1, m3, 4 +%else + PALIGNR m0, m3, m2, 4, m4 + mova m2, m0 + PALIGNR m0, m1, m3, 4, m4 + mova m3, m0 +%endif + psrldq m1, 4 + dec cntd + jg .loop + RET + +cglobal vp9_ipred_hd_16x16_16, 4, 4, 8, dst, stride, l, a + mova m2, [lq] + movu m1, [lq+2] + movu m0, [lq+4] + LOWPASS 0, 1, 2 + pavgw m1, m2 + mova m4, [lq+mmsize] + movu m5, [aq-2] + PALIGNR m3, m5, m4, 2, m6 + PALIGNR m2, m5, m4, 4, m6 + LOWPASS 2, 3, 4 + pavgw m3, m4 + SBUTTERFLY wd, 1, 0, 4 + SBUTTERFLY wd, 3, 2, 4 + mova m6, [aq] + movu m4, [aq+2] + LOWPASS 4, 6, 5 + movu m5, [aq+mmsize-2] + psrldq m6, m5, 2 + psrldq m7, m5, 4 + LOWPASS 5, 6, 7 + DEFINE_ARGS dst, mstride, mstride3, cnt + lea dstq, [dstq+mstrideq*8] + lea dstq, [dstq+mstrideq*8] + neg mstrideq + lea mstride3q, [mstrideq*3] + mov cntd, 4 + +.loop: + add dstq, mstrideq + mova [dstq+mstride3q*4+ 0], m2 + mova [dstq+mstride3q*4+16], m4 + mova [dstq+mstrideq *8+ 0], m3 + mova [dstq+mstrideq *8+16], m2 + mova [dstq+mstrideq *4+ 0], m0 + mova [dstq+mstrideq *4+16], m3 + mova [dstq+mstrideq *0+ 0], m1 + mova [dstq+mstrideq *0+16], m0 +%if cpuflag(avx) + vpalignr m1, m0, m1, 4 + vpalignr m0, m3, m0, 4 + vpalignr m3, m2, m3, 4 + vpalignr m2, m4, m2, 4 + vpalignr m4, m5, m4, 4 +%else + PALIGNR m6, m0, m1, 4, m7 + mova m1, m6 + PALIGNR m6, m3, m0, 4, m7 + mova m0, m6 + PALIGNR m6, m2, m3, 4, m7 + mova m3, m6 + PALIGNR m6, m4, m2, 4, m7 + mova m2, m6 + PALIGNR m6, m5, m4, 4, m7 + mova m4, m6 +%endif + psrldq m5, 4 + dec cntd + jg .loop + RET + +cglobal vp9_ipred_hd_32x32_16, 4, 4 + 3 * ARCH_X86_64, 14, \ + 10 * -mmsize * ARCH_X86_32, dst, stride, l, a + mova m2, [lq+mmsize*0+0] + movu m1, [lq+mmsize*0+2] + movu m0, [lq+mmsize*0+4] + LOWPASS 0, 1, 2 + pavgw m1, m2 + SBUTTERFLY wd, 1, 0, 2 + mova m4, [lq+mmsize*1+0] + movu m3, [lq+mmsize*1+2] + movu m2, [lq+mmsize*1+4] + LOWPASS 2, 3, 4 + pavgw m3, m4 + SBUTTERFLY wd, 3, 2, 4 + SCRATCH 0, 8, rsp+0*mmsize + SCRATCH 1, 9, rsp+1*mmsize + SCRATCH 2, 10, rsp+2*mmsize + SCRATCH 3, 11, rsp+3*mmsize + mova m6, [lq+mmsize*2+0] + movu m5, [lq+mmsize*2+2] + movu m4, [lq+mmsize*2+4] + LOWPASS 4, 5, 6 + pavgw m5, m6 + SBUTTERFLY wd, 5, 4, 6 + mova m0, [lq+mmsize*3+0] + movu m1, [aq+mmsize*0-2] + PALIGNR m7, m1, m0, 2, m2 + PALIGNR m6, m1, m0, 4, m2 + LOWPASS 6, 7, 0 + pavgw m7, m0 + SBUTTERFLY wd, 7, 6, 0 + mova m2, [aq+mmsize*0+0] + movu m0, [aq+mmsize*0+2] + LOWPASS 0, 2, 1 + movu m1, [aq+mmsize*1-2] + mova m2, [aq+mmsize*1+0] + movu m3, [aq+mmsize*1+2] + LOWPASS 1, 2, 3 + SCRATCH 6, 12, rsp+6*mmsize + SCRATCH 7, 13, rsp+7*mmsize + movu m2, [aq+mmsize*2-2] + mova m3, [aq+mmsize*2+0] + movu m6, [aq+mmsize*2+2] + LOWPASS 2, 3, 6 + movu m3, [aq+mmsize*3-2] + psrldq m6, m3, 2 + psrldq m7, m3, 4 + LOWPASS 3, 6, 7 + UNSCRATCH 6, 12, rsp+6*mmsize + UNSCRATCH 7, 13, rsp+7*mmsize +%if ARCH_X86_32 + mova [rsp+4*mmsize], m4 + mova [rsp+5*mmsize], m5 + ; we already backed up m6/m7 earlier on x86-32 in SCRATCH, so we don't need + ; to do it again here +%endif + DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28 + mov cntd, 4 + lea stride3q, [strideq*3] +%if ARCH_X86_64 + lea stride4q, [strideq*4] + lea stride28q, [stride4q*8] + lea stride20q, [stride4q*5] + sub stride28q, stride4q +%endif + add dstq, stride3q + + ; x86-32 doesn't have enough registers, so on that platform, we split + ; the loop in 2... Otherwise you spend most of the loop (un)scratching +.loop: +%if ARCH_X86_64 + mova [dstq+stride28q + 0], m9 + mova [dstq+stride28q +16], m8 + mova [dstq+stride28q +32], m11 + mova [dstq+stride28q +48], m10 + mova [dstq+stride3q*8+ 0], m8 + mova [dstq+stride3q*8+16], m11 + mova [dstq+stride3q*8+32], m10 + mova [dstq+stride3q*8+48], m5 + mova [dstq+stride20q + 0], m11 + mova [dstq+stride20q +16], m10 + mova [dstq+stride20q +32], m5 + mova [dstq+stride20q +48], m4 + mova [dstq+stride4q*4+ 0], m10 + mova [dstq+stride4q*4+16], m5 + mova [dstq+stride4q*4+32], m4 + mova [dstq+stride4q*4+48], m7 +%endif + mova [dstq+stride3q*4+ 0], m5 + mova [dstq+stride3q*4+16], m4 + mova [dstq+stride3q*4+32], m7 + mova [dstq+stride3q*4+48], m6 + mova [dstq+strideq* 8+ 0], m4 + mova [dstq+strideq* 8+16], m7 + mova [dstq+strideq* 8+32], m6 + mova [dstq+strideq* 8+48], m0 + mova [dstq+strideq* 4+ 0], m7 + mova [dstq+strideq* 4+16], m6 + mova [dstq+strideq* 4+32], m0 + mova [dstq+strideq* 4+48], m1 + mova [dstq+strideq* 0+ 0], m6 + mova [dstq+strideq* 0+16], m0 + mova [dstq+strideq* 0+32], m1 + mova [dstq+strideq* 0+48], m2 + sub dstq, strideq +%if cpuflag(avx) +%if ARCH_X86_64 + vpalignr m9, m8, m9, 4 + vpalignr m8, m11, m8, 4 + vpalignr m11, m10, m11, 4 + vpalignr m10, m5, m10, 4 +%endif + vpalignr m5, m4, m5, 4 + vpalignr m4, m7, m4, 4 + vpalignr m7, m6, m7, 4 + vpalignr m6, m0, m6, 4 + vpalignr m0, m1, m0, 4 + vpalignr m1, m2, m1, 4 + vpalignr m2, m3, m2, 4 +%else +%if ARCH_X86_64 + PALIGNR m12, m8, m9, 4, m13 + mova m9, m12 + PALIGNR m12, m11, m8, 4, m13 + mova m8, m12 + PALIGNR m12, m10, m11, 4, m13 + mova m11, m12 + PALIGNR m12, m5, m10, 4, m13 + mova m10, m12 +%endif + SCRATCH 3, 12, rsp+8*mmsize, sh +%if notcpuflag(ssse3) + SCRATCH 2, 13, rsp+9*mmsize +%endif + PALIGNR m3, m4, m5, 4, m2 + mova m5, m3 + PALIGNR m3, m7, m4, 4, m2 + mova m4, m3 + PALIGNR m3, m6, m7, 4, m2 + mova m7, m3 + PALIGNR m3, m0, m6, 4, m2 + mova m6, m3 + PALIGNR m3, m1, m0, 4, m2 + mova m0, m3 +%if notcpuflag(ssse3) + UNSCRATCH 2, 13, rsp+9*mmsize + SCRATCH 0, 13, rsp+9*mmsize +%endif + PALIGNR m3, m2, m1, 4, m0 + mova m1, m3 + PALIGNR m3, reg_sh, m2, 4, m0 + mova m2, m3 +%if notcpuflag(ssse3) + UNSCRATCH 0, 13, rsp+9*mmsize +%endif + UNSCRATCH 3, 12, rsp+8*mmsize, sh +%endif + psrldq m3, 4 + dec cntd + jg .loop + +%if ARCH_X86_32 + UNSCRATCH 0, 8, rsp+0*mmsize + UNSCRATCH 1, 9, rsp+1*mmsize + UNSCRATCH 2, 10, rsp+2*mmsize + UNSCRATCH 3, 11, rsp+3*mmsize + mova m4, [rsp+4*mmsize] + mova m5, [rsp+5*mmsize] + mova m6, [rsp+6*mmsize] + mova m7, [rsp+7*mmsize] + DEFINE_ARGS dst, stride, stride5, stride3 + lea stride5q, [strideq*5] + lea dstq, [dstq+stride5q*4] + DEFINE_ARGS dst, stride, cnt, stride3 + mov cntd, 4 +.loop_2: + mova [dstq+stride3q*4+ 0], m1 + mova [dstq+stride3q*4+16], m0 + mova [dstq+stride3q*4+32], m3 + mova [dstq+stride3q*4+48], m2 + mova [dstq+strideq* 8+ 0], m0 + mova [dstq+strideq* 8+16], m3 + mova [dstq+strideq* 8+32], m2 + mova [dstq+strideq* 8+48], m5 + mova [dstq+strideq* 4+ 0], m3 + mova [dstq+strideq* 4+16], m2 + mova [dstq+strideq* 4+32], m5 + mova [dstq+strideq* 4+48], m4 + mova [dstq+strideq* 0+ 0], m2 + mova [dstq+strideq* 0+16], m5 + mova [dstq+strideq* 0+32], m4 + mova [dstq+strideq* 0+48], m7 + sub dstq, strideq +%if cpuflag(avx) + vpalignr m1, m0, m1, 4 + vpalignr m0, m3, m0, 4 + vpalignr m3, m2, m3, 4 + vpalignr m2, m5, m2, 4 + vpalignr m5, m4, m5, 4 + vpalignr m4, m7, m4, 4 + vpalignr m7, m6, m7, 4 +%else + SCRATCH 6, 12, rsp+8*mmsize, sh +%if notcpuflag(ssse3) + SCRATCH 7, 13, rsp+9*mmsize +%endif + PALIGNR m6, m0, m1, 4, m7 + mova m1, m6 + PALIGNR m6, m3, m0, 4, m7 + mova m0, m6 + PALIGNR m6, m2, m3, 4, m7 + mova m3, m6 + PALIGNR m6, m5, m2, 4, m7 + mova m2, m6 + PALIGNR m6, m4, m5, 4, m7 + mova m5, m6 +%if notcpuflag(ssse3) + UNSCRATCH 7, 13, rsp+9*mmsize + SCRATCH 5, 13, rsp+9*mmsize +%endif + PALIGNR m6, m7, m4, 4, m5 + mova m4, m6 + PALIGNR m6, reg_sh, m7, 4, m5 + mova m7, m6 +%if notcpuflag(ssse3) + UNSCRATCH 5, 13, rsp+9*mmsize +%endif + UNSCRATCH 6, 12, rsp+8*mmsize, sh +%endif + psrldq m6, 4 + dec cntd + jg .loop_2 +%endif + RET +%endmacro + +INIT_XMM sse2 +HD_FUNCS +INIT_XMM ssse3 +HD_FUNCS +INIT_XMM avx +HD_FUNCS -- cgit v1.2.3