From 36d22d82aa202bb199967e9512281e9a53db42c9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 21:33:14 +0200 Subject: Adding upstream version 115.7.0esr. Signed-off-by: Daniel Baumann --- third_party/dav1d/src/x86/mc16_avx512.asm | 4858 +++++++++++++++++++++++++++++ 1 file changed, 4858 insertions(+) create mode 100644 third_party/dav1d/src/x86/mc16_avx512.asm (limited to 'third_party/dav1d/src/x86/mc16_avx512.asm') diff --git a/third_party/dav1d/src/x86/mc16_avx512.asm b/third_party/dav1d/src/x86/mc16_avx512.asm new file mode 100644 index 0000000000..585ba53e08 --- /dev/null +++ b/third_party/dav1d/src/x86/mc16_avx512.asm @@ -0,0 +1,4858 @@ +; Copyright © 2020, VideoLAN and dav1d authors +; Copyright © 2020, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 + db 32, 33, 34, 35, 34, 35, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41 +spel_h_shufC: db 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15, 16, 17 + db 40, 41, 42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49 + db 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23, 24, 25 + db 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57 +spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 + db 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45 +spel_h_shufD: db 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21 + db 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51, 50, 51, 52, 53 + db 20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29 + db 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59, 60, 61 +spel_v_shuf8: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 + db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39 + db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 + db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47 +spel_v_shuf16: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 + db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 + db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 + db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 +prep_endA: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 + db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 + db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94 + db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126 +prep_endB: db 1, 2, 5, 6, 9, 10, 13, 14, 33, 34, 37, 38, 41, 42, 45, 46 + db 17, 18, 21, 22, 25, 26, 29, 30, 49, 50, 53, 54, 57, 58, 61, 62 + db 65, 66, 69, 70, 73, 74, 77, 78, 97, 98,101,102,105,106,109,110 + db 81, 82, 85, 86, 89, 90, 93, 94,113,114,117,118,121,122,125,126 +prep_endC: db 1, 2, 5, 6, 9, 10, 13, 14, 65, 66, 69, 70, 73, 74, 77, 78 + db 17, 18, 21, 22, 25, 26, 29, 30, 81, 82, 85, 86, 89, 90, 93, 94 + db 33, 34, 37, 38, 41, 42, 45, 46, 97, 98,101,102,105,106,109,110 + db 49, 50, 53, 54, 57, 58, 61, 62,113,114,117,118,121,122,125,126 +spel_shuf4a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30 + db 17, 18, 33, 34, 21, 22, 37, 38, 25, 26, 41, 42, 29, 30, 45, 46 +spel_shuf4b: db 18, 19, 33, 34, 22, 23, 37, 38, 26, 27, 41, 42, 30, 31, 45, 46 + db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62 +spel_shuf8a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30 + db 17, 18, 65, 66, 21, 22, 69, 70, 25, 26, 73, 74, 29, 30, 77, 78 + db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62 + db 49, 50, 97, 98, 53, 54,101,102, 57, 58,105,106, 61, 62,109,110 +spel_shuf8b: db 18, 19, 65, 66, 22, 23, 69, 70, 26, 27, 73, 74, 30, 31, 77, 78 + db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94 + db 50, 51, 97, 98, 54, 55,101,102, 58, 59,105,106, 62, 63,109,110 + db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126 +spel_shuf16: db 1, 2, 33, 34, 5, 6, 37, 38, 9, 10, 41, 42, 13, 14, 45, 46 + db 17, 18, 49, 50, 21, 22, 53, 54, 25, 26, 57, 58, 29, 30, 61, 62 + db 65, 66, 97, 98, 69, 70,101,102, 73, 74,105,106, 77, 78,109,110 + db 81, 82,113,114, 85, 86,117,118, 89, 90,121,122, 93, 94,125,126 +spel_shuf32: db 1, 2, 65, 66, 5, 6, 69, 70, 9, 10, 73, 74, 13, 14, 77, 78 + db 17, 18, 81, 82, 21, 22, 85, 86, 25, 26, 89, 90, 29, 30, 93, 94 + db 33, 34, 97, 98, 37, 38,101,102, 41, 42,105,106, 45, 46,109,110 + db 49, 50,113,114, 53, 54,117,118, 57, 58,121,122, 61, 62,125,126 +spel_h_shuf2b: db 1, 2, 17, 18, 5, 6, 21, 22, 17, 18, 33, 34, 21, 22, 37, 38 + db 33, 34, 49, 50, 37, 38, 53, 54, 49, 50, 9, 10, 53, 54, 13, 14 + db 9, 10, 25, 26, 13, 14, 29, 30, 25, 26, 41, 42, 29, 30, 45, 46 +spel_shuf2: db 10, 11, 17, 18, 14, 15, 21, 22, 17, 18, 25, 26, 21, 22, 29, 30 +spel_h_shuf2a: db 0, 1, 2, 3, 2, 3, 4, 5, 16, 17, 18, 19, 18, 19, 20, 21 + db 4, 5, 6, 7, 6, 7, 8, 9, 20, 21, 22, 23, 22, 23, 24, 25 +w_mask_end42x: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 + db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 +w_mask_end444: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 + db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94 + db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126 +w_mask_shuf4: db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30 + db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62 + db 64, 66, 72, 74, 68, 70, 76, 78, 80, 82, 88, 90, 84, 86, 92, 94 + db 96, 98,104,106,100,102,108,110,112,114,120,122,116,118,124,126 +w_mask_shuf8: db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30 + db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62 + db 64, 66, 80, 82, 68, 70, 84, 86, 72, 74, 88, 90, 76, 78, 92, 94 + db 96, 98,112,114,100,102,116,118,104,106,120,122,108,110,124,126 +w_mask_shuf16: db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46 + db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62 + db 64, 66, 96, 98, 68, 70,100,102, 72, 74,104,106, 76, 78,108,110 + db 80, 82,112,114, 84, 86,116,118, 88, 90,120,122, 92, 94,124,126 +warp8x8_permA: db 0, 1, 2, 3, 32, 33, 34, 35, 2, 3, 4, 5, 34, 35, 36, 37 + db 4, 5, 6, 7, 36, 37, 38, 39, 6, 7, 8, 9, 38, 39, 40, 41 + db 8, 9, 10, 11, 40, 41, 42, 43, 10, 11, 12, 13, 42, 43, 44, 45 + db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49 +warp8x8_permB: db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49 + db 16, 17, 18, 19, 48, 49, 50, 51, 18, 19, 20, 21, 50, 51, 52, 53 + db 20, 21, 22, 23, 52, 53, 54, 55, 22, 23, 24, 25, 54, 55, 56, 57 + db 24, 25, 26, 27, 56, 57, 58, 59, 26, 27, 28, 29, 58, 59, 60, 61 +warp8x8_end: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53 + db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55 + db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61 + db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63 +deint_q_shuf: ;dq 0, 2, 4, 6, 1, 3, 5, 7 +pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7 + dd 1 +pw_2048: times 2 dw 2048 + dd 3 +pw_8192: times 2 dw 8192 +avg_shift: dw 5, 5, 3, 3 +pw_27615: times 2 dw 27615 +pw_32766: times 2 dw 32766 +warp8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13 +warp8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15 +warp_shift_h: db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53 +blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 +resize_permA: dd 0, 4, 8, 12, 1, 5, 9, 13, 16, 20, 24, 28, 17, 21, 25, 29 +resize_permB: dd 2, 6, 10, 14, 3, 7, 11, 15, 18, 22, 26, 30, 19, 23, 27, 31 +resize_permC: dq 0, 1, 4, 5, 8, 9, 12, 13 +resize_permD: dq 2, 3, 6, 7, 10, 11, 14, 15 +resize_permE: dq 0, 2, 4, 6 +resize_shufA: db -1, 0, -1, 1, -1, 4, -1, 5, -1, 8, -1, 9, -1, 12, -1, 13 +resize_shufB: db -1, 2, -1, 3, -1, 6, -1, 7, -1, 10, -1, 11, -1, 14, -1, 15 +rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 + db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 + +prep_hv_shift: dq 6, 4 +put_bilin_h_rnd: dw 8, 8, 10, 10 +prep_mul: dw 16, 16, 4, 4 +put_8tap_h_rnd: dd 34, 40 +prep_8tap_rnd: dd 128 - (8192 << 8) +warp_8x8_rnd_h: dd 512, 2048 +warp_8x8_rnd_v: dd 262144, 65536 +warp_8x8t_rnd_v: dd 16384 - (8192 << 15) +avg_round: dw -16400, -16400, -16388, -16388 +w_avg_round: dd 128 + (8192 << 4), 32 + (8192 << 4) +mask_round: dd 512 + (8192 << 6), 128 + (8192 << 6) +w_mask_round: dd 128, 64 +bidir_shift: dw 6, 6, 4, 4 + +pb_64: times 4 db 64 +pw_m512: times 2 dw -512 +pw_2: times 2 dw 2 +pw_64: times 2 dw 64 +pd_32: dd 32 +pd_63: dd 63 +pd_128: dd 128 +pd_640: dd 640 +pd_2176: dd 2176 +pd_16384: dd 16384 +pd_0_4: dd 0, 4 + +%define pw_16 prep_mul +%define pd_512 warp_8x8_rnd_h + +%macro BASE_JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base %1_%2 + %%table: + %rep %0 - 2 + dw %%base %+ _w%3 - %%base + %rotate 1 + %endrep +%endmacro + +%macro HV_JMP_TABLE 5-* + %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3) + %xdefine %%base %1_%3 + %assign %%types %4 + %if %%types & 1 + %xdefine %1_%2_h_%3_table (%%h - %5) + %%h: + %rep %0 - 4 + dw %%prefix %+ .h_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 2 + %xdefine %1_%2_v_%3_table (%%v - %5) + %%v: + %rep %0 - 4 + dw %%prefix %+ .v_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 4 + %xdefine %1_%2_hv_%3_table (%%hv - %5) + %%hv: + %rep %0 - 4 + dw %%prefix %+ .hv_w%5 - %%base + %rotate 1 + %endrep + %endif +%endmacro + +%macro BIDIR_JMP_TABLE 2-* + %xdefine %1_%2_table (%%table - 2*%3) + %xdefine %%base %1_%2_table + %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) + %%table: + %rep %0 - 2 + dd %%prefix %+ .w%3 - %%base + %rotate 1 + %endrep +%endmacro + +%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_16bpc_avx512icl.put) +%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_16bpc_avx512icl.prep) + +BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128 +BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128 +BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, 8tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, 8tap, avx512icl, 2, 4, 8, 16, 32, 64, 128 + +%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX + +cextern mc_subpel_filters +%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) + +cextern mc_warp_filter +cextern obmc_masks_avx2 +cextern resize_filter + +SECTION .text + +%if WIN64 +DECLARE_REG_TMP 4 +%else +DECLARE_REG_TMP 8 +%endif + +INIT_ZMM avx512icl +cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w, h, mxy + mov mxyd, r6m ; mx + lea r7, [put_avx512icl] + tzcnt t0d, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r7m ; my + test mxyd, mxyd + jnz .v +.put: + movzx t0d, word [r7+t0*2+table_offset(put,)] + add t0, r7 + jmp t0 +.put_w2: + mov r6d, [srcq+ssq*0] + mov r7d, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6d + mov [dstq+dsq*1], r7d + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w2 + RET +.put_w4: + mov r6, [srcq+ssq*0] + mov r7, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6 + mov [dstq+dsq*1], r7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w4 + RET +.put_w8: + movu xmm0, [srcq+ssq*0] + movu xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], xmm0 + mova [dstq+dsq*1], xmm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w8 + RET +.put_w16: + movu ym0, [srcq+ssq*0] + movu ym1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], ym0 + mova [dstq+dsq*1], ym1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w16 + RET +.put_w32: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w32 + RET +.put_w64: + movu m0, [srcq+ssq*0+64*0] + movu m1, [srcq+ssq*0+64*1] + movu m2, [srcq+ssq*1+64*0] + movu m3, [srcq+ssq*1+64*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0+64*0], m0 + mova [dstq+dsq*0+64*1], m1 + mova [dstq+dsq*1+64*0], m2 + mova [dstq+dsq*1+64*1], m3 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w64 + RET +.put_w128: + movu m0, [srcq+64*0] + movu m1, [srcq+64*1] + movu m2, [srcq+64*2] + movu m3, [srcq+64*3] + add srcq, ssq + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + mova [dstq+64*2], m2 + mova [dstq+64*3], m3 + add dstq, dsq + dec hd + jg .put_w128 + RET +.h: + vpbroadcastw m5, mxyd + mov mxyd, r7m ; my + vpbroadcastd m4, [pw_16] + psubw m4, m5 + test mxyd, mxyd + jnz .hv + ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v + movzx t0d, word [r7+t0*2+table_offset(put, _bilin_h)] + mov r6d, r8m ; bitdepth_max + add t0, r7 + shr r6d, 11 + vpbroadcastd m6, [r7-put_avx512icl+put_bilin_h_rnd+r6*4] + jmp t0 +.h_w2: + movq xmm1, [srcq+ssq*0] + movhps xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmullw xmm0, xmm1, xm4 + psrlq xmm1, 16 + pmullw xmm1, xm5 + paddw xmm0, xm6 + paddw xmm0, xmm1 + psrlw xmm0, 4 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2 + RET +.h_w4: + movq xmm0, [srcq+ssq*0+0] + movhps xmm0, [srcq+ssq*1+0] + movq xmm1, [srcq+ssq*0+2] + movhps xmm1, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + pmullw xmm0, xm4 + pmullw xmm1, xm5 + paddw xmm0, xm6 + paddw xmm0, xmm1 + psrlw xmm0, 4 + movq [dstq+dsq*0], xmm0 + movhps [dstq+dsq*1], xmm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4 + RET +.h_w8: + movu xm0, [srcq+ssq*0+0] + vinserti32x4 ym0, [srcq+ssq*1+0], 1 + movu xm1, [srcq+ssq*0+2] + vinserti32x4 ym1, [srcq+ssq*1+2], 1 + lea srcq, [srcq+ssq*2] + pmullw ym0, ym4 + pmullw ym1, ym5 + paddw ym0, ym6 + paddw ym0, ym1 + psrlw ym0, 4 + mova [dstq+dsq*0], xm0 + vextracti32x4 [dstq+dsq*1], ym0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + movu ym0, [srcq+ssq*0+0] + vinserti32x8 m0, [srcq+ssq*1+0], 1 + movu ym1, [srcq+ssq*0+2] + vinserti32x8 m1, [srcq+ssq*1+2], 1 + lea srcq, [srcq+ssq*2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m6 + paddw m0, m1 + psrlw m0, 4 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16 + RET +.h_w32: + pmullw m0, m4, [srcq+ssq*0+0] + pmullw m2, m5, [srcq+ssq*0+2] + pmullw m1, m4, [srcq+ssq*1+0] + pmullw m3, m5, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + paddw m0, m6 + paddw m1, m6 + paddw m0, m2 + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w32 + RET +.h_w64: + pmullw m0, m4, [srcq+64*0+0] + pmullw m2, m5, [srcq+64*0+2] + pmullw m1, m4, [srcq+64*1+0] + pmullw m3, m5, [srcq+64*1+2] + add srcq, ssq + paddw m0, m6 + paddw m1, m6 + paddw m0, m2 + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, dsq + dec hd + jg .h_w64 + RET +.h_w128: + pmullw m0, m4, [srcq+64*0+0] + pmullw m7, m5, [srcq+64*0+2] + pmullw m1, m4, [srcq+64*1+0] + pmullw m8, m5, [srcq+64*1+2] + pmullw m2, m4, [srcq+64*2+0] + pmullw m9, m5, [srcq+64*2+2] + pmullw m3, m4, [srcq+64*3+0] + pmullw m10, m5, [srcq+64*3+2] + add srcq, ssq + REPX {paddw x, m6}, m0, m1, m2, m3 + paddw m0, m7 + paddw m1, m8 + paddw m2, m9 + paddw m3, m10 + REPX {psrlw x, 4}, m0, m1, m2, m3 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + mova [dstq+64*2], m2 + mova [dstq+64*3], m3 + add dstq, dsq + dec hd + jg .h_w128 + RET +.v: + movzx t0d, word [r7+t0*2+table_offset(put, _bilin_v)] + shl mxyd, 11 + vpbroadcastw m8, mxyd + add t0, r7 + jmp t0 +.v_w2: + movd xmm0, [srcq+ssq*0] +.v_w2_loop: + movd xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpckldq xmm2, xmm0, xmm1 + movd xmm0, [srcq+ssq*0] + punpckldq xmm1, xmm0 + psubw xmm1, xmm2 + pmulhrsw xmm1, xm8 + paddw xmm1, xmm2 + movd [dstq+dsq*0], xmm1 + pextrd [dstq+dsq*1], xmm1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movq xmm0, [srcq+ssq*0] +.v_w4_loop: + movq xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklqdq xmm2, xmm0, xmm1 + movq xmm0, [srcq+ssq*0] + punpcklqdq xmm1, xmm0 + psubw xmm1, xmm2 + pmulhrsw xmm1, xm8 + paddw xmm1, xmm2 + movq [dstq+dsq*0], xmm1 + movhps [dstq+dsq*1], xmm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movu xmm0, [srcq+ssq*0] +.v_w8_loop: + vbroadcasti128 ymm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd ymm2, ymm0, ymm1, 0xf0 + vbroadcasti128 ymm0, [srcq+ssq*0] + vpblendd ymm1, ymm0, 0xf0 + psubw ymm1, ymm2 + pmulhrsw ymm1, ym8 + paddw ymm1, ymm2 + mova [dstq+dsq*0], xmm1 + vextracti128 [dstq+dsq*1], ymm1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + vzeroupper + RET +.v_w16: + movu ym0, [srcq+ssq*0] +.v_w16_loop: + movu ym3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + psubw ym1, ym3, ym0 + pmulhrsw ym1, ym8 + paddw ym1, ym0 + movu ym0, [srcq+ssq*0] + psubw ym2, ym0, ym3 + pmulhrsw ym2, ym8 + paddw ym2, ym3 + mova [dstq+dsq*0], ym1 + mova [dstq+dsq*1], ym2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w16_loop + RET +.v_w32: + movu m0, [srcq+ssq*0] +.v_w32_loop: + movu m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + psubw m1, m3, m0 + pmulhrsw m1, m8 + paddw m1, m0 + movu m0, [srcq+ssq*0] + psubw m2, m0, m3 + pmulhrsw m2, m8 + paddw m2, m3 + mova [dstq+dsq*0], m1 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w32_loop + RET +.v_w64: + movu m0, [srcq+ssq*0+64*0] + movu m1, [srcq+ssq*0+64*1] +.v_w64_loop: + movu m2, [srcq+ssq*1+64*0] + movu m3, [srcq+ssq*1+64*1] + lea srcq, [srcq+ssq*2] + psubw m4, m2, m0 + pmulhrsw m4, m8 + paddw m4, m0 + movu m0, [srcq+ssq*0+64*0] + psubw m5, m3, m1 + pmulhrsw m5, m8 + paddw m5, m1 + movu m1, [srcq+ssq*0+64*1] + psubw m6, m0, m2 + pmulhrsw m6, m8 + psubw m7, m1, m3 + pmulhrsw m7, m8 + mova [dstq+dsq*0+64*0], m4 + mova [dstq+dsq*0+64*1], m5 + paddw m6, m2 + paddw m7, m3 + mova [dstq+dsq*1+64*0], m6 + mova [dstq+dsq*1+64*1], m7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w64_loop + RET +.v_w128: + movu m0, [srcq+ssq*0+64*0] + movu m1, [srcq+ssq*0+64*1] + movu m2, [srcq+ssq*0+64*2] + movu m3, [srcq+ssq*0+64*3] +.v_w128_loop: + movu m4, [srcq+ssq*1+64*0] + movu m5, [srcq+ssq*1+64*1] + movu m6, [srcq+ssq*1+64*2] + movu m7, [srcq+ssq*1+64*3] + lea srcq, [srcq+ssq*2] + psubw m9, m4, m0 + pmulhrsw m9, m8 + paddw m9, m0 + movu m0, [srcq+ssq*0+64*0] + psubw m10, m5, m1 + pmulhrsw m10, m8 + paddw m10, m1 + movu m1, [srcq+ssq*0+64*1] + psubw m11, m6, m2 + pmulhrsw m11, m8 + paddw m11, m2 + movu m2, [srcq+ssq*0+64*2] + psubw m12, m7, m3 + pmulhrsw m12, m8 + paddw m12, m3 + movu m3, [srcq+ssq*0+64*3] + mova [dstq+dsq*0+64*0], m9 + psubw m9, m0, m4 + pmulhrsw m9, m8 + mova [dstq+dsq*0+64*1], m10 + psubw m10, m1, m5 + pmulhrsw m10, m8 + mova [dstq+dsq*0+64*2], m11 + psubw m11, m2, m6 + pmulhrsw m11, m8 + mova [dstq+dsq*0+64*3], m12 + psubw m12, m3, m7 + pmulhrsw m12, m8 + paddw m9, m4 + paddw m10, m5 + mova [dstq+dsq*1+64*0], m9 + mova [dstq+dsq*1+64*1], m10 + paddw m11, m6 + paddw m12, m7 + mova [dstq+dsq*1+64*2], m11 + mova [dstq+dsq*1+64*3], m12 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w128_loop + RET +.hv: + movzx t0d, word [r7+t0*2+table_offset(put, _bilin_hv)] + shl mxyd, 11 + vpbroadcastd m6, [pw_2] + vpbroadcastw m7, mxyd + vpbroadcastd m8, [pw_8192] + add t0, r7 + test dword r8m, 0x800 + jnz .hv_12bpc + psllw m4, 2 + psllw m5, 2 + vpbroadcastd m8, [pw_2048] +.hv_12bpc: + jmp t0 +.hv_w2: + vpbroadcastq xmm1, [srcq+ssq*0] + pmullw xmm0, xmm1, xm4 + psrlq xmm1, 16 + pmullw xmm1, xm5 + paddw xmm0, xm6 + paddw xmm0, xmm1 + psrlw xmm0, 2 +.hv_w2_loop: + movq xmm2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps xmm2, [srcq+ssq*0] + pmullw xmm1, xmm2, xm4 + psrlq xmm2, 16 + pmullw xmm2, xm5 + paddw xmm1, xm6 + paddw xmm1, xmm2 + psrlw xmm1, 2 ; 1 _ 2 _ + shufpd xmm2, xmm0, xmm1, 0x01 ; 0 _ 1 _ + mova xmm0, xmm1 + psubw xmm1, xmm2 + paddw xmm1, xmm1 + pmulhw xmm1, xm7 + paddw xmm1, xmm2 + pmulhrsw xmm1, xm8 + movd [dstq+dsq*0], xmm1 + pextrd [dstq+dsq*1], xmm1, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + pmullw xmm0, xm4, [srcq+ssq*0-8] + pmullw xmm1, xm5, [srcq+ssq*0-6] + paddw xmm0, xm6 + paddw xmm0, xmm1 + psrlw xmm0, 2 +.hv_w4_loop: + movq xmm1, [srcq+ssq*1+0] + movq xmm2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + movhps xmm1, [srcq+ssq*0+0] + movhps xmm2, [srcq+ssq*0+2] + pmullw xmm1, xm4 + pmullw xmm2, xm5 + paddw xmm1, xm6 + paddw xmm1, xmm2 + psrlw xmm1, 2 ; 1 2 + shufpd xmm2, xmm0, xmm1, 0x01 ; 0 1 + mova xmm0, xmm1 + psubw xmm1, xmm2 + paddw xmm1, xmm1 + pmulhw xmm1, xm7 + paddw xmm1, xmm2 + pmulhrsw xmm1, xm8 + movq [dstq+dsq*0], xmm1 + movhps [dstq+dsq*1], xmm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + pmullw xmm0, xm4, [srcq+ssq*0+0] + pmullw xmm1, xm5, [srcq+ssq*0+2] + paddw xmm0, xm6 + paddw xmm0, xmm1 + psrlw xmm0, 2 + vinserti32x4 ym0, xmm0, 1 +.hv_w8_loop: + movu xm1, [srcq+ssq*1+0] + movu xm2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + vinserti32x4 ym1, [srcq+ssq*0+0], 1 + vinserti32x4 ym2, [srcq+ssq*0+2], 1 + pmullw ym1, ym4 + pmullw ym2, ym5 + paddw ym1, ym6 + paddw ym1, ym2 + psrlw ym1, 2 ; 1 2 + vshufi32x4 ym2, ym0, ym1, 0x01 ; 0 1 + mova ym0, ym1 + psubw ym1, ym2 + paddw ym1, ym1 + pmulhw ym1, ym7 + paddw ym1, ym2 + pmulhrsw ym1, ym8 + mova [dstq+dsq*0], xm1 + vextracti32x4 [dstq+dsq*1], ym1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: + pmullw ym0, ym4, [srcq+ssq*0+0] + pmullw ym1, ym5, [srcq+ssq*0+2] + paddw ym0, ym6 + paddw ym0, ym1 + psrlw ym0, 2 + vinserti32x8 m0, ym0, 1 +.hv_w16_loop: + movu ym1, [srcq+ssq*1+0] + movu ym2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + vinserti32x8 m1, [srcq+ssq*0+0], 1 + vinserti32x8 m2, [srcq+ssq*0+2], 1 + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m6 + paddw m1, m2 + psrlw m1, 2 ; 1 2 + vshufi32x4 m2, m0, m1, q1032 ; 0 1 + mova m0, m1 + psubw m1, m2 + paddw m1, m1 + pmulhw m1, m7 + paddw m1, m2 + pmulhrsw m1, m8 + mova [dstq+dsq*0], ym1 + vextracti32x8 [dstq+dsq*1], m1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w16_loop + RET +.hv_w32: +.hv_w64: +.hv_w128: + movifnidn wd, wm + lea r6d, [hq+wq*8-256] + mov r4, srcq + mov r7, dstq +.hv_w32_loop0: + pmullw m0, m4, [srcq+ssq*0+0] + pmullw m1, m5, [srcq+ssq*0+2] + paddw m0, m6 + paddw m0, m1 + psrlw m0, 2 +.hv_w32_loop: + pmullw m3, m4, [srcq+ssq*1+0] + pmullw m1, m5, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + paddw m3, m6 + paddw m3, m1 + psrlw m3, 2 + psubw m1, m3, m0 + paddw m1, m1 + pmulhw m1, m7 + paddw m1, m0 + pmullw m0, m4, [srcq+ssq*0+0] + pmullw m2, m5, [srcq+ssq*0+2] + paddw m0, m6 + paddw m0, m2 + psrlw m0, 2 + psubw m2, m0, m3 + paddw m2, m2 + pmulhw m2, m7 + paddw m2, m3 + pmulhrsw m1, m8 + pmulhrsw m2, m8 + mova [dstq+dsq*0], m1 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w32_loop + add r4, 64 + add r7, 64 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 + jg .hv_w32_loop0 + RET + +cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, w, h, mxy, stride3 + movifnidn mxyd, r5m ; mx + lea r6, [prep_avx512icl] + tzcnt wd, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r6m ; my + test mxyd, mxyd + jnz .v +.prep: + movzx wd, word [r6+wq*2+table_offset(prep,)] + mov r5d, r7m ; bitdepth_max + vpbroadcastd m5, [r6-prep_avx512icl+pw_8192] + add wq, r6 + shr r5d, 11 + vpbroadcastd m4, [r6-prep_avx512icl+prep_mul+r5*4] + lea stride3q, [strideq*3] + jmp wq +.prep_w4: + movq xmm0, [srcq+strideq*0] + movhps xmm0, [srcq+strideq*1] + vpbroadcastq ymm1, [srcq+strideq*2] + vpbroadcastq ymm2, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd ymm0, ymm1, 0x30 + vpblendd ymm0, ymm2, 0xc0 + pmullw ymm0, ym4 + psubw ymm0, ym5 + mova [tmpq], ymm0 + add tmpq, 32 + sub hd, 4 + jg .prep_w4 + vzeroupper + RET +.prep_w8: + movu xm0, [srcq+strideq*0] + vinserti32x4 ym0, [srcq+strideq*1], 1 + vinserti32x4 m0, [srcq+strideq*2], 2 + vinserti32x4 m0, [srcq+stride3q ], 3 + lea srcq, [srcq+strideq*4] + pmullw m0, m4 + psubw m0, m5 + mova [tmpq], m0 + add tmpq, 64 + sub hd, 4 + jg .prep_w8 + RET +.prep_w16: + movu ym0, [srcq+strideq*0] + vinserti32x8 m0, [srcq+strideq*1], 1 + movu ym1, [srcq+strideq*2] + vinserti32x8 m1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + pmullw m0, m4 + pmullw m1, m4 + psubw m0, m5 + psubw m1, m5 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + add tmpq, 64*2 + sub hd, 4 + jg .prep_w16 + RET +.prep_w32: + pmullw m0, m4, [srcq+strideq*0] + pmullw m1, m4, [srcq+strideq*1] + pmullw m2, m4, [srcq+strideq*2] + pmullw m3, m4, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + sub hd, 4 + jg .prep_w32 + RET +.prep_w64: + pmullw m0, m4, [srcq+strideq*0+64*0] + pmullw m1, m4, [srcq+strideq*0+64*1] + pmullw m2, m4, [srcq+strideq*1+64*0] + pmullw m3, m4, [srcq+strideq*1+64*1] + lea srcq, [srcq+strideq*2] + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + sub hd, 2 + jg .prep_w64 + RET +.prep_w128: + pmullw m0, m4, [srcq+64*0] + pmullw m1, m4, [srcq+64*1] + pmullw m2, m4, [srcq+64*2] + pmullw m3, m4, [srcq+64*3] + add srcq, strideq + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + dec hd + jg .prep_w128 + RET +.h: + vpbroadcastw m5, mxyd + mov mxyd, r6m ; my + vpbroadcastd m4, [pw_16] + vpbroadcastd m6, [pw_32766] + psubw m4, m5 + test dword r7m, 0x800 + jnz .h_12bpc + psllw m4, 2 + psllw m5, 2 +.h_12bpc: + test mxyd, mxyd + jnz .hv + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.h_w4: + movu xm1, [srcq+strideq*0] + vinserti32x4 ym1, [srcq+strideq*2], 1 + movu xm2, [srcq+strideq*1] + vinserti32x4 ym2, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + punpcklqdq ym0, ym1, ym2 + psrldq ym1, 2 + psrldq ym2, 2 + pmullw ym0, ym4 + punpcklqdq ym1, ym2 + pmullw ym1, ym5 + psubw ym0, ym6 + paddw ym0, ym1 + psraw ym0, 2 + mova [tmpq], ym0 + add tmpq, 32 + sub hd, 4 + jg .h_w4 + RET +.h_w8: + movu xm0, [srcq+strideq*0+0] + movu xm1, [srcq+strideq*0+2] + vinserti32x4 ym0, [srcq+strideq*1+0], 1 + vinserti32x4 ym1, [srcq+strideq*1+2], 1 + vinserti32x4 m0, [srcq+strideq*2+0], 2 + vinserti32x4 m1, [srcq+strideq*2+2], 2 + vinserti32x4 m0, [srcq+stride3q +0], 3 + vinserti32x4 m1, [srcq+stride3q +2], 3 + lea srcq, [srcq+strideq*4] + pmullw m0, m4 + pmullw m1, m5 + psubw m0, m6 + paddw m0, m1 + psraw m0, 2 + mova [tmpq], m0 + add tmpq, 64 + sub hd, 4 + jg .h_w8 + RET +.h_w16: + movu ym0, [srcq+strideq*0+0] + vinserti32x8 m0, [srcq+strideq*1+0], 1 + movu ym1, [srcq+strideq*0+2] + vinserti32x8 m1, [srcq+strideq*1+2], 1 + lea srcq, [srcq+strideq*2] + pmullw m0, m4 + pmullw m1, m5 + psubw m0, m6 + paddw m0, m1 + psraw m0, 2 + mova [tmpq], m0 + add tmpq, 64 + sub hd, 2 + jg .h_w16 + RET +.h_w32: + pmullw m0, m4, [srcq+strideq*0+0] + pmullw m2, m5, [srcq+strideq*0+2] + pmullw m1, m4, [srcq+strideq*1+0] + pmullw m3, m5, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + psubw m0, m6 + psubw m1, m6 + paddw m0, m2 + paddw m1, m3 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + add tmpq, 64*2 + sub hd, 2 + jg .h_w32 + RET +.h_w64: + pmullw m0, m4, [srcq+ 0] + pmullw m2, m5, [srcq+ 2] + pmullw m1, m4, [srcq+64] + pmullw m3, m5, [srcq+66] + add srcq, strideq + psubw m0, m6 + psubw m1, m6 + paddw m0, m2 + paddw m1, m3 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + add tmpq, 64*2 + dec hd + jg .h_w64 + RET +.h_w128: + pmullw m0, m4, [srcq+ 0] + pmullw m7, m5, [srcq+ 2] + pmullw m1, m4, [srcq+ 64] + pmullw m8, m5, [srcq+ 66] + pmullw m2, m4, [srcq+128] + pmullw m9, m5, [srcq+130] + pmullw m3, m4, [srcq+192] + pmullw m10, m5, [srcq+194] + add srcq, strideq + REPX {psubw x, m6}, m0, m1, m2, m3 + paddw m0, m7 + paddw m1, m8 + paddw m2, m9 + paddw m3, m10 + REPX {psraw x, 2}, m0, m1, m2, m3 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + dec hd + jg .h_w128 + RET +.v: + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] + vpbroadcastw m9, mxyd + vpbroadcastd m8, [pw_16] + vpbroadcastd m10, [pw_32766] + add wq, r6 + lea stride3q, [strideq*3] + psubw m8, m9 + test dword r7m, 0x800 + jnz .v_12bpc + psllw m8, 2 + psllw m9, 2 +.v_12bpc: + jmp wq +.v_w4: + movq xmm0, [srcq+strideq*0] +.v_w4_loop: + vpbroadcastq xmm2, [srcq+strideq*1] + vpbroadcastq ymm1, [srcq+strideq*2] + vpbroadcastq ymm3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd ymm2, ymm1, 0x30 + vpblendd ymm2, ymm3, 0xc0 + vpblendd ymm1, ymm2, ymm0, 0x03 ; 0 1 2 3 + movq xmm0, [srcq+strideq*0] + valignq ymm2, ymm0, ymm2, 1 ; 1 2 3 4 + pmullw ymm1, ym8 + pmullw ymm2, ym9 + psubw ymm1, ym10 + paddw ymm1, ymm2 + psraw ymm1, 2 + mova [tmpq], ymm1 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + vzeroupper + RET +.v_w8: + movu xm0, [srcq+strideq*0] +.v_w8_loop: + vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 + vinserti32x4 m1, [srcq+strideq*2], 2 + vinserti32x4 m1, [srcq+stride3q ], 3 ; 0 1 2 3 + lea srcq, [srcq+strideq*4] + movu xm0, [srcq+strideq*0] + valignq m2, m0, m1, 2 ; 1 2 3 4 + pmullw m1, m8 + pmullw m2, m9 + psubw m1, m10 + paddw m1, m2 + psraw m1, 2 + mova [tmpq], m1 + add tmpq, 64 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + movu ym0, [srcq+strideq*0] +.v_w16_loop: + vinserti32x8 m1, m0, [srcq+strideq*1], 1 ; 0 1 + movu ym3, [srcq+strideq*2] + vinserti32x8 m2, m3, [srcq+stride3q ], 1 ; 2 3 + lea srcq, [srcq+strideq*4] + movu ym0, [srcq+strideq*0] + vshufi32x4 m3, m1, m3, q1032 ; 1 2 + vshufi32x4 m4, m2, m0, q1032 ; 3 4 + pmullw m1, m8 + pmullw m2, m8 + pmullw m3, m9 + pmullw m4, m9 + psubw m1, m10 + psubw m2, m10 + paddw m1, m3 + paddw m2, m4 + psraw m1, 2 + psraw m2, 2 + mova [tmpq+64*0], m1 + mova [tmpq+64*1], m2 + add tmpq, 64*2 + sub hd, 4 + jg .v_w16_loop + RET +.v_w32: + movu m0, [srcq+strideq*0] +.v_w32_loop: + movu m3, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + pmullw m1, m8, m0 + movu m0, [srcq+strideq*0] + pmullw m2, m8, m3 + pmullw m3, m9 + pmullw m4, m9, m0 + psubw m1, m10 + psubw m2, m10 + paddw m1, m3 + paddw m2, m4 + psraw m1, 2 + psraw m2, 2 + mova [tmpq+64*0], m1 + mova [tmpq+64*1], m2 + add tmpq, 64*2 + sub hd, 2 + jg .v_w32_loop + RET +.v_w64: + movu m0, [srcq+64*0] + movu m1, [srcq+64*1] +.v_w64_loop: + add srcq, strideq + pmullw m2, m8, m0 + movu m0, [srcq+64*0] + pmullw m3, m8, m1 + movu m1, [srcq+64*1] + pmullw m4, m9, m0 + pmullw m5, m9, m1 + psubw m2, m10 + psubw m3, m10 + paddw m2, m4 + paddw m3, m5 + psraw m2, 2 + psraw m3, 2 + mova [tmpq+64*0], m2 + mova [tmpq+64*1], m3 + add tmpq, 64*2 + dec hd + jg .v_w64_loop + RET +.v_w128: + movu m0, [srcq+64*0] + movu m1, [srcq+64*1] + movu m2, [srcq+64*2] + movu m3, [srcq+64*3] +.v_w128_loop: + add srcq, strideq + pmullw m4, m8, m0 + movu m0, [srcq+64*0] + pmullw m5, m8, m1 + movu m1, [srcq+64*1] + pmullw m6, m8, m2 + movu m2, [srcq+64*2] + pmullw m7, m8, m3 + movu m3, [srcq+64*3] + pmullw m11, m9, m0 + pmullw m12, m9, m1 + pmullw m13, m9, m2 + pmullw m14, m9, m3 + REPX {psubw x, m10}, m4, m5, m6, m7 + paddw m4, m11 + paddw m5, m12 + paddw m6, m13 + paddw m7, m14 + REPX {psraw x, 2}, m4, m5, m6, m7 + mova [tmpq+64*0], m4 + mova [tmpq+64*1], m5 + mova [tmpq+64*2], m6 + mova [tmpq+64*3], m7 + add tmpq, 64*4 + dec hd + jg .v_w128_loop + RET +.hv: + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] + shl mxyd, 11 + vpbroadcastw m7, mxyd + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.hv_w4: + movq xmm0, [srcq+strideq*0+0] + movq xmm1, [srcq+strideq*0+2] + pmullw xmm0, xm4 + pmullw xmm1, xm5 + psubw xmm0, xm6 + paddw xmm0, xmm1 + psraw xmm0, 2 + vpbroadcastq ym0, xmm0 +.hv_w4_loop: + movu xm1, [srcq+strideq*1] + vinserti128 ym1, [srcq+stride3q ], 1 + movu xm2, [srcq+strideq*2] + lea srcq, [srcq+strideq*4] + vinserti128 ym2, [srcq+strideq*0], 1 + punpcklqdq ym3, ym1, ym2 + psrldq ym1, 2 + psrldq ym2, 2 + pmullw ym3, ym4 + punpcklqdq ym1, ym2 + pmullw ym1, ym5 + psubw ym3, ym6 + paddw ym1, ym3 + psraw ym1, 2 ; 1 2 3 4 + valignq ym2, ym1, ym0, 3 ; 0 1 2 3 + mova ym0, ym1 + psubw ym1, ym2 + pmulhrsw ym1, ym7 + paddw ym1, ym2 + mova [tmpq], ym1 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop + RET +.hv_w8: + pmullw xm0, xm4, [srcq+strideq*0+0] + pmullw xm1, xm5, [srcq+strideq*0+2] + psubw xm0, xm6 + paddw xm0, xm1 + psraw xm0, 2 + vinserti32x4 m0, xm0, 3 +.hv_w8_loop: + movu xm1, [srcq+strideq*1+0] + movu xm2, [srcq+strideq*1+2] + vinserti32x4 ym1, [srcq+strideq*2+0], 1 + vinserti32x4 ym2, [srcq+strideq*2+2], 1 + vinserti32x4 m1, [srcq+stride3q +0], 2 + vinserti32x4 m2, [srcq+stride3q +2], 2 + lea srcq, [srcq+strideq*4] + vinserti32x4 m1, [srcq+strideq*0+0], 3 + vinserti32x4 m2, [srcq+strideq*0+2], 3 + pmullw m1, m4 + pmullw m2, m5 + psubw m1, m6 + paddw m1, m2 + psraw m1, 2 ; 1 2 3 4 + valignq m2, m1, m0, 6 ; 0 1 2 3 + mova m0, m1 + psubw m1, m2 + pmulhrsw m1, m7 + paddw m1, m2 + mova [tmpq], m1 + add tmpq, 64 + sub hd, 4 + jg .hv_w8_loop + RET +.hv_w16: + pmullw ym0, ym4, [srcq+strideq*0+0] + pmullw ym1, ym5, [srcq+strideq*0+2] + psubw ym0, ym6 + paddw ym0, ym1 + psraw ym0, 2 + vinserti32x8 m0, ym0, 1 +.hv_w16_loop: + movu ym1, [srcq+strideq*1+0] + movu ym2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + vinserti32x8 m1, [srcq+strideq*0+0], 1 + vinserti32x8 m2, [srcq+strideq*0+2], 1 + pmullw m1, m4 + pmullw m2, m5 + psubw m1, m6 + paddw m1, m2 + psraw m1, 2 ; 1 2 + vshufi32x4 m2, m0, m1, q1032 ; 0 1 + mova m0, m1 + psubw m1, m2 + pmulhrsw m1, m7 + paddw m1, m2 + mova [tmpq], m1 + add tmpq, 64 + sub hd, 2 + jg .hv_w16_loop + RET +.hv_w32: + pmullw m0, m4, [srcq+strideq*0+0] + pmullw m1, m5, [srcq+strideq*0+2] + psubw m0, m6 + paddw m0, m1 + psraw m0, 2 +.hv_w32_loop: + pmullw m3, m4, [srcq+strideq*1+0] + pmullw m1, m5, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + psubw m3, m6 + paddw m3, m1 + psraw m3, 2 + psubw m1, m3, m0 + pmulhrsw m1, m7 + paddw m1, m0 + pmullw m0, m4, [srcq+strideq*0+0] + pmullw m2, m5, [srcq+strideq*0+2] + psubw m0, m6 + paddw m0, m2 + psraw m0, 2 + psubw m2, m0, m3 + pmulhrsw m2, m7 + paddw m2, m3 + mova [tmpq+64*0], m1 + mova [tmpq+64*1], m2 + add tmpq, 64*2 + sub hd, 2 + jg .hv_w32_loop + RET +.hv_w64: + pmullw m0, m4, [srcq+ 0] + pmullw m2, m5, [srcq+ 2] + pmullw m1, m4, [srcq+64] + pmullw m3, m5, [srcq+66] + psubw m0, m6 + psubw m1, m6 + paddw m0, m2 + paddw m1, m3 + psraw m0, 2 + psraw m1, 2 +.hv_w64_loop: + add srcq, strideq + pmullw m2, m4, [srcq+ 0] + pmullw m8, m5, [srcq+ 2] + pmullw m3, m4, [srcq+64] + pmullw m9, m5, [srcq+66] + psubw m2, m6 + psubw m3, m6 + paddw m2, m8 + paddw m3, m9 + psraw m2, 2 + psraw m3, 2 + psubw m8, m2, m0 + psubw m9, m3, m1 + pmulhrsw m8, m7 + pmulhrsw m9, m7 + paddw m8, m0 + mova m0, m2 + paddw m9, m1 + mova m1, m3 + mova [tmpq+64*0], m8 + mova [tmpq+64*1], m9 + add tmpq, 64*2 + dec hd + jg .hv_w64_loop + RET +.hv_w128: + pmullw m0, m4, [srcq+ 0] + pmullw m8, m5, [srcq+ 2] + pmullw m1, m4, [srcq+ 64] + pmullw m9, m5, [srcq+ 66] + pmullw m2, m4, [srcq+128] + pmullw m10, m5, [srcq+130] + pmullw m3, m4, [srcq+192] + pmullw m11, m5, [srcq+194] + REPX {psubw x, m6}, m0, m1, m2, m3 + paddw m0, m8 + paddw m1, m9 + paddw m2, m10 + paddw m3, m11 + REPX {psraw x, 2}, m0, m1, m2, m3 +.hv_w128_loop: + add srcq, strideq + pmullw m8, m4, [srcq+ 0] + pmullw m12, m5, [srcq+ 2] + pmullw m9, m4, [srcq+ 64] + pmullw m13, m5, [srcq+ 66] + pmullw m10, m4, [srcq+128] + pmullw m14, m5, [srcq+130] + pmullw m11, m4, [srcq+192] + pmullw m15, m5, [srcq+194] + REPX {psubw x, m6}, m8, m9, m10, m11 + paddw m8, m12 + paddw m9, m13 + paddw m10, m14 + paddw m11, m15 + REPX {psraw x, 2}, m8, m9, m10, m11 + psubw m12, m8, m0 + psubw m13, m9, m1 + psubw m14, m10, m2 + psubw m15, m11, m3 + REPX {pmulhrsw x, m7}, m12, m13, m14, m15 + paddw m12, m0 + mova m0, m8 + paddw m13, m1 + mova m1, m9 + mova [tmpq+64*0], m12 + mova [tmpq+64*1], m13 + paddw m14, m2 + mova m2, m10 + paddw m15, m3 + mova m3, m11 + mova [tmpq+64*2], m14 + mova [tmpq+64*3], m15 + add tmpq, 64*4 + dec hd + jg .hv_w128_loop + RET + +; int8_t subpel_filters[5][15][8] +%assign FILTER_REGULAR (0*15 << 16) | 3*15 +%assign FILTER_SMOOTH (1*15 << 16) | 4*15 +%assign FILTER_SHARP (2*15 << 16) | 3*15 + +%macro MC_8TAP_FN 4 ; prefix, type, type_h, type_v +cglobal %1_8tap_%2_16bpc + mov t0d, FILTER_%3 +%ifidn %3, %4 + mov t1d, t0d +%else + mov t1d, FILTER_%4 +%endif +%ifnidn %2, regular ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%1_8tap_16bpc %+ SUFFIX) +%endif +%endmacro + +%if WIN64 +DECLARE_REG_TMP 4, 5 +%define buf rsp+stack_offset+8 ; shadow space +%else +DECLARE_REG_TMP 7, 8 +%define buf rsp-40 ; red zone +%endif + +MC_8TAP_FN put, sharp, SHARP, SHARP +MC_8TAP_FN put, sharp_smooth, SHARP, SMOOTH +MC_8TAP_FN put, smooth_sharp, SMOOTH, SHARP +MC_8TAP_FN put, smooth, SMOOTH, SMOOTH +MC_8TAP_FN put, sharp_regular, SHARP, REGULAR +MC_8TAP_FN put, regular_sharp, REGULAR, SHARP +MC_8TAP_FN put, smooth_regular, SMOOTH, REGULAR +MC_8TAP_FN put, regular_smooth, REGULAR, SMOOTH +MC_8TAP_FN put, regular, REGULAR, REGULAR + +cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my +%define base r8-put_avx512icl + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r8, [put_avx512icl] + movifnidn wd, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + movzx wd, word [r8+wq*2+table_offset(put,)] + add wq, r8 +%if WIN64 + pop r8 +%endif + jmp wq +.h_w2: + movzx mxd, mxb + sub srcq, 2 + mova ym2, [spel_h_shuf2a] + pmovsxbw xmm4, [base+subpel_filters+mxq*8] + pshufd xmm3, xmm4, q1111 + pshufd xmm4, xmm4, q2222 +.h_w2_loop: + movu xm1, [srcq+ssq*0] + vinserti32x4 ym1, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + mova xmm0, xm8 + vpermb ym1, ym2, ym1 + vpdpwssd xmm0, xmm3, xm1 + vextracti32x4 xm1, ym1, 1 + vpdpwssd xmm0, xmm4, xm1 + psrad xmm0, 6 + packusdw xmm0, xmm0 + pminsw xmm0, xm9 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2_loop + RET +.h_w4: + movzx mxd, mxb + sub srcq, 2 + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + vbroadcasti32x4 ym4, [spel_h_shufA] + vbroadcasti32x4 ym5, [spel_h_shufB] + pshufd xmm0, xmm0, q2211 + vpbroadcastq ym6, xmm0 + vpermq ym7, ymm0, q1111 +.h_w4_loop: + movu xm2, [srcq+ssq*0] + vinserti32x4 ym2, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + mova ym0, ym8 + pshufb ym1, ym2, ym4 + vpdpwssd ym0, ym6, ym1 + pshufb ym2, ym5 + vpdpwssd ym0, ym7, ym2 + psrad ym0, 6 + vextracti32x4 xm1, ym0, 1 + packusdw xm0, xm1 + pminsw xmm0, xm0, xm9 + movq [dstq+dsq*0], xmm0 + movhps [dstq+dsq*1], xmm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4_loop + RET +.h: + test myd, 0xf00 + jnz .hv + mov r7d, r8m + vpbroadcastw m9, r8m + shr r7d, 11 + vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4] + cmp wd, 4 + je .h_w4 + jl .h_w2 + shr mxd, 16 + sub srcq, 6 + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + mova [buf], xmm0 + vpbroadcastd m10, xmm0 + vpbroadcastd m11, [buf+ 4] + vpbroadcastd m12, [buf+ 8] + vpbroadcastd m13, [buf+12] + sub wd, 16 + je .h_w16 + jg .h_w32 +.h_w8: + mova m4, [spel_h_shufA] + movu m5, [spel_h_shufB] + movu m6, [spel_h_shufC] + mova m7, [spel_h_shufD] +.h_w8_loop: + movu ym2, [srcq+ssq*0] + vinserti32x8 m2, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + mova m0, m8 + vpermb m1, m4, m2 + vpdpwssd m0, m10, m1 + vpermb m1, m5, m2 + vpdpwssd m0, m11, m1 + vpermb m1, m6, m2 + vpdpwssd m0, m12, m1 + vpermb m1, m7, m2 + vpdpwssd m0, m13, m1 + psrad m0, 6 + vextracti32x8 ym1, m0, 1 + packusdw ym0, ym1 + pminsw ym0, ym9 + mova [dstq+dsq*0], xm0 + vextracti32x4 [dstq+dsq*1], ym0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8_loop + RET +.h_w16: + vbroadcasti32x4 m6, [spel_h_shufA] + vbroadcasti32x4 m7, [spel_h_shufB] +.h_w16_loop: + movu ym2, [srcq+ssq*0+ 0] + vinserti32x8 m2, [srcq+ssq*1+ 0], 1 + movu ym3, [srcq+ssq*0+16] + vinserti32x8 m3, [srcq+ssq*1+16], 1 + lea srcq, [srcq+ssq*2] + mova m0, m8 + mova m1, m8 + pshufb m4, m2, m6 + vpdpwssd m0, m10, m4 ; a0 + pshufb m4, m3, m6 + vpdpwssd m1, m12, m4 ; b2 + pshufb m4, m2, m7 + vpdpwssd m0, m11, m4 ; a1 + pshufb m4, m3, m7 + vpdpwssd m1, m13, m4 ; b3 + shufpd m2, m3, 0x55 + pshufb m4, m2, m6 + vpdpwssd m0, m12, m4 ; a2 + vpdpwssd m1, m10, m4 ; b0 + pshufb m2, m7 + vpdpwssd m0, m13, m2 ; a3 + vpdpwssd m1, m11, m2 ; b1 + psrad m0, 6 + psrad m1, 6 + packusdw m0, m1 + pminsw m0, m9 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16_loop + RET +.h_w32: + lea srcq, [srcq+wq*2] + vbroadcasti32x4 m6, [spel_h_shufA] + lea dstq, [dstq+wq*2] + vbroadcasti32x4 m7, [spel_h_shufB] + neg wq +.h_w32_loop0: + mov r6, wq +.h_w32_loop: + movu m2, [srcq+r6*2+ 0] + movu m3, [srcq+r6*2+ 8] + mova m0, m8 + mova m1, m8 + pshufb m4, m2, m6 + vpdpwssd m0, m10, m4 ; a0 + pshufb m4, m3, m6 + vpdpwssd m1, m10, m4 ; b0 + vpdpwssd m0, m12, m4 ; a2 + movu m4, [srcq+r6*2+16] + pshufb m3, m7 + vpdpwssd m1, m11, m3 ; b1 + vpdpwssd m0, m13, m3 ; a3 + pshufb m3, m4, m6 + vpdpwssd m1, m12, m3 ; b2 + pshufb m2, m7 + vpdpwssd m0, m11, m2 ; a1 + pshufb m4, m7 + vpdpwssd m1, m13, m4 ; b3 + psrad m0, 6 + psrad m1, 6 + packusdw m0, m1 + pminsw m0, m9 + mova [dstq+r6*2], m0 + add r6, 32 + jl .h_w32_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w32_loop0 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastd m10, [pd_32] + pmovsxbw xmm0, [base+subpel_filters+myq*8] + tzcnt r7d, wd + vpbroadcastw m11, r8m + lea r6, [ssq*3] + movzx r7d, word [r8+r7*2+table_offset(put, _8tap_v)] + sub srcq, r6 + mova [rsp+stack_offset+8], xmm0 + vpbroadcastd m12, xmm0 + add r7, r8 + vpbroadcastd m13, [rsp+stack_offset+12] + vpbroadcastd m14, [rsp+stack_offset+16] + vpbroadcastd m15, [rsp+stack_offset+20] + jmp r7 +.v_w2: + movd xmm2, [srcq+ssq*0] + pinsrd xmm2, [srcq+ssq*1], 1 + pinsrd xmm2, [srcq+ssq*2], 2 + add srcq, r6 + pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3 + movd xmm3, [srcq+ssq*1] + vpbroadcastd xmm1, [srcq+ssq*2] + add srcq, r6 + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm3, xmm1, 0x02 ; 4 5 + vpblendd xmm1, xmm0, 0x02 ; 5 6 + palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 + punpcklwd xmm3, xmm1 ; 45 56 + punpcklwd xmm1, xmm2, xmm4 ; 01 12 + punpckhwd xmm2, xmm4 ; 23 34 +.v_w2_loop: + vpbroadcastd xmm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova xmm5, xm10 + vpdpwssd xmm5, xm12, xmm1 ; a0 b0 + mova xmm1, xmm2 + vpdpwssd xmm5, xm13, xmm2 ; a1 b1 + mova xmm2, xmm3 + vpdpwssd xmm5, xm14, xmm3 ; a2 b2 + vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm4, xmm0, 0x02 ; 7 8 + punpcklwd xmm3, xmm4 ; 67 78 + vpdpwssd xmm5, xm15, xmm3 ; a3 b3 + psrad xmm5, 6 + packusdw xmm5, xmm5 + pminsw xmm5, xm11 + movd [dstq+dsq*0], xmm5 + pextrd [dstq+dsq*1], xmm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movq xmm1, [srcq+ssq*0] + vpbroadcastq ymm0, [srcq+ssq*1] + vpbroadcastq ymm2, [srcq+ssq*2] + add srcq, r6 + vpbroadcastq ymm4, [srcq+ssq*0] + vpbroadcastq ymm3, [srcq+ssq*1] + vpbroadcastq ymm5, [srcq+ssq*2] + add srcq, r6 + vpblendd ymm1, ymm0, 0x30 + vpblendd ymm0, ymm2, 0x30 + punpcklwd ymm1, ymm0 ; 01 12 + vpbroadcastq ymm0, [srcq+ssq*0] + vpblendd ymm2, ymm4, 0x30 + vpblendd ymm4, ymm3, 0x30 + punpcklwd ymm2, ymm4 ; 23 34 + vpblendd ymm3, ymm5, 0x30 + vpblendd ymm5, ymm0, 0x30 + punpcklwd ymm3, ymm5 ; 45 56 +.v_w4_loop: + vpbroadcastq ymm5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova ymm4, ym10 + vpdpwssd ymm4, ym12, ymm1 ; a0 b0 + mova ymm1, ymm2 + vpdpwssd ymm4, ym13, ymm2 ; a1 b1 + mova ymm2, ymm3 + vpdpwssd ymm4, ym14, ymm3 ; a2 b2 + vpblendd ymm3, ymm0, ymm5, 0x30 + vpbroadcastq ymm0, [srcq+ssq*0] + vpblendd ymm5, ymm0, 0x30 + punpcklwd ymm3, ymm5 ; 67 78 + vpdpwssd ymm4, ym15, ymm3 ; a3 b3 + psrad ymm4, 6 + vextracti128 xmm5, ymm4, 1 + packusdw xmm4, xmm5 + pminsw xmm4, xm11 + movq [dstq+dsq*0], xmm4 + movhps [dstq+dsq*1], xmm4 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + vzeroupper + RET +.v_w8: + vbroadcasti32x4 m2, [srcq+ssq*2] + vinserti32x4 m1, m2, [srcq+ssq*0], 0 + vinserti32x4 m1, [srcq+ssq*1], 1 ; 0 1 2 + add srcq, r6 + vinserti32x4 ym2, [srcq+ssq*0], 1 + vinserti32x4 m2, [srcq+ssq*1], 2 ; 2 3 4 + mova m6, [spel_v_shuf8] + movu xm0, [srcq+ssq*1] + vinserti32x4 ym0, [srcq+ssq*2], 1 + add srcq, r6 + vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6 + vpermb m1, m6, m1 ; 01 12 + vpermb m2, m6, m2 ; 23 34 + vpermb m3, m6, m0 ; 45 56 +.v_w8_loop: + vinserti32x4 m0, [srcq+ssq*1], 3 + lea srcq, [srcq+ssq*2] + movu xm5, [srcq+ssq*0] + mova m4, m10 + vpdpwssd m4, m12, m1 ; a0 b0 + mova m1, m2 + vshufi32x4 m0, m5, q1032 ; 6 7 8 + vpdpwssd m4, m13, m2 ; a1 b1 + mova m2, m3 + vpdpwssd m4, m14, m3 ; a2 b2 + vpermb m3, m6, m0 ; 67 78 + vpdpwssd m4, m15, m3 ; a3 b3 + psrad m4, 6 + vextracti32x8 ym5, m4, 1 + packusdw ym4, ym5 + pminsw ym4, ym11 + mova [dstq+dsq*0], xm4 + vextracti32x4 [dstq+dsq*1], ym4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: + vbroadcasti32x8 m1, [srcq+ssq*1] + vinserti32x8 m0, m1, [srcq+ssq*0], 0 + vinserti32x8 m1, [srcq+ssq*2], 1 + mova m8, [spel_v_shuf16] + add srcq, r6 + movu ym3, [srcq+ssq*0] + vinserti32x8 m3, [srcq+ssq*1], 1 + movu ym5, [srcq+ssq*2] + add srcq, r6 + vinserti32x8 m5, [srcq+ssq*0], 1 + vpermb m0, m8, m0 ; 01 + vpermb m1, m8, m1 ; 12 + vpermb m3, m8, m3 ; 34 + vpermb m5, m8, m5 ; 56 + mova m9, [deint_q_shuf] + vpshrdd m2, m1, m3, 16 ; 23 + vpshrdd m4, m3, m5, 16 ; 45 +.v_w16_loop: + mova m6, m10 + mova m7, m10 + vpdpwssd m6, m12, m0 ; a0 + mova m0, m2 + vpdpwssd m7, m12, m1 ; b0 + mova m1, m3 + vpdpwssd m6, m13, m2 ; a1 + mova m2, m4 + vpdpwssd m7, m13, m3 ; b1 + mova m3, m5 + vpdpwssd m6, m14, m4 ; a2 + mova m4, m5 + vpdpwssd m7, m14, m5 ; b2 + movu ym5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti32x8 m5, [srcq+ssq*0], 1 + vpermb m5, m8, m5 ; 78 + vpshrdd m4, m5, 16 ; 67 + vpdpwssd m6, m15, m4 ; a3 + vpdpwssd m7, m15, m5 ; b3 + psrad m6, 6 + psrad m7, 6 + packusdw m6, m7 + pminsw m6, m11 + vpermq m6, m9, m6 + mova [dstq+dsq*0], ym6 + vextracti32x8 [dstq+dsq*1], m6, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w16_loop + RET +.v_w32: +.v_w64: +.v_w128: +%if WIN64 + movaps [rsp+stack_offset+8], xmm6 +%endif + lea wd, [hq+wq*8-256] + mov r7, srcq + mov r8, dstq +.v_w32_loop0: + movu m16, [srcq+ssq*0] + movu m17, [srcq+ssq*1] + movu m18, [srcq+ssq*2] + add srcq, r6 + movu m19, [srcq+ssq*0] + movu m20, [srcq+ssq*1] + movu m21, [srcq+ssq*2] + add srcq, r6 + movu m22, [srcq+ssq*0] + punpcklwd m0, m16, m17 ; 01l + punpckhwd m16, m17 ; 01h + punpcklwd m1, m17, m18 ; 12l + punpckhwd m17, m18 ; 12h + punpcklwd m2, m18, m19 ; 23l + punpckhwd m18, m19 ; 23h + punpcklwd m3, m19, m20 ; 34l + punpckhwd m19, m20 ; 34h + punpcklwd m4, m20, m21 ; 45l + punpckhwd m20, m21 ; 45h + punpcklwd m5, m21, m22 ; 56l + punpckhwd m21, m22 ; 56h +.v_w32_loop: + mova m6, m10 + vpdpwssd m6, m12, m0 ; a0l + mova m8, m10 + vpdpwssd m8, m12, m16 ; a0h + mova m7, m10 + vpdpwssd m7, m12, m1 ; b0l + mova m9, m10 + vpdpwssd m9, m12, m17 ; b0h + mova m0, m2 + vpdpwssd m6, m13, m2 ; a1l + mova m16, m18 + vpdpwssd m8, m13, m18 ; a1h + mova m1, m3 + vpdpwssd m7, m13, m3 ; b1l + mova m17, m19 + vpdpwssd m9, m13, m19 ; b1h + mova m2, m4 + vpdpwssd m6, m14, m4 ; a2l + mova m18, m20 + vpdpwssd m8, m14, m20 ; a2h + mova m3, m5 + vpdpwssd m7, m14, m5 ; b2l + mova m19, m21 + vpdpwssd m9, m14, m21 ; b2h + movu m21, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m4, m22, m21 ; 67l + punpckhwd m20, m22, m21 ; 67h + movu m22, [srcq+ssq*0] + vpdpwssd m6, m15, m4 ; a3l + vpdpwssd m8, m15, m20 ; a3h + punpcklwd m5, m21, m22 ; 78l + punpckhwd m21, m22 ; 78h + vpdpwssd m7, m15, m5 ; b3l + vpdpwssd m9, m15, m21 ; b3h + REPX {psrad x, 6}, m6, m8, m7, m9 + packusdw m6, m8 + packusdw m7, m9 + pminsw m6, m11 + pminsw m7, m11 + mova [dstq+dsq*0], m6 + mova [dstq+dsq*1], m7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w32_loop + add r7, 64 + add r8, 64 + movzx hd, wb + mov srcq, r7 + mov dstq, r8 + sub wd, 1<<8 + jg .v_w32_loop0 +%if WIN64 + movaps xmm6, [rsp+stack_offset+8] +%endif + vzeroupper + RET +.hv: + vpbroadcastw m11, r8m + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + pmovsxbw xmm1, [base+subpel_filters+myq*8] + lea r6, [ssq*3] + sub srcq, 2 + sub srcq, r6 + test dword r8m, 0x800 + jnz .hv_12bit + vpbroadcastd m10, [pd_2176] + psllw xmm0, 6 + jmp .hv_main +.hv_12bit: + vpbroadcastd m10, [pd_640] + psllw xmm0, 4 + psllw xmm1, 2 +.hv_main: + mova [buf+ 0], xmm0 + mova [buf+16], xmm1 + vpbroadcastd m8, [buf+ 4] + vpbroadcastd m9, [buf+ 8] + vpbroadcastd ym12, xmm1 + vpbroadcastd ym13, [buf+20] + vpbroadcastd ym14, [buf+24] + vpbroadcastd ym15, [buf+28] + movu xm4, [srcq+ssq*0] + vinserti32x4 ym4, [srcq+ssq*1], 1 + vinserti32x4 m4, [srcq+ssq*2], 2 + add srcq, r6 + vinserti32x4 m4, [srcq+ssq*0], 3 ; 0 1 2 3 + movu xm0, [srcq+ssq*1] + vinserti32x4 ym0, [srcq+ssq*2], 1 + add srcq, r6 + vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6 + cmp wd, 4 + je .hv_w4 + vbroadcasti32x4 m2, [spel_h_shufA] + mova m3, [spel_h_shuf2b] + mova ym6, [spel_h_shuf2a] + mova xm7, [spel_shuf2] + mova m1, m10 + pshufb m4, m2 + pshufb m0, m2 + punpcklqdq m2, m4, m0 + vpdpwssd m1, m8, m2 ; 04 15 26 3_ + punpckhqdq m4, m0 + vpdpwssd m1, m9, m4 + vpermb m1, m3, m1 ; 01 12 + vextracti32x4 xm2, ym1, 1 ; 23 34 + vextracti32x4 xm3, m1, 2 ; 45 56 +.hv_w2_loop: + movu xm5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti32x4 ym5, [srcq+ssq*0], 1 + mova xm4, xm10 + vpermb ym5, ym6, ym5 + pmaddwd xmm0, xm12, xm1 ; a0 b0 + vpdpwssd xm4, xm8, xm5 + vextracti32x4 xm5, ym5, 1 + mova xm1, xm2 + vpdpwssd xmm0, xm13, xm2 ; a1 b1 + vpdpwssd xm4, xm9, xm5 ; 7 8 + mova xm2, xm3 + vpdpwssd xmm0, xm14, xm3 ; a2 b2 + vpermt2b xm3, xm7, xm4 ; 67 78 + vpdpwssd xmm0, xm15, xm3 ; a3 b3 + psrad xmm0, 10 + packusdw xmm0, xmm0 + pminsw xmm0, xm11 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + vbroadcasti32x4 m19, [spel_h_shufA] + vbroadcasti32x4 m20, [spel_h_shufB] + mova ym6, [spel_shuf4a] + mova ym7, [spel_shuf4b] + mova m2, m10 + mova m3, m10 + pshufb m1, m4, m19 + vpdpwssd m2, m8, m1 + pshufb m1, m0, m19 + vpdpwssd m3, m8, m1 + pshufb m4, m20 + vpdpwssd m2, m9, m4 + pshufb m0, m20 + vpdpwssd m3, m9, m0 + vpermb m1, m6, m2 ; 01 12 + vshufi32x4 m2, m3, q1032 + vpermb m3, m6, m3 ; 45 56 + vpermb m2, m6, m2 ; 23 34 +.hv_w4_loop: + movu xm18, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti128 ym18, [srcq+ssq*0], 1 + mova ym4, ym10 + pshufb ym17, ym18, ym19 + pmaddwd ym16, ym12, ym1 ; a0 b0 + vpdpwssd ym4, ym8, ym17 + pshufb ym18, ym20 + mova ym1, ym2 + vpdpwssd ym16, ym13, ym2 ; a1 b1 + vpdpwssd ym4, ym9, ym18 ; 7 8 + mova ym2, ym3 + vpdpwssd ym16, ym14, ym3 ; a2 b2 + vpermt2b ym3, ym7, ym4 ; 67 78 + vpdpwssd ym16, ym15, ym3 ; a3 b3 + psrad ym16, 10 + vextracti128 xm17, ym16, 1 + packusdw xm16, xm17 + pminsw xm16, xm11 + movq [dstq+dsq*0], xm16 + movhps [dstq+dsq*1], xm16 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + vzeroupper + RET +.hv_w8: + shr mxd, 16 + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + pmovsxbw xmm1, [base+subpel_filters+myq*8] + lea r6, [ssq*3] + sub srcq, 6 + sub srcq, r6 + test dword r8m, 0x800 + jnz .hv_w8_12bit + vpbroadcastd m10, [pd_2176] + psllw xmm0, 6 + jmp .hv_w8_main +.hv_w8_12bit: + vpbroadcastd m10, [pd_640] + psllw xmm0, 4 + psllw xmm1, 2 +.hv_w8_main: + mova [buf+ 0], xmm0 + mova [buf+16], xmm1 + vpbroadcastd m12, xmm0 + vpbroadcastd m13, [buf+ 4] + vpbroadcastd m14, [buf+ 8] + vpbroadcastd m15, [buf+12] + vpbroadcastd m16, xmm1 + vpbroadcastd m17, [buf+20] + vpbroadcastd m18, [buf+24] + vpbroadcastd m19, [buf+28] + cmp wd, 16 + je .hv_w16 + jg .hv_w32 + mova m5, [spel_h_shufA] + movu ym0, [srcq+ssq*0] + vinserti32x8 m0, [srcq+ssq*1], 1 ; 0 1 + movu ym9, [srcq+ssq*2] + add srcq, r6 + vinserti32x8 m9, [srcq+ssq*0], 1 ; 2 3 + movu ym20, [srcq+ssq*1] + vinserti32x8 m20, [srcq+ssq*2], 1 ; 4 5 + add srcq, r6 + movu ym21, [srcq+ssq*0] ; 6 + movu m6, [spel_h_shufB] + movu m7, [spel_h_shufC] + vpermb m8, m5, m0 + mova m1, m10 + vpdpwssd m1, m12, m8 ; a0 b0 + vpermb m8, m5, m9 + mova m2, m10 + vpdpwssd m2, m12, m8 ; c0 d0 + vpermb m8, m5, m20 + mova m3, m10 + vpdpwssd m3, m12, m8 ; e0 f0 + vpermb m8, m5, m21 + mova m4, m10 + vpdpwssd m4, m12, m8 ; g0 + vpermb m8, m6, m0 + vpdpwssd m1, m13, m8 ; a1 b1 + vpermb m8, m6, m9 + vpdpwssd m2, m13, m8 ; c1 d1 + vpermb m8, m6, m20 + vpdpwssd m3, m13, m8 ; e1 f1 + vpermb m8, m6, m21 + vpdpwssd m4, m13, m8 ; g1 + vpermb m8, m7, m0 + vpdpwssd m1, m14, m8 ; a2 b2 + vpermb m8, m7, m9 + vpdpwssd m2, m14, m8 ; c2 d2 + vpermb m8, m7, m20 + vpdpwssd m3, m14, m8 ; e2 f2 + vpermb m8, m7, m21 + vpdpwssd m4, m14, m8 ; g2 + mova m8, [spel_h_shufD] + vpermb m0, m8, m0 + vpdpwssd m1, m15, m0 ; a3 b3 + mova m0, [spel_shuf8a] + vpermb m9, m8, m9 + vpdpwssd m2, m15, m9 ; c3 d3 + mova m9, [spel_shuf8b] + vpermb m20, m8, m20 + vpdpwssd m3, m15, m20 ; e3 f3 + vpermb m21, m8, m21 + vpdpwssd m4, m15, m21 ; g3 + vpermt2b m1, m0, m2 ; 01 12 + vpermt2b m2, m0, m3 ; 23 34 + vpermt2b m3, m0, m4 ; 45 56 +.hv_w8_loop: + movu ym0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti32x8 m0, [srcq+ssq*0], 1 + mova m4, m10 + vpermb m21, m5, m0 + vpdpwssd m4, m12, m21 ; h0 i0 + vpermb m21, m6, m0 + pmaddwd m20, m16, m1 ; A0 B0 + vpdpwssd m4, m13, m21 ; h1 i1 + vpermb m21, m7, m0 + mova m1, m2 + vpdpwssd m20, m17, m2 ; A1 B1 + vpdpwssd m4, m14, m21 ; h2 i2 + vpermb m21, m8, m0 + mova m2, m3 + vpdpwssd m20, m18, m3 ; A2 B2 + vpdpwssd m4, m15, m21 ; h3 i3 + vpermt2b m3, m9, m4 ; 67 78 + vpdpwssd m20, m19, m3 ; A3 B3 + psrad m20, 10 + vextracti32x8 ym21, m20, 1 + packusdw ym20, ym21 + pminsw ym20, ym11 + mova [dstq+dsq*0], xm20 + vextracti128 [dstq+dsq*1], ym20, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + vzeroupper + RET +.hv_w16: + WIN64_SPILL_XMM 26 + vbroadcasti32x8 m5, [srcq+ssq*0+ 8] + vinserti32x8 m4, m5, [srcq+ssq*0+ 0], 0 + vinserti32x8 m5, [srcq+ssq*0+16], 1 ; 0 + movu ym6, [srcq+ssq*1+ 0] + movu ym7, [srcq+ssq*1+16] + vinserti32x8 m6, [srcq+ssq*2+ 0], 1 + vinserti32x8 m7, [srcq+ssq*2+16], 1 ; 1 2 + add srcq, r6 + movu ym22, [srcq+ssq*0+ 0] + movu ym23, [srcq+ssq*0+16] + vinserti32x8 m22, [srcq+ssq*1+ 0], 1 + vinserti32x8 m23, [srcq+ssq*1+16], 1 ; 3 4 + movu ym24, [srcq+ssq*2+ 0] + movu ym25, [srcq+ssq*2+16] + add srcq, r6 + vinserti32x8 m24, [srcq+ssq*0+ 0], 1 + vinserti32x8 m25, [srcq+ssq*0+16], 1 ; 5 6 + vbroadcasti32x4 m20, [spel_h_shufA] + vbroadcasti32x4 m21, [spel_h_shufB] + mova m9, [spel_shuf16] + pshufb m0, m4, m20 + mova m1, m10 + vpdpwssd m1, m12, m0 ; a0 + pshufb m0, m6, m20 + mova m2, m10 + vpdpwssd m2, m12, m0 ; b0 + pshufb m0, m7, m20 + mova m3, m10 + vpdpwssd m3, m14, m0 ; c2 + pshufb m0, m4, m21 + vpdpwssd m1, m13, m0 ; a1 + pshufb m0, m6, m21 + vpdpwssd m2, m13, m0 ; b1 + pshufb m0, m7, m21 + vpdpwssd m3, m15, m0 ; c3 + pshufb m0, m5, m20 + vpdpwssd m1, m14, m0 ; a2 + shufpd m6, m7, 0x55 + pshufb m7, m6, m20 + vpdpwssd m2, m14, m7 ; b2 + vpdpwssd m3, m12, m7 ; c0 + pshufb m5, m21 + vpdpwssd m1, m15, m5 ; a3 + pshufb m6, m21 + vpdpwssd m2, m15, m6 ; b3 + vpdpwssd m3, m13, m6 ; c1 + pshufb m0, m22, m20 + mova m4, m10 + vpdpwssd m4, m12, m0 ; d0 + pshufb m0, m23, m20 + mova m5, m10 + vpdpwssd m5, m14, m0 ; e2 + pshufb m0, m24, m20 + mova m6, m10 + vpdpwssd m6, m12, m0 ; f0 + pshufb m0, m25, m20 + mova m7, m10 + vpdpwssd m7, m14, m0 ; g2 + pshufb m0, m22, m21 + vpdpwssd m4, m13, m0 ; d1 + pshufb m0, m23, m21 + vpdpwssd m5, m15, m0 ; e3 + pshufb m0, m24, m21 + vpdpwssd m6, m13, m0 ; f1 + pshufb m0, m25, m21 + vpdpwssd m7, m15, m0 ; g3 + shufpd m22, m23, 0x55 + pshufb m23, m22, m20 + vpdpwssd m4, m14, m23 ; d2 + vpdpwssd m5, m12, m23 ; e0 + shufpd m24, m25, 0x55 + pshufb m25, m24, m20 + vpdpwssd m6, m14, m25 ; f2 + vpdpwssd m7, m12, m25 ; g0 + pshufb m22, m21 + vpdpwssd m4, m15, m22 ; d3 + vpdpwssd m5, m13, m22 ; e1 + pshufb m24, m21 + vpdpwssd m6, m15, m24 ; f3 + vpdpwssd m7, m13, m24 ; g1 + pslldq m1, 1 + vpermt2b m2, m9, m3 ; 12 + vpermt2b m4, m9, m5 ; 34 + vpermt2b m6, m9, m7 ; 56 + vpshrdd m1, m2, 16 ; 01 + vpshrdd m3, m2, m4, 16 ; 23 + vpshrdd m5, m4, m6, 16 ; 45 +.hv_w16_loop: + movu ym24, [srcq+ssq*1+ 0] + movu ym25, [srcq+ssq*1+16] + lea srcq, [srcq+ssq*2] + vinserti32x8 m24, [srcq+ssq*0+ 0], 1 + vinserti32x8 m25, [srcq+ssq*0+16], 1 + mova m7, m10 + mova m8, m10 + pshufb m0, m24, m20 + vpdpwssd m7, m12, m0 ; h0 + pshufb m0, m25, m20 + vpdpwssd m8, m14, m0 ; i2 + pmaddwd m22, m16, m1 ; A0 + mova m1, m3 + pmaddwd m23, m16, m2 ; B0 + mova m2, m4 + pshufb m0, m24, m21 + vpdpwssd m7, m13, m0 ; h1 + pshufb m0, m25, m21 + vpdpwssd m8, m15, m0 ; i3 + vpdpwssd m22, m17, m3 ; A1 + mova m3, m5 + vpdpwssd m23, m17, m4 ; B1 + mova m4, m6 + shufpd m24, m25, 0x55 + pshufb m25, m24, m20 + vpdpwssd m7, m14, m25 ; h2 + vpdpwssd m8, m12, m25 ; i0 + vpdpwssd m22, m18, m5 ; A2 + vpdpwssd m23, m18, m6 ; B2 + pshufb m24, m21 + vpdpwssd m7, m15, m24 ; h3 + vpdpwssd m8, m13, m24 ; i1 + vpermt2b m7, m9, m8 ; 78 + vpshrdd m5, m6, m7, 16 ; 67 + vpdpwssd m22, m19, m5 ; A3 + vpdpwssd m23, m19, m7 ; B3 + mova m6, m7 + psrad m22, 10 + psrad m23, 10 + vshufi32x4 m0, m22, m23, q3232 + vinserti32x8 m22, ym23, 1 + packusdw m22, m0 + pminsw m22, m11 + mova [dstq+dsq*0], ym22 + vextracti32x8 [dstq+dsq*1], m22, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w16_loop + RET +.hv_w32: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 32 + vbroadcasti32x4 m20, [spel_h_shufA] + vbroadcasti32x4 m21, [spel_h_shufB] + mova m22, [spel_shuf32] + lea wd, [hq+wq*8-256] + mov r7, srcq + mov r8, dstq +.hv_w32_loop0: + movu m6, [srcq+ssq*0+ 0] + movu m7, [srcq+ssq*0+ 8] + movu m8, [srcq+ssq*0+16] + mova m0, m10 + mova m23, m10 + pshufb m9, m6, m20 + vpdpwssd m0, m12, m9 ; a0l + pshufb m9, m7, m20 + vpdpwssd m23, m12, m9 ; a0h + vpdpwssd m0, m14, m9 ; a2l + pshufb m7, m21 + vpdpwssd m23, m13, m7 ; a1h + vpdpwssd m0, m15, m7 ; a3l + pshufb m7, m8, m20 + vpdpwssd m23, m14, m7 ; a2h + pshufb m6, m21 + vpdpwssd m0, m13, m6 ; a1l + pshufb m8, m21 + vpdpwssd m23, m15, m8 ; a3h +%macro PUT_8TAP_HV_W32 5 ; dst_lo, dst_hi, stride_name, stride[1-2] + movu m6, [srcq+%3*%4+ 0] + movu m7, [srcq+%3*%4+ 8] + movu m8, [srcq+%3*%4+16] +%if %4 == 2 + add srcq, r6 +%endif + movu m29, [srcq+%3*%5+ 0] + movu m30, [srcq+%3*%5+ 8] + movu m31, [srcq+%3*%5+16] +%if %5 == 2 + add srcq, r6 +%endif + mova m%1, m10 + mova m9, m10 + pshufb m%2, m6, m20 + vpdpwssd m%1, m12, m%2 ; x0l + pshufb m%2, m29, m20 + vpdpwssd m9, m12, m%2 ; y0l + pshufb m6, m21 + vpdpwssd m%1, m13, m6 ; x1l + pshufb m29, m21 + vpdpwssd m9, m13, m29 ; y1l + pshufb m6, m7, m20 + mova m%2, m10 + vpdpwssd m%2, m12, m6 ; x0h + pshufb m29, m30, m20 + vpdpwssd m%1, m14, m6 ; y2l + mova m6, m10 + vpdpwssd m6, m12, m29 ; x0h + pshufb m7, m21 + vpdpwssd m9, m14, m29 ; y2l + pshufb m30, m21 + vpdpwssd m%2, m13, m7 ; x1h + vpdpwssd m%1, m15, m7 ; x3l + pshufb m7, m8, m20 + vpdpwssd m6, m13, m30 ; y1h + vpdpwssd m9, m15, m30 ; y3l + pshufb m30, m31, m20 + vpdpwssd m%2, m14, m7 ; x2h + pshufb m8, m21 + vpdpwssd m6, m14, m30 ; y2h + pshufb m31, m21 + vpdpwssd m%2, m15, m8 ; x3h + vpdpwssd m6, m15, m31 ; y3h +%if %1 == 1 + vpermt2b m0, m22, m%1 ; 01l + vpermt2b m23, m22, m%2 ; 01h +%endif + vpermt2b m%1, m22, m9 ; xyl + vpermt2b m%2, m22, m6 ; xyh +%endmacro + PUT_8TAP_HV_W32 1, 24, ssq, 1, 2 ; 12 + PUT_8TAP_HV_W32 3, 26, ssq, 0, 1 ; 34 + PUT_8TAP_HV_W32 5, 28, ssq, 2, 0 ; 56 + vpshrdd m2, m1, m3, 16 ; 23l + vpshrdd m25, m24, m26, 16 ; 23h + vpshrdd m4, m3, m5, 16 ; 45l + vpshrdd m27, m26, m28, 16 ; 45h +.hv_w32_loop: + movu m7, [srcq+ssq*1+ 0] + movu m9, [srcq+ssq*2+ 0] + movu m6, [srcq+ssq*1+ 8] + movu m8, [srcq+ssq*2+ 8] + mova m29, m10 + mova m31, m10 + pshufb m30, m7, m20 + vpdpwssd m29, m12, m30 ; h0l + pshufb m30, m9, m20 + vpdpwssd m31, m12, m30 ; i0l + pshufb m7, m21 + vpdpwssd m29, m13, m7 ; h1l + pshufb m9, m21 + vpdpwssd m31, m13, m9 ; i1l + pshufb m7, m6, m20 + vpdpwssd m29, m14, m7 ; h2l + pshufb m9, m8, m20 + vpdpwssd m31, m14, m9 ; i2l + pshufb m6, m21 + vpdpwssd m29, m15, m6 ; h3l + pshufb m8, m21 + vpdpwssd m31, m15, m8 ; i3l + mova m30, m10 + vpdpwssd m30, m12, m7 ; h0h + movu m7, [srcq+ssq*1+16] + lea srcq, [srcq+ssq*2] + vpermt2b m29, m22, m31 ; 78l + mova m31, m10 + vpdpwssd m31, m12, m9 ; i0h + movu m9, [srcq+ssq*0+16] + vpdpwssd m30, m13, m6 ; h1h + pshufb m6, m7, m20 + vpdpwssd m31, m13, m8 ; i1h + pshufb m8, m9, m20 + vpdpwssd m30, m14, m6 ; h2h + pmaddwd m6, m16, m0 ; A0l + pshufb m7, m21 + vpdpwssd m31, m14, m8 ; i2h + pmaddwd m8, m16, m23 ; A0h + pshufb m9, m21 + vpdpwssd m30, m15, m7 ; h3h + pmaddwd m7, m16, m1 ; B0l + vpdpwssd m31, m15, m9 ; i3h + pmaddwd m9, m16, m24 ; B0h + mova m0, m2 + vpdpwssd m6, m17, m2 ; A1l + mova m23, m25 + vpdpwssd m8, m17, m25 ; A1h + mova m1, m3 + vpdpwssd m7, m17, m3 ; B1l + mova m24, m26 + vpdpwssd m9, m17, m26 ; B1h + vpermt2b m30, m22, m31 ; 78h + vpdpwssd m6, m18, m4 ; A2l + mova m2, m4 + vpdpwssd m8, m18, m27 ; A2h + mova m25, m27 + vpdpwssd m7, m18, m5 ; B2l + mova m3, m5 + vpdpwssd m9, m18, m28 ; B2h + mova m26, m28 + vpshrdd m4, m5, m29, 16 ; 67l + vpdpwssd m6, m19, m4 ; A3l + vpshrdd m27, m28, m30, 16 ; 67h + vpdpwssd m8, m19, m27 ; A3h + mova m5, m29 + vpdpwssd m7, m19, m29 ; B3l + mova m28, m30 + vpdpwssd m9, m19, m30 ; B3h + REPX {psrad x, 10}, m6, m8, m7, m9 + packusdw m6, m8 + packusdw m7, m9 + pminsw m6, m11 + pminsw m7, m11 + mova [dstq+dsq*0], m6 + mova [dstq+dsq*1], m7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w32_loop + add r7, 64 + add r8, 64 + movzx hd, wb + mov srcq, r7 + mov dstq, r8 + sub wd, 1<<8 + jg .hv_w32_loop0 + RET + +%if WIN64 +DECLARE_REG_TMP 6, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif + +MC_8TAP_FN prep, sharp, SHARP, SHARP +MC_8TAP_FN prep, sharp_smooth, SHARP, SMOOTH +MC_8TAP_FN prep, smooth_sharp, SMOOTH, SHARP +MC_8TAP_FN prep, smooth, SMOOTH, SMOOTH +MC_8TAP_FN prep, sharp_regular, SHARP, REGULAR +MC_8TAP_FN prep, regular_sharp, REGULAR, SHARP +MC_8TAP_FN prep, smooth_regular, SMOOTH, REGULAR +MC_8TAP_FN prep, regular_smooth, REGULAR, SMOOTH +MC_8TAP_FN prep, regular, REGULAR, REGULAR + +cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3 +%define base r7-prep_avx512icl + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r7, [prep_avx512icl] + mov wd, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + mov r5d, r7m ; bitdepth_max + vpbroadcastd m5, [pw_8192] + movzx wd, word [r7+wq*2+table_offset(prep,)] + shr r5d, 11 + vpbroadcastd m4, [r7-prep_avx512icl+prep_mul+r5*4] + add wq, r7 + lea r6, [strideq*3] +%if WIN64 + pop r7 +%endif + jmp wq +.h_w4: + movzx mxd, mxb + sub srcq, 2 + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + mov r5d, r7m + vbroadcasti32x4 m4, [spel_h_shufA] + vbroadcasti32x4 m5, [spel_h_shufB] + shr r5d, 11 + mova ym9, [prep_endA] + psllw xmm0, [base+prep_hv_shift+r5*8] + mova [tmpq], xmm0 + vpbroadcastd m6, [tmpq+4] + vpbroadcastd m7, [tmpq+8] +.h_w4_loop: + movu xm2, [srcq+strideq*0] + vinserti32x4 ym2, [srcq+strideq*1], 1 + vinserti32x4 m2, [srcq+strideq*2], 2 + vinserti32x4 m2, [srcq+r6 ], 3 + lea srcq, [srcq+strideq*4] + mova m0, m10 + pshufb m1, m2, m4 + vpdpwssd m0, m6, m1 + pshufb m2, m5 + vpdpwssd m0, m7, m2 + vpermb m0, m9, m0 + mova [tmpq], ym0 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m10, [prep_8tap_rnd] + lea r6, [strideq*3] + cmp wd, 4 + je .h_w4 + shr mxd, 16 + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + mov r5d, r7m + sub srcq, 6 + shr r5d, 11 + psllw xmm0, [base+prep_hv_shift+r5*8] + mova [tmpq], xmm0 + vpbroadcastd m12, xmm0 + vpbroadcastd m13, [tmpq+ 4] + vpbroadcastd m14, [tmpq+ 8] + vpbroadcastd m15, [tmpq+12] + cmp wd, 16 + je .h_w16 + jg .h_w32 +.h_w8: + mova m6, [spel_h_shufA] + movu m7, [spel_h_shufB] + movu m8, [spel_h_shufC] + mova m9, [spel_h_shufD] + mova m11, [prep_endB] +.h_w8_loop: + movu ym4, [srcq+strideq*0] + vinserti32x8 m4, [srcq+strideq*1], 1 + movu ym5, [srcq+strideq*2] + vinserti32x8 m5, [srcq+r6 ], 1 + lea srcq, [srcq+strideq*4] + mova m0, m10 + mova m1, m10 + vpermb m2, m6, m4 + vpermb m3, m6, m5 + vpdpwssd m0, m12, m2 + vpdpwssd m1, m12, m3 + vpermb m2, m7, m4 + vpermb m3, m7, m5 + vpdpwssd m0, m13, m2 + vpdpwssd m1, m13, m3 + vpermb m2, m8, m4 + vpermb m3, m8, m5 + vpdpwssd m0, m14, m2 + vpdpwssd m1, m14, m3 + vpermb m2, m9, m4 + vpermb m3, m9, m5 + vpdpwssd m0, m15, m2 + vpdpwssd m1, m15, m3 + vpermt2b m0, m11, m1 + mova [tmpq], m0 + add tmpq, 64 + sub hd, 4 + jg .h_w8_loop + RET +.h_w16: + vbroadcasti32x4 m6, [spel_h_shufA] + vbroadcasti32x4 m7, [spel_h_shufB] + mova m11, [prep_endC] +.h_w16_loop: + movu ym2, [srcq+strideq*0+ 0] + vinserti32x8 m2, [srcq+strideq*1+ 0], 1 + movu ym3, [srcq+strideq*0+16] + vinserti32x8 m3, [srcq+strideq*1+16], 1 + lea srcq, [srcq+strideq*2] + mova m0, m10 + mova m1, m10 + pshufb m4, m2, m6 + vpdpwssd m0, m12, m4 ; a0 + pshufb m4, m3, m6 + vpdpwssd m1, m14, m4 ; b2 + pshufb m4, m2, m7 + vpdpwssd m0, m13, m4 ; a1 + pshufb m4, m3, m7 + vpdpwssd m1, m15, m4 ; b3 + shufpd m2, m3, 0x55 + pshufb m4, m2, m6 + vpdpwssd m0, m14, m4 ; a2 + vpdpwssd m1, m12, m4 ; b0 + pshufb m2, m7 + vpdpwssd m0, m15, m2 ; a3 + vpdpwssd m1, m13, m2 ; b1 + vpermt2b m0, m11, m1 + mova [tmpq], m0 + add tmpq, 64 + sub hd, 2 + jg .h_w16_loop + RET +.h_w32: + vbroadcasti32x4 m6, [spel_h_shufA] + lea srcq, [srcq+wq*2] + vbroadcasti32x4 m7, [spel_h_shufB] + neg wq + mova m11, [prep_endC] +.h_w32_loop0: + mov r6, wq +.h_w32_loop: + movu m2, [srcq+r6*2+ 0] + movu m3, [srcq+r6*2+ 8] + mova m0, m10 + mova m1, m10 + pshufb m4, m2, m6 + vpdpwssd m0, m12, m4 ; a0 + pshufb m4, m3, m6 + vpdpwssd m1, m12, m4 ; b0 + vpdpwssd m0, m14, m4 ; a2 + movu m4, [srcq+r6*2+16] + pshufb m3, m7 + vpdpwssd m1, m13, m3 ; b1 + vpdpwssd m0, m15, m3 ; a3 + pshufb m3, m4, m6 + vpdpwssd m1, m14, m3 ; b2 + pshufb m2, m7 + vpdpwssd m0, m13, m2 ; a1 + pshufb m4, m7 + vpdpwssd m1, m15, m4 ; b3 + vpermt2b m0, m11, m1 + mova [tmpq], m0 + add tmpq, 64 + add r6, 32 + jl .h_w32_loop + add srcq, strideq + dec hd + jg .h_w32_loop0 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + mov r5d, r7m + vpbroadcastd m10, [prep_8tap_rnd] + pmovsxbw xmm0, [base+subpel_filters+myq*8] + tzcnt r6d, wd + shr r5d, 11 + movzx r6d, word [r7+r6*2+table_offset(prep, _8tap_v)] + psllw xmm0, [base+prep_hv_shift+r5*8] + add r7, r6 + lea r6, [strideq*3] + sub srcq, r6 + mova [tmpq], xmm0 + vpbroadcastd m12, xmm0 + vpbroadcastd m13, [tmpq+ 4] + vpbroadcastd m14, [tmpq+ 8] + vpbroadcastd m15, [tmpq+12] + jmp r7 +.v_w4: + movq xmm1, [srcq+strideq*0] + vpbroadcastq ymm0, [srcq+strideq*1] + vpbroadcastq ymm2, [srcq+strideq*2] + add srcq, r6 + vpbroadcastq ymm4, [srcq+strideq*0] + vpbroadcastq ymm3, [srcq+strideq*1] + vpbroadcastq ymm5, [srcq+strideq*2] + mova xm11, [prep_endA] + add srcq, r6 + vpblendd ymm1, ymm0, 0x30 + vpblendd ymm0, ymm2, 0x30 + punpcklwd ymm1, ymm0 ; 01 12 + vpbroadcastq ymm0, [srcq+strideq*0] + vpblendd ymm2, ymm4, 0x30 + vpblendd ymm4, ymm3, 0x30 + punpcklwd ymm2, ymm4 ; 23 34 + vpblendd ymm3, ymm5, 0x30 + vpblendd ymm5, ymm0, 0x30 + punpcklwd ymm3, ymm5 ; 45 56 +.v_w4_loop: + vpbroadcastq ymm5, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + mova ymm4, ym10 + vpdpwssd ymm4, ym12, ymm1 ; a0 b0 + mova ymm1, ymm2 + vpdpwssd ymm4, ym13, ymm2 ; a1 b1 + mova ymm2, ymm3 + vpdpwssd ymm4, ym14, ymm3 ; a2 b2 + vpblendd ymm3, ymm0, ymm5, 0x30 + vpbroadcastq ymm0, [srcq+strideq*0] + vpblendd ymm5, ymm0, 0x30 + punpcklwd ymm3, ymm5 ; 67 78 + vpdpwssd ymm4, ym15, ymm3 ; a3 b3 + vpermb ymm4, ym11, ymm4 + mova [tmpq], xmm4 + add tmpq, 16 + sub hd, 2 + jg .v_w4_loop + vzeroupper + RET +.v_w8: + vbroadcasti32x4 m2, [srcq+strideq*2] + vinserti32x4 m1, m2, [srcq+strideq*0], 0 + vinserti32x4 m1, [srcq+strideq*1], 1 ; 0 1 2 + add srcq, r6 + vinserti32x4 ym2, [srcq+strideq*0], 1 + vinserti32x4 m2, [srcq+strideq*1], 2 ; 2 3 4 + mova m6, [spel_v_shuf8] + movu xm0, [srcq+strideq*1] + vinserti32x4 ym0, [srcq+strideq*2], 1 + add srcq, r6 + vinserti32x4 m0, [srcq+strideq*0], 2 ; 4 5 6 + mova ym11, [prep_endB] + vpermb m1, m6, m1 ; 01 12 + vpermb m2, m6, m2 ; 23 34 + vpermb m3, m6, m0 ; 45 56 +.v_w8_loop: + vinserti32x4 m0, [srcq+strideq*1], 3 + lea srcq, [srcq+strideq*2] + movu xm5, [srcq+strideq*0] + mova m4, m10 + vpdpwssd m4, m12, m1 ; a0 b0 + mova m1, m2 + vshufi32x4 m0, m5, q1032 ; 6 7 8 + vpdpwssd m4, m13, m2 ; a1 b1 + mova m2, m3 + vpdpwssd m4, m14, m3 ; a2 b2 + vpermb m3, m6, m0 ; 67 78 + vpdpwssd m4, m15, m3 ; a3 b3 + vpermb m4, m11, m4 + mova [tmpq], ym4 + add tmpq, 32 + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: + vbroadcasti32x8 m1, [srcq+strideq*1] + vinserti32x8 m0, m1, [srcq+strideq*0], 0 + vinserti32x8 m1, [srcq+strideq*2], 1 + mova m8, [spel_v_shuf16] + add srcq, r6 + movu ym3, [srcq+strideq*0] + vinserti32x8 m3, [srcq+strideq*1], 1 + movu ym5, [srcq+strideq*2] + add srcq, r6 + vinserti32x8 m5, [srcq+strideq*0], 1 + mova m11, [prep_endA] + vpermb m0, m8, m0 ; 01 + vpermb m1, m8, m1 ; 12 + vpermb m3, m8, m3 ; 34 + vpermb m5, m8, m5 ; 56 + vpshrdd m2, m1, m3, 16 ; 23 + vpshrdd m4, m3, m5, 16 ; 45 +.v_w16_loop: + mova m6, m10 + mova m7, m10 + vpdpwssd m6, m12, m0 ; a0 + mova m0, m2 + vpdpwssd m7, m12, m1 ; b0 + mova m1, m3 + vpdpwssd m6, m13, m2 ; a1 + mova m2, m4 + vpdpwssd m7, m13, m3 ; b1 + mova m3, m5 + vpdpwssd m6, m14, m4 ; a2 + mova m4, m5 + vpdpwssd m7, m14, m5 ; b2 + movu ym5, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vinserti32x8 m5, [srcq+strideq*0], 1 + vpermb m5, m8, m5 ; 78 + vpshrdd m4, m5, 16 ; 67 + vpdpwssd m6, m15, m4 ; a3 + vpdpwssd m7, m15, m5 ; b3 + vpermt2b m6, m11, m7 + mova [tmpq], m6 + add tmpq, 64 + sub hd, 2 + jg .v_w16_loop + RET +.v_w32: +.v_w64: +.v_w128: +%if WIN64 + PUSH r8 + movaps [rsp+stack_offset+8], xmm6 +%endif + lea r5, [hq+wq*8-256] + mov r7, srcq + mov r8, tmpq +.v_w32_loop0: + movu m16, [srcq+strideq*0] + movu m17, [srcq+strideq*1] + movu m18, [srcq+strideq*2] + add srcq, r6 + movu m19, [srcq+strideq*0] + movu m20, [srcq+strideq*1] + movu m21, [srcq+strideq*2] + add srcq, r6 + movu m22, [srcq+strideq*0] + mova m11, [prep_endC] + punpcklwd m0, m16, m17 ; 01l + punpckhwd m16, m17 ; 01h + punpcklwd m1, m17, m18 ; 12l + punpckhwd m17, m18 ; 12h + punpcklwd m2, m18, m19 ; 23l + punpckhwd m18, m19 ; 23h + punpcklwd m3, m19, m20 ; 34l + punpckhwd m19, m20 ; 34h + punpcklwd m4, m20, m21 ; 45l + punpckhwd m20, m21 ; 45h + punpcklwd m5, m21, m22 ; 56l + punpckhwd m21, m22 ; 56h +.v_w32_loop: + mova m6, m10 + vpdpwssd m6, m12, m0 ; a0l + mova m8, m10 + vpdpwssd m8, m12, m16 ; a0h + mova m7, m10 + vpdpwssd m7, m12, m1 ; b0l + mova m9, m10 + vpdpwssd m9, m12, m17 ; b0h + mova m0, m2 + vpdpwssd m6, m13, m2 ; a1l + mova m16, m18 + vpdpwssd m8, m13, m18 ; a1h + mova m1, m3 + vpdpwssd m7, m13, m3 ; b1l + mova m17, m19 + vpdpwssd m9, m13, m19 ; b1h + mova m2, m4 + vpdpwssd m6, m14, m4 ; a2l + mova m18, m20 + vpdpwssd m8, m14, m20 ; a2h + mova m3, m5 + vpdpwssd m7, m14, m5 ; b2l + mova m19, m21 + vpdpwssd m9, m14, m21 ; b2h + movu m21, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + punpcklwd m4, m22, m21 ; 67l + punpckhwd m20, m22, m21 ; 67h + movu m22, [srcq+strideq*0] + vpdpwssd m6, m15, m4 ; a3l + vpdpwssd m8, m15, m20 ; a3h + punpcklwd m5, m21, m22 ; 78l + punpckhwd m21, m22 ; 78h + vpdpwssd m7, m15, m5 ; b3l + vpdpwssd m9, m15, m21 ; b3h + vpermt2b m6, m11, m8 + vpermt2b m7, m11, m9 + mova [tmpq+wq*0], m6 + mova [tmpq+wq*2], m7 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .v_w32_loop + add r7, 64 + add r8, 64 + movzx hd, r5b + mov srcq, r7 + mov tmpq, r8 + sub r5d, 1<<8 + jg .v_w32_loop0 +%if WIN64 + movaps xmm6, [rsp+stack_offset+8] + POP r8 +%endif + vzeroupper + RET +.hv: + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + mov r5d, r7m + pmovsxbw xmm1, [base+subpel_filters+myq*8] + lea r6, [strideq*3] + sub srcq, 2 + shr r5d, 11 + sub srcq, r6 + psllw xmm0, [base+prep_hv_shift+r5*8] + psllw xmm1, 2 + vpbroadcastd m10, [prep_8tap_rnd] + vpbroadcastd ym11, [pd_128] + mova xm21, [prep_endA] + mova [tmpq+ 0], xmm0 + mova [tmpq+16], xmm1 + vpbroadcastd m8, [tmpq+ 4] + vpbroadcastd m9, [tmpq+ 8] + vpbroadcastd ym12, xmm1 + vpbroadcastd ym13, [tmpq+20] + vpbroadcastd ym14, [tmpq+24] + vpbroadcastd ym15, [tmpq+28] + movu xm4, [srcq+strideq*0] + vinserti32x4 ym4, [srcq+strideq*1], 1 + vinserti32x4 m4, [srcq+strideq*2], 2 + add srcq, r6 + vinserti32x4 m4, [srcq+strideq*0], 3 ; 0 1 2 3 + movu xm0, [srcq+strideq*1] + vinserti32x4 ym0, [srcq+strideq*2], 1 + add srcq, r6 + vinserti32x4 m0, [srcq+strideq*0], 2 ; 4 5 6 + vbroadcasti32x4 m19, [spel_h_shufA] + vbroadcasti32x4 m20, [spel_h_shufB] + mova ym6, [spel_shuf4a] + mova ym7, [spel_shuf4b] + mova m2, m10 + mova m3, m10 + pshufb m1, m4, m19 + vpdpwssd m2, m8, m1 + pshufb m1, m0, m19 + vpdpwssd m3, m8, m1 + pshufb m4, m20 + vpdpwssd m2, m9, m4 + pshufb m0, m20 + vpdpwssd m3, m9, m0 + vpermb m1, m6, m2 ; 01 12 + vshufi32x4 m2, m3, q1032 + vpermb m3, m6, m3 ; 45 56 + vpermb m2, m6, m2 ; 23 34 +.hv_w4_loop: + movu xm18, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vinserti128 ym18, [srcq+strideq*0], 1 + mova ym16, ym11 + mova ym4, ym10 + pshufb ym17, ym18, ym19 + vpdpwssd ym16, ym12, ym1 ; a0 b0 + vpdpwssd ym4, ym8, ym17 + pshufb ym18, ym20 + mova ym1, ym2 + vpdpwssd ym16, ym13, ym2 ; a1 b1 + vpdpwssd ym4, ym9, ym18 ; 7 8 + mova ym2, ym3 + vpdpwssd ym16, ym14, ym3 ; a2 b2 + vpermt2b ym3, ym7, ym4 ; 67 78 + vpdpwssd ym16, ym15, ym3 ; a3 b3 + vpermb ym16, ym21, ym16 + mova [tmpq], xm16 + add tmpq, 16 + sub hd, 2 + jg .hv_w4_loop + vzeroupper + RET +.hv_w8: + shr mxd, 16 + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + mov r5d, r7m + pmovsxbw xmm1, [base+subpel_filters+myq*8] + lea r6, [strideq*3] + sub srcq, 6 + shr r5d, 11 + sub srcq, r6 + vpbroadcastd m10, [prep_8tap_rnd] + vpbroadcastd m11, [pd_128] + psllw xmm0, [base+prep_hv_shift+r5*8] + psllw xmm1, 2 + mova [tmpq+ 0], xmm0 + mova [tmpq+16], xmm1 + vpbroadcastd m12, xmm0 + vpbroadcastd m13, [tmpq+ 4] + vpbroadcastd m14, [tmpq+ 8] + vpbroadcastd m15, [tmpq+12] + vpbroadcastd m16, xmm1 + vpbroadcastd m17, [tmpq+20] + vpbroadcastd m18, [tmpq+24] + vpbroadcastd m19, [tmpq+28] + cmp wd, 16 + je .hv_w16 + jg .hv_w32 + WIN64_SPILL_XMM 23 + mova m5, [spel_h_shufA] + movu ym0, [srcq+strideq*0] + vinserti32x8 m0, [srcq+strideq*1], 1 ; 0 1 + movu ym9, [srcq+strideq*2] + add srcq, r6 + vinserti32x8 m9, [srcq+strideq*0], 1 ; 2 3 + movu ym20, [srcq+strideq*1] + vinserti32x8 m20, [srcq+strideq*2], 1 ; 4 5 + add srcq, r6 + movu ym21, [srcq+strideq*0] ; 6 + movu m6, [spel_h_shufB] + movu m7, [spel_h_shufC] + mova ym22, [prep_endB] + vpermb m8, m5, m0 + mova m1, m10 + vpdpwssd m1, m12, m8 ; a0 b0 + vpermb m8, m5, m9 + mova m2, m10 + vpdpwssd m2, m12, m8 ; c0 d0 + vpermb m8, m5, m20 + mova m3, m10 + vpdpwssd m3, m12, m8 ; e0 f0 + vpermb m8, m5, m21 + mova m4, m10 + vpdpwssd m4, m12, m8 ; g0 + vpermb m8, m6, m0 + vpdpwssd m1, m13, m8 ; a1 b1 + vpermb m8, m6, m9 + vpdpwssd m2, m13, m8 ; c1 d1 + vpermb m8, m6, m20 + vpdpwssd m3, m13, m8 ; e1 f1 + vpermb m8, m6, m21 + vpdpwssd m4, m13, m8 ; g1 + vpermb m8, m7, m0 + vpdpwssd m1, m14, m8 ; a2 b2 + vpermb m8, m7, m9 + vpdpwssd m2, m14, m8 ; c2 d2 + vpermb m8, m7, m20 + vpdpwssd m3, m14, m8 ; e2 f2 + vpermb m8, m7, m21 + vpdpwssd m4, m14, m8 ; g2 + mova m8, [spel_h_shufD] + vpermb m0, m8, m0 + vpdpwssd m1, m15, m0 ; a3 b3 + mova m0, [spel_shuf8a] + vpermb m9, m8, m9 + vpdpwssd m2, m15, m9 ; c3 d3 + mova m9, [spel_shuf8b] + vpermb m20, m8, m20 + vpdpwssd m3, m15, m20 ; e3 f3 + vpermb m21, m8, m21 + vpdpwssd m4, m15, m21 ; g3 + vpermt2b m1, m0, m2 ; 01 12 + vpermt2b m2, m0, m3 ; 23 34 + vpermt2b m3, m0, m4 ; 45 56 +.hv_w8_loop: + movu ym0, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vinserti32x8 m0, [srcq+strideq*0], 1 + mova m4, m10 + mova m20, m11 + vpermb m21, m5, m0 + vpdpwssd m4, m12, m21 ; h0 i0 + vpermb m21, m6, m0 + vpdpwssd m20, m16, m1 ; A0 B0 + vpdpwssd m4, m13, m21 ; h1 i1 + vpermb m21, m7, m0 + mova m1, m2 + vpdpwssd m20, m17, m2 ; A1 B1 + vpdpwssd m4, m14, m21 ; h2 i2 + vpermb m21, m8, m0 + mova m2, m3 + vpdpwssd m20, m18, m3 ; A2 B2 + vpdpwssd m4, m15, m21 ; h3 i3 + vpermt2b m3, m9, m4 ; 67 78 + vpdpwssd m20, m19, m3 ; A3 B3 + vpermb m20, m22, m20 + mova [tmpq], ym20 + add tmpq, 32 + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 27 + vbroadcasti32x8 m5, [srcq+strideq*0+ 8] + vinserti32x8 m4, m5, [srcq+strideq*0+ 0], 0 + vinserti32x8 m5, [srcq+strideq*0+16], 1 ; 0 + movu ym6, [srcq+strideq*1+ 0] + movu ym7, [srcq+strideq*1+16] + vinserti32x8 m6, [srcq+strideq*2+ 0], 1 + vinserti32x8 m7, [srcq+strideq*2+16], 1 ; 1 2 + add srcq, r6 + movu ym22, [srcq+strideq*0+ 0] + movu ym23, [srcq+strideq*0+16] + vinserti32x8 m22, [srcq+strideq*1+ 0], 1 + vinserti32x8 m23, [srcq+strideq*1+16], 1 ; 3 4 + movu ym24, [srcq+strideq*2+ 0] + movu ym25, [srcq+strideq*2+16] + add srcq, r6 + vinserti32x8 m24, [srcq+strideq*0+ 0], 1 + vinserti32x8 m25, [srcq+strideq*0+16], 1 ; 5 6 + vbroadcasti32x4 m20, [spel_h_shufA] + vbroadcasti32x4 m21, [spel_h_shufB] + mova m9, [spel_shuf16] + mova m26, [prep_endB] + pshufb m0, m4, m20 + mova m1, m10 + vpdpwssd m1, m12, m0 ; a0 + pshufb m0, m6, m20 + mova m2, m10 + vpdpwssd m2, m12, m0 ; b0 + pshufb m0, m7, m20 + mova m3, m10 + vpdpwssd m3, m14, m0 ; c2 + pshufb m0, m4, m21 + vpdpwssd m1, m13, m0 ; a1 + pshufb m0, m6, m21 + vpdpwssd m2, m13, m0 ; b1 + pshufb m0, m7, m21 + vpdpwssd m3, m15, m0 ; c3 + pshufb m0, m5, m20 + vpdpwssd m1, m14, m0 ; a2 + shufpd m6, m7, 0x55 + pshufb m7, m6, m20 + vpdpwssd m2, m14, m7 ; b2 + vpdpwssd m3, m12, m7 ; c0 + pshufb m5, m21 + vpdpwssd m1, m15, m5 ; a3 + pshufb m6, m21 + vpdpwssd m2, m15, m6 ; b3 + vpdpwssd m3, m13, m6 ; c1 + pshufb m0, m22, m20 + mova m4, m10 + vpdpwssd m4, m12, m0 ; d0 + pshufb m0, m23, m20 + mova m5, m10 + vpdpwssd m5, m14, m0 ; e2 + pshufb m0, m24, m20 + mova m6, m10 + vpdpwssd m6, m12, m0 ; f0 + pshufb m0, m25, m20 + mova m7, m10 + vpdpwssd m7, m14, m0 ; g2 + pshufb m0, m22, m21 + vpdpwssd m4, m13, m0 ; d1 + pshufb m0, m23, m21 + vpdpwssd m5, m15, m0 ; e3 + pshufb m0, m24, m21 + vpdpwssd m6, m13, m0 ; f1 + pshufb m0, m25, m21 + vpdpwssd m7, m15, m0 ; g3 + shufpd m22, m23, 0x55 + pshufb m23, m22, m20 + vpdpwssd m4, m14, m23 ; d2 + vpdpwssd m5, m12, m23 ; e0 + shufpd m24, m25, 0x55 + pshufb m25, m24, m20 + vpdpwssd m6, m14, m25 ; f2 + vpdpwssd m7, m12, m25 ; g0 + pshufb m22, m21 + vpdpwssd m4, m15, m22 ; d3 + vpdpwssd m5, m13, m22 ; e1 + pshufb m24, m21 + vpdpwssd m6, m15, m24 ; f3 + vpdpwssd m7, m13, m24 ; g1 + pslldq m1, 1 + vpermt2b m2, m9, m3 ; 12 + vpermt2b m4, m9, m5 ; 34 + vpermt2b m6, m9, m7 ; 56 + vpshrdd m1, m2, 16 ; 01 + vpshrdd m3, m2, m4, 16 ; 23 + vpshrdd m5, m4, m6, 16 ; 45 +.hv_w16_loop: + movu ym24, [srcq+strideq*1+ 0] + movu ym25, [srcq+strideq*1+16] + lea srcq, [srcq+strideq*2] + vinserti32x8 m24, [srcq+strideq*0+ 0], 1 + vinserti32x8 m25, [srcq+strideq*0+16], 1 + mova m7, m10 + mova m8, m10 + pshufb m0, m24, m20 + vpdpwssd m7, m12, m0 ; h0 + mova m22, m11 + pshufb m0, m25, m20 + vpdpwssd m8, m14, m0 ; i2 + mova m23, m11 + vpdpwssd m22, m16, m1 ; A0 + mova m1, m3 + vpdpwssd m23, m16, m2 ; B0 + mova m2, m4 + pshufb m0, m24, m21 + vpdpwssd m7, m13, m0 ; h1 + pshufb m0, m25, m21 + vpdpwssd m8, m15, m0 ; i3 + vpdpwssd m22, m17, m3 ; A1 + mova m3, m5 + vpdpwssd m23, m17, m4 ; B1 + mova m4, m6 + shufpd m24, m25, 0x55 + pshufb m25, m24, m20 + vpdpwssd m7, m14, m25 ; h2 + vpdpwssd m8, m12, m25 ; i0 + vpdpwssd m22, m18, m5 ; A2 + vpdpwssd m23, m18, m6 ; B2 + pshufb m24, m21 + vpdpwssd m7, m15, m24 ; h3 + vpdpwssd m8, m13, m24 ; i1 + vpermt2b m7, m9, m8 ; 78 + vpshrdd m5, m6, m7, 16 ; 67 + vpdpwssd m22, m19, m5 ; A3 + vpdpwssd m23, m19, m7 ; B3 + mova m6, m7 + vpermt2b m22, m26, m23 + mova [tmpq], m22 + add tmpq, 64 + sub hd, 2 + jg .hv_w16_loop + RET +.hv_w32: +%if WIN64 + %assign stack_offset stack_offset - stack_size_padded + PUSH r8 + %assign regs_used regs_used + 1 + WIN64_SPILL_XMM 32 +%endif + vbroadcasti32x4 m20, [spel_h_shufA] + vbroadcasti32x4 m21, [spel_h_shufB] + mova m22, [spel_shuf32] + lea r5d, [hq+wq*8-256] + mov r7, srcq + mov r8, tmpq +.hv_w32_loop0: + movu m6, [srcq+strideq*0+ 0] + movu m7, [srcq+strideq*0+ 8] + movu m8, [srcq+strideq*0+16] + mova m0, m10 + mova m23, m10 + pshufb m9, m6, m20 + vpdpwssd m0, m12, m9 ; a0l + pshufb m9, m7, m20 + vpdpwssd m23, m12, m9 ; a0h + vpdpwssd m0, m14, m9 ; a2l + pshufb m7, m21 + vpdpwssd m23, m13, m7 ; a1h + vpdpwssd m0, m15, m7 ; a3l + pshufb m7, m8, m20 + vpdpwssd m23, m14, m7 ; a2h + pshufb m6, m21 + vpdpwssd m0, m13, m6 ; a1l + pshufb m8, m21 + vpdpwssd m23, m15, m8 ; a3h + PUT_8TAP_HV_W32 1, 24, strideq, 1, 2 ; 12 + PUT_8TAP_HV_W32 3, 26, strideq, 0, 1 ; 34 + PUT_8TAP_HV_W32 5, 28, strideq, 2, 0 ; 56 + vpshrdd m2, m1, m3, 16 ; 23l + vpshrdd m25, m24, m26, 16 ; 23h + vpshrdd m4, m3, m5, 16 ; 45l + vpshrdd m27, m26, m28, 16 ; 45h +.hv_w32_loop: + movu m7, [srcq+strideq*1+ 0] + movu m9, [srcq+strideq*2+ 0] + movu m6, [srcq+strideq*1+ 8] + movu m8, [srcq+strideq*2+ 8] + mova m29, m10 + mova m31, m10 + pshufb m30, m7, m20 + vpdpwssd m29, m12, m30 ; h0l + pshufb m30, m9, m20 + vpdpwssd m31, m12, m30 ; i0l + pshufb m7, m21 + vpdpwssd m29, m13, m7 ; h1l + pshufb m9, m21 + vpdpwssd m31, m13, m9 ; i1l + pshufb m7, m6, m20 + vpdpwssd m29, m14, m7 ; h2l + pshufb m9, m8, m20 + vpdpwssd m31, m14, m9 ; i2l + pshufb m6, m21 + vpdpwssd m29, m15, m6 ; h3l + pshufb m8, m21 + vpdpwssd m31, m15, m8 ; i3l + mova m30, m10 + vpdpwssd m30, m12, m7 ; h0h + movu m7, [srcq+strideq*1+16] + lea srcq, [srcq+strideq*2] + vpermt2b m29, m22, m31 ; 78l + mova m31, m10 + vpdpwssd m31, m12, m9 ; i0h + movu m9, [srcq+strideq*0+16] + vpdpwssd m30, m13, m6 ; h1h + pshufb m6, m7, m20 + vpdpwssd m31, m13, m8 ; i1h + pshufb m8, m9, m20 + vpdpwssd m30, m14, m6 ; h2h + mova m6, m11 + vpdpwssd m6, m16, m0 ; A0l + pshufb m7, m21 + vpdpwssd m31, m14, m8 ; i2h + mova m8, m11 + vpdpwssd m8, m16, m23 ; A0h + pshufb m9, m21 + vpdpwssd m30, m15, m7 ; h3h + mova m7, m11 + vpdpwssd m7, m16, m1 ; B0l + vpdpwssd m31, m15, m9 ; i3h + mova m9, m11 + vpdpwssd m9, m16, m24 ; B0h + mova m0, m2 + vpdpwssd m6, m17, m2 ; A1l + mova m23, m25 + vpdpwssd m8, m17, m25 ; A1h + mova m1, m3 + vpdpwssd m7, m17, m3 ; B1l + mova m24, m26 + vpdpwssd m9, m17, m26 ; B1h + vpermt2b m30, m22, m31 ; 78h + mova m31, [prep_endC] + vpdpwssd m6, m18, m4 ; A2l + mova m2, m4 + vpdpwssd m8, m18, m27 ; A2h + mova m25, m27 + vpdpwssd m7, m18, m5 ; B2l + mova m3, m5 + vpdpwssd m9, m18, m28 ; B2h + mova m26, m28 + vpshrdd m4, m5, m29, 16 ; 67l + vpdpwssd m6, m19, m4 ; A3l + vpshrdd m27, m28, m30, 16 ; 67h + vpdpwssd m8, m19, m27 ; A3h + mova m5, m29 + vpdpwssd m7, m19, m29 ; B3l + mova m28, m30 + vpdpwssd m9, m19, m30 ; B3h + vpermt2b m6, m31, m8 + vpermt2b m7, m31, m9 + mova [tmpq+wq*0], m6 + mova [tmpq+wq*2], m7 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .hv_w32_loop + add r7, 64 + add r8, 64 + movzx hd, r5b + mov srcq, r7 + mov tmpq, r8 + sub r5d, 1<<8 + jg .hv_w32_loop0 + RET + +%if WIN64 +DECLARE_REG_TMP 5 +%else +DECLARE_REG_TMP 7 +%endif + +cglobal warp_affine_8x8t_16bpc, 4, 7, 22, tmp, ts +%define base r6-pd_0to7 + mov t0d, r7m + lea r6, [pd_0to7] + shr t0d, 11 + vpbroadcastd m8, [base+warp_8x8t_rnd_v] + vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4] + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main + psrad m14, m16, 15 + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 + psrad m16, 15 + packssdw m14, m16 + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 + psrad m15, m16, 15 + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 + add tsq, tsq + psrad m16, 15 + packssdw m15, m16 + jmp mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).end + +cglobal warp_affine_8x8_16bpc, 4, 7, 22, dst, ds, src, ss, abcd + mov t0d, r7m ; pixel_max + lea r6, [pd_0to7] + shr t0d, 11 + vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4] + vpbroadcastd m8, [base+warp_8x8_rnd_v+t0*4] + call .main + psrad m14, m16, 13 + call .main2 + psrad m16, 13 + packusdw m14, m16 + call .main2 + psrad m15, m16, 13 + call .main2 + vpbroadcastd m0, [base+bidir_shift+t0*4] + vpsrlvw m14, m0 + psrad m16, 13 + packusdw m15, m16 + vpsrlvw m15, m0 +.end: + mova m0, [base+warp8x8_end] + vpermb m16, m0, m14 + lea r2, [dsq*3] + mova [dstq+dsq*0], xm16 + vextracti128 [dstq+dsq*1], ym16, 1 + vextracti32x4 [dstq+dsq*2], m16, 2 + vextracti32x4 [dstq+r2 ], m16, 3 + vpermb m16, m0, m15 + lea dstq, [dstq+dsq*4] + mova [dstq+dsq*0], xm16 + vextracti128 [dstq+dsq*1], ym16, 1 + vextracti32x4 [dstq+dsq*2], m16, 2 + vextracti32x4 [dstq+r2 ], m16, 3 + RET +.main: + vpbroadcastd ym3, [base+pd_512] +%if WIN64 + mov abcdq, r5mp + vpaddd ym18, ym3, r6m {1to8} ; mx +%else + add r5d, 512 + vpbroadcastd ym18, r5d +%endif + vpaddd ym20, ym3, r7m {1to8} ; my + mova ym16, [base+pd_0to7] + vpbroadcastd ym19, [abcdq+4*0] ; alpha + vpbroadcastd ym21, [abcdq+4*1] ; gamma + lea r4, [ssq*3+6] + vpdpwssd ym18, ym19, ym16 ; tmx + vpdpwssd ym20, ym21, ym16 ; tmy + sub srcq, r4 + mova m10, [base+warp8x8_permA] + lea r4, [mc_warp_filter+64*8] + vbroadcasti32x4 m12, [base+warp8x8_permC] + kxnorb k1, k1, k1 + vbroadcasti32x4 m13, [base+warp8x8_permD] + movu ym5, [srcq+0] + vinserti32x8 m5, [srcq+8], 1 + psrad ym17, ym18, 10 + mova m11, [base+warp8x8_permB] + kmovb k2, k1 + vpgatherdq m3{k1}, [r4+ym17*8] ; filter_x0 + psrad ym19, 16 ; beta + psrad ym21, 16 ; delta + paddd ym18, ym19 + vpermb m4, m10, m5 + vpbroadcastq m9, [base+warp_shift_h+t0*8] + pshufd m3, m3, q3120 + paddd m7, m1, m1 + pshufb m2, m3, m12 + vpdpwssd m1, m4, m2 + vpermb m5, m11, m5 + vshufi32x4 m4, m5, q1021 + pshufb m3, m13 + vpdpwssd m1, m4, m3 + call .h + psllq m2, m1, 32 + paddd m1, m2 + vpmultishiftqb m1, m9, m1 + vpshrdq m1, m0, 48 ; 01 12 + call .h + vpshrdq m2, m1, m0, 48 ; 23 34 + call .h + vpshrdq m3, m2, m0, 48 ; 45 56 +.main2: + call .h + psrad ym6, ym20, 10 + kmovb k1, k2 + paddd ym17, ym20, ym21 ; my += delta + vpgatherdq m20{k2}, [r4+ym6*8] ; filter_y0 + psrad ym16, ym17, 10 + kmovb k2, k1 + vpgatherdq m6{k1}, [r4+ym16*8] ; filter_y1 + shufps m5, m20, m6, q2020 + mova m16, m8 + pshufb m4, m5, m12 + vpdpwssd m16, m1, m4 ; a0 b0 + pshufb m5, m13 + mova m1, m2 + vpdpwssd m16, m2, m5 ; a1 b1 + shufps m6, m20, m6, q3131 + paddd ym20, ym17, ym21 + pshufb m4, m6, m12 + mova m2, m3 + vpdpwssd m16, m3, m4 ; a2 b2 + vpshrdq m3, m0, 48 ; 67 78 + pshufb m6, m13 + vpdpwssd m16, m3, m6 ; a3 b3 + ret +ALIGN function_align +.h: + movu ym16, [srcq+ssq*1] + psrad ym6, ym18, 10 + lea srcq, [srcq+ssq*2] + vinserti32x8 m5, m16, [srcq+ssq*0], 1 + kmovb k1, k2 + paddd ym17, ym18, ym19 ; mx += beta + vpgatherdq m18{k2}, [r4+ym6*8] ; filter_x1 + psrad ym16, ym17, 10 + kmovb k2, k1 + vpgatherdq m6{k1}, [r4+ym16*8] ; filter_x2 + vpermb m4, m10, m5 + shufps m16, m18, m6, q2020 + shufps m6, m18, m6, q3131 + mova m0, m7 + pshufb m18, m16, m12 + vpdpwssd m0, m4, m18 ; a0 b0 + vpermb m5, m11, m5 + pshufb m18, m6, m13 + vpdpwssd m0, m5, m18 ; a3 b3 + paddd ym18, ym17, ym19 + vshufi32x4 m17, m4, m5, q1021 + pshufb m16, m13 + vpdpwssd m0, m17, m16 ; a1 b1 + vshufi32x4 m4, m5, q2132 + pshufb m6, m12 + vpdpwssd m0, m4, m6 ; a2 b2 + vpmultishiftqb m0, m9, m0 ; a a b b + ret + +%macro BIDIR_FN 0 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + movq [dstq ], xm0 + movhps [dstq+strideq*1], xm0 + vextracti32x4 xm2, ym0, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + cmp hd, 8 + jl .w4_end + vextracti32x4 xm2, m0, 2 + lea dstq, [dstq+strideq*4] + movq [dstq ], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm0, m0, 3 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + je .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq ], xm1 + movhps [dstq+strideq*1], xm1 + vextracti32x4 xm0, ym1, 1 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + vextracti32x4 xm0, m1, 2 + lea dstq, [dstq+strideq*4] + movq [dstq ], xm0 + movhps [dstq+strideq*1], xm0 + vextracti32x4 xm1, m1, 3 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 +.w4_end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] +.w8: + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+stride3q ], m0, 3 + sub hd, 8 + jl .w8_end + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm1 + vextracti32x4 [dstq+strideq*1], ym1, 1 + vextracti32x4 [dstq+strideq*2], m1, 2 + vextracti32x4 [dstq+stride3q ], m1, 3 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*4] +.w16: + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*2] +.w32: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + call .main + mova [dstq+64*2], m0 + mova [dstq+64*3], m1 + dec hd + jg .w128_loop + RET +%endmacro + +%if WIN64 +DECLARE_REG_TMP 5 +%else +DECLARE_REG_TMP 7 +%endif + +cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h, stride3 +%define base r6-avg_avx512icl_table + lea r6, [avg_avx512icl_table] + tzcnt wd, wm + mov t0d, r6m ; pixel_max + movsxd wq, [r6+wq*4] + shr t0d, 11 + vpbroadcastd m2, [base+avg_round+t0*4] + vpbroadcastd m3, [base+avg_shift+t0*4] + movifnidn hd, hm + add wq, r6 + BIDIR_FN +ALIGN function_align +.main: + mova m0, [tmp1q+64*0] + paddsw m0, [tmp2q+64*0] + mova m1, [tmp1q+64*1] + paddsw m1, [tmp2q+64*1] + add tmp1q, 64*2 + add tmp2q, 64*2 + pmaxsw m0, m2 + pmaxsw m1, m2 + psubsw m0, m2 + psubsw m1, m2 + vpsrlvw m0, m3 + vpsrlvw m1, m3 + ret + +cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h, stride3 +%define base r6-w_avg_avx512icl_table + lea r6, [w_avg_avx512icl_table] + tzcnt wd, wm + mov t0d, r7m ; pixel_max + shr t0d, 11 + movsxd wq, [r6+wq*4] + vpbroadcastd m5, [base+w_avg_round+t0*4] + vpbroadcastd m7, [base+bidir_shift+t0*4] + add wq, r6 + mov r6d, r6m ; weight + lea t0d, [r6-16] + shl r6d, 16 + sub r6d, t0d ; 16-weight, weight + movifnidn hd, hm + vpbroadcastd m6, r6d + BIDIR_FN +ALIGN function_align +.main: + mova m3, [tmp1q+64*0] + mova m1, [tmp2q+64*0] + mova m0, [tmp1q+64*1] + mova m4, [tmp2q+64*1] + add tmp1q, 64*2 + add tmp2q, 64*2 + punpcklwd m2, m1, m3 + punpckhwd m1, m3 + punpcklwd m3, m4, m0 + punpckhwd m4, m0 + mova m0, m5 + vpdpwssd m0, m6, m2 + mova m2, m5 + vpdpwssd m2, m6, m1 + mova m1, m5 + vpdpwssd m1, m6, m3 + mova m3, m5 + vpdpwssd m3, m6, m4 + REPX {psrad x, 2}, m0, m2, m1, m3 + packusdw m0, m2 + packusdw m1, m3 + vpsrlvw m0, m7 + vpsrlvw m1, m7 + ret + +cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-mask_avx512icl_table + lea r7, [mask_avx512icl_table] + tzcnt wd, wm + mov r6d, r7m ; pixel_max + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m8, [base+pw_64] + vpbroadcastd m9, [base+mask_round+r6*4] + vpbroadcastd m10, [base+bidir_shift+r6*4] + mov maskq, maskmp + add wq, r7 + BIDIR_FN +ALIGN function_align +.main: + pmovzxbw m1, [maskq+32*0] + mova m4, [tmp1q+64*0] + mova m2, [tmp2q+64*0] + pmovzxbw m6, [maskq+32*1] + mova m5, [tmp1q+64*1] + mova m3, [tmp2q+64*1] + add maskq, 32*2 + add tmp1q, 64*2 + add tmp2q, 64*2 + punpcklwd m7, m4, m2 + punpckhwd m4, m2 + psubw m0, m8, m1 + punpcklwd m2, m1, m0 ; m, 64-m + punpckhwd m1, m0 + mova m0, m9 + vpdpwssd m0, m7, m2 + mova m2, m9 + vpdpwssd m2, m4, m1 ; tmp1 * m + tmp2 * (64-m) + punpcklwd m7, m5, m3 + punpckhwd m5, m3 + psubw m1, m8, m6 + punpcklwd m3, m6, m1 + punpckhwd m6, m1 + mova m1, m9 + vpdpwssd m1, m7, m3 + mova m3, m9 + vpdpwssd m3, m5, m6 + REPX {psrad x, 4}, m0, m2, m1, m3 + packusdw m0, m2 + packusdw m1, m3 + vpsrlvw m0, m10 + vpsrlvw m1, m10 + ret + +cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_420_avx512icl_table + lea r7, [w_mask_420_avx512icl_table] + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 + vpbroadcastd m11, [base+pw_64] + vpbroadcastd m12, [base+mask_round+r6*4] + vpbroadcastd m13, [base+bidir_shift+r6*4] + mov r6d, r7m ; sign + vpbroadcastd m14, [base+w_mask_round+r6*4] + mova ym15, [w_mask_end42x] + mov maskq, maskmp + add wq, r7 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + mova m4, [w_mask_shuf4] + vpermt2b m2, m4, m3 + mova m3, m14 + vpdpbusd m3, m2, [pb_64] {1to16} + vpermb m3, m15, m3 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti32x4 xm2, ym0, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + mova [maskq], xm3 + cmp hd, 8 + jl .w4_end + vextracti32x4 xm2, m0, 2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm0, m0, 3 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + je .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + vextracti32x4 xm2, ym1, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + vextracti32x4 xm2, m1, 2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm1, m1, 3 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 +.w4_end: + RET +.w8: + mova m8, [w_mask_shuf8] + vpbroadcastd m9, [pb_64] + jmp .w8_start +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] + add maskq, 16 +.w8_start: + vpermt2b m2, m8, m3 + mova m3, m14 + vpdpbusd m3, m2, m9 + vpermb m3, m15, m3 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+stride3q ], m0, 3 + mova [maskq], xm3 + sub hd, 8 + jl .w8_end + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm1 + vextracti32x4 [dstq+strideq*1], ym1, 1 + vextracti32x4 [dstq+strideq*2], m1, 2 + vextracti32x4 [dstq+stride3q ], m1, 3 + jg .w8_loop +.w8_end: + RET +.w16: + mova m8, [w_mask_shuf16] + vpbroadcastd m9, [pb_64] + jmp .w16_start +.w16_loop: + call .main + lea dstq, [dstq+strideq*4] + add maskq, 16 +.w16_start: + vpermt2b m2, m8, m3 + mova m3, m14 + vpdpbusd m3, m2, m9 + vpermb m3, m15, m3 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + mova [maskq], xm3 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*4] + add maskq, 32 +.w32: + paddw m2, m3 + mova m8, m14 + vpdpwssd m8, m11, m2 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + call .main + paddw m2, m3 + mova m3, m14 + vpdpwssd m3, m11, m2 + vpermt2b m8, m15, m3 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m1 + mova [maskq], ym8 + sub hd, 4 + jg .w32_loop + RET +.w64_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 32 +.w64: + mova m8, m2 + mova m9, m3 + mova [dstq+strideq*0+64*0], m0 + mova [dstq+strideq*0+64*1], m1 + call .main + paddw m8, m2 + paddw m9, m3 + mova m2, m14 + vpdpwssd m2, m11, m8 + mova m3, m14 + vpdpwssd m3, m11, m9 + vpermt2b m2, m15, m3 + mova [dstq+strideq*1+64*0], m0 + mova [dstq+strideq*1+64*1], m1 + mova [maskq], ym2 + sub hd, 2 + jg .w64_loop + RET +.w128_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 64 +.w128: + mova m16, m2 + mova m8, m3 + mova [dstq+strideq*0+64*0], m0 + mova [dstq+strideq*0+64*1], m1 + call .main + mova m17, m2 + mova m9, m3 + mova [dstq+strideq*0+64*2], m0 + mova [dstq+strideq*0+64*3], m1 + call .main + paddw m2, m16 + paddw m3, m8 + mova m16, m14 + vpdpwssd m16, m11, m2 + mova m8, m14 + vpdpwssd m8, m11, m3 + mova [dstq+strideq*1+64*0], m0 + mova [dstq+strideq*1+64*1], m1 + call .main + paddw m2, m17 + paddw m3, m9 + mova m17, m14 + vpdpwssd m17, m11, m2 + mova m9, m14 + vpdpwssd m9, m11, m3 + vpermt2b m16, m15, m8 + vpermt2b m17, m15, m9 + mova [dstq+strideq*1+64*2], m0 + mova [dstq+strideq*1+64*3], m1 + mova [maskq+32*0], ym16 + mova [maskq+32*1], ym17 + sub hd, 2 + jg .w128_loop + vzeroupper + RET +ALIGN function_align +.main: + mova m1, [tmp1q+64*0] + mova m3, [tmp2q+64*0] + mova m4, [tmp1q+64*1] + mova m7, [tmp2q+64*1] + add tmp1q, 64*2 + add tmp2q, 64*2 + psubsw m6, m1, m3 + punpcklwd m5, m3, m1 + pabsw m6, m6 + punpckhwd m3, m1 + psubusw m6, m10, m6 + psrlw m6, 10 ; 64-m + psubw m2, m11, m6 ; m + punpcklwd m1, m6, m2 + punpckhwd m6, m2 + mova m0, m12 + vpdpwssd m0, m5, m1 + mova m1, m12 + vpdpwssd m1, m3, m6 + psubsw m5, m4, m7 + punpcklwd m6, m7, m4 + pabsw m5, m5 + punpckhwd m7, m4 + psubusw m5, m10, m5 + psrlw m5, 10 + psubw m3, m11, m5 + punpcklwd m4, m5, m3 + psrad m0, 4 + punpckhwd m5, m3 + psrad m1, 4 + packusdw m0, m1 + mova m1, m12 + vpdpwssd m1, m6, m4 + mova m4, m12 + vpdpwssd m4, m7, m5 + psrad m1, 4 + psrad m4, 4 + packusdw m1, m4 + vpsrlvw m0, m13 + vpsrlvw m1, m13 + ret + +cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_422_avx512icl_table + lea r7, [w_mask_422_avx512icl_table] + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 + vpbroadcastd m9, [base+pw_64] + vpbroadcastd m10, [base+mask_round+r6*4] + vpbroadcastd m11, [base+bidir_shift+r6*4] + mov r6d, r7m ; sign + vpbroadcastd m12, [base+w_mask_round+r6*4] + mova ym13, [w_mask_end42x] + mov maskq, maskmp + add wq, r7 + paddw m14, m9, m9 ; pw_128 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti32x4 xm2, ym0, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + cmp hd, 8 + jl .w4_end + vextracti32x4 xm2, m0, 2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm0, m0, 3 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + je .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + vextracti32x4 xm2, ym1, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + vextracti32x4 xm2, m1, 2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm1, m1, 3 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 +.w4_end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] +.w8: + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+stride3q ], m0, 3 + sub hd, 8 + jl .w8_end + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm1 + vextracti32x4 [dstq+strideq*1], ym1, 1 + vextracti32x4 [dstq+strideq*2], m1, 2 + vextracti32x4 [dstq+stride3q ], m1, 3 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*4] +.w16: + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*2] +.w32: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + call .main + mova [dstq+64*2], m0 + mova [dstq+64*3], m1 + dec hd + jg .w128_loop + RET +ALIGN function_align +.main: + mova m1, [tmp1q+64*0] + mova m3, [tmp2q+64*0] + mova m4, [tmp1q+64*1] + mova m7, [tmp2q+64*1] + add tmp1q, 64*2 + add tmp2q, 64*2 + psubsw m6, m1, m3 + punpcklwd m5, m3, m1 + pabsw m6, m6 + punpckhwd m3, m1 + psubusw m6, m8, m6 + psrlw m6, 10 + psubw m2, m9, m6 + punpcklwd m1, m6, m2 + punpckhwd m6, m2 + mova m0, m10 + vpdpwssd m0, m5, m1 + mova m1, m10 + vpdpwssd m1, m3, m6 + psubsw m5, m4, m7 + punpcklwd m6, m7, m4 + pabsw m5, m5 + punpckhwd m7, m4 + psubusw m5, m8, m5 + psrlw m5, 10 + psubw m3, m9, m5 + punpcklwd m4, m5, m3 + psrad m0, 4 + punpckhwd m5, m3 + psrad m1, 4 + packusdw m0, m1 + mova m1, m10 + vpdpwssd m1, m6, m4 + mova m4, m10 + vpdpwssd m4, m7, m5 + mova m5, m12 + vpdpwssd m5, m14, m2 + mova m2, m12 + vpdpwssd m2, m14, m3 + psrad m1, 4 + psrad m4, 4 + packusdw m1, m4 + vpermt2b m5, m13, m2 + vpsrlvw m0, m11 + vpsrlvw m1, m11 + mova [maskq], ym5 + add maskq, 32 + ret + +cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_444_avx512icl_table + lea r7, [w_mask_444_avx512icl_table] + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 + vpbroadcastd m9, [base+pw_64] + vpbroadcastd m10, [base+mask_round+r6*4] + mova m11, [w_mask_end444] + vpbroadcastd m12, [base+bidir_shift+r6*4] + mov maskq, maskmp + add wq, r7 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti32x4 xm2, ym0, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + cmp hd, 8 + jl .w4_end + vextracti32x4 xm2, m0, 2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm0, m0, 3 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + je .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + vextracti32x4 xm2, ym1, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + vextracti32x4 xm2, m1, 2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm1, m1, 3 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 +.w4_end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] +.w8: + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+stride3q ], m0, 3 + sub hd, 8 + jl .w8_end + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm1 + vextracti32x4 [dstq+strideq*1], ym1, 1 + vextracti32x4 [dstq+strideq*2], m1, 2 + vextracti32x4 [dstq+stride3q ], m1, 3 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*4] +.w16: + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*2] +.w32: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + call .main + mova [dstq+64*2], m0 + mova [dstq+64*3], m1 + dec hd + jg .w128_loop + RET +ALIGN function_align +.main: + mova m1, [tmp1q+64*0] + mova m3, [tmp2q+64*0] + mova m4, [tmp1q+64*1] + mova m7, [tmp2q+64*1] + add tmp1q, 64*2 + add tmp2q, 64*2 + psubsw m6, m1, m3 + punpcklwd m5, m3, m1 + pabsw m6, m6 + punpckhwd m3, m1 + psubusw m6, m8, m6 + psrlw m6, 10 + psubw m2, m9, m6 + punpcklwd m1, m6, m2 + punpckhwd m6, m2 + mova m0, m10 + vpdpwssd m0, m5, m1 + mova m1, m10 + vpdpwssd m1, m3, m6 + psubsw m5, m4, m7 + punpcklwd m6, m7, m4 + pabsw m5, m5 + punpckhwd m7, m4 + psubusw m5, m8, m5 + psrlw m5, 10 + psubw m3, m9, m5 + punpcklwd m4, m5, m3 + psrad m0, 4 + punpckhwd m5, m3 + psrad m1, 4 + packusdw m0, m1 + mova m1, m10 + vpdpwssd m1, m6, m4 + mova m4, m10 + vpdpwssd m4, m7, m5 + vpermt2b m2, m11, m3 + psrad m1, 4 + psrad m4, 4 + packusdw m1, m4 + vpsrlvw m0, m12 + vpsrlvw m1, m12 + mova [maskq], m2 + add maskq, 64 + ret + +cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask +%define base r6-blend_avx512icl_table + lea r6, [blend_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r6+wq*4] + movifnidn maskq, maskmp + vpbroadcastd m6, [base+pw_m512] + add wq, r6 + lea r6, [dsq*3] + jmp wq +.w4: + pmovzxbw ym19, [maskq] + movq xm16, [dstq+dsq*0] + movhps xm16, [dstq+dsq*1] + vpbroadcastq ym17, [dstq+dsq*2] + vpbroadcastq ym18, [dstq+r6 ] + pmullw ym19, ym6 + vpblendd ym16, ym17, 0x30 + vpblendd ym16, ym18, 0xc0 + psubw ym17, ym16, [tmpq] + add maskq, 16 + add tmpq, 32 + pmulhrsw ym17, ym19 + paddw ym16, ym17 + vextracti128 xm17, ym16, 1 + movq [dstq+dsq*0], xm16 + movhps [dstq+dsq*1], xm16 + movq [dstq+dsq*2], xm17 + movhps [dstq+r6 ], xm17 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w4 + vzeroupper + RET +.w8: + pmovzxbw m2, [maskq] + mova xm0, [dstq+dsq*0] + vinserti32x4 ym0, [dstq+dsq*1], 1 + vinserti32x4 m0, [dstq+dsq*2], 2 + vinserti32x4 m0, [dstq+r6 ], 3 + pmullw m2, m6 + psubw m1, m0, [tmpq] + add maskq, 32 + add tmpq, 64 + pmulhrsw m1, m2 + paddw m0, m1 + mova [dstq+dsq*0], xm0 + vextracti32x4 [dstq+dsq*1], ym0, 1 + vextracti32x4 [dstq+dsq*2], m0, 2 + vextracti32x4 [dstq+r6 ], m0, 3 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w8 + RET +.w16: + pmovzxbw m4, [maskq+32*0] + pmovzxbw m5, [maskq+32*1] + mova ym0, [dstq+dsq*0] + vinserti32x8 m0, [dstq+dsq*1], 1 + mova ym1, [dstq+dsq*2] + vinserti32x8 m1, [dstq+r6 ], 1 + pmullw m4, m6 + pmullw m5, m6 + psubw m2, m0, [tmpq+64*0] + psubw m3, m1, [tmpq+64*1] + add maskq, 32*2 + add tmpq, 64*2 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + mova [dstq+dsq*2], ym1 + vextracti32x8 [dstq+r6 ], m1, 1 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w16 + RET +.w32: + pmovzxbw m4, [maskq+32*0] + pmovzxbw m5, [maskq+32*1] + mova m0, [dstq+dsq*0] + mova m1, [dstq+dsq*1] + pmullw m4, m6 + pmullw m5, m6 + psubw m2, m0, [tmpq+ 64*0] + psubw m3, m1, [tmpq+ 64*1] + add maskq, 32*2 + add tmpq, 64*2 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w32 + RET + +cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h + lea r5, [blend_v_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + jmp wq +.w2: + vpbroadcastd xmm2, [obmc_masks_avx2+2*2] +.w2_loop: + movd xmm0, [dstq+dsq*0] + pinsrd xmm0, [dstq+dsq*1], 1 + movq xmm1, [tmpq] + add tmpq, 4*2 + psubw xmm1, xmm0, xmm1 + pmulhrsw xmm1, xmm2 + paddw xmm0, xmm1 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w2_loop + RET +.w4: + vpbroadcastq xmm2, [obmc_masks_avx2+4*2] +.w4_loop: + movq xmm0, [dstq+dsq*0] + movhps xmm0, [dstq+dsq*1] + psubw xmm1, xmm0, [tmpq] + add tmpq, 8*2 + pmulhrsw xmm1, xmm2 + paddw xmm0, xmm1 + movq [dstq+dsq*0], xmm0 + movhps [dstq+dsq*1], xmm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w4_loop + RET +.w8: + vbroadcasti32x4 ym2, [obmc_masks_avx2+8*2] +.w8_loop: + mova xm0, [dstq+dsq*0] + vinserti32x4 ym0, [dstq+dsq*1], 1 + psubw ym1, ym0, [tmpq] + add tmpq, 16*2 + pmulhrsw ym1, ym2 + paddw ym0, ym1 + mova [dstq+dsq*0], xm0 + vextracti32x4 [dstq+dsq*1], ym0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w8_loop + RET +.w16: + vbroadcasti32x8 m2, [obmc_masks_avx2+16*2] +.w16_loop: + mova ym0, [dstq+dsq*0] + vinserti32x8 m0, [dstq+dsq*1], 1 + psubw m1, m0, [tmpq] + add tmpq, 32*2 + pmulhrsw m1, m2 + paddw m0, m1 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w16_loop + RET +.w32: + mova m4, [obmc_masks_avx2+32*2] +.w32_loop: + mova m0, [dstq+dsq*0] + psubw m2, m0, [tmpq+ 64*0] + mova m1, [dstq+dsq*1] + psubw m3, m1, [tmpq+ 64*1] + add tmpq, 64*2 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w32_loop + RET + +cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, mask +%define base r6-$$ + lea r6, [$$] + tzcnt wd, wm + mov hd, hm + movsxd wq, [base+blend_h_avx512icl_table+wq*4] + lea maskq, [base+obmc_masks_avx2+hq*2] + lea hd, [hq*3] + lea wq, [base+blend_h_avx512icl_table+wq] + shr hd, 2 ; h * 3/4 + lea maskq, [maskq+hq*2] + neg hq + jmp wq +.w2: + movd xmm0, [dstq+dsq*0] + pinsrd xmm0, [dstq+dsq*1], 1 + movd xmm2, [maskq+hq*2] + movq xmm1, [tmpq] + add tmpq, 4*2 + punpcklwd xmm2, xmm2 + psubw xmm1, xmm0, xmm1 + pmulhrsw xmm1, xmm2 + paddw xmm0, xmm1 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w2 + RET +.w4: + mova xmm3, [blend_shuf] +.w4_loop: + movq xmm0, [dstq+dsq*0] + movhps xmm0, [dstq+dsq*1] + movd xmm2, [maskq+hq*2] + psubw xmm1, xmm0, [tmpq] + add tmpq, 8*2 + pshufb xmm2, xmm3 + pmulhrsw xmm1, xmm2 + paddw xmm0, xmm1 + movq [dstq+dsq*0], xmm0 + movhps [dstq+dsq*1], xmm0 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w4_loop + RET +.w8: + vbroadcasti32x4 ym3, [blend_shuf] + shufpd ym3, ym3, 0x0c +.w8_loop: + mova xm0, [dstq+dsq*0] + vinserti32x4 ym0, [dstq+dsq*1], 1 + vpbroadcastd ym2, [maskq+hq*2] + psubw ym1, ym0, [tmpq] + add tmpq, 16*2 + pshufb ym2, ym3 + pmulhrsw ym1, ym2 + paddw ym0, ym1 + mova [dstq+dsq*0], xm0 + vextracti32x4 [dstq+dsq*1], ym0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w8_loop + RET +.w16: + vbroadcasti32x4 m3, [blend_shuf] + shufpd m3, m3, 0xf0 +.w16_loop: + mova ym0, [dstq+dsq*0] + vinserti32x8 m0, [dstq+dsq*1], 1 + vpbroadcastd m2, [maskq+hq*2] + psubw m1, m0, [tmpq] + add tmpq, 32*2 + pshufb m2, m3 + pmulhrsw m1, m2 + paddw m0, m1 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w16_loop + RET +.w32: + vpbroadcastw m4, [maskq+hq*2] + vpbroadcastw m5, [maskq+hq*2+2] + mova m0, [dstq+dsq*0] + psubw m2, m0, [tmpq+ 64*0] + mova m1, [dstq+dsq*1] + psubw m3, m1, [tmpq+ 64*1] + add tmpq, 64*2 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w32 + RET +.w64: + vpbroadcastw m4, [maskq+hq*2] + mova m0, [dstq+64*0] + psubw m2, m0, [tmpq+64*0] + mova m1, [dstq+64*1] + psubw m3, m1, [tmpq+64*1] + add tmpq, 64*2 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + paddw m0, m2 + paddw m1, m3 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, dsq + inc hq + jl .w64 + RET +.w128: + vpbroadcastw m8, [maskq+hq*2] + mova m0, [dstq+64*0] + psubw m4, m0, [tmpq+64*0] + mova m1, [dstq+64*1] + psubw m5, m1, [tmpq+64*1] + mova m2, [dstq+64*2] + psubw m6, m2, [tmpq+64*2] + mova m3, [dstq+64*3] + psubw m7, m3, [tmpq+64*3] + add tmpq, 64*4 + REPX {pmulhrsw x, m8}, m4, m5, m6, m7 + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + mova [dstq+64*2], m2 + mova [dstq+64*3], m3 + add dstq, dsq + inc hq + jl .w128 + RET + +cglobal resize_16bpc, 6, 12, 32, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0, pxmax + sub dword mx0m, 4<<14 + sub dword src_wm, 8 + mov r6, ~0 + vpbroadcastd m5, dxm + vpbroadcastd m8, mx0m + vpbroadcastd m6, src_wm + kmovq k6, r6 + DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax + LEA r7, $$ +%define base r7-$$ + vpbroadcastd m3, [base+pd_16384] + vpbroadcastd m7, [base+pd_63] + mova m24, [base+resize_permA] + mova m25, [base+resize_permB] + mova m26, [base+resize_permC] + mova m27, [base+resize_permD] + vbroadcasti32x4 m28, [base+resize_shufA] + vbroadcasti32x4 m29, [base+resize_shufB] + mova m30, [base+resize_permE] + vpbroadcastw ym31, pxmaxm + vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15] + pslld m5, 4 ; dx*16 + pslld m6, 14 + pxor m2, m2 +.loop_y: + xor xd, xd + mova m4, m8 ; per-line working version of mx +.loop_x: + pmaxsd m0, m4, m2 + psrad m9, m4, 8 ; filter offset (unmasked) + pminsd m0, m6 ; iclip(mx, 0, src_w-8) + psubd m1, m4, m0 ; pshufb offset + psrad m0, 14 ; clipped src_x offset + psrad m1, 14 ; pshufb edge_emu offset + vptestmd k5, m1, m1 + pand m9, m7 ; filter offset (masked) + ktestw k5, k5 + jz .load + vpbroadcastq m14, [base+pd_0_4] + vpermq m10, m0, q1100 + vpermq m11, m0, q3322 + vpermq m20, m1, q1100 + vpermq m21, m1, q3322 + punpckldq m10, m10 + punpckldq m11, m11 + punpckldq m20, m20 + punpckldq m21, m21 + paddd m10, m14 + paddd m11, m14 + paddd m20, m14 + paddd m21, m14 + vextracti32x8 ym12, m10, 1 + vextracti32x8 ym13, m11, 1 + vextracti32x8 ym22, m20, 1 + vextracti32x8 ym23, m21, 1 + kmovq k1, k6 + kmovq k2, k6 + kmovq k3, k6 + kmovq k4, k6 + vpgatherdq m16{k1}, [srcq+ym10*2] ; 0 1 2 3 + vpgatherdq m17{k2}, [srcq+ym11*2] ; 4 5 6 7 + vpgatherdq m18{k3}, [srcq+ym12*2] ; 8 9 A B + vpgatherdq m19{k4}, [srcq+ym13*2] ; C D E F + kmovq k1, k6 + kmovq k2, k6 + kmovq k3, k6 + kmovq k4, k6 + vpgatherdq m0{k1}, [base+resize_shuf+8+ym20*2] + vpgatherdq m1{k2}, [base+resize_shuf+8+ym21*2] + vpgatherdq m14{k3}, [base+resize_shuf+8+ym22*2] + vpgatherdq m15{k4}, [base+resize_shuf+8+ym23*2] + pshufb m16, m0 + pshufb m17, m1 + pshufb m18, m14 + pshufb m19, m15 + mova m20, m24 + mova m22, m24 + mova m21, m25 + mova m23, m25 + vpermi2d m20, m16, m17 ; 0-3a 0-3b 4-7a 4-7b + vpermi2d m21, m16, m17 ; 0-3c 0-3d 4-7c 4-7d + vpermi2d m22, m18, m19 ; 8-Ba 8-Bb C-Fa C-Fb + vpermi2d m23, m18, m19 ; 8-Bc 8-Bd C-Fc C-Fd + mova m15, m26 + mova m17, m26 + mova m16, m27 + mova m18, m27 + vpermi2q m15, m20, m22 ; 0-3a 4-7a 8-Ba C-Fa + vpermi2q m16, m20, m22 ; 0-3b 4-7b 8-Bb C-Fb + vpermi2q m17, m21, m23 ; 0-3c 4-7c 8-Bc C-Fc + vpermi2q m18, m21, m23 ; 0-3d 4-7d 8-Bd C-Fd + kmovq k1, k6 + kmovq k2, k6 + vpgatherdd m11{k1}, [base+resize_filter+m9*8+0] + vpgatherdd m13{k2}, [base+resize_filter+m9*8+4] + pshufb m10, m11, m28 + pshufb m11, m11, m29 + pshufb m12, m13, m28 + pshufb m13, m13, m29 + jmp .filter +.load: + kmovq k1, k6 + kmovq k2, k6 + kmovq k3, k6 + kmovq k4, k6 + vpgatherdd m11{k1}, [base+resize_filter+m9*8+0] + vpgatherdd m13{k2}, [base+resize_filter+m9*8+4] + pshufb m10, m11, m28 + pshufb m11, m11, m29 + pshufb m12, m13, m28 + pshufb m13, m13, m29 + vpgatherdd m15{k3}, [srcq+m0*2+ 0] + vpgatherdd m16{k4}, [srcq+m0*2+ 4] + kmovq k1, k6 + kmovq k2, k6 + vpgatherdd m17{k1}, [srcq+m0*2+ 8] + vpgatherdd m18{k2}, [srcq+m0*2+12] +.filter: + mova m14, m2 + vpdpwssd m14, m15, m10 + vpdpwssd m14, m16, m11 + vpdpwssd m14, m17, m12 + vpdpwssd m14, m18, m13 + psubd m14, m3, m14 + psrad m14, 15 + packusdw m14, m14 + vpermq m14, m30, m14 + pminsw ym14, ym31 + mova [dstq+xq*2], ym14 + paddd m4, m5 + add xd, 16 + cmp xd, dst_wd + jl .loop_x + add dstq, dst_strideq + add srcq, src_strideq + dec hd + jg .loop_y + RET + +%endif ; ARCH_X86_64 -- cgit v1.2.3