summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/x86/mc_avx512.asm
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/dav1d/src/x86/mc_avx512.asm')
-rw-r--r--third_party/dav1d/src/x86/mc_avx512.asm4538
1 files changed, 4538 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/mc_avx512.asm b/third_party/dav1d/src/x86/mc_avx512.asm
new file mode 100644
index 0000000000..7897f1decc
--- /dev/null
+++ b/third_party/dav1d/src/x86/mc_avx512.asm
@@ -0,0 +1,4538 @@
+; Copyright © 2020, VideoLAN and dav1d authors
+; Copyright © 2020, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+obmc_masks:
+pw_512: times 2 dw 512
+ ; 2
+ db 45, 19, 64, 0
+ ; 4
+ db 39, 25, 50, 14, 59, 5, 64, 0
+ ; 8
+ db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0
+ ; 16
+ db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
+ db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0
+ ; 32
+ db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
+ db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9
+ db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
+ db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0
+
+warp_8x8_permA: db 4, 5, 6, 7, 16, 17, 18, 19, 5, 6, 7, 8, 17, 18, 19, 20
+ db 6, 7, 8, 9, 18, 19, 20, 21, 7, 8, 9, 10, 19, 20, 21, 22
+ db 8, 9, 10, 11, 20, 21, 22, 23, 9, 10, 11, 12, 21, 22, 23, 24
+ db 10, 11, 12, 13, 22, 23, 24, 25, 11, 12, 13, 14, 23, 24, 25, 26
+warp_8x8_permB: db 0, 1, 2, 3, 20, 21, 22, 23, 1, 2, 3, 4, 21, 22, 23, 24
+ db 2, 3, 4, 5, 22, 23, 24, 25, 3, 4, 5, 6, 23, 24, 25, 26
+ db 4, 5, 6, 7, 24, 25, 26, 27, 5, 6, 7, 8, 25, 26, 27, 28
+ db 6, 7, 8, 9, 26, 27, 28, 29, 7, 8, 9, 10, 27, 28, 29, 30
+warp_8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13
+warp_8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15
+pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7
+warp_8x8_hpack: db 3, 11, 3, 11, 35, 43, 35, 43
+pd_16384: dd 16384
+pd_262144: dd 262144
+warp_8x8_end: db 0, 4, 16, 20, 32, 36, 48, 52, 2, 6, 18, 22, 34, 38, 50, 54
+warp_8x8t_end: db 2, 3, 10, 11, 18, 19, 26, 27, 34, 35, 42, 43, 50, 51, 58, 59
+ db 6, 7, 14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 63
+bidir_sctr_w4: dd 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+wm_420_perm4: db 1, 3, 9, 11, 5, 7, 13, 15, 17, 19, 25, 27, 21, 23, 29, 31
+ db 33, 35, 41, 43, 37, 39, 45, 47, 49, 51, 57, 59, 53, 55, 61, 63
+ db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30
+ db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62
+wm_420_perm8: db 1, 3, 17, 19, 5, 7, 21, 23, 9, 11, 25, 27, 13, 15, 29, 31
+ db 33, 35, 49, 51, 37, 39, 53, 55, 41, 43, 57, 59, 45, 47, 61, 63
+ db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30
+ db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62
+wm_420_perm16: db 1, 3, 33, 35, 5, 7, 37, 39, 9, 11, 41, 43, 13, 15, 45, 47
+ db 17, 19, 49, 51, 21, 23, 53, 55, 25, 27, 57, 59, 29, 31, 61, 63
+ db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46
+ db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62
+wm_420_mask: db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63
+ db 67, 71, 75, 79, 83, 87, 91, 95, 99,103,107,111,115,119,123,127
+ db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
+ db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
+wm_422_mask: db 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62
+ db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
+ db 66, 70, 74, 78, 82, 86, 90, 94, 98,102,106,110,114,118,122,126
+ db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
+wm_444_mask: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
+ db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
+bilin_h_perm16: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+ db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15
+ db 33, 32, 34, 33, 35, 34, 36, 35, 37, 36, 38, 37, 39, 38, 40, 39
+ db 41, 40, 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47
+bilin_h_perm32: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+ db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15
+ db 17, 16, 18, 17, 19, 18, 20, 19, 21, 20, 22, 21, 23, 22, 24, 23
+ db 25, 24, 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31
+bilin_v_perm8: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
+ db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23
+ db 32, 80, 33, 81, 34, 82, 35, 83, 36, 84, 37, 85, 38, 86, 39, 87
+ db 64, 32, 65, 33, 66, 34, 67, 35, 68, 36, 69, 37, 70, 38, 71, 39
+bilin_v_perm16: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
+ db 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
+ db 64, 16, 65, 17, 66, 18, 67, 19, 68, 20, 69, 21, 70, 22, 71, 23
+ db 72, 24, 73, 25, 74, 26, 75, 27, 76, 28, 77, 29, 78, 30, 79, 31
+bilin_v_perm32: db 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7
+ db 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15
+ db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23
+ db 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31
+bilin_v_perm64: dq 0, 4, 1, 5, 2, 6, 3, 7
+spel_h_perm16a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+ db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+ db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
+ db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
+spel_h_perm16b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+ db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18
+ db 36, 37, 38, 39, 37, 38, 39, 40, 38, 39, 40, 41, 39, 40, 41, 42
+ db 44, 45, 46, 47, 45, 46, 47, 48, 46, 47, 48, 49, 47, 48, 49, 50
+spel_h_perm16c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+ db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
+ db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
+ db 48, 49, 50, 51, 49, 50, 51, 52, 50, 51, 52, 53, 51, 52, 53, 54
+spel_h_perm32a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+ db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+ db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
+ db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30
+spel_h_perm32b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+ db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18
+ db 20, 21, 22, 23, 21, 22, 23, 24, 22, 23, 24, 25, 23, 24, 25, 26
+ db 28, 29, 30, 31, 29, 30, 31, 32, 30, 31, 32, 33, 31, 32, 33, 34
+spel_h_perm32c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+ db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
+ db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30
+ db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
+spel_v_perm16: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7
+ db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+ db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23
+ db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
+spel_v_perm32: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39
+ db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47
+ db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55
+ db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
+spel_hv_perm4a: db 8, 9, 16, 17, 10, 11, 18, 19, 12, 13, 20, 21, 14, 15, 22, 23
+ db 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31
+spel_hv_perm4b: db 24, 25, 32, 33, 26, 27, 34, 35, 28, 29, 36, 37, 30, 31, 38, 39
+ db 32, 33, 40, 41, 34, 35, 42, 43, 36, 37, 44, 45, 38, 39, 46, 47
+spel_hv_perm4c: db 40, 41, 48, 49, 42, 43, 50, 51, 44, 45, 52, 53, 46, 47, 54, 55
+ db 48, 49, 56, 57, 50, 51, 58, 59, 52, 53, 60, 61, 54, 55, 62, 63
+spel_hv_perm4d: db 18, 19, 0, 1, 22, 23, 4, 5, 26, 27, 8, 9, 30, 31, 12, 13
+ db 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29
+spel_hv_perm8a: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23
+ db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31
+ db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39
+ db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47
+spel_hv_perm8b: db 32, 33, 48, 49, 34, 35, 50, 51, 36, 37, 52, 53, 38, 39, 54, 55
+ db 40, 41, 56, 57, 42, 43, 58, 59, 44, 45, 60, 61, 46, 47, 62, 63
+ db 48, 49, 64, 65, 50, 51, 66, 67, 52, 53, 68, 69, 54, 55, 70, 71
+ db 56, 57, 72, 73, 58, 59, 74, 75, 60, 61, 76, 77, 62, 63, 78, 79
+spel_hv_perm8c: db 34, 35, 0, 1, 38, 39, 4, 5, 42, 43, 8, 9, 46, 47, 12, 13
+ db 50, 51, 16, 17, 54, 55, 20, 21, 58, 59, 24, 25, 62, 63, 28, 29
+ db 0, 1, 32, 33, 4, 5, 36, 37, 8, 9, 40, 41, 12, 13, 44, 45
+ db 16, 17, 48, 49, 20, 21, 52, 53, 24, 25, 56, 57, 28, 29, 60, 61
+spel_hv_end16: db 1, 3, 17, 19, 5, 7, 21, 23, 33, 35, 49, 51, 37, 39, 53, 55
+ db 9, 11, 25, 27, 13, 15, 29, 31, 41, 43, 57, 59, 45, 47, 61, 63
+spel_hv_perm16a:db 0, 1, 2, 3, 32, 33, 34, 35, 1, 2, 3, 4, 33, 34, 35, 36
+ db 2, 3, 4, 5, 34, 35, 36, 37, 3, 4, 5, 6, 35, 36, 37, 38
+spel_hv_perm16c:db 8, 9, 10, 11, 40, 41, 42, 43, 9, 10, 11, 12, 41, 42, 43, 44
+ db 10, 11, 12, 13, 42, 43, 44, 45, 11, 12, 13, 14, 43, 44, 45, 46
+ db 16, 17, 18, 19, 48, 49, 50, 51, 17, 18, 19, 20, 49, 50, 51, 52
+ db 18, 19, 20, 21, 50, 51, 52, 53, 19, 20, 21, 22, 51, 52, 53, 54
+spel_hv_perm16b:db 4, 5, 6, 7, 36, 37, 38, 39, 5, 6, 7, 8, 37, 38, 39, 40
+ db 6, 7, 8, 9, 38, 39, 40, 41, 7, 8, 9, 10, 39, 40, 41, 42
+ db 12, 13, 14, 15, 44, 45, 46, 47, 13, 14, 15, 16, 45, 46, 47, 48
+ db 14, 15, 16, 17, 46, 47, 48, 49, 15, 16, 17, 18, 47, 48, 49, 50
+spel_hv_perm16d:db 0, 1, 2, 3, 1, 2, 3, 4, 4, 5, 6, 7, 5, 6, 7, 8
+ db 2, 3, 4, 5, 3, 4, 5, 6, 6, 7, 8, 9, 7, 8, 9, 10
+ db 8, 9, 10, 11, 9, 10, 11, 12, 12, 13, 14, 15, 13, 14, 15, 16
+ db 10, 11, 12, 13, 11, 12, 13, 14, 14, 15, 16, 17, 15, 16, 17, 18
+spel_hv_perm16e:db 4, 5, 6, 7, 5, 6, 7, 8, 8, 9, 10, 11, 9, 10, 11, 12
+ db 6, 7, 8, 9, 7, 8, 9, 10, 10, 11, 12, 13, 11, 12, 13, 14
+ db 12, 13, 14, 15, 13, 14, 15, 16, 16, 17, 18, 19, 17, 18, 19, 20
+ db 14, 15, 16, 17, 15, 16, 17, 18, 18, 19, 20, 21, 19, 20, 21, 22
+spel_hv_end: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55
+deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
+subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
+ db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
+bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7
+resize_permA: dd 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+resize_permB: dd 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+resize_permC: dd 0, 4, 8, 12
+pb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7
+
+wm_420_perm64: dq 0xfedcba9876543210
+wm_sign: dd 0x40804080, 0xc0c0c0c0, 0x40404040
+
+pb_8x0_8x8: times 8 db 0
+ times 8 db 8
+pb_127: times 4 db 127
+pw_m128 times 2 dw -128
+pw_m256: times 2 dw -256
+pw_1024: times 2 dw 1024
+pw_2048: times 2 dw 2048
+pw_6903: times 2 dw 6903
+pw_8192: times 2 dw 8192
+pd_32: dd 32
+pd_34: dd 34
+pd_63: dd 63
+pd_512: dd 512
+pd_32768: dd 32768
+
+%define pb_m64 (wm_sign+4)
+%define pb_64 (wm_sign+8)
+%define pd_2 (pd_0to7+8)
+
+cextern mc_subpel_filters
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+cextern mc_warp_filter
+cextern resize_filter
+
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro HV_JMP_TABLE 5-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
+ %xdefine %%base %1_%3
+ %assign %%types %4
+ %if %%types & 1
+ %xdefine %1_%2_h_%3_table (%%h - %5)
+ %%h:
+ %rep %0 - 4
+ dw %%prefix %+ .h_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 2
+ %xdefine %1_%2_v_%3_table (%%v - %5)
+ %%v:
+ %rep %0 - 4
+ dw %%prefix %+ .v_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 4
+ %xdefine %1_%2_hv_%3_table (%%hv - %5)
+ %%hv:
+ %rep %0 - 4
+ dw %%prefix %+ .hv_w%5 - %%base
+ %rotate 1
+ %endrep
+ %endif
+%endmacro
+
+%macro BIDIR_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - 2*%3)
+ %xdefine %%base %1_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%prefix %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_8bpc_avx512icl.put)
+%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_8bpc_avx512icl.prep)
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, 8tap, avx512icl, 3, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 8tap, avx512icl, 7, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128
+
+SECTION .text
+
+%macro WRAP_YMM 1+
+INIT_YMM cpuname
+ %1
+INIT_ZMM cpuname
+%endmacro
+
+INIT_ZMM avx512icl
+cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
+ movifnidn mxyd, r6m ; mx
+ lea r7, [put_avx512icl]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .v
+.put:
+ movzx wd, word [r7+wq*2+table_offset(put,)]
+ add wq, r7
+ jmp wq
+.put_w2:
+ movzx r6d, word [srcq+ssq*0]
+ movzx r7d, word [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6w
+ mov [dstq+dsq*1], r7w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w2
+ RET
+.put_w4:
+ mov r6d, [srcq+ssq*0]
+ mov r7d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6d
+ mov [dstq+dsq*1], r7d
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w4
+ RET
+.put_w8:
+ mov r6, [srcq+ssq*0]
+ mov r7, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6
+ mov [dstq+dsq*1], r7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w8
+ RET
+.put_w16:
+ movu xmm0, [srcq+ssq*0]
+ movu xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], xmm0
+ mova [dstq+dsq*1], xmm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w16
+ RET
+.put_w32:
+ movu ym0, [srcq+ssq*0]
+ movu ym1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], ym0
+ mova [dstq+dsq*1], ym1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w32
+ RET
+.put_w64:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w64
+ RET
+.put_w128:
+ movu m0, [srcq+ssq*0+64*0]
+ movu m1, [srcq+ssq*0+64*1]
+ movu m2, [srcq+ssq*1+64*0]
+ movu m3, [srcq+ssq*1+64*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0+64*0], m0
+ mova [dstq+dsq*0+64*1], m1
+ mova [dstq+dsq*1+64*0], m2
+ mova [dstq+dsq*1+64*1], m3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w128
+ RET
+.h:
+ ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
+ ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
+ imul mxyd, 0xff01
+ vbroadcasti128 m4, [bilin_h_shuf8]
+ add mxyd, 16 << 8
+ vpbroadcastw m5, mxyd
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)]
+ vpbroadcastd m3, [pw_2048]
+ add wq, r7
+ jmp wq
+.h_w2:
+ movd xmm0, [srcq+ssq*0]
+ pinsrd xmm0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb xmm0, xm4
+ pmaddubsw xmm0, xm5
+ pmulhrsw xmm0, xm3
+ packuswb xmm0, xmm0
+ pextrw [dstq+dsq*0], xmm0, 0
+ pextrw [dstq+dsq*1], xmm0, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2
+ RET
+.h_w4:
+ mova xmm4, [bilin_h_shuf4]
+.h_w4_loop:
+ movq xmm0, [srcq+ssq*0]
+ movhps xmm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xmm0, xmm4
+ pmaddubsw xmm0, xm5
+ pmulhrsw xmm0, xm3
+ packuswb xmm0, xmm0
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h_w8:
+ movu xm0, [srcq+ssq*0]
+ vinserti32x4 ym0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb ym0, ym4
+ pmaddubsw ym0, ym5
+ pmulhrsw ym0, ym3
+ vpmovuswb xm0, ym0
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ mova m4, [bilin_h_perm16]
+.h_w16_loop:
+ movu ym0, [srcq+ssq*0]
+ vinserti32x8 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ vpermb m0, m4, m0
+ pmaddubsw m0, m5
+ pmulhrsw m0, m3
+ vpmovuswb ym0, m0
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], ym0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16_loop
+ RET
+.h_w32:
+ movu ym0, [srcq+ssq*0+8*0]
+ vinserti32x8 m0, [srcq+ssq*1+8*0], 1
+ movu ym1, [srcq+ssq*0+8*1]
+ vinserti32x8 m1, [srcq+ssq*1+8*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w32
+ RET
+.h_w64:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ add srcq, ssq
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ movu m0, [srcq+8*0]
+ movu m2, [srcq+8*1]
+ movu m1, [srcq+8*8]
+ movu m6, [srcq+8*9]
+ add srcq, ssq
+ REPX {pshufb x, m4}, m0, m2, m1, m6
+ REPX {pmaddubsw x, m5}, m0, m2, m1, m6
+ REPX {pmulhrsw x, m3}, m0, m2, m1, m6
+ packuswb m0, m2
+ packuswb m1, m6
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, dsq
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)]
+ imul mxyd, 0xff01
+ vpbroadcastd m5, [pw_2048]
+ add mxyd, 16 << 8
+ add wq, r7
+ vpbroadcastw m4, mxyd
+ jmp wq
+.v_w2:
+ movd xmm0, [srcq+ssq*0]
+.v_w2_loop:
+ pinsrw xmm1, xmm0, [srcq+ssq*1], 1 ; 0 1
+ lea srcq, [srcq+ssq*2]
+ pinsrw xmm0, xmm1, [srcq+ssq*0], 0 ; 2 1
+ pshuflw xmm1, xmm1, q2301 ; 1 0
+ punpcklbw xmm1, xmm0, xmm1
+ pmaddubsw xmm1, xm4
+ pmulhrsw xmm1, xm5
+ packuswb xmm1, xmm1
+ pextrw [dstq+dsq*0], xmm1, 1
+ pextrw [dstq+dsq*1], xmm1, 0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movd xmm0, [srcq+ssq*0]
+.v_w4_loop:
+ vpbroadcastd xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xmm2, xmm1, xmm0, 0x01 ; 0 1
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm1, xmm0, 0x02 ; 1 2
+ punpcklbw xmm1, xmm2
+ pmaddubsw xmm1, xm4
+ pmulhrsw xmm1, xm5
+ packuswb xmm1, xmm1
+ movd [dstq+dsq*0], xmm1
+ pextrd [dstq+dsq*1], xmm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xmm0, [srcq+ssq*0]
+.v_w8_loop:
+ movq xmm3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw xmm1, xmm3, xmm0
+ movq xmm0, [srcq+ssq*0]
+ punpcklbw xmm2, xmm0, xmm3
+ pmaddubsw xmm1, xm4
+ pmaddubsw xmm2, xm4
+ pmulhrsw xmm1, xm5
+ pmulhrsw xmm2, xm5
+ packuswb xmm1, xmm2
+ movq [dstq+dsq*0], xmm1
+ movhps [dstq+dsq*1], xmm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+ movu xmm0, [srcq+ssq*0]
+.v_w16_loop:
+ vbroadcasti128 ymm2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd ymm3, ymm2, ymm0, 0x0f ; 0 1
+ vbroadcasti128 ymm0, [srcq+ssq*0]
+ vpblendd ymm2, ymm2, ymm0, 0xf0 ; 1 2
+ punpcklbw ymm1, ymm2, ymm3
+ punpckhbw ymm2, ymm3
+ pmaddubsw ymm1, ym4
+ pmaddubsw ymm2, ym4
+ pmulhrsw ymm1, ym5
+ pmulhrsw ymm2, ym5
+ packuswb ymm1, ymm2
+ mova [dstq+dsq*0], xmm1
+ vextracti128 [dstq+dsq*1], ymm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ vzeroupper
+ RET
+.v_w32:
+ movu ym0, [srcq+ssq*0]
+ kxnorb k1, k1, k1
+.v_w32_loop:
+ vbroadcasti32x8 m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendmd m3{k1}, m2, m0 ; 0 1
+ vbroadcasti32x8 m0, [srcq+ssq*0]
+ vpblendmd m2{k1}, m0, m2 ; 1 2
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ mova [dstq+dsq*0], ym1
+ vextracti32x8 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w32_loop
+ RET
+.v_w64:
+ movu m0, [srcq+ssq*0]
+.v_w64_loop:
+ movu m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw m1, m3, m0
+ punpckhbw m6, m3, m0
+ movu m0, [srcq+ssq*0]
+ pmaddubsw m1, m4
+ pmaddubsw m6, m4
+ punpcklbw m2, m0, m3
+ punpckhbw m7, m0, m3
+ pmaddubsw m2, m4
+ pmaddubsw m7, m4
+ REPX {pmulhrsw x, m5}, m1, m6, m2, m7
+ packuswb m1, m6
+ packuswb m2, m7
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w64_loop
+ RET
+.v_w128:
+ movu m0, [srcq+64*0]
+ movu m1, [srcq+64*1]
+.v_w128_loop:
+ add srcq, ssq
+ movu m2, [srcq+64*0]
+ movu m3, [srcq+64*1]
+ punpcklbw m6, m2, m0
+ pmaddubsw m6, m4
+ punpckhbw m0, m2, m0
+ pmaddubsw m0, m4
+ punpcklbw m7, m3, m1
+ pmaddubsw m7, m4
+ punpckhbw m1, m3, m1
+ pmaddubsw m1, m4
+ REPX {pmulhrsw x, m5}, m6, m0, m7, m1
+ packuswb m6, m0
+ mova m0, m2
+ packuswb m7, m1
+ mova m1, m3
+ mova [dstq+64*0], m6
+ mova [dstq+64*1], m7
+ add dstq, dsq
+ dec hd
+ jg .v_w128_loop
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
+ ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
+ WIN64_SPILL_XMM 8
+ shl mxyd, 11 ; can't shift by 12 due to signed overflow
+ vpbroadcastd m7, [pw_2048]
+ add wq, r7
+ vpbroadcastw m6, mxyd
+ jmp wq
+.hv_w2:
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ pshufb xmm0, xm4
+ pmaddubsw xmm0, xm5
+.hv_w2_loop:
+ movd xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pinsrd xmm1, [srcq+ssq*0], 1
+ pshufb xmm1, xm4
+ pmaddubsw xmm1, xm5 ; 1 _ 2 _
+ shufps xmm2, xmm0, xmm1, q1032 ; 0 _ 1 _
+ mova xmm0, xmm1
+ psubw xmm1, xmm2
+ paddw xmm1, xmm1
+ pmulhw xmm1, xm6
+ paddw xmm1, xmm2
+ pmulhrsw xmm1, xm7
+ packuswb xmm1, xmm1
+ pextrw [dstq+dsq*0], xmm1, 0
+ pextrw [dstq+dsq*1], xmm1, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ mova xmm4, [bilin_h_shuf4]
+ movddup xmm0, [srcq+ssq*0]
+ pshufb xmm0, xmm4
+ pmaddubsw xmm0, xm5
+.hv_w4_loop:
+ movq xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xmm1, [srcq+ssq*0]
+ pshufb xmm1, xmm4
+ pmaddubsw xmm1, xm5 ; 1 2
+ shufps xmm2, xmm0, xmm1, q1032 ; 0 1
+ mova xmm0, xmm1
+ psubw xmm1, xmm2
+ paddw xmm1, xmm1
+ pmulhw xmm1, xm6
+ paddw xmm1, xmm2
+ pmulhrsw xmm1, xm7
+ packuswb xmm1, xmm1
+ movd [dstq+dsq*0], xmm1
+ pextrd [dstq+dsq*1], xmm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ vbroadcasti128 ym0, [srcq+ssq*0]
+ pshufb ym0, ym4
+ pmaddubsw ym0, ym5
+.hv_w8_loop:
+ movu xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti128 ym1, [srcq+ssq*0], 1
+ pshufb ym1, ym4
+ pmaddubsw ym1, ym5 ; 1 2
+ valignq ym2, ym1, ym0, 2
+ mova ym0, ym1
+ psubw ym1, ym2
+ paddw ym1, ym1
+ pmulhw ym1, ym6
+ paddw ym1, ym2
+ pmulhrsw ym1, ym7
+ vpmovuswb xm1, ym1
+ movq [dstq+dsq*0], xm1
+ movhps [dstq+dsq*1], xm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ vbroadcasti32x8 m0, [srcq+ssq*0]
+ mova m4, [bilin_h_perm16]
+ vpermb m0, m4, m0
+ pmaddubsw m0, m5
+.hv_w16_loop:
+ movu ym1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x8 m1, [srcq+ssq*0], 1
+ vpermb m1, m4, m1
+ pmaddubsw m1, m5 ; 1 2
+ valignq m2, m1, m0, 4 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m2
+ pmulhrsw m1, m7
+ vpmovuswb ym1, m1
+ mova [dstq+dsq*0], xm1
+ vextracti32x4 [dstq+dsq*1], ym1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+ mova m4, [bilin_h_perm32]
+ vpermb m0, m4, [srcq+ssq*0]
+ pmovzxbq m8, [pb_02461357]
+ pmaddubsw m0, m5
+.hv_w32_loop:
+ vpermb m2, m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpermb m3, m4, [srcq+ssq*0]
+ pmaddubsw m2, m5
+ psubw m1, m2, m0
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m0
+ pmaddubsw m0, m3, m5
+ psubw m3, m0, m2
+ paddw m3, m3
+ pmulhw m3, m6
+ paddw m3, m2
+ pmulhrsw m1, m7
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ vpermq m1, m8, m1
+ mova [dstq+dsq*0], ym1
+ vextracti32x8 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w32_loop
+ RET
+.hv_w64:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+.hv_w64_loop:
+ add srcq, ssq
+ movu m2, [srcq+8*0]
+ movu m3, [srcq+8*1]
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ psubw m8, m2, m0
+ psubw m9, m3, m1
+ paddw m8, m8
+ pmulhw m8, m6
+ paddw m9, m9
+ pmulhw m9, m6
+ paddw m8, m0
+ pmulhrsw m8, m7
+ paddw m9, m1
+ pmulhrsw m9, m7
+ mova m0, m2
+ mova m1, m3
+ packuswb m8, m9
+ mova [dstq], m8
+ add dstq, dsq
+ dec hd
+ jg .hv_w64_loop
+ RET
+.hv_w128:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ movu m2, [srcq+8*8]
+ movu m3, [srcq+8*9]
+ REPX {pshufb x, m4}, m0, m1, m2, m3
+ REPX {pmaddubsw x, m5}, m0, m1, m2, m3
+.hv_w128_loop:
+ add srcq, ssq
+ movu m8, [srcq+8*0]
+ movu m9, [srcq+8*1]
+ movu m10, [srcq+8*8]
+ movu m11, [srcq+8*9]
+ REPX {pshufb x, m4}, m8, m9, m10, m11
+ REPX {pmaddubsw x, m5}, m8, m9, m10, m11
+ psubw m12, m8, m0
+ psubw m13, m9, m1
+ psubw m14, m10, m2
+ psubw m15, m11, m3
+ paddw m12, m12
+ pmulhw m12, m6
+ paddw m13, m13
+ pmulhw m13, m6
+ paddw m14, m14
+ pmulhw m14, m6
+ paddw m15, m15
+ pmulhw m15, m6
+ paddw m12, m0
+ pmulhrsw m12, m7
+ paddw m13, m1
+ pmulhrsw m13, m7
+ paddw m14, m2
+ pmulhrsw m14, m7
+ paddw m15, m3
+ pmulhrsw m15, m7
+ mova m0, m8
+ mova m1, m9
+ mova m2, m10
+ mova m3, m11
+ packuswb m12, m13
+ packuswb m14, m15
+ mova [dstq+64*0], m12
+ mova [dstq+64*1], m14
+ add dstq, dsq
+ dec hd
+ jg .hv_w128_loop
+ RET
+
+DECLARE_REG_TMP 3, 5, 6
+
+cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ movifnidn mxyd, r5m ; mx
+ lea t2, [prep_avx512icl]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .v
+.prep:
+ movzx wd, word [t2+wq*2+table_offset(prep,)]
+ add wq, t2
+ lea stride3q, [strideq*3]
+ jmp wq
+.prep_w4:
+ movd xmm0, [srcq+strideq*0]
+ pinsrd xmm0, [srcq+strideq*1], 1
+ pinsrd xmm0, [srcq+strideq*2], 2
+ pinsrd xmm0, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
+ pmovzxbw ym0, xmm0
+ psllw ym0, 4
+ mova [tmpq], ym0
+ add tmpq, 32
+ sub hd, 4
+ jg .prep_w4
+ RET
+.prep_w8:
+ movq xmm0, [srcq+strideq*0]
+ movq xmm1, [srcq+strideq*1]
+ vinserti128 ym0, ymm0, [srcq+strideq*2], 1
+ vinserti128 ym1, ymm1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ punpcklqdq ym0, ym1
+ pmovzxbw m0, ym0
+ psllw m0, 4
+ mova [tmpq], m0
+ add tmpq, 32*2
+ sub hd, 4
+ jg .prep_w8
+ RET
+.prep_w16:
+ movu xmm0, [srcq+strideq*0]
+ vinserti128 ym0, ymm0, [srcq+strideq*1], 1
+ movu xmm1, [srcq+strideq*2]
+ vinserti128 ym1, ymm1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ pmovzxbw m0, ym0
+ pmovzxbw m1, ym1
+ psllw m0, 4
+ psllw m1, 4
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ add tmpq, 32*4
+ sub hd, 4
+ jg .prep_w16
+ RET
+.prep_w32:
+ pmovzxbw m0, [srcq+strideq*0]
+ pmovzxbw m1, [srcq+strideq*1]
+ pmovzxbw m2, [srcq+strideq*2]
+ pmovzxbw m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ REPX {psllw x, 4}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 4
+ jg .prep_w32
+ RET
+.prep_w64:
+ pmovzxbw m0, [srcq+strideq*0+32*0]
+ pmovzxbw m1, [srcq+strideq*0+32*1]
+ pmovzxbw m2, [srcq+strideq*1+32*0]
+ pmovzxbw m3, [srcq+strideq*1+32*1]
+ lea srcq, [srcq+strideq*2]
+ REPX {psllw x, 4}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 2
+ jg .prep_w64
+ RET
+.prep_w128:
+ pmovzxbw m0, [srcq+32*0]
+ pmovzxbw m1, [srcq+32*1]
+ pmovzxbw m2, [srcq+32*2]
+ pmovzxbw m3, [srcq+32*3]
+ REPX {psllw x, 4}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ add srcq, strideq
+ dec hd
+ jg .prep_w128
+ RET
+.h:
+ ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
+ ; = (16 - mx) * src[x] + mx * src[x + 1]
+ imul mxyd, 0xff01
+ add mxyd, 16 << 8
+ vpbroadcastw m5, mxyd
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
+ add wq, t2
+ lea stride3q, [strideq*3]
+ jmp wq
+.h_w4:
+ vbroadcasti32x4 ym4, [bilin_h_shuf4]
+.h_w4_loop:
+ movq xmm0, [srcq+strideq*0]
+ movq xmm1, [srcq+strideq*1]
+ vinserti32x4 ym0, ymm0, [srcq+strideq*2], 1
+ vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ punpcklqdq ym0, ym1
+ pshufb ym0, ym4
+ pmaddubsw ym0, ym5
+ mova [tmpq], ym0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+ vbroadcasti32x4 m4, [bilin_h_shuf8]
+.h_w8_loop:
+ movu xmm0, [srcq+strideq*0]
+ vinserti32x4 ym0, ymm0, [srcq+strideq*1], 1
+ vinserti32x4 m0, [srcq+strideq*2], 2
+ vinserti32x4 m0, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 4
+ jg .h_w8_loop
+ RET
+.h_w16:
+ mova m4, [bilin_h_perm16]
+.h_w16_loop:
+ movu ym0, [srcq+strideq*0]
+ vinserti32x8 m0, [srcq+strideq*1], 1
+ movu ym1, [srcq+strideq*2]
+ vinserti32x8 m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ vpermb m0, m4, m0
+ vpermb m1, m4, m1
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ add tmpq, 64*2
+ sub hd, 4
+ jg .h_w16_loop
+ RET
+.h_w32:
+ mova m4, [bilin_h_perm32]
+.h_w32_loop:
+ vpermb m0, m4, [srcq+strideq*0]
+ vpermb m1, m4, [srcq+strideq*1]
+ vpermb m2, m4, [srcq+strideq*2]
+ vpermb m3, m4, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 4
+ jg .h_w32_loop
+ RET
+.h_w64:
+ mova m4, [bilin_h_perm32]
+.h_w64_loop:
+ vpermb m0, m4, [srcq+strideq*0+32*0]
+ vpermb m1, m4, [srcq+strideq*0+32*1]
+ vpermb m2, m4, [srcq+strideq*1+32*0]
+ vpermb m3, m4, [srcq+strideq*1+32*1]
+ lea srcq, [srcq+strideq*2]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 2
+ jg .h_w64_loop
+ RET
+.h_w128:
+ mova m4, [bilin_h_perm32]
+.h_w128_loop:
+ vpermb m0, m4, [srcq+32*0]
+ vpermb m1, m4, [srcq+32*1]
+ vpermb m2, m4, [srcq+32*2]
+ vpermb m3, m4, [srcq+32*3]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ add srcq, strideq
+ dec hd
+ jg .h_w128_loop
+ RET
+.v:
+ WIN64_SPILL_XMM 7
+ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
+ imul mxyd, 0xff01
+ add mxyd, 16 << 8
+ add wq, t2
+ lea stride3q, [strideq*3]
+ vpbroadcastw m6, mxyd
+ jmp wq
+.v_w4:
+ vpbroadcastd xm0, [srcq+strideq*0]
+ mov r3d, 0x29
+ vbroadcasti32x4 ym3, [bilin_v_shuf4]
+ kmovb k1, r3d
+.v_w4_loop:
+ vpblendmd xm1{k1}, xm0, [srcq+strideq*1] {1to4} ; __01 ____
+ vpbroadcastd ym2, [srcq+strideq*2]
+ vpbroadcastd ym2{k1}, [srcq+stride3q ] ; __2_ 23__
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastd ym0, [srcq+strideq*0]
+ punpckhqdq ym2{k1}, ym1, ym0 ; 012_ 234_
+ pshufb ym2, ym3
+ pmaddubsw ym2, ym6
+ mova [tmpq], ym2
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ RET
+.v_w8:
+ mova m5, [bilin_v_perm8]
+ vbroadcasti32x4 ym0, [srcq+strideq*0]
+.v_w8_loop:
+ vinserti32x4 ym1, ym0, [srcq+strideq*1], 1
+ vpbroadcastq ym0, [srcq+strideq*2]
+ vinserti32x4 m1, [srcq+stride3q ], 2
+ lea srcq, [srcq+strideq*4]
+ vinserti32x4 ym0, [srcq+strideq*0], 0
+ vpermt2b m1, m5, m0
+ pmaddubsw m1, m6
+ mova [tmpq], m1
+ add tmpq, 64
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ mova m5, [bilin_v_perm16]
+ movu xm0, [srcq+strideq*0]
+.v_w16_loop:
+ movu xm2, [srcq+strideq*2]
+ vinserti32x4 ym1, ym0, [srcq+strideq*1], 1
+ vpermt2b m1, m5, m2
+ vinserti32x4 ym2, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ movu xm0, [srcq+strideq*0]
+ vpermt2b m2, m5, m0
+ pmaddubsw m1, m6
+ pmaddubsw m2, m6
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m2
+ add tmpq, 64*2
+ sub hd, 4
+ jg .v_w16_loop
+ RET
+.v_w32:
+ mova m5, [bilin_v_perm32]
+ movu ym0, [srcq+strideq*0]
+.v_w32_loop:
+ movu ym2, [srcq+strideq*1]
+ movu ym3, [srcq+strideq*2]
+ movu ym4, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpermt2b m0, m5, m2
+ vpermt2b m2, m5, m3
+ vpermt2b m3, m5, m4
+ pmaddubsw m1, m0, m6
+ movu ym0, [srcq+strideq*0]
+ vpermt2b m4, m5, m0
+ pmaddubsw m2, m6
+ pmaddubsw m3, m6
+ pmaddubsw m4, m6
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m2
+ mova [tmpq+64*2], m3
+ mova [tmpq+64*3], m4
+ add tmpq, 64*4
+ sub hd, 4
+ jg .v_w32_loop
+ RET
+.v_w64:
+ mova m5, [bilin_v_perm64]
+ vpermq m0, m5, [srcq+strideq*0]
+.v_w64_loop:
+ vpermq m1, m5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m4, m1, m0
+ punpckhbw m2, m1, m0
+ vpermq m0, m5, [srcq+strideq*0]
+ punpcklbw m3, m0, m1
+ punpckhbw m1, m0, m1
+ pmaddubsw m4, m6
+ pmaddubsw m2, m6
+ pmaddubsw m3, m6
+ pmaddubsw m1, m6
+ mova [tmpq+64*0], m4
+ mova [tmpq+64*1], m2
+ mova [tmpq+64*2], m3
+ mova [tmpq+64*3], m1
+ add tmpq, 64*4
+ sub hd, 2
+ jg .v_w64_loop
+ RET
+.v_w128:
+ mova m5, [bilin_v_perm64]
+ vpermq m0, m5, [srcq+strideq*0+ 0]
+ vpermq m1, m5, [srcq+strideq*0+64]
+.v_w128_loop:
+ vpermq m2, m5, [srcq+strideq*1+ 0]
+ vpermq m3, m5, [srcq+strideq*1+64]
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m4, m2, m0
+ punpckhbw m0, m2, m0
+ pmaddubsw m4, m6
+ pmaddubsw m0, m6
+ mova [tmpq+64*0], m4
+ mova [tmpq+64*1], m0
+ punpcklbw m4, m3, m1
+ punpckhbw m1, m3, m1
+ pmaddubsw m4, m6
+ pmaddubsw m1, m6
+ mova [tmpq+64*2], m4
+ mova [tmpq+64*3], m1
+ vpermq m0, m5, [srcq+strideq*0+ 0]
+ vpermq m1, m5, [srcq+strideq*0+64]
+ punpcklbw m4, m0, m2
+ punpckhbw m2, m0, m2
+ pmaddubsw m4, m6
+ pmaddubsw m2, m6
+ mova [tmpq+64*4], m4
+ mova [tmpq+64*5], m2
+ punpcklbw m4, m1, m3
+ punpckhbw m3, m1, m3
+ pmaddubsw m4, m6
+ pmaddubsw m3, m6
+ mova [tmpq+64*6], m4
+ mova [tmpq+64*7], m3
+ add tmpq, 64*8
+ sub hd, 2
+ jg .v_w128_loop
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
+ ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 7
+ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
+ shl mxyd, 11
+ vpbroadcastw m6, mxyd
+ add wq, t2
+ lea stride3q, [strideq*3]
+ jmp wq
+.hv_w4:
+ vbroadcasti32x4 ym4, [bilin_h_shuf4]
+ vpbroadcastq ym0, [srcq+strideq*0]
+ pshufb ym0, ym4
+ pmaddubsw ym0, ym5
+.hv_w4_loop:
+ movq xmm1, [srcq+strideq*1]
+ movq xmm2, [srcq+strideq*2]
+ vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ vinserti32x4 ym2, ymm2, [srcq+strideq*0], 1
+ punpcklqdq ym1, ym2
+ pshufb ym1, ym4
+ pmaddubsw ym1, ym5 ; 1 2 3 4
+ valignq ym2, ym1, ym0, 3 ; 0 1 2 3
+ mova ym0, ym1
+ psubw ym1, ym2
+ pmulhrsw ym1, ym6
+ paddw ym1, ym2
+ mova [tmpq], ym1
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ vbroadcasti32x4 m4, [bilin_h_shuf8]
+ vbroadcasti32x4 m0, [srcq+strideq*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w8_loop:
+ movu xmm1, [srcq+strideq*1]
+ vinserti128 ym1, ymm1, [srcq+strideq*2], 1
+ vinserti128 m1, [srcq+stride3q ], 2
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m1, [srcq+strideq*0], 3
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 2 3 4
+ valignq m2, m1, m0, 6 ; 0 1 2 3
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m6
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 64
+ sub hd, 4
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ mova m4, [bilin_h_perm16]
+ vbroadcasti32x8 m0, [srcq+strideq*0]
+ vpermb m0, m4, m0
+ pmaddubsw m0, m5
+.hv_w16_loop:
+ movu ym1, [srcq+strideq*1]
+ vinserti32x8 m1, [srcq+strideq*2], 1
+ movu ym2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti32x8 m2, [srcq+strideq*0], 1
+ vpermb m1, m4, m1
+ vpermb m2, m4, m2
+ pmaddubsw m1, m5 ; 1 2
+ vshufi32x4 m3, m0, m1, q1032 ; 0 1
+ pmaddubsw m0, m2, m5 ; 3 4
+ vshufi32x4 m2, m1, m0, q1032 ; 2 3
+ psubw m1, m3
+ pmulhrsw m1, m6
+ paddw m1, m3
+ psubw m3, m0, m2
+ pmulhrsw m3, m6
+ paddw m3, m2
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m3
+ add tmpq, 64*2
+ sub hd, 4
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+ mova m4, [bilin_h_perm32]
+ vpermb m0, m4, [srcq+strideq*0]
+ pmaddubsw m0, m5
+.hv_w32_loop:
+ vpermb m1, m4, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermb m2, m4, [srcq+strideq*0]
+ pmaddubsw m1, m5
+ psubw m3, m1, m0
+ pmulhrsw m3, m6
+ paddw m3, m0
+ pmaddubsw m0, m2, m5
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+64*0], m3
+ mova [tmpq+64*1], m2
+ add tmpq, 64*2
+ sub hd, 2
+ jg .hv_w32_loop
+ RET
+.hv_w64:
+ mova m4, [bilin_h_perm32]
+ vpermb m0, m4, [srcq+32*0]
+ vpermb m1, m4, [srcq+32*1]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+.hv_w64_loop:
+ add srcq, strideq
+ vpermb m2, m4, [srcq+32*0]
+ vpermb m3, m4, [srcq+32*1]
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ psubw m7, m2, m0
+ psubw m8, m3, m1
+ pmulhrsw m7, m6
+ pmulhrsw m8, m6
+ paddw m7, m0
+ mova m0, m2
+ paddw m8, m1
+ mova m1, m3
+ mova [tmpq+64*0], m7
+ mova [tmpq+64*1], m8
+ add tmpq, 64*2
+ dec hd
+ jg .hv_w64_loop
+ RET
+.hv_w128:
+ mova m4, [bilin_h_perm32]
+ vpermb m0, m4, [srcq+32*0]
+ vpermb m1, m4, [srcq+32*1]
+ vpermb m2, m4, [srcq+32*2]
+ vpermb m3, m4, [srcq+32*3]
+ REPX {pmaddubsw x, m5}, m0, m1, m2, m3
+.hv_w128_loop:
+ add srcq, strideq
+ vpermb m7, m4, [srcq+32*0]
+ vpermb m8, m4, [srcq+32*1]
+ vpermb m9, m4, [srcq+32*2]
+ vpermb m10, m4, [srcq+32*3]
+ REPX {pmaddubsw x, m5}, m7, m8, m9, m10
+ psubw m11, m7, m0
+ psubw m12, m8, m1
+ psubw m13, m9, m2
+ psubw m14, m10, m3
+ REPX {pmulhrsw x, m6}, m11, m12, m13, m14
+ paddw m11, m0
+ mova m0, m7
+ paddw m12, m1
+ mova m1, m8
+ paddw m13, m2
+ mova m2, m9
+ paddw m14, m3
+ mova m3, m10
+ mova [tmpq+64*0], m11
+ mova [tmpq+64*1], m12
+ mova [tmpq+64*2], m13
+ mova [tmpq+64*3], m14
+ add tmpq, 64*4
+ dec hd
+ jg .hv_w128_loop
+ RET
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+%macro FN 4 ; fn, type, type_h, type_v
+cglobal %1_%2_8bpc
+ mov t0d, FILTER_%3
+%ifidn %3, %4
+ mov t1d, t0d
+%else
+ mov t1d, FILTER_%4
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX)
+%endif
+%endmacro
+
+%macro PUT_8TAP_H 4-5 0 ; dst/src, tmp[1-3], vpermb
+%if %5
+ vpermb m%2, m6, m%1
+ vpermb m%3, m7, m%1
+ vpermb m%4, m8, m%1
+%else
+%if %2 < %4 ; reuse a previous value if possible
+ pshufb m%2, m%1, m6
+%endif
+ pshufb m%3, m%1, m7
+ pshufb m%4, m%1, m8
+%endif
+ mova m%1, m5
+ vpdpbusd m%1, m%2, m9
+ mova m%2, m5
+ vpdpbusd m%2, m%3, m9
+ vpdpbusd m%1, m%3, m10
+ vpdpbusd m%2, m%4, m10
+ packusdw m%1, m%2
+ psrlw m%1, 6
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 4, 5
+%else
+DECLARE_REG_TMP 7, 8
+%endif
+
+%define PUT_8TAP_FN FN put_8tap,
+
+PUT_8TAP_FN sharp, SHARP, SHARP
+PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+%define base r8-put_avx512icl
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r8, [put_avx512icl]
+ movsxd wq, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r8+wq*2+table_offset(put,)]
+ add wq, r8
+ lea r6, [ssq*3]
+ lea r7, [dsq*3]
+%if WIN64
+ pop r8
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m5, [pd_34] ; 2 + (8 << 2)
+ WIN64_SPILL_XMM 11
+ cmp wd, 4
+ jl .h_w2
+ vbroadcasti128 m6, [subpel_h_shufA]
+ je .h_w4
+ tzcnt wd, wd
+ vbroadcasti128 m7, [subpel_h_shufB]
+ vbroadcasti128 m8, [subpel_h_shufC]
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)]
+ vpbroadcastd m9, [base+mxq*8+subpel_filters+0]
+ vpbroadcastd m10, [base+mxq*8+subpel_filters+4]
+ add wq, r8
+ jmp wq
+.h_w2:
+ movzx mxd, mxb
+ dec srcq
+ mova xmm4, [subpel_h_shuf4]
+ vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2]
+.h_w2_loop:
+ movq xmm0, [srcq+ssq*0]
+ movhps xmm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xmm0, xmm4
+ mova xmm1, xm5
+ vpdpbusd xmm1, xmm0, xmm3
+ packssdw xmm0, xmm1, xmm1
+ psraw xmm0, 6
+ packuswb xmm0, xm0
+ pextrw [dstq+dsq*0], xmm0, 0
+ pextrw [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2]
+.h_w4_loop:
+ movq xmm0, [srcq+ssq*0]
+ movq xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xmm0, xm6
+ pshufb xmm1, xm6
+ mova xmm2, xm5
+ vpdpbusd xmm2, xmm0, xmm3
+ mova xmm0, xm5
+ vpdpbusd xmm0, xmm1, xmm3
+ packssdw xmm0, xmm2, xmm0
+ psraw xmm0, 6
+ packuswb xmm0, xmm0
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h_w8:
+ movu xm0, [srcq+ssq*0]
+ vinserti32x4 ym0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ WRAP_YMM PUT_8TAP_H 0, 1, 2, 3
+ vpmovuswb xm0, ym0
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ mova m6, [spel_h_perm16a]
+ mova m7, [spel_h_perm16b]
+ mova m8, [spel_h_perm16c]
+.h_w16_loop:
+ movu ym0, [srcq+ssq*0]
+ vinserti32x8 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 0, 1, 2, 3, 1
+ vpmovuswb ym0, m0
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], ym0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16_loop
+ RET
+.h_w32:
+ movu ym0, [srcq+ssq*0+8*0]
+ vinserti32x8 m0, [srcq+ssq*1+8*0], 1
+ movu ym1, [srcq+ssq*0+8*1]
+ vinserti32x8 m1, [srcq+ssq*1+8*1], 1
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 0, 2, 3, 4
+ PUT_8TAP_H 1, 4, 3, 2
+ packuswb m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w32
+ RET
+.h_w64:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ add srcq, ssq
+ PUT_8TAP_H 0, 2, 3, 4
+ PUT_8TAP_H 1, 4, 3, 2
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ movu m0, [srcq+8*0]
+ movu m2, [srcq+8*1]
+ movu m1, [srcq+8*8]
+ movu m3, [srcq+8*9]
+ add srcq, ssq
+ PUT_8TAP_H 0, 4, 11, 12
+ PUT_8TAP_H 2, 12, 11, 4
+ PUT_8TAP_H 1, 4, 11, 12
+ PUT_8TAP_H 3, 12, 11, 4
+ packuswb m0, m2
+ packuswb m1, m3
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, dsq
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ tzcnt r6d, wd
+ movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)]
+ vpbroadcastd m7, [pw_512]
+ lea myq, [base+subpel_filters+myq*8]
+ vpbroadcastw m8, [myq+0]
+ vpbroadcastw m9, [myq+2]
+ vpbroadcastw m10, [myq+4]
+ vpbroadcastw m11, [myq+6]
+ add r6, r8
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ jmp r6
+.v_w2:
+ movd xmm2, [srcq+ssq*0]
+ pinsrw xmm2, [srcq+ssq*1], 2
+ pinsrw xmm2, [srcq+ssq*2], 4
+ add srcq, ss3q
+ pinsrw xmm2, [srcq+ssq*0], 6 ; 0 1 2 3
+ movd xmm3, [srcq+ssq*1]
+ vpbroadcastd xmm1, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5
+ vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6
+ palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4
+ punpcklbw xmm3, xmm1 ; 45 56
+ punpcklbw xmm1, xmm2, xmm4 ; 01 12
+ punpckhbw xmm2, xmm4 ; 23 34
+.v_w2_loop:
+ pmaddubsw xmm5, xmm1, xm8 ; a0 b0
+ mova xmm1, xmm2
+ pmaddubsw xmm2, xm9 ; a1 b1
+ paddw xmm5, xmm2
+ mova xmm2, xmm3
+ pmaddubsw xmm3, xm10 ; a2 b2
+ paddw xmm5, xmm3
+ vpbroadcastd xmm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8
+ punpcklbw xmm3, xmm4 ; 67 78
+ pmaddubsw xmm4, xmm3, xm11 ; a3 b3
+ paddw xmm5, xmm4
+ pmulhrsw xmm5, xm7
+ packuswb xmm5, xmm5
+ pextrw [dstq+dsq*0], xmm5, 0
+ pextrw [dstq+dsq*1], xmm5, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movd xmm2, [srcq+ssq*0]
+ pinsrd xmm2, [srcq+ssq*1], 1
+ pinsrd xmm2, [srcq+ssq*2], 2
+ add srcq, ss3q
+ pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3
+ movd xmm3, [srcq+ssq*1]
+ vpbroadcastd xmm1, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5
+ vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6
+ palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4
+ punpcklbw xmm3, xmm1 ; 45 56
+ punpcklbw xmm1, xmm2, xmm4 ; 01 12
+ punpckhbw xmm2, xmm4 ; 23 34
+.v_w4_loop:
+ vpbroadcastd xmm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw xmm5, xmm1, xm8 ; a0 b0
+ mova xmm1, xmm2
+ pmaddubsw xmm2, xm9 ; a1 b1
+ paddw xmm5, xmm2
+ mova xmm2, xmm3
+ pmaddubsw xmm3, xm10 ; a2 b2
+ paddw xmm5, xmm3
+ vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8
+ punpcklbw xmm3, xmm4 ; 67 78
+ pmaddubsw xmm4, xmm3, xm11 ; a3 b3
+ paddw xmm5, xmm4
+ pmulhrsw xmm5, xm7
+ packuswb xmm5, xmm5
+ movd [dstq+dsq*0], xmm5
+ pextrd [dstq+dsq*1], xmm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xmm1, [srcq+ssq*0]
+ vpbroadcastq ymm0, [srcq+ssq*1]
+ vpbroadcastq ymm2, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq ymm5, [srcq+ssq*0]
+ vpbroadcastq ymm3, [srcq+ssq*1]
+ vpbroadcastq ymm4, [srcq+ssq*2]
+ add srcq, ss3q
+ vpblendd ymm1, ymm0, 0x30
+ vpblendd ymm0, ymm2, 0x30
+ punpcklbw ymm1, ymm0 ; 01 12
+ vpbroadcastq ymm0, [srcq+ssq*0]
+ vpblendd ymm2, ymm5, 0x30
+ vpblendd ymm5, ymm3, 0x30
+ punpcklbw ymm2, ymm5 ; 23 34
+ vpblendd ymm3, ymm4, 0x30
+ vpblendd ymm4, ymm0, 0x30
+ punpcklbw ymm3, ymm4 ; 45 56
+.v_w8_loop:
+ vpbroadcastq ymm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw ymm5, ymm1, ym8 ; a0 b0
+ mova ymm1, ymm2
+ pmaddubsw ymm2, ym9 ; a1 b1
+ paddw ymm5, ymm2
+ mova ymm2, ymm3
+ pmaddubsw ymm3, ym10 ; a2 b2
+ paddw ymm5, ymm3
+ vpblendd ymm3, ymm0, ymm4, 0x30
+ vpbroadcastq ymm0, [srcq+ssq*0]
+ vpblendd ymm4, ymm4, ymm0, 0x30
+ punpcklbw ymm3, ymm4 ; 67 78
+ pmaddubsw ymm4, ymm3, ym11 ; a3 b3
+ paddw ymm5, ymm4
+ pmulhrsw ymm5, ym7
+ vextracti128 xmm4, ymm5, 1
+ packuswb xmm5, xmm4
+ movq [dstq+dsq*0], xmm5
+ movhps [dstq+dsq*1], xmm5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ vzeroupper
+ RET
+.v_w16:
+ mova m12, [spel_v_perm16]
+ vbroadcasti32x4 m1, [srcq+ssq*0]
+ vbroadcasti32x4 ym4, [srcq+ssq*1]
+ mov r6d, 0x0f
+ vbroadcasti32x4 m2, [srcq+ssq*2]
+ add srcq, ss3q
+ vbroadcasti32x4 ym5, [srcq+ssq*0]
+ kmovb k1, r6d
+ vbroadcasti32x4 m3, [srcq+ssq*1]
+ vbroadcasti32x4 ym6, [srcq+ssq*2]
+ add srcq, ss3q
+ vbroadcasti32x4 m0, [srcq+ssq*0]
+ vshufpd m1{k1}, m4, m2, 0xcc
+ vshufpd m2{k1}, m5, m3, 0xcc
+ vshufpd m3{k1}, m6, m0, 0xcc
+ vpermb m1, m12, m1 ; 01 12
+ vpermb m2, m12, m2 ; 23 34
+ vpermb m3, m12, m3 ; 45 56
+.v_w16_loop:
+ pmaddubsw m4, m1, m8 ; a0 b0
+ mova m1, m2
+ pmaddubsw m5, m2, m9 ; a1 b1
+ mova m2, m3
+ pmaddubsw m6, m3, m10 ; a2 b2
+ mova m3, m0
+ paddw m4, m5
+ vbroadcasti32x4 ym5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti32x4 m0, [srcq+ssq*0]
+ vshufpd m3{k1}, m5, m0, 0xcc
+ vpermb m3, m12, m3 ; 67 78
+ pmaddubsw m5, m3, m11 ; a3 b3
+ paddw m4, m6
+ paddw m4, m5
+ pmulhrsw m4, m7
+ vextracti32x8 ym5, m4, 1
+ packuswb ym4, ym5
+ mova [dstq+dsq*0], xm4
+ vextracti32x4 [dstq+dsq*1], ym4, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ RET
+.v_w32:
+ mova m12, [spel_v_perm32]
+ pmovzxbq m14, [pb_02461357]
+ vpshrdw m13, m12, m12, 8
+ movu ym0, [srcq+ssq*0]
+ vinserti32x8 m0, [srcq+ssq*1], 1
+ vpermb m1, m12, m0 ; 01
+ vinserti32x8 m0, [srcq+ssq*2], 0
+ add srcq, ss3q
+ vpermb m2, m13, m0 ; 12
+ vinserti32x8 m0, [srcq+ssq*0], 1
+ vpermb m3, m12, m0 ; 23
+ vinserti32x8 m0, [srcq+ssq*1], 0
+ vpermb m4, m13, m0 ; 34
+ vinserti32x8 m0, [srcq+ssq*2], 1
+ add srcq, ss3q
+ vpermb m5, m12, m0 ; 45
+ vinserti32x8 m0, [srcq+ssq*0], 0
+ vpermb m6, m13, m0 ; 56
+.v_w32_loop:
+ vinserti32x8 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw m15, m1, m8
+ mova m1, m3
+ pmaddubsw m16, m2, m8
+ mova m2, m4
+ pmaddubsw m17, m3, m9
+ mova m3, m5
+ pmaddubsw m18, m4, m9
+ mova m4, m6
+ pmaddubsw m19, m5, m10
+ vpermb m5, m12, m0 ; 67
+ vinserti32x8 m0, [srcq+ssq*0], 0
+ pmaddubsw m20, m6, m10
+ vpermb m6, m13, m0 ; 78
+ paddw m15, m17
+ pmaddubsw m17, m5, m11
+ paddw m16, m18
+ pmaddubsw m18, m6, m11
+ paddw m15, m19
+ paddw m16, m20
+ paddw m15, m17
+ paddw m16, m18
+ pmulhrsw m15, m7
+ pmulhrsw m16, m7
+ packuswb m15, m16
+ vpermq m15, m14, m15
+ mova [dstq+dsq*0], ym15
+ vextracti32x8 [dstq+dsq*1], m15, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w32_loop
+ vzeroupper
+ RET
+.v_w64:
+.v_w128:
+ lea r6d, [hq+wq*4-256]
+ mov r4, srcq
+ mov r7, dstq
+.v_loop0:
+ movu m2, [srcq+ssq*0]
+ movu m4, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ add srcq, ss3q
+ movu m13, [srcq+ssq*0]
+ movu m15, [srcq+ssq*1]
+ movu m17, [srcq+ssq*2]
+ add srcq, ss3q
+ movu m0, [srcq+ssq*0]
+ punpcklbw m1, m2, m4 ; 01l
+ punpckhbw m2, m4 ; 01h
+ punpcklbw m3, m4, m6 ; 12l
+ punpckhbw m4, m6 ; 12h
+ punpcklbw m5, m6, m13 ; 23l
+ punpckhbw m6, m13 ; 23h
+ punpcklbw m12, m13, m15 ; 34l
+ punpckhbw m13, m15 ; 34h
+ punpcklbw m14, m15, m17 ; 45l
+ punpckhbw m15, m17 ; 45h
+ punpcklbw m16, m17, m0 ; 56l
+ punpckhbw m17, m0 ; 56h
+.v_loop:
+ pmaddubsw m18, m1, m8 ; a0l
+ mova m1, m5
+ pmaddubsw m19, m2, m8 ; a0h
+ mova m2, m6
+ pmaddubsw m20, m3, m8 ; b0l
+ mova m3, m12
+ pmaddubsw m21, m4, m8 ; b0h
+ mova m4, m13
+ pmaddubsw m5, m9 ; a1l
+ pmaddubsw m6, m9 ; a1h
+ pmaddubsw m12, m9 ; b1l
+ pmaddubsw m13, m9 ; b1h
+ paddw m18, m5
+ mova m5, m14
+ pmaddubsw m14, m10 ; a2l
+ paddw m19, m6
+ mova m6, m15
+ pmaddubsw m15, m10 ; a2h
+ paddw m20, m12
+ mova m12, m16
+ pmaddubsw m16, m10 ; b2l
+ paddw m21, m13
+ mova m13, m17
+ pmaddubsw m17, m10 ; b2h
+ paddw m18, m14
+ paddw m19, m15
+ paddw m20, m16
+ paddw m21, m17
+ movu m17, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw m14, m0, m17 ; 67l
+ punpckhbw m15, m0, m17 ; 67h
+ pmaddubsw m16, m14, m11 ; a3l
+ pmaddubsw m0, m15, m11 ; a3h
+ paddw m18, m16
+ paddw m19, m0
+ movu m0, [srcq+ssq*0]
+ punpcklbw m16, m17, m0 ; 78l
+ punpckhbw m17, m0 ; 78h
+ pmulhrsw m18, m7
+ pmulhrsw m19, m7
+ packuswb m18, m19
+ mova [dstq+dsq*0], m18
+ pmaddubsw m18, m16, m11 ; b3l
+ pmaddubsw m19, m17, m11 ; b3h
+ paddw m18, m20
+ paddw m19, m21
+ pmulhrsw m18, m7
+ pmulhrsw m19, m7
+ packuswb m18, m19
+ mova [dstq+dsq*1], m18
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_loop
+ add r4, 64
+ add r7, 64
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 256
+ jg .v_loop0
+ vzeroupper
+ RET
+.hv:
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m7, [base+subpel_filters+mxq*8+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastd m8, [pd_2]
+ vpbroadcastq ym0, [base+subpel_filters+myq*8]
+ lea ss3q, [ssq*3]
+ vpbroadcastd ym9, [pd_32768]
+ mov r6, srcq
+ punpcklbw ym0, ym8, ym0
+ sub r6, ss3q
+ psraw ym0, 2 ; << 6
+ mova xm14, [spel_hv_end]
+ pshufd ym10, ym0, q0000
+ pshufd ym11, ym0, q1111
+ pshufd ym12, ym0, q2222
+ pshufd ym13, ym0, q3333
+ cmp wd, 4
+ je .hv_w4
+ vbroadcasti128 ym6, [subpel_h_shuf4]
+ movq xmm2, [r6+ssq*0]
+ movhps xmm2, [r6+ssq*1]
+ movq xmm0, [r6+ssq*2]
+ movhps xmm0, [srcq+ssq*0]
+ vpbroadcastq ymm3, [srcq+ssq*1]
+ vpbroadcastq ymm4, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq ymm1, [srcq+ssq*0]
+ vpblendd ymm2, ymm3, 0x30
+ vpblendd ymm0, ymm1, 0x30 ; 2 3 6 _
+ vpblendd ymm2, ymm4, 0xc0 ; 0 1 4 5
+ pshufb ymm2, ym6
+ pshufb ymm0, ym6
+ mova ymm1, ym8
+ vpdpbusd ymm1, ymm2, ym7
+ mova ymm2, ym8
+ vpdpbusd ymm2, ymm0, ym7
+ packssdw ymm2, ymm1, ymm2
+ psraw ymm2, 2
+ vextracti128 xmm3, ymm2, 1
+ palignr xmm4, xmm3, xmm2, 4
+ punpcklwd xmm1, xmm2, xmm4 ; 01 12
+ punpckhwd xmm2, xmm4 ; 23 34
+ pshufd xmm0, xmm3, q2121
+ punpcklwd xmm3, xmm0 ; 45 56
+.hv_w2_loop:
+ movq xmm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xmm4, [srcq+ssq*0]
+ mova xmm5, xm9
+ vpdpwssd xmm5, xmm1, xm10 ; a0 b0
+ mova xmm1, xmm2
+ vpdpwssd xmm5, xmm2, xm11 ; a1 b1
+ pshufb xmm4, xm6
+ mova xmm2, xmm3
+ vpdpwssd xmm5, xmm3, xm12 ; a2 b2
+ mova xmm3, xm8
+ vpdpbusd xmm3, xmm4, xm7
+ packssdw xmm4, xmm3, xmm3
+ psraw xmm4, 2
+ palignr xmm3, xmm4, xmm0, 12
+ mova xmm0, xmm4
+ punpcklwd xmm3, xmm4 ; 67 78
+ vpdpwssd xmm5, xmm3, xm13 ; a3 b3
+ packuswb xmm5, xmm5
+ pshufb xmm5, xm14
+ pextrw [dstq+dsq*0], xmm5, 0
+ pextrw [dstq+dsq*1], xmm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ vzeroupper
+ RET
+.hv_w4:
+ movq xmm1, [r6+ssq*0]
+ vpbroadcastq ym2, [r6+ssq*1]
+ vinserti32x4 ym1, ymm1, [r6+ssq*2], 1
+ vinserti32x4 m2, [srcq+ssq*0], 2
+ vinserti32x4 m1, [srcq+ssq*1], 2
+ vinserti32x4 m2, [srcq+ssq*2], 3 ; _ 1 3 5
+ vbroadcasti32x4 m6, [subpel_h_shufA]
+ add srcq, ss3q
+ vinserti32x4 m1, [srcq+ssq*0], 3 ; 0 2 4 6
+ pshufb m2, m6
+ pshufb m1, m6
+ mova m0, m8
+ vpdpbusd m0, m2, m7
+ mova m4, m8
+ vpdpbusd m4, m1, m7
+ mova ym1, [spel_hv_perm4a]
+ mova ym2, [spel_hv_perm4b]
+ mova ym3, [spel_hv_perm4c]
+ packssdw m0, m4
+ psraw m0, 2 ; _ 0 1 2 3 4 5 6
+ mov r6d, 0x5555
+ vpermb ym1, ym1, ym0 ; 01 12
+ vpermb m2, m2, m0 ; 23 34
+ vpermb m3, m3, m0 ; 45 56
+ kmovw k1, r6d
+ mova ym15, [spel_hv_perm4d]
+.hv_w4_loop:
+ movq xmm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x4 ym4, ymm4, [srcq+ssq*0], 1
+ mova ym5, ym9
+ vpdpwssd ym5, ym1, ym10 ; a0 b0
+ mova ym1, ym2
+ pshufb ym4, ym6
+ mova ym0, ym8
+ vpdpbusd ym0, ym4, ym7
+ vpdpwssd ym5, ym2, ym11 ; a1 b1
+ mova ym2, ym3
+ vpdpwssd ym5, ym3, ym12 ; a2 b2
+ vpsraw ym3{k1}, ym0, 2 ; 7 8
+ vpermb ym3, ym15, ym3 ; 67 78
+ vpdpwssd ym5, ym3, ym13 ; a3 b3
+ packuswb ym5, ym5
+ vpermb ym5, ym14, ym5
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+ sub srcq, 3
+ vpbroadcastd m10, [base+subpel_filters+mxq*8+0]
+ vpbroadcastd m11, [base+subpel_filters+mxq*8+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastd m8, [pd_2]
+ vpbroadcastq m0, [base+subpel_filters+myq*8]
+ vpbroadcastd m9, [pd_32768]
+ punpcklbw m0, m8, m0
+ lea ss3q, [ssq*3]
+ psraw m0, 2 ; << 6
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ cmp wd, 8
+ jne .hv_w16
+ mov r6, srcq
+ sub r6, ss3q
+ movu xmm1, [r6+ssq*0]
+ vinserti128 ymm1, [r6+ssq*1], 1
+ movu xmm2, [srcq+ssq*1]
+ vinserti32x4 m6, zmm1, [r6+ssq*2], 2
+ vinserti128 ymm2, [srcq+ssq*2], 1
+ vinserti32x4 m6, [srcq+ssq*0], 3 ; 0 1 2 3
+ add srcq, ss3q
+ vbroadcasti32x4 m4, [subpel_h_shufA]
+ vinserti32x4 m0, zmm2, [srcq+ssq*0], 2 ; 4 5 6 _
+ vbroadcasti32x4 m7, [subpel_h_shufB]
+ vbroadcasti32x4 m17, [subpel_h_shufC]
+ pshufb m1, m6, m4 ; 0 1 2 3 0123
+ mova m2, m8
+ vpdpbusd m2, m1, m10
+ pshufb m5, m6, m7 ; 0 1 2 3 4567
+ mova m1, m8
+ vpdpbusd m1, m5, m10
+ pshufb m4, m0, m4 ; 4 5 6 _ 0123
+ mova m3, m8
+ vpdpbusd m3, m4, m10
+ pshufb m7, m0, m7 ; 4 5 6 _ 4567
+ mova m4, m8
+ vpdpbusd m4, m7, m10
+ pshufb m6, m17
+ vpdpbusd m2, m5, m11
+ vpdpbusd m1, m6, m11
+ pshufb m6, m0, m17
+ vpdpbusd m3, m7, m11
+ vpdpbusd m4, m6, m11
+ mova m5, [spel_hv_perm8a]
+ mova m0, [spel_hv_perm8b]
+ mov r6, 0x55555555ff00
+ packssdw m2, m1
+ packssdw m3, m4
+ mova m18, [spel_hv_perm8c]
+ psraw m2, 2 ; 0 1 2 3
+ psraw m3, 2 ; 4 5 6 _
+ vpermb m1, m5, m2 ; 01 12
+ vbroadcasti32x8 m6, [subpel_h_shufA]
+ kmovq k1, r6
+ vpermt2b m2, m0, m3 ; 23 34
+ vbroadcasti32x8 m7, [subpel_h_shufB]
+ kshiftrq k2, k1, 16
+ mova xm16, [spel_hv_end]
+ vpermb m3, m5, m3 ; 45 56
+.hv_w8_loop:
+ vbroadcasti32x4 ym4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti32x4 m4{k1}, [srcq+ssq*0]
+ mova m0, m9
+ vpdpwssd m0, m1, m12 ; a0 b0
+ pshufb m1, m4, m6 ; 7 8 0123 4567
+ mova m5, m8
+ vpdpbusd m5, m1, m10
+ pshufb m4, m7 ; 7 8 4567 89ab
+ vpdpwssd m0, m2, m13 ; a1 b1
+ mova m1, m2
+ vpdpbusd m5, m4, m11
+ mova m2, m3
+ vpdpwssd m0, m3, m14 ; a2 b2
+ psraw m3{k2}, m5, 2 ; 75 86
+ vpermb m3, m18, m3 ; 67 78
+ vpdpwssd m0, m3, m15 ; a3 b3
+ packuswb m0, m0
+ vpermb zmm1, m16, m0
+ movq [dstq+dsq*0], xmm1
+ movhps [dstq+dsq*1], xmm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ vzeroupper
+ RET
+.hv_w16:
+ movu m7, [spel_hv_perm16a]
+ sub srcq, ss3q
+ mova m20, [spel_hv_perm16b]
+ lea r6d, [wq*2-32]
+ mova m21, [spel_hv_perm16c]
+ mov r4, srcq
+ mov r7, dstq
+ mova ym16, [spel_hv_end16]
+ lea r6d, [hq+r6*8]
+.hv_w16_loop0:
+ movu ym17, [srcq+ssq*0]
+ vinserti32x8 m17, [srcq+ssq*1], 1 ; 0 1
+ movu ym18, [srcq+ssq*2]
+ add srcq, ss3q
+ vinserti32x8 m18, [srcq+ssq*0], 1 ; 2 3
+ movu ym19, [srcq+ssq*1]
+ vinserti32x8 m19, [srcq+ssq*2], 1 ; 4 5
+ add srcq, ss3q
+ vpermb m2, m7, m17 ; 0 1 0123 89ab
+ vpermb m0, m20, m17 ; 0 1 4567 cdef
+ vpermb m4, m7, m18 ; 2 3 0123 89ab
+ mova m1, m8
+ vpdpbusd m1, m2, m10
+ vpermb m5, m20, m18 ; 2 3 4567 cdef
+ mova m2, m8
+ vpdpbusd m2, m0, m10
+ vpermb m17, m21, m17 ; 0 1 89ab ghij
+ mova m3, m8
+ vpdpbusd m3, m4, m10
+ vpermb m6, m7, m19 ; 4 5 0123 89ab
+ mova m4, m8
+ vpdpbusd m4, m5, m10
+ vpermb m18, m21, m18 ; 2 3 89ab ghij
+ vpdpbusd m1, m0, m11
+ movu ym0, [srcq+ssq*0] ; 6
+ vpdpbusd m2, m17, m11
+ vpermb m17, m20, m19 ; 4 5 4567 cdef
+ vpdpbusd m3, m5, m11
+ mova m5, m8
+ vpdpbusd m5, m6, m10
+ mova m6, m8
+ vpdpbusd m6, m17, m10
+ vpdpbusd m4, m18, m11
+ mova m18, [spel_hv_perm16d]
+ vpermb m18, m18, m0 ; 6 0145 2367 89cd abef
+ vpdpbusd m5, m17, m11
+ vpermb m19, m21, m19 ; 4 5 89ab ghij
+ mova m17, m8
+ vpdpbusd m17, m18, m10
+ mova m18, [spel_hv_perm16e]
+ vpermb m0, m18, m0 ; 6 4589 67ab cdgh efij
+ packssdw m1, m2 ; 01
+ vpdpbusd m6, m19, m11
+ packssdw m3, m4 ; 23
+ vpdpbusd m17, m0, m11
+ psraw m1, 2
+ packssdw m5, m6 ; 45
+ psraw m3, 2
+ vpshrdd m2, m1, m3, 16 ; 12
+ psraw m5, 2
+ vpshrdd m4, m3, m5, 16 ; 34
+ psraw m17, 2
+ vpshrdd m6, m5, m17, 16 ; 56
+.hv_w16_loop:
+ movu ym18, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x8 m18, [srcq+ssq*0], 1
+ mova m0, m9
+ vpdpwssd m0, m1, m12 ; a0
+ vpermb m1, m7, m18 ; 7 8 0123 89ab
+ mova m17, m9
+ vpdpwssd m17, m2, m12 ; b0
+ vpermb m2, m20, m18 ; 7 8 4567 cdef
+ mova m19, m8
+ vpdpbusd m19, m1, m10
+ vpermb m18, m21, m18
+ mova m1, m8
+ vpdpbusd m1, m2, m10
+ vpdpwssd m0, m3, m13 ; a1
+ vpdpwssd m17, m4, m13 ; b1
+ vpdpbusd m19, m2, m11
+ mova m2, m4
+ vpdpbusd m1, m18, m11
+ mova m4, m6
+ vpdpwssd m0, m5, m14 ; a2
+ vpdpwssd m17, m6, m14 ; b2
+ packssdw m19, m1
+ mova m1, m3
+ mova m3, m5
+ psraw m6, m19, 2 ; 7 8
+ vpshrdd m5, m4, m6, 16 ; 6 7
+ vpdpwssd m17, m6, m15 ; b3
+ vpdpwssd m0, m5, m15 ; a3
+ packuswb m0, m17
+ vpermb zmm1, m16, m0
+ mova [dstq+dsq*0], xmm1
+ vextracti128 [dstq+dsq*1], ymm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ add r4, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .hv_w16_loop0
+ vzeroupper
+ RET
+
+%macro PREP_8TAP_H 0
+ vpermb m10, m5, m0
+ vpermb m11, m5, m1
+ vpermb m12, m6, m0
+ vpermb m13, m6, m1
+ vpermb m14, m7, m0
+ vpermb m15, m7, m1
+ mova m0, m4
+ vpdpbusd m0, m10, m8
+ mova m2, m4
+ vpdpbusd m2, m12, m8
+ mova m1, m4
+ vpdpbusd m1, m11, m8
+ mova m3, m4
+ vpdpbusd m3, m13, m8
+ vpdpbusd m0, m12, m9
+ vpdpbusd m2, m14, m9
+ vpdpbusd m1, m13, m9
+ vpdpbusd m3, m15, m9
+ packssdw m0, m2
+ packssdw m1, m3
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%define PREP_8TAP_FN FN prep_8tap,
+
+PREP_8TAP_FN sharp, SHARP, SHARP
+PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r7, [prep_avx512icl]
+ movsxd wq, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r7+wq*2+table_offset(prep,)]
+ add wq, r7
+ lea r6, [strideq*3]
+%if WIN64
+ pop r7
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m4, [pd_2]
+ WIN64_SPILL_XMM 10
+ cmp wd, 4
+ je .h_w4
+ tzcnt wd, wd
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
+ vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+0]
+ vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep_avx512icl+4]
+ add wq, r7
+ jmp wq
+.h_w4:
+ movzx mxd, mxb
+ vbroadcasti128 ym5, [subpel_h_shufA]
+ mov r3d, 0x4
+ dec srcq
+ vpbroadcastd ym6, [r7+mxq*8+subpel_filters-prep_avx512icl+2]
+ kmovb k1, r3d
+ lea stride3q, [strideq*3]
+.h_w4_loop:
+ movq xm2, [srcq+strideq*0]
+ movq xm3, [srcq+strideq*1]
+ vpbroadcastq ym2{k1}, [srcq+strideq*2]
+ vpbroadcastq ym3{k1}, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pshufb ym2, ym5
+ pshufb ym3, ym5
+ mova ym0, ym4
+ vpdpbusd ym0, ym2, ym6
+ mova ym1, ym4
+ vpdpbusd ym1, ym3, ym6
+ packssdw ym0, ym1
+ psraw ym0, 2
+ mova [tmpq], ym0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+ vbroadcasti128 m5, [subpel_h_shufA]
+ vbroadcasti128 m6, [subpel_h_shufB]
+ vbroadcasti128 m7, [subpel_h_shufC]
+ lea stride3q, [strideq*3]
+.h_w8_loop:
+ movu xmm3, [srcq+strideq*0]
+ vinserti128 ym3, ymm3, [srcq+strideq*1], 1
+ vinserti128 m3, [srcq+strideq*2], 2
+ vinserti128 m3, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
+ pshufb m1, m3, m5
+ pshufb m2, m3, m6
+ mova m0, m4
+ vpdpbusd m0, m1, m8
+ mova m1, m4
+ vpdpbusd m1, m2, m8
+ pshufb m3, m7
+ vpdpbusd m0, m2, m9
+ vpdpbusd m1, m3, m9
+ packssdw m0, m1
+ psraw m0, 2
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 4
+ jg .h_w8_loop
+ RET
+.h_w16:
+ mova m5, [spel_h_perm16a]
+ mova m6, [spel_h_perm16b]
+ mova m7, [spel_h_perm16c]
+ lea stride3q, [strideq*3]
+.h_w16_loop:
+ movu ym0, [srcq+strideq*0]
+ movu ym1, [srcq+strideq*2]
+ vinserti32x8 m0, [srcq+strideq*1], 1
+ vinserti32x8 m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ PREP_8TAP_H
+ add tmpq, 64*2
+ sub hd, 4
+ jg .h_w16_loop
+ RET
+.h_w32:
+ mova m5, [spel_h_perm32a]
+ mova m6, [spel_h_perm32b]
+ mova m7, [spel_h_perm32c]
+.h_w32_loop:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_H
+ add tmpq, 64*2
+ sub hd, 2
+ jg .h_w32_loop
+ RET
+.h_w64:
+ xor r6d, r6d
+ jmp .h_start
+.h_w128:
+ mov r6, -64*1
+.h_start:
+ mova m5, [spel_h_perm32a]
+ mova m6, [spel_h_perm32b]
+ mova m7, [spel_h_perm32c]
+ sub srcq, r6
+ mov r5, r6
+.h_loop:
+ movu m0, [srcq+r6+32*0]
+ movu m1, [srcq+r6+32*1]
+ PREP_8TAP_H
+ add tmpq, 64*2
+ add r6, 64
+ jle .h_loop
+ add srcq, strideq
+ mov r6, r5
+ dec hd
+ jg .h_loop
+ RET
+.v:
+ movzx mxd, myb ; Select 4-tap/8-tap filter multipliers.
+ shr myd, 16 ; Note that the code is 8-tap only, having
+ tzcnt wd, wd
+ cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4
+ cmove myd, mxd ; had a negligible effect on performance.
+ ; TODO: Would a 6-tap code path be worth it?
+ lea myq, [r7+myq*8+subpel_filters-prep_avx512icl]
+ movzx wd, word [r7+wq*2+table_offset(prep, _8tap_v)]
+ add wq, r7
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ vpbroadcastd m7, [pw_8192]
+ vpbroadcastw m8, [myq+0]
+ vpbroadcastw m9, [myq+2]
+ vpbroadcastw m10, [myq+4]
+ vpbroadcastw m11, [myq+6]
+ jmp wq
+.v_w4:
+ movd xmm0, [srcq+strideq*0]
+ vpbroadcastd ymm1, [srcq+strideq*2]
+ vpbroadcastd xmm2, [srcq+strideq*1]
+ vpbroadcastd ymm3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd ymm1, ymm0, 0x01 ; 0 2 2 _ 2 _ _ _
+ vpblendd ymm3, ymm2, 0x03 ; 1 1 3 3 3 3 _ _
+ vpbroadcastd ymm0, [srcq+strideq*0]
+ vpbroadcastd ymm2, [srcq+strideq*1]
+ vpblendd ymm1, ymm0, 0x68 ; 0 2 2 4 2 4 4 _
+ vpbroadcastd ymm0, [srcq+strideq*2]
+ vbroadcasti128 ymm5, [deint_shuf4]
+ vpblendd ymm3, ymm2, 0xc0 ; 1 1 3 3 3 3 5 5
+ vpblendd ymm2, ymm3, ymm1, 0x55 ; 0 1 2 3 2 3 4 5
+ vpblendd ymm3, ymm1, 0xaa ; 1 2 3 4 3 4 5 _
+ punpcklbw ymm1, ymm2, ymm3 ; 01 12 23 34
+ vpblendd ymm3, ymm0, 0x80 ; 1 2 3 4 3 4 5 6
+ punpckhbw ymm2, ymm3 ; 23 34 45 56
+.v_w4_loop:
+ pinsrd xmm0, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastd ymm3, [srcq+strideq*0]
+ vpbroadcastd ymm4, [srcq+strideq*1]
+ vpblendd ymm3, ymm4, 0x20 ; _ _ 8 _ 8 9 _ _
+ vpblendd ymm3, ymm0, 0x03 ; 6 7 8 _ 8 9 _ _
+ vpbroadcastd ymm0, [srcq+strideq*2]
+ vpblendd ymm3, ymm0, 0x40 ; 6 7 8 _ 8 9 a _
+ pshufb ymm3, ymm5 ; 67 78 89 9a
+ pmaddubsw ymm4, ymm1, ym8
+ vperm2i128 ymm1, ymm2, ymm3, 0x21 ; 45 56 67 78
+ pmaddubsw ymm2, ym9
+ paddw ymm4, ymm2
+ mova ymm2, ymm3
+ pmaddubsw ymm3, ym11
+ paddw ymm3, ymm4
+ pmaddubsw ymm4, ymm1, ym10
+ paddw ymm3, ymm4
+ pmulhrsw ymm3, ym7
+ mova [tmpq], ymm3
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ vzeroupper
+ RET
+.v_w8:
+ mov r3d, 0xf044
+ kmovw k1, r3d
+ kshiftrw k2, k1, 8
+ movq xm0, [srcq+strideq*0]
+ vpbroadcastq ym1, [srcq+strideq*1]
+ vpbroadcastq m2, [srcq+strideq*2]
+ vpbroadcastq m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m4, [srcq+strideq*0]
+ vpbroadcastq m5, [srcq+strideq*1]
+ vpbroadcastq m6, [srcq+strideq*2]
+ vmovdqa64 ym0{k1}, ym1
+ vmovdqa64 ym1{k1}, ym2
+ vmovdqa64 m2{k1}, m3
+ vmovdqa64 m3{k1}, m4
+ vmovdqa64 m4{k1}, m5
+ vmovdqa64 m5{k1}, m6
+ punpcklbw ym0, ym1 ; 01 12 __ __
+ punpcklbw m2, m3 ; 23 34 23 34
+ punpcklbw m4, m5 ; 45 56 45 56
+ vmovdqa64 m0{k2}, m2 ; 01 12 23 34
+ vmovdqa64 m2{k2}, m4 ; 23 34 45 56
+.v_w8_loop:
+ vpbroadcastq m1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m3, [srcq+strideq*0]
+ vpbroadcastq m5, [srcq+strideq*1]
+ pmaddubsw m14, m0, m8
+ pmaddubsw m15, m2, m9
+ vpblendmq m0{k1}, m6, m1
+ vpblendmq m2{k1}, m1, m3
+ vpbroadcastq m6, [srcq+strideq*2]
+ paddw m14, m15
+ punpcklbw m2, m0, m2 ; 67 78 67 78
+ vpblendmq m12{k1}, m3, m5
+ vpblendmq m13{k1}, m5, m6
+ vpblendmq m0{k2}, m4, m2 ; 45 56 67 78
+ punpcklbw m4, m12, m13 ; 89 9a 89 9a
+ vmovdqa64 m2{k2}, m4 ; 67 78 89 9a
+ pmaddubsw m12, m0, m10
+ pmaddubsw m13, m2, m11
+ paddw m14, m12
+ paddw m14, m13
+ pmulhrsw m14, m7
+ mova [tmpq], m14
+ add tmpq, 64
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ mov r3d, 0xf0
+ kmovb k1, r3d
+ vbroadcasti128 m0, [srcq+strideq*0]
+ vbroadcasti128 m1, [srcq+strideq*1]
+ vbroadcasti128 m2, [srcq+strideq*2]
+ vbroadcasti128 m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vbroadcasti128 m4, [srcq+strideq*0]
+ vbroadcasti128 m5, [srcq+strideq*1]
+ vbroadcasti128 m6, [srcq+strideq*2]
+ vmovdqa64 m0{k1}, m1
+ vmovdqa64 m1{k1}, m2
+ vmovdqa64 m2{k1}, m3
+ vmovdqa64 m3{k1}, m4
+ vmovdqa64 m4{k1}, m5
+ vmovdqa64 m5{k1}, m6
+ shufpd m0, m2, 0xcc ; 0a_2a 0b_2b 1a_3a 1b_3b
+ shufpd m1, m3, 0xcc ; 1a_3a 1b_3b 2a_4a 2b_4b
+ shufpd m4, m4, 0x44 ; 4a_-- 4b_-- 5a_-- 5b_--
+ shufpd m5, m5, 0x44 ; 5a_-- 5b_-- 6a_-- 6b_--
+ punpckhbw m2, m0, m1 ; 23a 23b 34a 34b
+ punpcklbw m0, m1 ; 01a 01b 12a 12b
+ punpcklbw m4, m5 ; 45a 45b 56a 56b
+.v_w16_loop:
+ vbroadcasti128 m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vbroadcasti128 m5, [srcq+strideq*0]
+ vpblendmq m1{k1}, m6, m3
+ vmovdqa64 m3{k1}, m5
+ pmaddubsw m12, m0, m8
+ pmaddubsw m13, m2, m8
+ pmaddubsw m14, m2, m9
+ pmaddubsw m15, m4, m9
+ pmaddubsw m0, m4, m10
+ vbroadcasti128 m2, [srcq+strideq*1]
+ vbroadcasti128 m6, [srcq+strideq*2]
+ paddw m12, m14
+ paddw m13, m15
+ paddw m12, m0
+ vmovdqa64 m5{k1}, m2
+ vmovdqa64 m2{k1}, m6
+ mova m0, m4
+ shufpd m1, m5, 0xcc ; 6a_8a 6b_8b 7a_9a 7b_9b
+ shufpd m3, m2, 0xcc ; 7a_9a 7b_9b 8a_Aa 8b_Ab
+ punpcklbw m2, m1, m3 ; 67a 67b 78a 78b
+ punpckhbw m4, m1, m3 ; 89a 89b 9Aa 9Ab
+ pmaddubsw m14, m2, m10
+ pmaddubsw m15, m2, m11
+ paddw m13, m14
+ paddw m12, m15
+ pmaddubsw m14, m4, m11
+ paddw m13, m14
+ pmulhrsw m12, m7
+ pmulhrsw m13, m7
+ mova [tmpq+ 0], m12
+ mova [tmpq+64], m13
+ add tmpq, 64*2
+ sub hd, 4
+ jg .v_w16_loop
+ RET
+.v_w32:
+ mova m18, [bilin_v_perm64]
+ movu ym0, [srcq+strideq*0]
+ movu ym1, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movu ym2, [srcq+strideq*0]
+ movu ym3, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movu ym4, [srcq+strideq*0]
+ movu ym5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movu ym6, [srcq+strideq*0]
+ vpermq m0, m18, m0
+ vpermq m1, m18, m1
+ vpermq m2, m18, m2
+ vpermq m3, m18, m3
+ vpermq m4, m18, m4
+ vpermq m5, m18, m5
+ vpermq m6, m18, m6
+ punpcklbw m0, m1
+ punpcklbw m1, m2
+ punpcklbw m2, m3
+ punpcklbw m3, m4
+ punpcklbw m4, m5
+ punpcklbw m5, m6
+.v_w32_loop:
+ movu ym12, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movu ym13, [srcq+strideq*0]
+ pmaddubsw m14, m0, m8
+ pmaddubsw m16, m2, m9
+ pmaddubsw m15, m1, m8
+ pmaddubsw m17, m3, m9
+ mova m0, m2
+ mova m1, m3
+ vpermq m12, m18, m12
+ vpermq m13, m18, m13
+ paddw m14, m16
+ paddw m15, m17
+ pmaddubsw m16, m4, m10
+ pmaddubsw m17, m5, m10
+ punpcklbw m6, m12
+ punpcklbw m12, m13
+ mova m2, m4
+ mova m3, m5
+ paddw m14, m16
+ paddw m15, m17
+ pmaddubsw m16, m6, m11
+ pmaddubsw m17, m12, m11
+ mova m4, m6
+ mova m5, m12
+ paddw m14, m16
+ paddw m15, m17
+ pmulhrsw m14, m7
+ pmulhrsw m15, m7
+ mova m6, m13
+ mova [tmpq+ 0], m14
+ mova [tmpq+64], m15
+ add tmpq, 64*2
+ sub hd, 2
+ jg .v_w32_loop
+ vzeroupper
+ RET
+.v_w64:
+ mov wd, 64
+ jmp .v_start
+.v_w128:
+ mov wd, 128
+.v_start:
+ WIN64_SPILL_XMM 27
+ mova m26, [bilin_v_perm64]
+ lea r6d, [hq+wq*2]
+ mov r5, srcq
+ mov r7, tmpq
+.v_loop0:
+ vpermq m0, m26, [srcq+strideq*0]
+ vpermq m1, m26, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermq m2, m26, [srcq+strideq*0]
+ vpermq m3, m26, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermq m4, m26, [srcq+strideq*0]
+ vpermq m5, m26, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermq m6, m26, [srcq+strideq*0]
+ punpckhbw m12, m0, m1
+ punpcklbw m0, m1
+ punpckhbw m13, m1, m2
+ punpcklbw m1, m2
+ punpckhbw m14, m2, m3
+ punpcklbw m2, m3
+ punpckhbw m15, m3, m4
+ punpcklbw m3, m4
+ punpckhbw m16, m4, m5
+ punpcklbw m4, m5
+ punpckhbw m17, m5, m6
+ punpcklbw m5, m6
+.v_loop:
+ vpermq m18, m26, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermq m19, m26, [srcq+strideq*0]
+ pmaddubsw m20, m0, m8
+ pmaddubsw m21, m12, m8
+ pmaddubsw m22, m1, m8
+ pmaddubsw m23, m13, m8
+ mova m0, m2
+ mova m12, m14
+ mova m1, m3
+ mova m13, m15
+ pmaddubsw m2, m9
+ pmaddubsw m14, m9
+ pmaddubsw m3, m9
+ pmaddubsw m15, m9
+ punpckhbw m24, m6, m18
+ punpcklbw m6, m18
+ paddw m20, m2
+ paddw m21, m14
+ paddw m22, m3
+ paddw m23, m15
+ mova m2, m4
+ mova m14, m16
+ mova m3, m5
+ mova m15, m17
+ pmaddubsw m4, m10
+ pmaddubsw m16, m10
+ pmaddubsw m5, m10
+ pmaddubsw m17, m10
+ punpckhbw m25, m18, m19
+ punpcklbw m18, m19
+ paddw m20, m4
+ paddw m21, m16
+ paddw m22, m5
+ paddw m23, m17
+ mova m4, m6
+ mova m16, m24
+ mova m5, m18
+ mova m17, m25
+ pmaddubsw m6, m11
+ pmaddubsw m24, m11
+ pmaddubsw m18, m11
+ pmaddubsw m25, m11
+ paddw m20, m6
+ paddw m21, m24
+ paddw m22, m18
+ paddw m23, m25
+ pmulhrsw m20, m7
+ pmulhrsw m21, m7
+ pmulhrsw m22, m7
+ pmulhrsw m23, m7
+ mova m6, m19
+ mova [tmpq+wq*0+ 0], m20
+ mova [tmpq+wq*0+64], m21
+ mova [tmpq+wq*2+ 0], m22
+ mova [tmpq+wq*2+64], m23
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .v_loop
+ add r5, 64
+ add r7, 128
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r6d, 1<<8
+ jg .v_loop0
+ RET
+.hv:
+ %assign stack_offset stack_offset - stack_size_padded
+ %assign stack_size_padded 0
+ WIN64_SPILL_XMM 16
+ cmp wd, 4
+ je .hv_w4
+ shr mxd, 16
+ sub srcq, 3
+ vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep_avx512icl+0]
+ vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep_avx512icl+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ tzcnt wd, wd
+ vpbroadcastd m8, [pd_2]
+ movzx wd, word [r7+wq*2+table_offset(prep, _8tap_hv)]
+ vpbroadcastd m9, [pd_32]
+ add wq, r7
+ vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ jmp wq
+.hv_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ mov r3d, 0x04
+ kmovb k1, r3d
+ kshiftlb k2, k1, 2
+ kshiftlb k3, k1, 4
+ vpbroadcastd m10, [pd_2]
+ vbroadcasti128 m16, [subpel_h_shufA]
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ vpbroadcastd m11, [pd_32]
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ movq xm3, [srcq+strideq*0]
+ vpbroadcastq ym2, [srcq+strideq*1]
+ vpbroadcastq ym3{k1}, [srcq+strideq*2]
+ vpbroadcastq m2{k2}, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m3{k2}, [srcq+strideq*0]
+ vpbroadcastq m2{k3}, [srcq+strideq*1]
+ vpbroadcastq m3{k3}, [srcq+strideq*2]
+ mova m17, [spel_hv_perm4a]
+ movu m18, [spel_hv_perm4b]
+ mova m0, m10
+ mova m1, m10
+ pshufb m2, m16
+ pshufb m3, m16
+ vpdpbusd m0, m2, m8
+ vpdpbusd m1, m3, m8
+ packssdw m0, m1 ; _ 0 1 2 3 4 5 6
+ psraw m0, 2
+ vpermb m1, m17, m0 ; 01 12 23 34
+ vpermb m2, m18, m0 ; 23 34 45 56
+.hv_w4_loop:
+ movq xm3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ movq xm4, [srcq+strideq*0]
+ vpbroadcastq ym3{k1}, [srcq+strideq*1]
+ vpbroadcastq ym4{k1}, [srcq+strideq*2]
+ mova ym5, ym10
+ mova ym6, ym10
+ pshufb ym3, ym16
+ pshufb ym4, ym16
+ vpdpbusd ym5, ym3, ym8
+ vpdpbusd ym6, ym4, ym8
+ mova m7, m11
+ packssdw ym5, ym6 ; 7 8 9 a _ _ _ _
+ psraw ym5, 2
+ valignq m0, m5, m0, 4 ; _ 4 5 6 7 8 9 a
+ vpdpwssd m7, m1, m12
+ vpdpwssd m7, m2, m13
+ vpermb m1, m17, m0 ; 45 56 67 78
+ vpermb m2, m18, m0 ; 67 78 89 9a
+ vpdpwssd m7, m1, m14
+ vpdpwssd m7, m2, m15
+ psrad m7, 6
+ vpmovdw [tmpq], m7
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ vzeroupper
+ RET
+.hv_w8:
+ WIN64_SPILL_XMM 24
+ vbroadcasti128 m16, [subpel_h_shufA]
+ vbroadcasti128 m17, [subpel_h_shufB]
+ vbroadcasti128 m18, [subpel_h_shufC]
+ vinserti128 ym0, [srcq+strideq*0], 1
+ vinserti128 m0, [srcq+strideq*1], 2
+ vinserti128 m0, [srcq+strideq*2], 3
+ movu xm1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 ym1, [srcq+strideq*0], 1
+ vinserti128 m1, [srcq+strideq*1], 2
+ vinserti128 m1, [srcq+strideq*2], 3
+ mova m2, m8
+ mova m4, m8
+ mova m3, m8
+ mova m5, m8
+ pshufb m20, m0, m16
+ pshufb m21, m0, m17
+ pshufb m22, m0, m18
+ pshufb m23, m1, m16
+ pshufb m6, m1, m17
+ pshufb m7, m1, m18
+ vpdpbusd m2, m20, m10
+ vpdpbusd m4, m21, m10
+ vpdpbusd m2, m21, m11
+ vpdpbusd m4, m22, m11
+ vpdpbusd m3, m23, m10
+ vpdpbusd m5, m6, m10
+ vpdpbusd m3, m6, m11
+ vpdpbusd m5, m7, m11
+ packssdw m2, m4
+ packssdw m3, m5
+ psraw m2, 2 ; _ 0 1 2
+ psraw m3, 2 ; 3 4 5 6
+ valignq m0, m3, m2, 2 ; 0 1 2 3
+ valignq m1, m3, m2, 4 ; 1 2 3 4
+ valignq m2, m3, m2, 6 ; 2 3 4 5
+ punpcklwd m4, m0, m1 ; 01a 12a 23a 34a
+ punpckhwd m5, m0, m1 ; 01b 12b 23b 34b
+ punpcklwd m6, m2, m3 ; 23a 34a 45a 56a
+ punpckhwd m7, m2, m3 ; 23b 34b 45b 56b
+.hv_w8_loop:
+ movu xm19, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 ym19, [srcq+strideq*0], 1
+ vinserti128 m19, [srcq+strideq*1], 2
+ vinserti128 m19, [srcq+strideq*2], 3
+ mova m20, m9
+ mova m21, m9
+ mova m22, m8
+ mova m23, m8
+ vpdpwssd m20, m4, m12
+ vpdpwssd m21, m5, m12
+ vpdpwssd m20, m6, m13
+ vpdpwssd m21, m7, m13
+ pshufb m0, m19, m16
+ pshufb m1, m19, m17
+ pshufb m2, m19, m18
+ vpdpbusd m22, m0, m10
+ vpdpbusd m23, m1, m10
+ vpdpbusd m22, m1, m11
+ vpdpbusd m23, m2, m11
+ packssdw m22, m23
+ psraw m22, 2 ; 7 8 9 A
+ valignq m0, m22, m3, 2 ; 4 5 6 7
+ valignq m1, m22, m3, 4 ; 5 6 7 8
+ valignq m2, m22, m3, 6 ; 6 7 8 9
+ mova m3, m22
+ punpcklwd m4, m0, m1 ; 45a 56a 67a 78a
+ punpckhwd m5, m0, m1 ; 45b 56b 67b 78b
+ punpcklwd m6, m2, m3 ; 67a 78a 89a 9Aa
+ punpckhwd m7, m2, m3 ; 67b 78b 89b 9Ab
+ vpdpwssd m20, m4, m14
+ vpdpwssd m21, m5, m14
+ vpdpwssd m20, m6, m15
+ vpdpwssd m21, m7, m15
+ psrad m20, 6
+ psrad m21, 6
+ packssdw m20, m21
+ mova [tmpq], m20
+ add tmpq, 64
+ sub hd, 4
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ mov wd, 16*2
+ jmp .hv_start
+.hv_w32:
+ mov wd, 32*2
+ jmp .hv_start
+.hv_w64:
+ mov wd, 64*2
+ jmp .hv_start
+.hv_w128:
+ mov wd, 128*2
+.hv_start:
+ WIN64_SPILL_XMM 31
+ mova m16, [spel_h_perm16a]
+ mova m17, [spel_h_perm16b]
+ mova m18, [spel_h_perm16c]
+ lea r6d, [hq+wq*8-256]
+ mov r5, srcq
+ mov r7, tmpq
+.hv_loop0:
+ movu ym0, [srcq+strideq*0]
+ vinserti32x8 m0, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu ym1, [srcq+strideq*0]
+ vinserti32x8 m1, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu ym2, [srcq+strideq*0]
+ vinserti32x8 m2, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu ym3, [srcq+strideq*0]
+ mova m4, m8
+ mova m5, m8
+ mova m6, m8
+ mova m7, m8
+ vpermb m19, m16, m0
+ vpermb m20, m17, m0
+ vpermb m21, m18, m0
+ vpermb m22, m16, m1
+ vpermb m23, m17, m1
+ vpermb m24, m18, m1
+ vpermb m25, m16, m2
+ vpermb m26, m17, m2
+ vpermb m27, m18, m2
+ vpermb ym28, ym16, ym3
+ vpermb ym29, ym17, ym3
+ vpermb ym30, ym18, ym3
+ mova m0, m8
+ mova m1, m8
+ mova ym2, ym8
+ mova ym3, ym8
+ vpdpbusd m4, m19, m10
+ vpdpbusd m5, m20, m10
+ vpdpbusd m6, m22, m10
+ vpdpbusd m7, m23, m10
+ vpdpbusd m0, m25, m10
+ vpdpbusd m1, m26, m10
+ vpdpbusd ym2, ym28, ym10
+ vpdpbusd ym3, ym29, ym10
+ vpdpbusd m4, m20, m11
+ vpdpbusd m5, m21, m11
+ vpdpbusd m6, m23, m11
+ vpdpbusd m7, m24, m11
+ vpdpbusd m0, m26, m11
+ vpdpbusd m1, m27, m11
+ vpdpbusd ym2, ym29, ym11
+ vpdpbusd ym3, ym30, ym11
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m0, m1
+ packssdw ym2, ym3
+ psraw m4, 2 ; 0a 0b 1a 1b
+ psraw m6, 2 ; 2a 2b 3a 3b
+ psraw m0, 2 ; 4a 4b 5a 5b
+ psraw ym2, 2 ; 6a 6b __ __
+ vshufi32x4 m5, m4, m6, q1032 ; 1a 1b 2a 2b
+ vshufi32x4 m7, m6, m0, q1032 ; 3a 3b 4a 4b
+ vshufi32x4 m1, m0, m2, q1032 ; 5a 5b 6a 6b
+ punpcklwd m2, m4, m5 ; 01a 01c 12a 12c
+ punpckhwd m3, m4, m5 ; 01b 01d 12b 12d
+ punpcklwd m4, m6, m7 ; 23a 23c 34a 34c
+ punpckhwd m5, m6, m7 ; 23b 23d 34b 34d
+ punpcklwd m6, m0, m1 ; 45a 45c 56a 56c
+ punpckhwd m7, m0, m1 ; 45b 45d 56b 56d
+.hv_loop:
+ movu ym19, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vinserti32x8 m19, [srcq+strideq*0], 1
+ mova m20, m9
+ mova m21, m9
+ mova m22, m8
+ mova m23, m8
+ vpdpwssd m20, m2, m12
+ vpdpwssd m21, m3, m12
+ vpdpwssd m20, m4, m13
+ vpdpwssd m21, m5, m13
+ vpermb m24, m16, m19
+ vpermb m25, m17, m19
+ vpermb m26, m18, m19
+ vpdpbusd m22, m24, m10
+ vpdpbusd m23, m25, m10
+ vpdpbusd m22, m25, m11
+ vpdpbusd m23, m26, m11
+ packssdw m22, m23
+ psraw m22, 2 ; 7a 7b 8a 8b
+ vshufi32x4 m0, m1, m22, q1032 ; 6a 6b 7a 7b
+ mova m2, m4
+ mova m3, m5
+ mova m1, m22
+ mova m4, m6
+ mova m5, m7
+ punpcklwd m6, m0, m1 ; 67a 67c 78a 78c
+ punpckhwd m7, m0, m1 ; 67b 67d 78b 78d
+ vpdpwssd m20, m4, m14
+ vpdpwssd m21, m5, m14
+ vpdpwssd m20, m6, m15
+ vpdpwssd m21, m7, m15
+ psrad m20, 6
+ psrad m21, 6
+ packssdw m20, m21
+ mova [tmpq+wq*0], ym20
+ vextracti32x8 [tmpq+wq*1], m20, 1
+ lea tmpq, [tmpq+wq*2]
+ sub hd, 2
+ jg .hv_loop
+ add r5, 16
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r6d, 1<<8
+ jg .hv_loop0
+ RET
+
+cglobal warp_affine_8x8t_8bpc, 4, 7, 22, tmp, ts
+ vpbroadcastd m9, [pd_16384]
+ mova ym15, [warp_8x8t_end]
+ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main
+ jmp .start
+.loop:
+ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main2
+ lea tmpq, [tmpq+tsq*4]
+.start:
+ paddd m16, m16
+ vpermb m16, m15, m16
+ mova [tmpq+tsq*0], xm16
+ vextracti128 [tmpq+tsq*2], ym16, 1
+ sub r6d, 0x1800
+ jg .loop
+ RET
+
+cglobal warp_affine_8x8_8bpc, 4, 7, 22, dst, ds, src, ss, abcd, filter
+ vpbroadcastd m9, [pd_262144]
+ mova xm15, [warp_8x8_end]
+ call .main
+ jmp .start
+.loop:
+ call .main2
+ lea dstq, [dstq+dsq*2]
+.start:
+ psrad m16, 19
+ packuswb m16, m16
+ vpermb m16, m15, m16
+ movq [dstq+dsq*0], xm16
+ movhps [dstq+dsq*1], xm16
+ sub r6d, 0x1800
+ jg .loop
+ RET
+ALIGN function_align
+.main:
+ vpbroadcastd m1, [pd_512]
+%if WIN64
+ mov abcdq, r5mp
+ vpaddd ym18, ym1, r6m {1to8} ; mx
+%else
+ add r5d, 512
+ vpbroadcastd ym18, r5d
+%endif
+ vpaddd ym20, ym1, r7m {1to8} ; my
+ mova ym16, [pd_0to7]
+ vpbroadcastd ym19, [abcdq+4*0]
+ vpbroadcastd ym21, [abcdq+4*1]
+ lea r4, [ssq*3+3]
+ mova m10, [warp_8x8_permA]
+ mov r6d, 0x5555
+ mova m11, [warp_8x8_permB]
+ lea filterq, [mc_warp_filter+64*8]
+ vpbroadcastq m12, [warp_8x8_hpack]
+ sub srcq, r4 ; src -= src_stride*3 + 3
+ vbroadcasti32x4 m13, [warp_8x8_permC]
+ kxnorb k2, k2, k2
+ vbroadcasti32x4 m14, [warp_8x8_permD]
+ vpdpwssd ym18, ym19, ym16 ; alpha
+ vpdpwssd ym20, ym21, ym16 ; gamma
+ vbroadcasti32x4 m0, [srcq]
+ psrad ym19, 16 ; beta
+ psrad ym21, 16 ; delta
+ kmovw k1, r6d
+ psrad ym16, ym18, 10
+ kmovb k3, k2
+ paddd ym18, ym19
+ vpgatherdq m2{k2}, [filterq+ym16*8] ; filter_x0
+ psrld m1, 8 ; pd_2
+ pshufb m0, m11
+ paddd m8, m1, m1 ; pd_4
+ vpdpbusd m1, m0, m2
+ call .h
+ psllq m2, m1, 45
+ pslld m1, 13
+ paddd m1, m2
+ vpshrdq m1, m0, 48 ; 01 12
+ call .h
+ vpshrdq m2, m1, m0, 48 ; 23 34
+ call .h
+ vpshrdq m3, m2, m0, 48 ; 45 56
+.main2:
+ call .h
+ psrad ym17, ym20, 10
+ kmovb k2, k3
+ paddd ym20, ym21
+ vpgatherdq m7{k3}, [filterq+ym17*8] ; filter_y0
+ psrad ym16, ym20, 10
+ kmovb k3, k2
+ paddd ym20, ym21
+ vpgatherdq m17{k2}, [filterq+ym16*8] ; filter_y1
+ shufps m5, m7, m17, q2020 ; a0 a1 a2 a3 b0 b1 b2 b3 A0 A1 A2 A3 B0 B1 B2 B3
+ mova m16, m9
+ pshufb m4, m5, m13 ; a0 a1 A0 A1 b0 b1 B0 B1
+ vpdpwssd m16, m1, m4
+ pshufb m5, m14 ; a2 a3 A2 A3 b2 b3 B2 B3
+ mova m1, m2
+ vpdpwssd m16, m2, m5
+ shufps m5, m7, m17, q3131 ; a4 a5 a6 a7 b4 b5 b6 b7 A4 A5 A6 A7 B4 B5 B6 B7
+ mova m2, m3
+ pshufb m4, m5, m13 ; a4 a5 A4 A5 b4 b5 B4 B5
+ vpdpwssd m16, m3, m4
+ vpshrdq m3, m0, 48 ; 67 78
+ pshufb m5, m14 ; a6 a7 A6 A7 b6 b7 B6 B7
+ vpdpwssd m16, m3, m5
+ ret
+ALIGN function_align
+.h:
+ movu xm5, [srcq+ssq*1]
+ psrad ym16, ym18, 10
+ lea srcq, [srcq+ssq*2]
+ vinserti32x4 ym5, [srcq+ssq*0], 1
+ kmovb k2, k3
+ paddd ym18, ym19
+ vpgatherdq m6{k3}, [filterq+ym16*8] ; filter_x1
+ psrad ym17, ym18, 10
+ kmovb k3, k2
+ paddd ym18, ym19
+ vpgatherdq m16{k2}, [filterq+ym17*8] ; filter_x2
+ mova m0, m8
+ vpermb m4, m10, m5 ; a4 b0 a5 b1 a6 b2 a7 b3 a8 b4 a9 b5 aa b6 ab b7
+ vpshldq m17, m16, m6, 32 ; a4 a5 a6 a7 b0 b1 b2 b3
+ vpdpbusd m0, m4, m17
+ vpermb m5, m11, m5 ; a0 b4 a1 b5 a2 b6 a3 b7 a4 b8 a5 b9 a6 ba a7 bb
+ vmovdqa32 m16{k1}, m6 ; a0 a1 a2 a3 b4 b5 b6 b7
+ vpdpbusd m0, m5, m16
+ vpmultishiftqb m0, m12, m0 ; 1 1 2 2 (>> 3)
+ ret
+
+%macro BIDIR_FN 1 ; op
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ cmp hd, 8
+ jg .w4_h16
+ WRAP_YMM %1 0
+ vextracti32x4 xm1, ym0, 1
+ movd [dstq ], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ jl .w4_ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq ], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+.w4_ret:
+ RET
+.w4_h16:
+ vpbroadcastd m7, strided
+ pmulld m7, [bidir_sctr_w4]
+ %1 0
+ kxnorw k1, k1, k1
+ vpscatterdd [dstq+m7]{k1}, m0
+ RET
+.w8:
+ cmp hd, 4
+ jne .w8_h8
+ WRAP_YMM %1 0
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq ], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ RET
+.w8_loop:
+ %1_INC_PTR 2
+ lea dstq, [dstq+strideq*4]
+.w8_h8:
+ %1 0
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
+ movq [dstq ], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq ], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
+ sub hd, 8
+ jg .w8_loop
+ RET
+.w16_loop:
+ %1_INC_PTR 2
+ lea dstq, [dstq+strideq*4]
+.w16:
+ %1 0
+ vpermq m0, m0, q3120
+ mova [dstq ], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ pmovzxbq m7, [pb_02461357]
+.w32_loop:
+ %1 0
+ %1_INC_PTR 2
+ vpermq m0, m7, m0
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ pmovzxbq m7, [pb_02461357]
+.w64_loop:
+ %1 0
+ %1_INC_PTR 2
+ vpermq m0, m7, m0
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+.w128:
+ pmovzxbq m7, [pb_02461357]
+.w128_loop:
+ %1 0
+ vpermq m6, m7, m0
+ %1 2
+ mova [dstq+64*0], m6
+ %1_INC_PTR 4
+ vpermq m6, m7, m0
+ mova [dstq+64*1], m6
+ add dstq, strideq
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%macro AVG 1 ; src_offset
+ mova m0, [tmp1q+(%1+0)*mmsize]
+ paddw m0, [tmp2q+(%1+0)*mmsize]
+ mova m1, [tmp1q+(%1+1)*mmsize]
+ paddw m1, [tmp2q+(%1+1)*mmsize]
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ packuswb m0, m1
+%endmacro
+
+%macro AVG_INC_PTR 1
+ add tmp1q, %1*mmsize
+ add tmp2q, %1*mmsize
+%endmacro
+
+cglobal avg_8bpc, 4, 7, 5, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-avg_avx512icl_table
+ lea r6, [avg_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r6+wq*4]
+ vpbroadcastd m4, [base+pw_1024]
+ add wq, r6
+ BIDIR_FN AVG
+
+%macro W_AVG 1 ; src_offset
+ ; (a * weight + b * (16 - weight) + 128) >> 8
+ ; = ((a - b) * weight + (b << 4) + 128) >> 8
+ ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
+ ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4
+ mova m0, [tmp1q+(%1+0)*mmsize]
+ psubw m2, m0, [tmp2q+(%1+0)*mmsize]
+ mova m1, [tmp1q+(%1+1)*mmsize]
+ psubw m3, m1, [tmp2q+(%1+1)*mmsize]
+ pmulhw m2, m4
+ pmulhw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+%endmacro
+
+%define W_AVG_INC_PTR AVG_INC_PTR
+
+cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-w_avg_avx512icl_table
+ lea r6, [w_avg_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ vpbroadcastw m4, r6m ; weight
+ movsxd wq, dword [r6+wq*4]
+ vpbroadcastd m5, [base+pw_2048]
+ psllw m4, 12 ; (weight-16) << 12 when interpreted as signed
+ add wq, r6
+ cmp dword r6m, 7
+ jg .weight_gt7
+ mov r6, tmp1q
+ pxor m0, m0
+ mov tmp1q, tmp2q
+ psubw m4, m0, m4 ; -weight
+ mov tmp2q, r6
+.weight_gt7:
+ BIDIR_FN W_AVG
+
+%macro MASK 1 ; src_offset
+ ; (a * m + b * (64 - m) + 512) >> 10
+ ; = ((a - b) * m + (b << 6) + 512) >> 10
+ ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
+%if mmsize == 64
+ vpermq m3, m8, [maskq+%1*32]
+%else
+ vpermq m3, [maskq+%1*16], q3120
+%endif
+ mova m0, [tmp2q+(%1+0)*mmsize]
+ psubw m1, m0, [tmp1q+(%1+0)*mmsize]
+ psubb m3, m4, m3
+ paddw m1, m1 ; (b - a) << 1
+ paddb m3, m3
+ punpcklbw m2, m4, m3 ; -m << 9
+ pmulhw m1, m2
+ paddw m0, m1
+ mova m1, [tmp2q+(%1+1)*mmsize]
+ psubw m2, m1, [tmp1q+(%1+1)*mmsize]
+ paddw m2, m2
+ punpckhbw m3, m4, m3
+ pmulhw m2, m3
+ paddw m1, m2
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+%endmacro
+
+%macro MASK_INC_PTR 1
+ add maskq, %1*32
+ add tmp2q, %1*64
+ add tmp1q, %1*64
+%endmacro
+
+cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-mask_avx512icl_table
+ lea r7, [mask_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ mov maskq, maskmp
+ movsxd wq, dword [r7+wq*4]
+ pxor m4, m4
+ mova m8, [base+bilin_v_perm64]
+ vpbroadcastd m5, [base+pw_2048]
+ add wq, r7
+ BIDIR_FN MASK
+
+%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4
+ mova m%1, [tmp1q+mmsize*%3]
+ mova m1, [tmp2q+mmsize*%3]
+ psubw m1, m%1
+ pabsw m%2, m1
+ psubusw m%2, m6, m%2
+ psrlw m%2, 8 ; 64 - m
+ psllw m2, m%2, 10
+ pmulhw m1, m2
+ paddw m%1, m1
+ mova m1, [tmp1q+mmsize*%4]
+ mova m2, [tmp2q+mmsize*%4]
+ psubw m2, m1
+ pabsw m3, m2
+ psubusw m3, m6, m3
+ vpshldw m%2, m3, 8
+ psllw m3, m%2, 10
+%if %5
+ psubb m%2, m5, m%2
+%endif
+ pmulhw m2, m3
+ paddw m1, m2
+ pmulhrsw m%1, m7
+ pmulhrsw m1, m7
+ packuswb m%1, m1
+%endmacro
+
+cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_420_avx512icl_table
+ lea r7, [w_mask_420_avx512icl_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ movifnidn hd, hm
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m7, [base+pw_2048]
+ vpbroadcastd m9, [base+pb_m64] ; -1 << 6
+ mova ym10, [base+wm_420_mask+32]
+ vpbroadcastd m8, [base+wm_sign+r6*8] ; (258 - sign) << 6
+ add wq, r7
+ mov maskq, maskmp
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ mova m5, [wm_420_perm4]
+ cmp hd, 8
+ jg .w4_h16
+ WRAP_YMM W_MASK 0, 4, 0, 1
+ vinserti128 ym5, [wm_420_perm4+32], 1
+ vpermb ym4, ym5, ym4
+ vpdpbusd ym8, ym4, ym9
+ vextracti32x4 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+.w4_end:
+ vpermb ym8, ym10, ym8
+ movq [maskq], xm8
+ RET
+.w4_h16:
+ vpbroadcastd m11, strided
+ pmulld m11, [bidir_sctr_w4]
+ W_MASK 0, 4, 0, 1
+ vpermb m4, m5, m4
+ vpdpbusd m8, m4, m9
+ kxnorw k1, k1, k1
+ vpermb m8, m10, m8
+ mova [maskq], xm8
+ vpscatterdd [dstq+m11]{k1}, m0
+ RET
+.w8:
+ mova m5, [wm_420_perm8]
+ cmp hd, 4
+ jne .w8_h8
+ WRAP_YMM W_MASK 0, 4, 0, 1
+ vinserti128 ym5, [wm_420_perm8+32], 1
+ vpermb ym4, ym5, ym4
+ vpdpbusd ym8, ym4, ym9
+ vpermb m8, m10, m8
+ mova [maskq], xm8
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ RET
+.w8_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 16
+ lea dstq, [dstq+strideq*4]
+.w8_h8:
+ W_MASK 0, 4, 0, 1
+ vpermb m4, m5, m4
+ mova m1, m8
+ vpdpbusd m1, m4, m9
+ vpermb m1, m10, m1
+ mova [maskq], xm1
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
+ sub hd, 8
+ jg .w8_loop
+ RET
+.w16:
+ mova m5, [wm_420_perm16]
+.w16_loop:
+ W_MASK 0, 4, 0, 1
+ vpermb m4, m5, m4
+ mova m1, m8
+ vpdpbusd m1, m4, m9
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermb m1, m10, m1
+ vpermq m0, m0, q3120
+ mova [maskq], xm1
+ add maskq, 16
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ pmovzxbq m5, [pb_02461357]
+.w32_loop:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpbusd m1, m4, m9
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermb m1, m10, m1
+ vpermq m0, m5, m0
+ mova [maskq], xm1
+ add maskq, 16
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ pmovzxbq m12, [wm_420_perm64] ; 0, 2, 4, 6, 8, 10, 12, 14
+ psrlq m13, m12, 4 ; 1, 3, 5, 7, 9, 11, 13, 15
+.w64_loop:
+ W_MASK 0, 4, 0, 2
+ W_MASK 11, 5, 1, 3
+ mova m2, m8
+ vpdpbusd m2, m4, m9
+ mova m3, m8
+ vpdpbusd m3, m5, m9
+ add tmp1q, 256
+ add tmp2q, 256
+ vpermt2b m2, m10, m3
+ mova m1, m0
+ vpermt2q m0, m12, m11
+ vpermt2q m1, m13, m11
+ mova [maskq], ym2
+ add maskq, 32
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w64_loop
+ RET
+.w128:
+ pmovzxbq m14, [wm_420_perm64]
+ mova m10, [wm_420_mask]
+ psrlq m15, m14, 4
+.w128_loop:
+ W_MASK 0, 12, 0, 4
+ W_MASK 11, 13, 1, 5
+ mova m4, m8
+ vpdpbusd m4, m12, m9
+ mova m5, m8
+ vpdpbusd m5, m13, m9
+ mova m1, m0
+ vpermt2q m0, m14, m11
+ vpermt2q m1, m15, m11
+ mova [dstq+strideq*0+64*0], m0
+ mova [dstq+strideq*1+64*0], m1
+ W_MASK 0, 12, 2, 6
+ W_MASK 11, 13, 3, 7
+ vprold m4, 16
+ vprold m5, 16
+ vpdpbusd m4, m12, m9
+ vpdpbusd m5, m13, m9
+ add tmp1q, 512
+ add tmp2q, 512
+ vpermt2b m4, m10, m5
+ mova m1, m0
+ vpermt2q m0, m14, m11
+ vpermt2q m1, m15, m11
+ mova [maskq], m4
+ add maskq, 64
+ mova [dstq+strideq*0+64*1], m0
+ mova [dstq+strideq*1+64*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w128_loop
+ RET
+
+cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_422_avx512icl_table
+ lea r7, [w_mask_422_avx512icl_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ movifnidn hd, hm
+ movsxd wq, dword [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m7, [base+pw_2048]
+ vpbroadcastd m9, [base+pw_m128]
+ mova m10, [base+wm_422_mask]
+ vpbroadcastd m11, [base+pb_127]
+ add wq, r7
+ vpbroadcastd m8, [base+wm_sign+4+r6*4]
+ mov maskq, maskmp
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ cmp hd, 8
+ jg .w4_h16
+ WRAP_YMM W_MASK 0, 4, 0, 1
+ movhps xm10, [wm_422_mask+16]
+ vpdpwssd ym8, ym4, ym9
+ vpermb ym8, ym10, ym8
+ vextracti32x4 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+.w4_end:
+ pand xm8, xm11
+ mova [maskq], xm8
+ RET
+.w4_h16:
+ vpbroadcastd m5, strided
+ pmulld m5, [bidir_sctr_w4]
+ W_MASK 0, 4, 0, 1
+ vpdpwssd m8, m4, m9
+ kxnorw k1, k1, k1
+ vpermb m8, m10, m8
+ pand ym8, ym11
+ mova [maskq], ym8
+ vpscatterdd [dstq+m5]{k1}, m0
+ RET
+.w8:
+ cmp hd, 4
+ jne .w8_h8
+ WRAP_YMM W_MASK 0, 4, 0, 1
+ movhps xm10, [wm_422_mask+16]
+ vpdpwssd ym8, ym4, ym9
+ vpermb ym8, ym10, ym8
+ pand xm8, xm11
+ mova [maskq], xm8
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ RET
+.w8_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 32
+ lea dstq, [dstq+strideq*4]
+.w8_h8:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpwssd m1, m4, m9
+ vpermb m1, m10, m1
+ pand ym1, ym11
+ mova [maskq], ym1
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
+ sub hd, 8
+ jg .w8_loop
+ RET
+.w16_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 32
+ lea dstq, [dstq+strideq*4]
+.w16:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpwssd m1, m4, m9
+ vpermb m1, m10, m1
+ vpermq m0, m0, q3120
+ pand ym1, ym11
+ mova [maskq], ym1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ pmovzxbq m5, [pb_02461357]
+.w32_loop:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpwssd m1, m4, m9
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermb m1, m10, m1
+ vpermq m0, m5, m0
+ pand ym1, ym11
+ mova [maskq], ym1
+ add maskq, 32
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ pmovzxbq m5, [pb_02461357]
+.w64_loop:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpwssd m1, m4, m9
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermb m1, m10, m1
+ vpermq m0, m5, m0
+ pand ym1, ym11
+ mova [maskq], ym1
+ add maskq, 32
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+.w128:
+ pmovzxbq m13, [pb_02461357]
+.w128_loop:
+ W_MASK 0, 4, 0, 1
+ W_MASK 12, 5, 2, 3
+ mova m2, m8
+ vpdpwssd m2, m4, m9
+ mova m3, m8
+ vpdpwssd m3, m5, m9
+ add tmp1q, 256
+ add tmp2q, 256
+ vpermt2b m2, m10, m3
+ vpermq m0, m13, m0
+ vpermq m1, m13, m12
+ pand m2, m11
+ mova [maskq], m2
+ add maskq, 64
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, strideq
+ dec hd
+ jg .w128_loop
+ RET
+
+cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_444_avx512icl_table
+ lea r7, [w_mask_444_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m5, [base+pb_64]
+ vpbroadcastd m7, [base+pw_2048]
+ mova m8, [base+wm_444_mask]
+ add wq, r7
+ mov maskq, maskmp
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ cmp hd, 8
+ jg .w4_h16
+ WRAP_YMM W_MASK 0, 4, 0, 1, 1
+ vinserti128 ym8, [wm_444_mask+32], 1
+ vpermb ym4, ym8, ym4
+ mova [maskq], ym4
+ vextracti32x4 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+.w4_end:
+ RET
+.w4_h16:
+ vpbroadcastd m9, strided
+ pmulld m9, [bidir_sctr_w4]
+ W_MASK 0, 4, 0, 1, 1
+ vpermb m4, m8, m4
+ kxnorw k1, k1, k1
+ mova [maskq], m4
+ vpscatterdd [dstq+m9]{k1}, m0
+ RET
+.w8:
+ cmp hd, 4
+ jne .w8_h8
+ WRAP_YMM W_MASK 0, 4, 0, 1, 1
+ vinserti128 ym8, [wm_444_mask+32], 1
+ vpermb ym4, ym8, ym4
+ mova [maskq], ym4
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ RET
+.w8_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 64
+ lea dstq, [dstq+strideq*4]
+.w8_h8:
+ W_MASK 0, 4, 0, 1, 1
+ vpermb m4, m8, m4
+ mova [maskq], m4
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
+ sub hd, 8
+ jg .w8_loop
+ RET
+.w16_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 64
+ lea dstq, [dstq+strideq*4]
+.w16:
+ W_MASK 0, 4, 0, 1, 1
+ vpermb m4, m8, m4
+ vpermq m0, m0, q3120
+ mova [maskq], m4
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ pmovzxbq m9, [pb_02461357]
+.w32_loop:
+ W_MASK 0, 4, 0, 1, 1
+ vpermb m4, m8, m4
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermq m0, m9, m0
+ mova [maskq], m4
+ add maskq, 64
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ pmovzxbq m9, [pb_02461357]
+.w64_loop:
+ W_MASK 0, 4, 0, 1, 1
+ vpermb m4, m8, m4
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermq m0, m9, m0
+ mova [maskq], m4
+ add maskq, 64
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+.w128:
+ pmovzxbq m11, [pb_02461357]
+.w128_loop:
+ W_MASK 0, 4, 0, 1, 1
+ W_MASK 10, 9, 2, 3, 1
+ vpermb m4, m8, m4
+ vpermb m9, m8, m9
+ add tmp1q, 256
+ add tmp2q, 256
+ vpermq m0, m11, m0
+ vpermq m10, m11, m10
+ mova [maskq+64*0], m4
+ mova [maskq+64*1], m9
+ add maskq, 128
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m10
+ add dstq, strideq
+ dec hd
+ jg .w128_loop
+ RET
+
+cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
+%define base r6-blend_avx512icl_table
+ lea r6, [blend_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn maskq, maskmp
+ movifnidn hd, hm
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m6, [base+pb_64]
+ vpbroadcastd m7, [base+pw_512]
+ sub tmpq, maskq
+ add wq, r6
+ lea r6, [dsq*3]
+ jmp wq
+.w4:
+ movd xmm0, [dstq+dsq*0]
+ pinsrd xmm0, [dstq+dsq*1], 1
+ vpbroadcastd xmm1, [dstq+dsq*2]
+ pinsrd xmm1, [dstq+r6 ], 3
+ mova xmm4, [maskq]
+ mova xmm5, [maskq+tmpq]
+ add maskq, 4*4
+ psubb xmm3, xm6, xmm4
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm2, xmm3, xmm4
+ punpckhbw xmm1, xmm5
+ punpckhbw xmm3, xmm4
+ pmaddubsw xmm0, xmm2
+ pmaddubsw xmm1, xmm3
+ pmulhrsw xmm0, xm7
+ pmulhrsw xmm1, xm7
+ packuswb xmm0, xmm1
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ pextrd [dstq+dsq*2], xmm0, 2
+ pextrd [dstq+r6 ], xmm0, 3
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ movq xmm0, [dstq+dsq*0]
+ vpbroadcastq xmm1, [dstq+dsq*1]
+ vpbroadcastq ymm2, [dstq+dsq*2]
+ vpbroadcastq ymm3, [dstq+r6 ]
+ mova ymm4, [maskq]
+ mova ymm5, [maskq+tmpq]
+ add maskq, 8*4
+ vpblendd ymm0, ymm2, 0x30
+ vpblendd ymm1, ymm3, 0xc0
+ psubb ymm3, ym6, ymm4
+ punpcklbw ymm0, ymm5
+ punpcklbw ymm2, ymm3, ymm4
+ punpckhbw ymm1, ymm5
+ punpckhbw ymm3, ymm4
+ pmaddubsw ymm0, ymm2
+ pmaddubsw ymm1, ymm3
+ pmulhrsw ymm0, ym7
+ pmulhrsw ymm1, ym7
+ packuswb ymm0, ymm1
+ vextracti128 xmm1, ymm0, 1
+ movq [dstq+dsq*0], xmm0
+ movhps [dstq+dsq*1], xmm0
+ movq [dstq+dsq*2], xmm1
+ movhps [dstq+r6 ], xmm1
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w8
+ vzeroupper
+ RET
+.w16:
+ mova xm1, [dstq+dsq*0]
+ vinserti32x4 ym1, [dstq+dsq*1], 1
+ vinserti32x4 m1, [dstq+dsq*2], 2
+ mova m4, [maskq]
+ vinserti32x4 m1, [dstq+r6 ], 3
+ mova m5, [maskq+tmpq]
+ add maskq, 16*4
+ psubb m3, m6, m4
+ punpcklbw m0, m1, m5
+ punpcklbw m2, m3, m4
+ punpckhbw m1, m5
+ punpckhbw m3, m4
+ pmaddubsw m0, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m7
+ pmulhrsw m1, m7
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], ym0, 1
+ vextracti32x4 [dstq+dsq*2], m0, 2
+ vextracti32x4 [dstq+r6 ], m0, 3
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w16
+ RET
+.w32:
+ mova ym1, [dstq+dsq*0]
+ vinserti32x8 m1, [dstq+dsq*1], 1
+ mova m4, [maskq]
+ mova m5, [maskq+tmpq]
+ add maskq, 32*2
+ psubb m3, m6, m4
+ punpcklbw m0, m1, m5
+ punpcklbw m2, m3, m4
+ punpckhbw m1, m5
+ punpckhbw m3, m4
+ pmaddubsw m0, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m7
+ pmulhrsw m1, m7
+ packuswb m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w32
+ RET
+
+cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
+%define base r5-blend_v_avx512icl_table
+ lea r5, [blend_v_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m5, [base+pw_512]
+ add wq, r5
+ add maskq, obmc_masks-blend_v_avx512icl_table
+ jmp wq
+.w2:
+ vpbroadcastd xmm2, [maskq+2*2]
+.w2_s0_loop:
+ movd xmm0, [dstq+dsq*0]
+ pinsrw xmm0, [dstq+dsq*1], 1
+ movd xmm1, [tmpq]
+ add tmpq, 2*2
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm2
+ pmulhrsw xmm0, xm5
+ packuswb xmm0, xmm0
+ pextrw [dstq+dsq*0], xmm0, 0
+ pextrw [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w2_s0_loop
+ RET
+.w4:
+ vpbroadcastq xmm2, [maskq+4*2]
+.w4_loop:
+ movd xmm0, [dstq+dsq*0]
+ pinsrd xmm0, [dstq+dsq*1], 1
+ movq xmm1, [tmpq]
+ add tmpq, 4*2
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm2
+ pmulhrsw xmm0, xm5
+ packuswb xmm0, xmm0
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+.w8:
+ mova xmm3, [maskq+8*2]
+.w8_loop:
+ movq xmm0, [dstq+dsq*0]
+ vpbroadcastq xmm1, [dstq+dsq*1]
+ mova xmm2, [tmpq]
+ add tmpq, 8*2
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ pmaddubsw xmm0, xmm3
+ pmaddubsw xmm1, xmm3
+ pmulhrsw xmm0, xm5
+ pmulhrsw xmm1, xm5
+ packuswb xmm0, xmm1
+ movq [dstq+dsq*0], xmm0
+ movhps [dstq+dsq*1], xmm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16:
+ vbroadcasti32x4 ym3, [maskq+16*2]
+ vbroadcasti32x4 ym4, [maskq+16*3]
+.w16_loop:
+ mova xm1, [dstq+dsq*0]
+ vinserti32x4 ym1, [dstq+dsq*1], 1
+ mova ym2, [tmpq]
+ add tmpq, 16*2
+ punpcklbw ym0, ym1, ym2
+ punpckhbw ym1, ym2
+ pmaddubsw ym0, ym3
+ pmaddubsw ym1, ym4
+ pmulhrsw ym0, ym5
+ pmulhrsw ym1, ym5
+ packuswb ym0, ym1
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32:
+ mova m4, [maskq+32*2]
+ vshufi32x4 m3, m4, m4, q2020
+ vshufi32x4 m4, m4, q3131
+.w32_loop:
+ mova ym1, [dstq+dsq*0]
+ vinserti32x8 m1, [dstq+dsq*1], 1
+ mova m2, [tmpq]
+ add tmpq, 32*2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m4
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+
+cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
+%define base r6-blend_h_avx512icl_table
+ lea r6, [blend_h_avx512icl_table]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ lea maskq, [base+obmc_masks+hq*2]
+ vpbroadcastd m5, [base+pw_512]
+ lea hd, [hq*3]
+ add wq, r6
+ shr hd, 2 ; h * 3/4
+ lea maskq, [maskq+hq*2]
+ neg hq
+ jmp wq
+.w2:
+ movd xmm0, [dstq+dsq*0]
+ pinsrw xmm0, [dstq+dsq*1], 1
+ movd xmm2, [maskq+hq*2]
+ movd xmm1, [tmpq]
+ add tmpq, 2*2
+ punpcklwd xmm2, xmm2
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm2
+ pmulhrsw xmm0, xm5
+ packuswb xmm0, xmm0
+ pextrw [dstq+dsq*0], xmm0, 0
+ pextrw [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w2
+ RET
+.w4:
+ mova xmm3, [blend_shuf]
+.w4_loop:
+ movd xmm0, [dstq+dsq*0]
+ pinsrd xmm0, [dstq+dsq*1], 1
+ movd xmm2, [maskq+hq*2]
+ movq xmm1, [tmpq]
+ add tmpq, 4*2
+ pshufb xmm2, xmm3
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm2
+ pmulhrsw xmm0, xm5
+ packuswb xmm0, xmm0
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w4_loop
+ RET
+.w8:
+ vbroadcasti128 ymm4, [blend_shuf]
+ shufpd ymm4, ymm4, 0x03
+.w8_loop:
+ vpbroadcastq ymm1, [dstq+dsq*0]
+ movq xmm0, [dstq+dsq*1]
+ vpblendd ymm0, ymm1, 0x30
+ vpbroadcastd ymm3, [maskq+hq*2]
+ movq xmm1, [tmpq+8*1]
+ vinserti128 ymm1, [tmpq+8*0], 1
+ add tmpq, 8*2
+ pshufb ymm3, ymm4
+ punpcklbw ymm0, ymm1
+ pmaddubsw ymm0, ymm3
+ pmulhrsw ymm0, ym5
+ vextracti128 xmm1, ymm0, 1
+ packuswb xmm0, xmm1
+ movhps [dstq+dsq*0], xmm0
+ movq [dstq+dsq*1], xmm0
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w8_loop
+ vzeroupper
+ RET
+.w16:
+ vbroadcasti32x4 ym4, [blend_shuf]
+ shufpd ym4, ym4, 0x0c
+.w16_loop:
+ mova xm1, [dstq+dsq*0]
+ vinserti32x4 ym1, [dstq+dsq*1], 1
+ vpbroadcastd ym3, [maskq+hq*2]
+ mova ym2, [tmpq]
+ add tmpq, 16*2
+ pshufb ym3, ym4
+ punpcklbw ym0, ym1, ym2
+ punpckhbw ym1, ym2
+ pmaddubsw ym0, ym3
+ pmaddubsw ym1, ym3
+ pmulhrsw ym0, ym5
+ pmulhrsw ym1, ym5
+ packuswb ym0, ym1
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w16_loop
+ RET
+.w32:
+ vbroadcasti32x4 m4, [blend_shuf]
+ shufpd m4, m4, 0xf0
+.w32_loop:
+ mova ym1, [dstq+dsq*0]
+ vinserti32x8 m1, [dstq+dsq*1], 1
+ vpbroadcastd m3, [maskq+hq*2]
+ mova m2, [tmpq]
+ add tmpq, 32*2
+ pshufb m3, m4
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w32_loop
+ RET
+.w64:
+ vpbroadcastw m3, [maskq+hq*2]
+ mova m1, [dstq]
+ mova m2, [tmpq]
+ add tmpq, 32*2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ inc hq
+ jl .w64
+ RET
+.w128:
+ vpbroadcastw m6, [maskq+hq*2]
+ mova m2, [dstq+64*0]
+ mova m1, [tmpq+64*0]
+ mova m3, [dstq+64*1]
+ mova m4, [tmpq+64*1]
+ add tmpq, 64*2
+ punpcklbw m0, m2, m1
+ punpckhbw m2, m1
+ pmaddubsw m0, m6
+ pmaddubsw m2, m6
+ punpcklbw m1, m3, m4
+ punpckhbw m3, m4
+ pmaddubsw m1, m6
+ pmaddubsw m3, m6
+ REPX {pmulhrsw x, m5}, m0, m2, m1, m3
+ packuswb m0, m2
+ packuswb m1, m3
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, dsq
+ inc hq
+ jl .w128
+ RET
+
+cglobal resize_8bpc, 6, 12, 19, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ mov r6, ~0
+ vpbroadcastd m5, dxm
+ vpbroadcastd m8, mx0m
+ vpbroadcastd m6, src_wm
+ kmovq k3, r6
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
+ LEA r7, $$
+%define base r7-$$
+ vpbroadcastd m3, [base+pw_m256]
+ vpbroadcastd m7, [base+pd_63]
+ vbroadcasti32x4 m15, [base+pb_8x0_8x8]
+ vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15]
+ pslld m5, 4 ; dx*16
+ pslld m6, 14
+ pxor m2, m2
+ mova m16, [base+resize_permA]
+ mova m17, [base+resize_permB]
+ mova xm18, [base+resize_permC]
+.loop_y:
+ xor xd, xd
+ mova m4, m8 ; per-line working version of mx
+.loop_x:
+ pmaxsd m0, m4, m2
+ psrad m9, m4, 8 ; filter offset (unmasked)
+ pminsd m0, m6 ; iclip(mx, 0, src_w-8)
+ psubd m1, m4, m0 ; pshufb offset
+ psrad m0, 14 ; clipped src_x offset
+ psrad m1, 14 ; pshufb edge_emu offset
+ vptestmd k4, m1, m1
+ pand m9, m7 ; filter offset (masked)
+ ktestw k4, k4
+ jz .load
+ vextracti32x8 ym12, m0, 1
+ vextracti32x8 ym13, m1, 1
+ kmovq k1, k3
+ kmovq k2, k3
+ vpgatherdq m10{k1}, [srcq+ym0]
+ vpgatherdq m11{k2}, [srcq+ym12]
+ kmovq k1, k3
+ kmovq k2, k3
+ vpgatherdq m14{k1}, [base+resize_shuf+4+ym1]
+ vpgatherdq m0{k2}, [base+resize_shuf+4+ym13]
+ mova m12, m16
+ mova m13, m17
+ paddb m14, m15
+ paddb m0, m15
+ pshufb m10, m14
+ pshufb m11, m0
+ vpermi2d m12, m10, m11
+ vpermi2d m13, m10, m11
+ jmp .filter
+.load:
+ kmovq k1, k3
+ kmovq k2, k3
+ vpgatherdd m12{k1}, [srcq+m0+0]
+ vpgatherdd m13{k2}, [srcq+m0+4]
+.filter:
+ kmovq k1, k3
+ kmovq k2, k3
+ vpgatherdd m10{k1}, [base+resize_filter+m9*8+0]
+ vpgatherdd m11{k2}, [base+resize_filter+m9*8+4]
+ mova m14, m2
+ vpdpbusd m14, m12, m10
+ vpdpbusd m14, m13, m11
+ packssdw m14, m14
+ pmulhrsw m14, m3
+ packuswb m14, m14
+ vpermd m14, m18, m14
+ mova [dstq+xq], xm14
+ paddd m4, m5
+ add xd, 16
+ cmp xd, dst_wd
+ jl .loop_x
+ add dstq, dst_strideq
+ add srcq, src_strideq
+ dec hd
+ jg .loop_y
+ RET
+
+%endif ; ARCH_X86_64