diff options
Diffstat (limited to 'third_party/dav1d/src/x86/mc_avx512.asm')
-rw-r--r-- | third_party/dav1d/src/x86/mc_avx512.asm | 2953 |
1 files changed, 1971 insertions, 982 deletions
diff --git a/third_party/dav1d/src/x86/mc_avx512.asm b/third_party/dav1d/src/x86/mc_avx512.asm index f9043f1ad3..50e670ec25 100644 --- a/third_party/dav1d/src/x86/mc_avx512.asm +++ b/third_party/dav1d/src/x86/mc_avx512.asm @@ -89,55 +89,47 @@ wm_444_mask: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 3 db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 -bilin_h_perm16: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 - db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15 - db 33, 32, 34, 33, 35, 34, 36, 35, 37, 36, 38, 37, 39, 38, 40, 39 - db 41, 40, 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47 -bilin_h_perm32: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 - db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15 - db 17, 16, 18, 17, 19, 18, 20, 19, 21, 20, 22, 21, 23, 22, 24, 23 - db 25, 24, 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31 -bilin_v_perm8: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 - db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23 - db 32, 80, 33, 81, 34, 82, 35, 83, 36, 84, 37, 85, 38, 86, 39, 87 - db 64, 32, 65, 33, 66, 34, 67, 35, 68, 36, 69, 37, 70, 38, 71, 39 -bilin_v_perm16: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 - db 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 - db 64, 16, 65, 17, 66, 18, 67, 19, 68, 20, 69, 21, 70, 22, 71, 23 - db 72, 24, 73, 25, 74, 26, 75, 27, 76, 28, 77, 29, 78, 30, 79, 31 -bilin_v_perm32: db 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7 - db 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15 - db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23 - db 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31 -bilin_v_perm64: dq 0, 4, 1, 5, 2, 6, 3, 7 -spel_h_perm16a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 +bilin_h_perm16: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 + db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16 + db 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40 + db 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48 +bilin_h_perm32: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 + db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16 + db 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24 + db 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32 +bilin_v_perm8: db 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 + db 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87 + db 80, 32, 81, 33, 82, 34, 83, 35, 84, 36, 85, 37, 86, 38, 87, 39 + db 32, 64, 33, 65, 34, 66, 35, 67, 36, 68, 37, 69, 38, 70, 39, 71 +bilin_v_perm16: db 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 + db 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 + db 16, 64, 17, 65, 18, 66, 19, 67, 20, 68, 21, 69, 22, 70, 23, 71 + db 24, 72, 25, 73, 26, 74, 27, 75, 28, 76, 29, 77, 30, 78, 31, 79 +bilin_v_perm32: db 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71 + db 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79 + db 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87 + db 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95 +bilin_v_perm64: dd 0, 0, 4, 8, 1, 1, 5, 9, 2, 2, 6, 10, 3, 3, 7, 11 +spel_h_perm16: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38 db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46 -spel_h_perm16b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 - db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 - db 36, 37, 38, 39, 37, 38, 39, 40, 38, 39, 40, 41, 39, 40, 41, 42 - db 44, 45, 46, 47, 45, 46, 47, 48, 46, 47, 48, 49, 47, 48, 49, 50 -spel_h_perm16c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 - db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 - db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46 - db 48, 49, 50, 51, 49, 50, 51, 52, 50, 51, 52, 53, 51, 52, 53, 54 -spel_h_perm32a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 +spel_h_perm32: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30 -spel_h_perm32b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 - db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 - db 20, 21, 22, 23, 21, 22, 23, 24, 22, 23, 24, 25, 23, 24, 25, 26 - db 28, 29, 30, 31, 29, 30, 31, 32, 30, 31, 32, 33, 31, 32, 33, 34 -spel_h_perm32c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 - db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 - db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30 - db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38 -spel_v_perm16: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7 +spel_v_perm8: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 + db 8, 16, 9, 17, 10, 18, 11, 19, 12, 20, 13, 21, 14, 22, 15, 23 + db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 + db 24, 32, 25, 33, 26, 34, 27, 35, 28, 36, 29, 37, 30, 38, 31, 39 +spel_v_perm16a: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7 db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23 db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 +spel_v_perm16b: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7 + db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23 + db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 + db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 spel_v_perm32: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39 db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55 @@ -154,34 +146,20 @@ spel_hv_perm8a: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 2 db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39 db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47 -spel_hv_perm8b: db 32, 33, 48, 49, 34, 35, 50, 51, 36, 37, 52, 53, 38, 39, 54, 55 - db 40, 41, 56, 57, 42, 43, 58, 59, 44, 45, 60, 61, 46, 47, 62, 63 - db 48, 49, 64, 65, 50, 51, 66, 67, 52, 53, 68, 69, 54, 55, 70, 71 - db 56, 57, 72, 73, 58, 59, 74, 75, 60, 61, 76, 77, 62, 63, 78, 79 -spel_hv_perm8c: db 34, 35, 0, 1, 38, 39, 4, 5, 42, 43, 8, 9, 46, 47, 12, 13 +spel_hv_perm8b: db 34, 35, 0, 1, 38, 39, 4, 5, 42, 43, 8, 9, 46, 47, 12, 13 db 50, 51, 16, 17, 54, 55, 20, 21, 58, 59, 24, 25, 62, 63, 28, 29 db 0, 1, 32, 33, 4, 5, 36, 37, 8, 9, 40, 41, 12, 13, 44, 45 db 16, 17, 48, 49, 20, 21, 52, 53, 24, 25, 56, 57, 28, 29, 60, 61 -spel_hv_end16: db 1, 3, 17, 19, 5, 7, 21, 23, 33, 35, 49, 51, 37, 39, 53, 55 - db 9, 11, 25, 27, 13, 15, 29, 31, 41, 43, 57, 59, 45, 47, 61, 63 spel_hv_perm16a:db 0, 1, 2, 3, 32, 33, 34, 35, 1, 2, 3, 4, 33, 34, 35, 36 db 2, 3, 4, 5, 34, 35, 36, 37, 3, 4, 5, 6, 35, 36, 37, 38 -spel_hv_perm16c:db 8, 9, 10, 11, 40, 41, 42, 43, 9, 10, 11, 12, 41, 42, 43, 44 + db 8, 9, 10, 11, 40, 41, 42, 43, 9, 10, 11, 12, 41, 42, 43, 44 db 10, 11, 12, 13, 42, 43, 44, 45, 11, 12, 13, 14, 43, 44, 45, 46 - db 16, 17, 18, 19, 48, 49, 50, 51, 17, 18, 19, 20, 49, 50, 51, 52 - db 18, 19, 20, 21, 50, 51, 52, 53, 19, 20, 21, 22, 51, 52, 53, 54 -spel_hv_perm16b:db 4, 5, 6, 7, 36, 37, 38, 39, 5, 6, 7, 8, 37, 38, 39, 40 - db 6, 7, 8, 9, 38, 39, 40, 41, 7, 8, 9, 10, 39, 40, 41, 42 - db 12, 13, 14, 15, 44, 45, 46, 47, 13, 14, 15, 16, 45, 46, 47, 48 - db 14, 15, 16, 17, 46, 47, 48, 49, 15, 16, 17, 18, 47, 48, 49, 50 -spel_hv_perm16d:db 0, 1, 2, 3, 1, 2, 3, 4, 4, 5, 6, 7, 5, 6, 7, 8 +spel_hv_perm16b:db 0, 1, 2, 3, 1, 2, 3, 4, 4, 5, 6, 7, 5, 6, 7, 8 db 2, 3, 4, 5, 3, 4, 5, 6, 6, 7, 8, 9, 7, 8, 9, 10 db 8, 9, 10, 11, 9, 10, 11, 12, 12, 13, 14, 15, 13, 14, 15, 16 db 10, 11, 12, 13, 11, 12, 13, 14, 14, 15, 16, 17, 15, 16, 17, 18 -spel_hv_perm16e:db 4, 5, 6, 7, 5, 6, 7, 8, 8, 9, 10, 11, 9, 10, 11, 12 - db 6, 7, 8, 9, 7, 8, 9, 10, 10, 11, 12, 13, 11, 12, 13, 14 - db 12, 13, 14, 15, 13, 14, 15, 16, 16, 17, 18, 19, 17, 18, 19, 20 - db 14, 15, 16, 17, 15, 16, 17, 18, 18, 19, 20, 21, 19, 20, 21, 22 +spel_hv_end16: db 1, 3, 17, 19, 5, 7, 21, 23, 33, 35, 49, 51, 37, 39, 53, 55 + db 9, 11, 25, 27, 13, 15, 29, 31, 41, 43, 57, 59, 45, 47, 61, 63 spel_hv_end: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55 deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 @@ -189,15 +167,14 @@ subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 1 subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 -bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11 -bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 -bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 +bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 +bilin_v_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 -resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 resize_permA: dd 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 resize_permB: dd 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 resize_permC: dd 0, 4, 8, 12 +resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 pb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7 wm_420_perm64: dq 0xfedcba9876543210 @@ -205,6 +182,8 @@ wm_sign: dd 0x40804080, 0xc0c0c0c0, 0x40404040 pb_8x0_8x8: times 8 db 0 times 8 db 8 +pb_4: times 4 db 4 +pb_32: times 4 db 32 pb_127: times 4 db 127 pw_m128 times 2 dw -128 pw_m256: times 2 dw -256 @@ -216,7 +195,6 @@ pd_32: dd 32 pd_34: dd 34 pd_63: dd 63 pd_512: dd 512 -pd_32768: dd 32768 %define pb_m64 (wm_sign+4) %define pb_64 (wm_sign+8) @@ -289,8 +267,10 @@ BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, 6tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, 8tap, avx512icl, 3, 2, 4, 8, 16, 32, 64, 128 -HV_JMP_TABLE prep, 8tap, avx512icl, 7, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, 6tap, avx512icl, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, 8tap, avx512icl, 3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128 @@ -401,9 +381,9 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy .h: ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 - imul mxyd, 0xff01 - vbroadcasti128 m4, [bilin_h_shuf8] - add mxyd, 16 << 8 + imul mxyd, 255 + vbroadcasti128 m4, [bilin_h_perm16] + add mxyd, 16 vpbroadcastw m5, mxyd mov mxyd, r7m ; my test mxyd, mxyd @@ -526,9 +506,9 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy RET .v: movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] - imul mxyd, 0xff01 + imul mxyd, 255 vpbroadcastd m5, [pw_2048] - add mxyd, 16 << 8 + add mxyd, 16 add wq, r7 vpbroadcastw m4, mxyd jmp wq @@ -539,7 +519,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy lea srcq, [srcq+ssq*2] pinsrw xmm0, xmm1, [srcq+ssq*0], 0 ; 2 1 pshuflw xmm1, xmm1, q2301 ; 1 0 - punpcklbw xmm1, xmm0, xmm1 + punpcklbw xmm1, xmm0 pmaddubsw xmm1, xm4 pmulhrsw xmm1, xm5 packuswb xmm1, xmm1 @@ -552,11 +532,11 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy .v_w4: movd xmm0, [srcq+ssq*0] .v_w4_loop: - vpbroadcastd xmm1, [srcq+ssq*1] + vpbroadcastd xmm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - vpblendd xmm2, xmm1, xmm0, 0x01 ; 0 1 + vpblendd xmm1, xmm2, xmm0, 0x01 ; 0 1 vpbroadcastd xmm0, [srcq+ssq*0] - vpblendd xmm1, xmm0, 0x02 ; 1 2 + vpblendd xmm2, xmm0, 0x02 ; 1 2 punpcklbw xmm1, xmm2 pmaddubsw xmm1, xm4 pmulhrsw xmm1, xm5 @@ -570,11 +550,11 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy .v_w8: movq xmm0, [srcq+ssq*0] .v_w8_loop: - movq xmm3, [srcq+ssq*1] + movq xmm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - punpcklbw xmm1, xmm3, xmm0 + punpcklbw xmm1, xmm0, xmm2 movq xmm0, [srcq+ssq*0] - punpcklbw xmm2, xmm0, xmm3 + punpcklbw xmm2, xmm0 pmaddubsw xmm1, xm4 pmaddubsw xmm2, xm4 pmulhrsw xmm1, xm5 @@ -589,11 +569,11 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy .v_w16: movu xmm0, [srcq+ssq*0] .v_w16_loop: - vbroadcasti128 ymm2, [srcq+ssq*1] + vbroadcasti128 ymm3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - vpblendd ymm3, ymm2, ymm0, 0x0f ; 0 1 + vpblendd ymm2, ymm3, ymm0, 0x0f ; 0 1 vbroadcasti128 ymm0, [srcq+ssq*0] - vpblendd ymm2, ymm2, ymm0, 0xf0 ; 1 2 + vpblendd ymm3, ymm0, 0xf0 ; 1 2 punpcklbw ymm1, ymm2, ymm3 punpckhbw ymm2, ymm3 pmaddubsw ymm1, ym4 @@ -612,11 +592,11 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy movu ym0, [srcq+ssq*0] kxnorb k1, k1, k1 .v_w32_loop: - vbroadcasti32x8 m2, [srcq+ssq*1] + vbroadcasti32x8 m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - vpblendmd m3{k1}, m2, m0 ; 0 1 + vpblendmd m2{k1}, m3, m0 ; 0 1 vbroadcasti32x8 m0, [srcq+ssq*0] - vpblendmd m2{k1}, m0, m2 ; 1 2 + vpblendmd m3{k1}, m0, m3 ; 1 2 punpcklbw m1, m2, m3 punpckhbw m2, m3 pmaddubsw m1, m4 @@ -635,18 +615,18 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy .v_w64_loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - punpcklbw m1, m3, m0 - punpckhbw m6, m3, m0 + punpcklbw m1, m0, m3 + punpckhbw m6, m0, m3 movu m0, [srcq+ssq*0] pmaddubsw m1, m4 pmaddubsw m6, m4 - punpcklbw m2, m0, m3 - punpckhbw m7, m0, m3 + punpcklbw m2, m3, m0 + punpckhbw m3, m0 pmaddubsw m2, m4 - pmaddubsw m7, m4 - REPX {pmulhrsw x, m5}, m1, m6, m2, m7 + pmaddubsw m3, m4 + REPX {pmulhrsw x, m5}, m1, m6, m2, m3 packuswb m1, m6 - packuswb m2, m7 + packuswb m2, m3 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] @@ -660,13 +640,13 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy add srcq, ssq movu m2, [srcq+64*0] movu m3, [srcq+64*1] - punpcklbw m6, m2, m0 + punpcklbw m6, m0, m2 pmaddubsw m6, m4 - punpckhbw m0, m2, m0 + punpckhbw m0, m2 pmaddubsw m0, m4 - punpcklbw m7, m3, m1 + punpcklbw m7, m1, m3 pmaddubsw m7, m4 - punpckhbw m1, m3, m1 + punpckhbw m1, m3 pmaddubsw m1, m4 REPX {pmulhrsw x, m5}, m6, m0, m7, m1 packuswb m6, m0 @@ -1005,8 +985,8 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 .h: ; 16 * src[x] + (mx * (src[x + 1] - src[x])) ; = (16 - mx) * src[x] + mx * src[x + 1] - imul mxyd, 0xff01 - add mxyd, 16 << 8 + imul mxyd, 255 + add mxyd, 16 vpbroadcastw m5, mxyd mov mxyd, r6m ; my test mxyd, mxyd @@ -1032,7 +1012,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 jg .h_w4_loop RET .h_w8: - vbroadcasti32x4 m4, [bilin_h_shuf8] + vbroadcasti32x4 m4, [bilin_h_perm16] .h_w8_loop: movu xmm0, [srcq+strideq*0] vinserti32x4 ym0, ymm0, [srcq+strideq*1], 1 @@ -1127,8 +1107,8 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 .v: WIN64_SPILL_XMM 7 movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)] - imul mxyd, 0xff01 - add mxyd, 16 << 8 + imul mxyd, 255 + add mxyd, 16 add wq, t2 lea stride3q, [strideq*3] vpbroadcastw m6, mxyd @@ -1218,11 +1198,11 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 .v_w64_loop: vpermq m1, m5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] - punpcklbw m4, m1, m0 - punpckhbw m2, m1, m0 + punpcklbw m4, m0, m1 + punpckhbw m2, m0, m1 vpermq m0, m5, [srcq+strideq*0] - punpcklbw m3, m0, m1 - punpckhbw m1, m0, m1 + punpcklbw m3, m1, m0 + punpckhbw m1, m0 pmaddubsw m4, m6 pmaddubsw m2, m6 pmaddubsw m3, m6 @@ -1243,28 +1223,28 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 vpermq m2, m5, [srcq+strideq*1+ 0] vpermq m3, m5, [srcq+strideq*1+64] lea srcq, [srcq+strideq*2] - punpcklbw m4, m2, m0 - punpckhbw m0, m2, m0 + punpcklbw m4, m0, m2 + punpckhbw m0, m2 pmaddubsw m4, m6 pmaddubsw m0, m6 mova [tmpq+64*0], m4 mova [tmpq+64*1], m0 - punpcklbw m4, m3, m1 - punpckhbw m1, m3, m1 + punpcklbw m4, m1, m3 + punpckhbw m1, m3 pmaddubsw m4, m6 pmaddubsw m1, m6 mova [tmpq+64*2], m4 mova [tmpq+64*3], m1 vpermq m0, m5, [srcq+strideq*0+ 0] vpermq m1, m5, [srcq+strideq*0+64] - punpcklbw m4, m0, m2 - punpckhbw m2, m0, m2 + punpcklbw m4, m2, m0 + punpckhbw m2, m0 pmaddubsw m4, m6 pmaddubsw m2, m6 mova [tmpq+64*4], m4 mova [tmpq+64*5], m2 - punpcklbw m4, m1, m3 - punpckhbw m3, m1, m3 + punpcklbw m4, m3, m1 + punpckhbw m3, m1 pmaddubsw m4, m6 pmaddubsw m3, m6 mova [tmpq+64*6], m4 @@ -1308,7 +1288,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 jg .hv_w4_loop RET .hv_w8: - vbroadcasti32x4 m4, [bilin_h_shuf8] + vbroadcasti32x4 m4, [bilin_h_perm16] vbroadcasti32x4 m0, [srcq+strideq*0] pshufb m0, m4 pmaddubsw m0, m5 @@ -1448,7 +1428,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 -%macro FN 4 ; fn, type, type_h, type_v +%macro FN 4-5 ; fn, type, type_h, type_v, jmp_to cglobal %1_%2_8bpc mov t0d, FILTER_%3 %ifidn %3, %4 @@ -1456,8 +1436,8 @@ cglobal %1_%2_8bpc %else mov t1d, FILTER_%4 %endif -%ifnidn %2, regular ; skip the jump in the last filter - jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX) +%if %0 == 5 ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%5 %+ SUFFIX) %endif %endmacro @@ -1489,24 +1469,22 @@ DECLARE_REG_TMP 4, 5 DECLARE_REG_TMP 7, 8 %endif +; Due to the use of vpdpbusd (which does 4 pixels per instruction) in +; the horizontal filter, 6-tap is only used for the vertical filter. %define PUT_8TAP_FN FN put_8tap, - -PUT_8TAP_FN sharp, SHARP, SHARP -PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH -PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP -PUT_8TAP_FN smooth, SMOOTH, SMOOTH -PUT_8TAP_FN sharp_regular, SHARP, REGULAR -PUT_8TAP_FN regular_sharp, REGULAR, SHARP -PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR -PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_6tap_8bpc +PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_6tap_8bpc +PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_8bpc +PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_8bpc +PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_8bpc PUT_8TAP_FN regular, REGULAR, REGULAR -cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 +cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ns %define base r8-put_avx512icl imul mxd, mxm, 0x010101 - add mxd, t0d ; 8tap_h, mx, 4tap_h + add mxd, t0d ; 6tap_h, mx, 4tap_h imul myd, mym, 0x010101 - add myd, t1d ; 8tap_v, my, 4tap_v + add myd, t1d ; 6tap_v, my, 4tap_v lea r8, [put_avx512icl] movsxd wq, wm movifnidn hd, hm @@ -1514,6 +1492,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 jnz .h test myd, 0xf00 jnz .v +.put: tzcnt wd, wd movzx wd, word [r8+wq*2+table_offset(put,)] add wq, r8 @@ -1523,158 +1502,577 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 pop r8 %endif jmp wq -.h: - test myd, 0xf00 - jnz .hv - vpbroadcastd m5, [pd_34] ; 2 + (8 << 2) - WIN64_SPILL_XMM 11 - cmp wd, 4 - jl .h_w2 - vbroadcasti128 m6, [subpel_h_shufA] - je .h_w4 - tzcnt wd, wd - vbroadcasti128 m7, [subpel_h_shufB] - vbroadcasti128 m8, [subpel_h_shufC] - shr mxd, 16 - sub srcq, 3 - movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] - vpbroadcastd m9, [base+mxq*8+subpel_filters+0] - vpbroadcastd m10, [base+mxq*8+subpel_filters+4] - add wq, r8 - jmp wq -.h_w2: - movzx mxd, mxb - dec srcq - mova xmm4, [subpel_h_shuf4] - vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] -.h_w2_loop: - movq xmm0, [srcq+ssq*0] - movhps xmm0, [srcq+ssq*1] +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + tzcnt r6d, wd + movzx r6d, word [r8+r6*2+table_offset(put, _6tap_v)] + vpbroadcastd m6, [pw_512] + lea myq, [base+subpel_filters+1+myq*8] + vpbroadcastw m7, [myq+0] + add r6, r8 + vpbroadcastw m8, [myq+2] + mov nsq, ssq + vpbroadcastw m9, [myq+4] + neg nsq + jmp r6 +.v_w2: + movd xmm2, [srcq+nsq*2] + pinsrw xmm2, [srcq+nsq*1], 2 + pinsrw xmm2, [srcq+ssq*0], 4 + pinsrw xmm2, [srcq+ssq*1], 6 ; 0 1 2 3 lea srcq, [srcq+ssq*2] - pshufb xmm0, xmm4 - mova xmm1, xm5 - vpdpbusd xmm1, xmm0, xmm3 - packssdw xmm0, xmm1, xmm1 - psraw xmm0, 6 - packuswb xmm0, xm0 - pextrw [dstq+dsq*0], xmm0, 0 - pextrw [dstq+dsq*1], xmm0, 1 + vpbroadcastd xmm0, [srcq+ssq*0] + palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4 + punpcklbw xmm1, xmm2, xmm3 ; 01 12 + punpckhbw xmm2, xmm3 ; 23 34 +.v_w2_loop: + vpbroadcastd xmm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw xmm3, xmm1, xm7 ; a0 b0 + mova xmm1, xmm2 + pmaddubsw xmm2, xm8 ; a1 b1 + paddw xmm3, xmm2 + vpblendd xmm2, xmm0, xmm4, 0x02 ; 4 5 + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm4, xmm0, 0x02 ; 5 6 + punpcklbw xmm2, xmm4 ; 67 78 + pmaddubsw xmm4, xmm2, xm9 ; a3 b3 + paddw xmm3, xmm4 + pmulhrsw xmm3, xm6 + packuswb xmm3, xmm3 + pextrw [dstq+dsq*0], xmm3, 0 + pextrw [dstq+dsq*1], xmm3, 2 lea dstq, [dstq+dsq*2] sub hd, 2 - jg .h_w2_loop + jg .v_w2_loop RET -.h_w4: - movzx mxd, mxb - dec srcq - vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] -.h_w4_loop: - movq xmm0, [srcq+ssq*0] - movq xmm1, [srcq+ssq*1] +.v_w4: + movd xmm2, [srcq+nsq*2] + pinsrd xmm2, [srcq+nsq*1], 1 + pinsrd xmm2, [srcq+ssq*0], 2 + pinsrd xmm2, [srcq+ssq*1], 3 ; 0 1 2 3 lea srcq, [srcq+ssq*2] - pshufb xmm0, xm6 - pshufb xmm1, xm6 - mova xmm2, xm5 - vpdpbusd xmm2, xmm0, xmm3 - mova xmm0, xm5 - vpdpbusd xmm0, xmm1, xmm3 - packssdw xmm0, xmm2, xmm0 - psraw xmm0, 6 - packuswb xmm0, xmm0 - movd [dstq+dsq*0], xmm0 - pextrd [dstq+dsq*1], xmm0, 1 + vpbroadcastd xmm0, [srcq+ssq*0] + palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4 + punpcklbw xmm1, xmm2, xmm3 ; 01 12 + punpckhbw xmm2, xmm3 ; 23 34 +.v_w4_loop: + vpbroadcastd xmm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw xmm3, xmm1, xm7 ; a0 b0 + mova xmm1, xmm2 + pmaddubsw xmm2, xm8 ; a1 b1 + paddw xmm3, xmm2 + vpblendd xmm2, xmm0, xmm4, 0x02 ; 4 5 + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm4, xmm0, 0x02 ; 5 6 + punpcklbw xmm2, xmm4 ; 45 56 + pmaddubsw xmm4, xmm2, xm9 ; a2 b2 + paddw xmm3, xmm4 + pmulhrsw xmm3, xm6 + packuswb xmm3, xmm3 + movd [dstq+dsq*0], xmm3 + pextrd [dstq+dsq*1], xmm3, 1 lea dstq, [dstq+dsq*2] sub hd, 2 - jg .h_w4_loop + jg .v_w4_loop RET -.h_w8: - movu xm0, [srcq+ssq*0] - vinserti32x4 ym0, [srcq+ssq*1], 1 +.v_w8: + movq xmm1, [srcq+nsq*2] + vpbroadcastq ymm3, [srcq+nsq*1] + vpbroadcastq ymm2, [srcq+ssq*0] + vpbroadcastq ymm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - WRAP_YMM PUT_8TAP_H 0, 1, 2, 3 - vpmovuswb xm0, ym0 - movq [dstq+dsq*0], xm0 - movhps [dstq+dsq*1], xm0 + vpbroadcastq ymm0, [srcq+ssq*0] + vpblendd ymm1, ymm3, 0x30 + vpblendd ymm3, ymm2, 0x30 + punpcklbw ymm1, ymm3 ; 01 12 + vpblendd ymm2, ymm4, 0x30 + vpblendd ymm4, ymm0, 0x30 + punpcklbw ymm2, ymm4 ; 23 34 +.v_w8_loop: + vpbroadcastq ymm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw ymm3, ymm1, ym7 ; a0 b0 + mova ymm1, ymm2 + pmaddubsw ymm2, ym8 ; a1 b1 + paddw ymm3, ymm2 + vpblendd ymm2, ymm0, ymm4, 0x30 + vpbroadcastq ymm0, [srcq+ssq*0] + vpblendd ymm4, ymm0, 0x30 + punpcklbw ymm2, ymm4 ; 45 56 + pmaddubsw ymm4, ymm2, ym9 ; a2 b2 + paddw ymm3, ymm4 + pmulhrsw ymm3, ym6 + vextracti128 xmm4, ymm3, 1 + packuswb xmm3, xmm4 + movq [dstq+dsq*0], xmm3 + movhps [dstq+dsq*1], xmm3 lea dstq, [dstq+dsq*2] sub hd, 2 - jg .h_w8 + jg .v_w8_loop + vzeroupper RET -.h_w16: - mova m6, [spel_h_perm16a] - mova m7, [spel_h_perm16b] - mova m8, [spel_h_perm16c] -.h_w16_loop: - movu ym0, [srcq+ssq*0] +.v_w16: + mova m5, [spel_v_perm16a] + vbroadcasti32x4 m1, [srcq+nsq*2] + vbroadcasti32x4 ym3, [srcq+nsq*1] + mov r6d, 0x0f + vbroadcasti32x4 m2, [srcq+ssq*0] + kmovb k1, r6d + vbroadcasti32x4 ym4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vbroadcasti32x4 m0, [srcq+ssq*0] + vshufpd m1{k1}, m3, m2, 0xcc + vshufpd m2{k1}, m4, m0, 0xcc + vpermb m1, m5, m1 ; 01 12 + vpermb m2, m5, m2 ; 23 34 +.v_w16_loop: + vbroadcasti32x4 ym4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw m3, m1, m7 ; a0 b0 + mova m1, m2 + pmaddubsw m2, m8 ; a1 b1 + paddw m3, m2 + mova m2, m0 + vbroadcasti32x4 m0, [srcq+ssq*0] + vshufpd m2{k1}, m4, m0, 0xcc + vpermb m2, m5, m2 ; 45 56 + pmaddubsw m4, m2, m9 ; a2 b2 + paddw m3, m4 + pmulhrsw m3, m6 + vextracti32x8 ym4, m3, 1 + packuswb ym3, ym4 + mova [dstq+dsq*0], xm3 + vextracti32x4 [dstq+dsq*1], ym3, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w16_loop + RET +.v_w32: + mova m10, [spel_v_perm32] + pmovzxbq m5, [pb_02461357] + vpshrdw m11, m10, m10, 8 + movu ym0, [srcq+nsq*2] + vinserti32x8 m0, [srcq+nsq*1], 1 + vpermb m1, m10, m0 ; 01 + vinserti32x8 m0, [srcq+ssq*0], 0 + vpermb m2, m11, m0 ; 12 vinserti32x8 m0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] - PUT_8TAP_H 0, 1, 2, 3, 1 - vpmovuswb ym0, m0 - mova [dstq+dsq*0], xm0 - vextracti128 [dstq+dsq*1], ym0, 1 + vpermb m3, m10, m0 ; 23 + vinserti32x8 m0, [srcq+ssq*0], 0 + vpermb m4, m11, m0 ; 34 +.v_w32_loop: + vinserti32x8 m0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pmaddubsw m12, m1, m7 + mova m1, m3 + pmaddubsw m13, m2, m7 + mova m2, m4 + pmaddubsw m14, m3, m8 + vpermb m3, m10, m0 ; 45 + vinserti32x8 m0, [srcq+ssq*0], 0 + pmaddubsw m15, m4, m8 + vpermb m4, m11, m0 ; 56 + paddw m12, m14 + pmaddubsw m14, m3, m9 + paddw m13, m15 + pmaddubsw m15, m4, m9 + paddw m12, m14 + paddw m13, m15 + pmulhrsw m12, m6 + pmulhrsw m13, m6 + packuswb m12, m13 + vpermq m12, m5, m12 + mova [dstq+dsq*0], ym12 + vextracti32x8 [dstq+dsq*1], m12, 1 lea dstq, [dstq+dsq*2] sub hd, 2 - jg .h_w16_loop + jg .v_w32_loop RET -.h_w32: - movu ym0, [srcq+ssq*0+8*0] - vinserti32x8 m0, [srcq+ssq*1+8*0], 1 - movu ym1, [srcq+ssq*0+8*1] - vinserti32x8 m1, [srcq+ssq*1+8*1], 1 +.v_w64: +.v_w128: + lea r6d, [hq+wq*4-256] +.v_loop0: + movu m2, [srcq+nsq*2] + movu m4, [srcq+nsq*1] + lea r4, [srcq+ssq*2] + movu m11, [srcq+ssq*0] + movu m13, [srcq+ssq*1] + mov r7, dstq + movu m0, [r4 +ssq*0] + punpcklbw m1, m2, m4 ; 01l + punpckhbw m2, m4 ; 01h + punpcklbw m3, m4, m11 ; 12l + punpckhbw m4, m11 ; 12h + punpcklbw m10, m11, m13 ; 23l + punpckhbw m11, m13 ; 23h + punpcklbw m12, m13, m0 ; 34l + punpckhbw m13, m0 ; 34h +.v_loop: + movu m5, [r4+ssq*1] + pmaddubsw m14, m1, m7 ; a0l + mova m1, m10 + pmaddubsw m10, m8 ; a1l + lea r4, [r4+ssq*2] + pmaddubsw m15, m2, m7 ; a0h + mova m2, m11 + pmaddubsw m11, m8 ; a1h + paddw m14, m10 + punpcklbw m10, m0, m5 ; 45l + paddw m15, m11 + punpckhbw m11, m0, m5 ; 45h + pmaddubsw m0, m10, m9 ; a2l + paddw m14, m0 + pmaddubsw m0, m11, m9 ; a2h + paddw m15, m0 + movu m0, [r4+ssq*0] + pmulhrsw m14, m6 + pmulhrsw m15, m6 + packuswb m14, m15 + pmaddubsw m15, m3, m7 ; b0l + mova m3, m12 + pmaddubsw m12, m8 ; b1l + mova [r7+dsq*0], m14 + pmaddubsw m14, m4, m7 ; b0h + mova m4, m13 + pmaddubsw m13, m8 ; b1h + paddw m15, m12 + punpcklbw m12, m5, m0 ; 56l + paddw m14, m13 + punpckhbw m13, m5, m0 ; 56h + pmaddubsw m5, m12, m9 ; b2l + paddw m15, m5 + pmaddubsw m5, m13, m9 ; b2h + paddw m14, m5 + pmulhrsw m15, m6 + pmulhrsw m14, m6 + packuswb m15, m14 + mova [r7+dsq*1], m15 + lea r7, [r7+dsq*2] + sub hd, 2 + jg .v_loop + add srcq, 64 + add dstq, 64 + movzx hd, r6b + sub r6d, 256 + jg .v_loop0 + RET +.h: + test myd, 0xf00 + jz mangle(private_prefix %+ _put_8tap_8bpc_avx512icl).h2 +.hv: + vpbroadcastd m9, [pd_34] + mova xm10, [spel_hv_end] + pxor xm0, xm0 + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + dec srcq + vpbroadcastd m7, [base+subpel_filters+mxq*8+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq ym1, [base+subpel_filters+1+myq*8] + mov nsq, ssq + punpcklbw ym0, ym1 + neg nsq + psraw ym0, 2 ; << 6 + pshufd ym11, ym0, q0000 + pshufd ym12, ym0, q1111 + pshufd ym13, ym0, q2222 + cmp wd, 4 + je .hv_w4 + vbroadcasti128 ym5, [subpel_h_shuf4] + movq xmm0, [srcq+nsq*2] + movhps xmm0, [srcq+nsq*1] + movq xmm2, [srcq+ssq*0] + movhps xmm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - PUT_8TAP_H 0, 2, 3, 4 - PUT_8TAP_H 1, 4, 3, 2 - packuswb m0, m1 - mova [dstq+dsq*0], ym0 - vextracti32x8 [dstq+dsq*1], m0, 1 + vpbroadcastq ymm1, [srcq+ssq*0] + vpblendd ymm0, ymm1, 0x30 + pshufb xmm2, xm5 ; 2 3 + pshufb ymm0, ym5 ; 0 1 4 + mova xmm1, xm9 + vpdpbusd xmm1, xmm2, xm7 + mova ymm2, ym9 + vpdpbusd ymm2, ymm0, ym7 + packssdw ymm2, ymm1 + psraw ymm2, 2 + vextracti128 xmm0, ymm2, 1 + vzeroupper + palignr xmm0, xmm2, 4 + punpcklwd xmm1, xmm2, xmm0 ; 01 12 + punpckhwd xmm2, xmm0 ; 23 34 +.hv_w2_loop: + movq xmm3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps xmm3, [srcq+ssq*0] + pmaddwd xmm4, xmm1, xm11 ; a0 b0 + mova xmm1, xmm2 + vpdpwssd xmm4, xmm2, xm12 ; a1 b1 + pshufb xmm3, xm5 + mova xmm2, xm9 + vpdpbusd xmm2, xmm3, xm7 + packssdw xmm3, xmm2, xmm2 + psraw xmm3, 2 + palignr xmm2, xmm3, xmm0, 12 + mova xmm0, xmm3 + punpcklwd xmm2, xmm3 ; 45 56 + vpdpwssd xmm4, xmm2, xm13 ; a2 b2 + packuswb xmm4, xmm4 + pshufb xmm4, xm10 + pextrw [dstq+dsq*0], xmm4, 0 + pextrw [dstq+dsq*1], xmm4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 - jg .h_w32 + jg .hv_w2_loop RET -.h_w64: - movu m0, [srcq+8*0] - movu m1, [srcq+8*1] - add srcq, ssq - PUT_8TAP_H 0, 2, 3, 4 - PUT_8TAP_H 1, 4, 3, 2 - packuswb m0, m1 - mova [dstq], m0 - add dstq, dsq - dec hd - jg .h_w64 +.hv_w4: + movq xm2, [srcq+nsq*2] + vpbroadcastq ym1, [srcq+nsq*1] + vinserti32x4 ym2, [srcq+ssq*0], 1 + vinserti32x4 m1, [srcq+ssq*1], 2 ; _ 1 3 + lea srcq, [srcq+ssq*2] + vbroadcasti32x4 m5, [subpel_h_shufA] + vinserti32x4 m2, [srcq+ssq*0], 2 ; 0 2 4 + pshufb m1, m5 + mova m0, m9 + pshufb m2, m5 + mova m3, m9 + vpdpbusd m0, m1, m7 + mova ym1, [spel_hv_perm4a] + vpdpbusd m3, m2, m7 + mova ym2, [spel_hv_perm4b] + mov r6d, 0x5555 + mova ym6, [spel_hv_perm4d] + packssdw m0, m3 + kmovw k1, r6d + psraw m0, 2 ; _ 0 1 2 3 4 5 6 + vpermb ym1, ym1, ym0 ; 01 12 + vpermb m2, m2, m0 ; 23 34 +.hv_w4_loop: + movq xm3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti32x4 ym3, [srcq+ssq*0], 1 + pmaddwd ym4, ym1, ym11 ; a0 b0 + mova ym1, ym2 + pshufb ym3, ym5 + mova ym0, ym9 + vpdpbusd ym0, ym3, ym7 + vpdpwssd ym4, ym2, ym12 ; a1 b1 + vpsraw ym2{k1}, ym0, 2 ; 5 6 + vpermb ym2, ym6, ym2 ; 45 56 + vpdpwssd ym4, ym2, ym13 ; a2 b2 + packuswb ym4, ym4 + vpermb ym4, ym10, ym4 + movd [dstq+dsq*0], xm4 + pextrd [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop RET -.h_w128: - movu m0, [srcq+8*0] - movu m2, [srcq+8*1] - movu m1, [srcq+8*8] - movu m3, [srcq+8*9] - add srcq, ssq - PUT_8TAP_H 0, 4, 11, 12 - PUT_8TAP_H 2, 12, 11, 4 - PUT_8TAP_H 1, 4, 11, 12 - PUT_8TAP_H 3, 12, 11, 4 - packuswb m0, m2 - packuswb m1, m3 - mova [dstq+64*0], m0 - mova [dstq+64*1], m1 - add dstq, dsq - dec hd - jg .h_w128 +.hv_w8: + shr mxd, 16 + sub srcq, 3 + vpbroadcastd m11, [base+subpel_filters+mxq*8+0] + vpbroadcastd m12, [base+subpel_filters+mxq*8+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq m1, [base+subpel_filters+1+myq*8] + mov nsq, ssq + punpcklbw m0, m1 + neg nsq + psraw m0, 2 ; << 6 + pshufd m13, m0, q0000 + pshufd m14, m0, q1111 + pshufd m15, m0, q2222 + cmp wd, 8 + jne .hv_w16 + movu xm0, [srcq+nsq*2] + vinserti32x4 ym0, [srcq+nsq*1], 1 + vbroadcasti32x4 m1, [subpel_h_shufA] + vinserti32x4 m0, [srcq+ssq*0], 2 + vbroadcasti32x4 m4, [subpel_h_shufB] + vinserti32x4 m0, [srcq+ssq*1], 3 + lea srcq, [srcq+ssq*2] + vbroadcasti32x4 m7, [subpel_h_shufC] + vbroadcasti32x4 ym5, [srcq+ssq*0] + vbroadcasti32x8 m6, [subpel_h_shufA] + pshufb m1, m0, m1 ; 0 1 2 3 0123 + mova m2, m9 + vpdpbusd m2, m1, m11 + pshufb m4, m0, m4 ; 0 1 2 3 4567 + mova m1, m9 + vpdpbusd m1, m4, m11 + pshufb m0, m7 ; 0 1 2 3 89ab + pshufb ym7, ym5, ym6 ; 4 0123 4567 + mova ym3, ym9 + vpdpbusd ym3, ym7, ym11 + vbroadcasti32x8 m7, [subpel_h_shufB] + vpdpbusd m2, m4, m12 + mova m4, [spel_hv_perm8a] + pshufb ym5, ym7 ; 4 4567 89ab + vpdpbusd m1, m0, m12 + vpaddd m0, m4, [pb_32] {1to16} + vpdpbusd ym3, ym5, ym12 + mova m5, [spel_hv_perm8b] + mov r6, 0x55555555ff00 + packssdw m2, m1 + vpmovsdw xm3, ym3 + kmovq k1, r6 + psraw m2, 2 ; 0 1 2 3 + psraw xm3, 2 ; 4 + vpermb m1, m4, m2 ; 01 12 + kshiftrq k2, k1, 16 + vpermt2b m2, m0, m3 ; 23 34 +.hv_w8_loop: + vbroadcasti32x4 ym3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vbroadcasti32x4 m3{k1}, [srcq+ssq*0] + pmaddwd m0, m1, m13 ; a0 b0 + pshufb m1, m3, m6 ; 5 6 0123 4567 + mova m4, m9 + vpdpbusd m4, m1, m11 + pshufb m3, m7 ; 5 6 4567 89ab + vpdpwssd m0, m2, m14 ; a1 b1 + mova m1, m2 + vpdpbusd m4, m3, m12 + psraw m2{k2}, m4, 2 ; 53 64 + vpermb m2, m5, m2 ; 45 56 + vpdpwssd m0, m2, m15 ; a2 b2 + packuswb m0, m0 + vpermb m0, m10, m0 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: + movu m19, [spel_hv_perm16a] + vpbroadcastd m7, [pb_4] + lea r6d, [wq*2-32] + mova m6, [spel_hv_perm16b] + paddb m20, m7, m19 + lea r6d, [hq+r6*8] + paddb m21, m7, m20 + mova ym10, [spel_hv_end16] + paddb m7, m6 +.hv_w16_loop0: + movu ym16, [srcq+nsq*2] + vinserti32x8 m16, [srcq+nsq*1], 1 + lea r4, [srcq+ssq*2] + movu ym17, [srcq+ssq*0] + vinserti32x8 m17, [srcq+ssq*1], 1 + mov r7, dstq + movu ym18, [r4 +ssq*0] + vpermb m2, m19, m16 ; 0 1 0123 89ab + mova m1, m9 + vpermb m3, m21, m16 ; 0 1 89ab ghij + vpdpbusd m1, m2, m11 + mova m2, m9 + vpermb m4, m19, m17 ; 2 3 0123 89ab + vpdpbusd m2, m3, m12 + mova m3, m9 + vpermb m5, m21, m17 ; 2 3 89ab ghij + vpdpbusd m3, m4, m11 + mova m4, m9 + vpermb m0, m6, m18 ; 4 0145 2367 89cd abef + vpdpbusd m4, m5, m12 + mova m5, m9 + vpermb m16, m20, m16 ; 0 1 4567 cdef + vpdpbusd m5, m0, m11 + vpermb m17, m20, m17 ; 2 3 4567 cdef + vpdpbusd m1, m16, m12 + vpermb m18, m7, m18 ; 4 4589 67ab cdgh efij + vpdpbusd m2, m16, m11 + vpdpbusd m3, m17, m12 + vpdpbusd m4, m17, m11 + vpdpbusd m5, m18, m12 + packssdw m1, m2 ; 01 + packssdw m3, m4 ; 23 + REPX {psraw x, 2}, m1, m3, m5 + vpshrdd m2, m1, m3, 16 ; 12 + vpshrdd m4, m3, m5, 16 ; 34 +.hv_w16_loop: + movu ym18, [r4+ssq*1] + lea r4, [r4+ssq*2] + vinserti32x8 m18, [r4+ssq*0], 1 + pmaddwd m16, m1, m13 ; a0 + vpermb m1, m19, m18 ; 5 6 0123 89ab + pmaddwd m17, m2, m13 ; b0 + vpermb m2, m20, m18 ; 5 6 4567 cdef + mova m0, m9 + vpdpbusd m0, m1, m11 + vpermb m18, m21, m18 + mova m1, m9 + vpdpbusd m1, m2, m11 + vpdpwssd m16, m3, m14 ; a1 + vpdpwssd m17, m4, m14 ; b1 + vpdpbusd m0, m2, m12 + mova m2, m4 + vpdpbusd m1, m18, m12 + packssdw m0, m1 + mova m1, m3 + psraw m4, m0, 2 ; 5 6 + vpshrdd m3, m2, m4, 16 ; 4 5 + vpdpwssd m17, m4, m15 ; b2 + vpdpwssd m16, m3, m15 ; a2 + packuswb m16, m17 + vpermb m16, m10, m16 + mova [r7+dsq*0], xm16 + vextracti128 [r7+dsq*1], ym16, 1 + lea r7, [r7+dsq*2] + sub hd, 2 + jg .hv_w16_loop + add srcq, 16 + add dstq, 16 + movzx hd, r6b + sub r6d, 1<<8 + jg .hv_w16_loop0 + vzeroupper RET + +PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_8bpc +PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_8bpc +PUT_8TAP_FN sharp, SHARP, SHARP + +cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r8, [put_avx512icl] + movsxd wq, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jz mangle(private_prefix %+ _put_6tap_8bpc_avx512icl).put .v: movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd tzcnt r6d, wd + lea myq, [base+subpel_filters+myq*8] movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)] vpbroadcastd m7, [pw_512] - lea myq, [base+subpel_filters+myq*8] vpbroadcastw m8, [myq+0] - vpbroadcastw m9, [myq+2] - vpbroadcastw m10, [myq+4] - vpbroadcastw m11, [myq+6] add r6, r8 + vpbroadcastw m9, [myq+2] lea ss3q, [ssq*3] + vpbroadcastw m10, [myq+4] sub srcq, ss3q + vpbroadcastw m11, [myq+6] jmp r6 .v_w2: movd xmm2, [srcq+ssq*0] @@ -1802,7 +2200,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 vzeroupper RET .v_w16: - mova m12, [spel_v_perm16] + mova m12, [spel_v_perm16a] vbroadcasti32x4 m1, [srcq+ssq*0] vbroadcasti32x4 ym4, [srcq+ssq*1] mov r6d, 0x0f @@ -1990,7 +2388,146 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 jg .v_loop0 vzeroupper RET +.h: + test myd, 0xf00 + jnz .hv +.h2: + vpbroadcastd m5, [pd_34] ; 2 + (8 << 2) + cmp wd, 4 + jl .h_w2 + vbroadcasti128 m6, [subpel_h_shufA] + je .h_w4 + tzcnt wd, wd + vbroadcasti128 m7, [subpel_h_shufB] + vbroadcasti128 m8, [subpel_h_shufC] + shr mxd, 16 + sub srcq, 3 + movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] + vpbroadcastd m9, [base+mxq*8+subpel_filters+0] + vpbroadcastd m10, [base+mxq*8+subpel_filters+4] + add wq, r8 + jmp wq +.h_w2: + movzx mxd, mxb + dec srcq + mova xmm4, [subpel_h_shuf4] + vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] +.h_w2_loop: + movq xmm0, [srcq+ssq*0] + movhps xmm0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xmm0, xmm4 + mova xmm1, xm5 + vpdpbusd xmm1, xmm0, xmm3 + packssdw xmm0, xmm1, xmm1 + psraw xmm0, 6 + packuswb xmm0, xm0 + pextrw [dstq+dsq*0], xmm0, 0 + pextrw [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2_loop + RET +.h_w4: + movzx mxd, mxb + dec srcq + vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] +.h_w4_loop: + movq xmm0, [srcq+ssq*0] + movq xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xmm0, xm6 + pshufb xmm1, xm6 + mova xmm2, xm5 + vpdpbusd xmm2, xmm0, xmm3 + mova xmm0, xm5 + vpdpbusd xmm0, xmm1, xmm3 + packssdw xmm0, xmm2, xmm0 + psraw xmm0, 6 + packuswb xmm0, xmm0 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4_loop + RET +.h_w8: + movu xm0, [srcq+ssq*0] + vinserti32x4 ym0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + WRAP_YMM PUT_8TAP_H 0, 1, 2, 3 + vpmovuswb xm0, ym0 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + mova m6, [spel_h_perm16] + vpbroadcastd m8, [pb_4] + paddb m7, m8, m6 + paddb m8, m7 +.h_w16_loop: + movu ym0, [srcq+ssq*0] + vinserti32x8 m0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + PUT_8TAP_H 0, 1, 2, 3, 1 + vpmovuswb ym0, m0 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], ym0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16_loop + RET +.h_w32: + movu ym0, [srcq+ssq*0+8*0] + vinserti32x8 m0, [srcq+ssq*1+8*0], 1 + movu ym1, [srcq+ssq*0+8*1] + vinserti32x8 m1, [srcq+ssq*1+8*1], 1 + lea srcq, [srcq+ssq*2] + PUT_8TAP_H 0, 2, 3, 4 + PUT_8TAP_H 1, 4, 3, 2 + packuswb m0, m1 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w32 + RET +.h_w64: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + add srcq, ssq + PUT_8TAP_H 0, 2, 3, 4 + PUT_8TAP_H 1, 4, 3, 2 + packuswb m0, m1 + mova [dstq], m0 + add dstq, dsq + dec hd + jg .h_w64 + RET +.h_w128: + movu m0, [srcq+8*0] + movu m2, [srcq+8*1] + movu m1, [srcq+8*8] + movu m3, [srcq+8*9] + add srcq, ssq + PUT_8TAP_H 0, 4, 11, 12 + PUT_8TAP_H 2, 12, 11, 4 + PUT_8TAP_H 1, 4, 11, 12 + PUT_8TAP_H 3, 12, 11, 4 + packuswb m0, m2 + packuswb m1, m3 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, dsq + dec hd + jg .h_w128 + RET .hv: + vpbroadcastd m9, [pd_34] + pxor xm0, xm0 cmp wd, 4 jg .hv_w8 movzx mxd, mxb @@ -2000,12 +2537,10 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 shr myd, 16 cmp hd, 6 cmovs myd, mxd - vpbroadcastd m8, [pd_2] - vpbroadcastq ym0, [base+subpel_filters+myq*8] + vpbroadcastq ym1, [base+subpel_filters+myq*8] lea ss3q, [ssq*3] - vpbroadcastd ym9, [pd_32768] mov r6, srcq - punpcklbw ym0, ym8, ym0 + punpcklbw ym0, ym1 sub r6, ss3q psraw ym0, 2 ; << 6 mova xm14, [spel_hv_end] @@ -2029,9 +2564,9 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 vpblendd ymm2, ymm4, 0xc0 ; 0 1 4 5 pshufb ymm2, ym6 pshufb ymm0, ym6 - mova ymm1, ym8 + mova ymm1, ym9 vpdpbusd ymm1, ymm2, ym7 - mova ymm2, ym8 + mova ymm2, ym9 vpdpbusd ymm2, ymm0, ym7 packssdw ymm2, ymm1, ymm2 psraw ymm2, 2 @@ -2045,14 +2580,13 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 movq xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps xmm4, [srcq+ssq*0] - mova xmm5, xm9 - vpdpwssd xmm5, xmm1, xm10 ; a0 b0 + pmaddwd xmm5, xmm1, xm10 ; a0 b0 mova xmm1, xmm2 vpdpwssd xmm5, xmm2, xm11 ; a1 b1 pshufb xmm4, xm6 mova xmm2, xmm3 vpdpwssd xmm5, xmm3, xm12 ; a2 b2 - mova xmm3, xm8 + mova xmm3, xm9 vpdpbusd xmm3, xmm4, xm7 packssdw xmm4, xmm3, xmm3 psraw xmm4, 2 @@ -2081,9 +2615,9 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 vinserti32x4 m1, [srcq+ssq*0], 3 ; 0 2 4 6 pshufb m2, m6 pshufb m1, m6 - mova m0, m8 + mova m0, m9 vpdpbusd m0, m2, m7 - mova m4, m8 + mova m4, m9 vpdpbusd m4, m1, m7 mova ym1, [spel_hv_perm4a] mova ym2, [spel_hv_perm4b] @@ -2100,11 +2634,10 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 movq xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x4 ym4, ymm4, [srcq+ssq*0], 1 - mova ym5, ym9 - vpdpwssd ym5, ym1, ym10 ; a0 b0 + pmaddwd ym5, ym1, ym10 ; a0 b0 mova ym1, ym2 pshufb ym4, ym6 - mova ym0, ym8 + mova ym0, ym9 vpdpbusd ym0, ym4, ym7 vpdpwssd ym5, ym2, ym11 ; a1 b1 mova ym2, ym3 @@ -2129,10 +2662,8 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 shr myd, 16 cmp hd, 6 cmovs myd, mxd - vpbroadcastd m8, [pd_2] - vpbroadcastq m0, [base+subpel_filters+myq*8] - vpbroadcastd m9, [pd_32768] - punpcklbw m0, m8, m0 + vpbroadcastq m1, [base+subpel_filters+myq*8] + punpcklbw m0, m1 lea ss3q, [ssq*3] psraw m0, 2 ; << 6 pshufd m12, m0, q0000 @@ -2153,31 +2684,31 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 vbroadcasti32x4 m4, [subpel_h_shufA] vinserti32x4 m0, zmm2, [srcq+ssq*0], 2 ; 4 5 6 _ vbroadcasti32x4 m7, [subpel_h_shufB] - vbroadcasti32x4 m17, [subpel_h_shufC] + vbroadcasti32x4 m8, [subpel_h_shufC] pshufb m1, m6, m4 ; 0 1 2 3 0123 - mova m2, m8 + mova m2, m9 vpdpbusd m2, m1, m10 pshufb m5, m6, m7 ; 0 1 2 3 4567 - mova m1, m8 + mova m1, m9 vpdpbusd m1, m5, m10 pshufb m4, m0, m4 ; 4 5 6 _ 0123 - mova m3, m8 + mova m3, m9 vpdpbusd m3, m4, m10 pshufb m7, m0, m7 ; 4 5 6 _ 4567 - mova m4, m8 + mova m4, m9 vpdpbusd m4, m7, m10 - pshufb m6, m17 + pshufb m6, m8 vpdpbusd m2, m5, m11 vpdpbusd m1, m6, m11 - pshufb m6, m0, m17 + pshufb m6, m0, m8 vpdpbusd m3, m7, m11 vpdpbusd m4, m6, m11 mova m5, [spel_hv_perm8a] - mova m0, [spel_hv_perm8b] + vpaddd m0, m5, [pb_32] {1to16} mov r6, 0x55555555ff00 packssdw m2, m1 packssdw m3, m4 - mova m18, [spel_hv_perm8c] + mova m8, [spel_hv_perm8b] psraw m2, 2 ; 0 1 2 3 psraw m3, 2 ; 4 5 6 _ vpermb m1, m5, m2 ; 01 12 @@ -2192,10 +2723,9 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 vbroadcasti32x4 ym4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vbroadcasti32x4 m4{k1}, [srcq+ssq*0] - mova m0, m9 - vpdpwssd m0, m1, m12 ; a0 b0 + pmaddwd m0, m1, m12 ; a0 b0 pshufb m1, m4, m6 ; 7 8 0123 4567 - mova m5, m8 + mova m5, m9 vpdpbusd m5, m1, m10 pshufb m4, m7 ; 7 8 4567 89ab vpdpwssd m0, m2, m13 ; a1 b1 @@ -2204,7 +2734,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 mova m2, m3 vpdpwssd m0, m3, m14 ; a2 b2 psraw m3{k2}, m5, 2 ; 75 86 - vpermb m3, m18, m3 ; 67 78 + vpermb m3, m8, m3 ; 67 78 vpdpwssd m0, m3, m15 ; a3 b3 packuswb m0, m0 vpermb zmm1, m16, m0 @@ -2216,111 +2746,652 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 vzeroupper RET .hv_w16: - movu m7, [spel_hv_perm16a] + WIN64_SPILL_XMM 23 + movu m22, [spel_hv_perm16a] sub srcq, ss3q - mova m20, [spel_hv_perm16b] + vpbroadcastd m8, [pb_4] lea r6d, [wq*2-32] - mova m21, [spel_hv_perm16c] - mov r4, srcq - mov r7, dstq + mova m7, [spel_hv_perm16b] + paddb m20, m8, m22 mova ym16, [spel_hv_end16] + paddb m21, m8, m20 lea r6d, [hq+r6*8] + paddb m8, m7 .hv_w16_loop0: movu ym17, [srcq+ssq*0] vinserti32x8 m17, [srcq+ssq*1], 1 ; 0 1 + lea r4, [srcq+ss3q] movu ym18, [srcq+ssq*2] - add srcq, ss3q - vinserti32x8 m18, [srcq+ssq*0], 1 ; 2 3 - movu ym19, [srcq+ssq*1] - vinserti32x8 m19, [srcq+ssq*2], 1 ; 4 5 - add srcq, ss3q - vpermb m2, m7, m17 ; 0 1 0123 89ab - vpermb m0, m20, m17 ; 0 1 4567 cdef - vpermb m4, m7, m18 ; 2 3 0123 89ab - mova m1, m8 + vinserti32x8 m18, [r4 +ssq*0], 1 ; 2 3 + mov r7, dstq + movu ym19, [r4 +ssq*1] + vinserti32x8 m19, [r4 +ssq*2], 1 ; 4 5 + add r4, ss3q + vpermb m2, m22, m17 ; 0 1 0123 89ab + mova m1, m9 + vpermb m3, m21, m17 ; 0 1 89ab ghij vpdpbusd m1, m2, m10 - vpermb m5, m20, m18 ; 2 3 4567 cdef - mova m2, m8 - vpdpbusd m2, m0, m10 - vpermb m17, m21, m17 ; 0 1 89ab ghij - mova m3, m8 + mova m2, m9 + vpermb m4, m22, m18 ; 2 3 0123 89ab + vpdpbusd m2, m3, m11 + mova m3, m9 + vpermb m5, m21, m18 ; 2 3 89ab ghij vpdpbusd m3, m4, m10 - vpermb m6, m7, m19 ; 4 5 0123 89ab - mova m4, m8 - vpdpbusd m4, m5, m10 - vpermb m18, m21, m18 ; 2 3 89ab ghij - vpdpbusd m1, m0, m11 - movu ym0, [srcq+ssq*0] ; 6 - vpdpbusd m2, m17, m11 - vpermb m17, m20, m19 ; 4 5 4567 cdef - vpdpbusd m3, m5, m11 - mova m5, m8 + mova m4, m9 + vpermb m6, m22, m19 ; 4 5 0123 89ab + vpdpbusd m4, m5, m11 + mova m5, m9 + vpermb m17, m20, m17 ; 0 1 4567 cdef vpdpbusd m5, m6, m10 - mova m6, m8 - vpdpbusd m6, m17, m10 - vpdpbusd m4, m18, m11 - mova m18, [spel_hv_perm16d] - vpermb m18, m18, m0 ; 6 0145 2367 89cd abef - vpdpbusd m5, m17, m11 - vpermb m19, m21, m19 ; 4 5 89ab ghij - mova m17, m8 - vpdpbusd m17, m18, m10 - mova m18, [spel_hv_perm16e] - vpermb m0, m18, m0 ; 6 4589 67ab cdgh efij - packssdw m1, m2 ; 01 - vpdpbusd m6, m19, m11 - packssdw m3, m4 ; 23 - vpdpbusd m17, m0, m11 - psraw m1, 2 - packssdw m5, m6 ; 45 - psraw m3, 2 + mova m6, m9 + vpermb m0, m21, m19 ; 4 5 89ab ghij + vpdpbusd m1, m17, m11 + vpdpbusd m2, m17, m10 + movu ym17, [r4+ssq*0] ; 6 + vpermb m18, m20, m18 ; 2 3 4567 cdef + vpdpbusd m6, m0, m11 + vpermb m0, m7, m17 ; 6 0145 2367 89cd abef + vpdpbusd m3, m18, m11 + vpermb m19, m20, m19 ; 4 5 4567 cdef + vpdpbusd m4, m18, m10 + mova m18, m9 + vpermb m17, m8, m17 ; 6 4589 67ab cdgh efij + vpdpbusd m18, m0, m10 + packssdw m1, m2 + vpdpbusd m5, m19, m11 + vpdpbusd m6, m19, m10 + packssdw m3, m4 + vpdpbusd m18, m17, m11 + psraw m1, 2 ; 01 + psraw m3, 2 ; 23 + packssdw m5, m6 vpshrdd m2, m1, m3, 16 ; 12 - psraw m5, 2 + psraw m5, 2 ; 45 vpshrdd m4, m3, m5, 16 ; 34 - psraw m17, 2 - vpshrdd m6, m5, m17, 16 ; 56 + psraw m18, 2 + vpshrdd m6, m5, m18, 16 ; 56 .hv_w16_loop: - movu ym18, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - vinserti32x8 m18, [srcq+ssq*0], 1 + movu ym19, [r4+ssq*1] + lea r4, [r4+ssq*2] + vinserti32x8 m19, [r4+ssq*0], 1 + pmaddwd m17, m1, m12 ; a0 + vpermb m1, m22, m19 ; 7 8 0123 89ab + pmaddwd m18, m2, m12 ; b0 mova m0, m9 - vpdpwssd m0, m1, m12 ; a0 - vpermb m1, m7, m18 ; 7 8 0123 89ab - mova m17, m9 - vpdpwssd m17, m2, m12 ; b0 - vpermb m2, m20, m18 ; 7 8 4567 cdef - mova m19, m8 - vpdpbusd m19, m1, m10 - vpermb m18, m21, m18 - mova m1, m8 - vpdpbusd m1, m2, m10 - vpdpwssd m0, m3, m13 ; a1 - vpdpwssd m17, m4, m13 ; b1 - vpdpbusd m19, m2, m11 + vpermb m2, m21, m19 ; 7 8 89ab ghij + vpdpbusd m0, m1, m10 + mova m1, m9 + vpermb m19, m20, m19 ; 7 8 4567 cdef + vpdpbusd m1, m2, m11 mova m2, m4 - vpdpbusd m1, m18, m11 + vpdpwssd m17, m3, m13 ; a1 + vpdpwssd m18, m4, m13 ; b1 mova m4, m6 - vpdpwssd m0, m5, m14 ; a2 - vpdpwssd m17, m6, m14 ; b2 - packssdw m19, m1 + vpdpbusd m0, m19, m11 + vpdpbusd m1, m19, m10 + vpdpwssd m17, m5, m14 ; a2 + vpdpwssd m18, m6, m14 ; b2 + packssdw m0, m1 mova m1, m3 + psraw m6, m0, 2 ; 78 mova m3, m5 - psraw m6, m19, 2 ; 7 8 - vpshrdd m5, m4, m6, 16 ; 6 7 - vpdpwssd m17, m6, m15 ; b3 - vpdpwssd m0, m5, m15 ; a3 - packuswb m0, m17 - vpermb zmm1, m16, m0 - mova [dstq+dsq*0], xmm1 - vextracti128 [dstq+dsq*1], ymm1, 1 - lea dstq, [dstq+dsq*2] + vpshrdd m5, m4, m6, 16 ; 67 + vpdpwssd m18, m6, m15 ; b3 + vpdpwssd m17, m5, m15 ; a3 + packuswb m17, m18 + vpermb m17, m16, m17 + mova [r7+dsq*0], xm17 + vextracti128 [r7+dsq*1], ym17, 1 + lea r7, [r7+dsq*2] sub hd, 2 jg .hv_w16_loop - add r4, 16 - add r7, 16 + add srcq, 16 + add dstq, 16 + movzx hd, r6b + sub r6d, 1<<8 + jg .hv_w16_loop0 + RET + +%if WIN64 +DECLARE_REG_TMP 6, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif + +%define PREP_8TAP_FN FN prep_8tap, +PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_6tap_8bpc +PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_6tap_8bpc +PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_8bpc +PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_8bpc +PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_8bpc +PREP_8TAP_FN regular, REGULAR, REGULAR + +cglobal prep_6tap_8bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my, ss3 +%define base r7-prep_avx512icl + imul mxd, mxm, 0x010101 + add mxd, t0d ; 6tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 6tap_v, my, 4tap_v + lea r7, [prep_avx512icl] + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v +.prep: + tzcnt wd, wd + movzx wd, word [r7+wq*2+table_offset(prep,)] + add wq, r7 + lea r6, [ssq*3] +%if WIN64 + pop r7 +%endif + jmp wq +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + tzcnt r5d, wd + lea myq, [base+subpel_filters+1+myq*8] + movzx r5d, word [r7+r5*2+table_offset(prep, _6tap_v)] + vpbroadcastd m7, [pw_8192] + sub srcq, ssq + vpbroadcastw m8, [myq+0] + add r5, r7 + vpbroadcastw m9, [myq+2] + lea ss3q, [ssq*3] + vpbroadcastw m10, [myq+4] + sub srcq, ssq + jmp r5 +.v_w4: + movd xmm2, [srcq+ssq*0] + pinsrd xmm2, [srcq+ssq*1], 1 + vpbroadcastd ymm1, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastd ymm3, [srcq+ssq*0] + vpbroadcastd ymm0, [srcq+ssq*1] + vbroadcasti128 ymm5, [deint_shuf4] + vpblendd ymm1, ymm2, 0xeb + punpcklqdq ymm3, ymm0 + vpblendd ymm1, ymm3, 0x60 ; 0 1 2 _ 2 3 4 _ + pshufb ymm1, ymm5 ; 01 12 23 34 +.v_w4_loop: + pinsrd xmm0, [srcq+ssq*2], 1 + vpbroadcastd ymm2, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + vpbroadcastd ymm3, [srcq+ssq*0] + vpblendd ymm2, ymm0, 0xeb + vpbroadcastd ymm0, [srcq+ssq*1] + punpcklqdq ymm3, ymm0 + vpblendd ymm2, ymm3, 0x60 ; 4 5 6 _ 6 7 8 _ + pshufb ymm2, ymm5 ; 45 56 67 78 + pmaddubsw ymm3, ymm1, ym8 ; a0 b0 c0 d0 + vperm2i128 ymm1, ymm2, 0x21 ; 23 34 45 56 + pmaddubsw ymm4, ymm2, ym10 ; a2 b2 c2 d2 + pmaddubsw ymm1, ym9 ; a1 b1 c1 d1 + paddw ymm3, ymm4 + paddw ymm3, ymm1 + pmulhrsw ymm3, ym7 + mova ymm1, ymm2 + mova [tmpq], ymm3 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + vzeroupper + RET +.v_w8: + mova m6, [spel_v_perm8] + movq xm1, [srcq+ssq*0] + mov r6d, 0x3e + movq xm2, [srcq+ssq*1] + kmovb k1, r6d + vpbroadcastq ym3, [srcq+ssq*2] + add srcq, ss3q + vpunpcklqdq ym2, [srcq+ssq*0] {1to4} + vpunpcklqdq m1{k1}, m3, [srcq+ssq*1] {1to8} + movq xm0, [srcq+ssq*1] + kshiftlb k2, k1, 2 + shufpd m1, m2, 0x18 ; 0 1 2 3 4 + vpermb m1, m6, m1 ; 01 12 23 34 +.v_w8_loop: + vpbroadcastq ym3, [srcq+ss3q ] + vpunpcklqdq ym0{k1}, ym3, [srcq+ssq*2] {1to4} + lea srcq, [srcq+ssq*4] + vpbroadcastq m3, [srcq+ssq*1] + vpunpcklqdq m0{k2}, m3, [srcq+ssq*0] {1to8} + pmaddubsw m4, m1, m8 ; a0 b0 c0 d0 + vpermb m2, m6, m0 ; 45 56 67 78 + mova xm0, xm3 + vshufi32x4 m1, m2, q1032 ; 23 34 45 56 + pmaddubsw m3, m2, m10 ; a3 b3 c3 d3 + pmaddubsw m5, m1, m9 ; a2 b2 c2 d2 + mova m1, m2 + paddw m4, m3 + paddw m4, m5 + pmulhrsw m4, m7 + mova [tmpq], m4 + add tmpq, 64 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + mova m11, [spel_v_perm16b] + vbroadcasti32x4 m1, [srcq+ssq*0] + mov r6d, 0x0f + vbroadcasti32x4 ym3, [srcq+ssq*1] + vbroadcasti32x4 m2, [srcq+ssq*2] + kmovb k1, r6d + add srcq, ss3q + vbroadcasti32x4 ym4, [srcq+ssq*0] + vbroadcasti32x4 m0, [srcq+ssq*1] + vshufpd m1{k1}, m3, m2, 0xcc + vshufpd m2{k1}, m4, m0, 0xcc + vpermb m1, m11, m1 ; 01 12 + vpermb m2, m11, m2 ; 23 34 +.v_w16_loop: + pmaddubsw m3, m1, m8 ; a0 b0 + pmaddubsw m5, m2, m9 ; a1 b1 + vbroadcasti32x4 ym6, [srcq+ssq*2] + pmaddubsw m4, m2, m8 ; c0 d0 + vbroadcasti32x4 m2, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + vshufpd m0{k1}, m6, m2, 0xcc + vbroadcasti32x4 ym6, [srcq+ssq*0] + vpermb m1, m11, m0 ; 45 56 + vbroadcasti32x4 m0, [srcq+ssq*1] + vshufpd m2{k1}, m6, m0, 0xcc + pmaddubsw m6, m1, m9 ; c1 d1 + vpermb m2, m11, m2 ; 67 78 + paddw m3, m5 + pmaddubsw m5, m1, m10 ; a2 b2 + paddw m4, m6 + pmaddubsw m6, m2, m10 ; c2 d2 + paddw m3, m5 + paddw m4, m6 + pmulhrsw m3, m7 + pmulhrsw m4, m7 + mova [tmpq+ 0], m3 + mova [tmpq+64], m4 + add tmpq, 64*2 + sub hd, 4 + jg .v_w16_loop + RET +.v_w32: + movshdup m6, [bilin_v_perm64] + movu ym16, [srcq+ssq*0] + movu ym17, [srcq+ssq*1] + movu ym18, [srcq+ssq*2] + add srcq, ss3q + movu ym19, [srcq+ssq*0] + add srcq, ssq + movu ym20, [srcq+ssq*0] + vpermt2q m16, m6, m18 ; 0 2 + vpermt2q m17, m6, m19 ; 1 3 + vpermt2q m18, m6, m20 ; 2 4 + punpcklbw m0, m16, m17 ; 01 + punpcklbw m1, m17, m18 ; 12 + punpckhbw m2, m16, m17 ; 23 + punpckhbw m3, m17, m18 ; 34 +.v_w32_loop: + movu ym16, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movu ym17, [srcq+ssq*0] + pmaddubsw m4, m0, m8 ; a0 + mova m0, m2 + pmaddubsw m2, m9 ; a1 + vpermt2q m16, m6, m17 ; 5 6 + pmaddubsw m5, m1, m8 ; b0 + mova m1, m3 + pmaddubsw m3, m9 ; b1 + shufpd m18, m16, 0x55 ; 4 5 + paddw m4, m2 + punpcklbw m2, m18, m16 ; 45 + paddw m5, m3 + punpckhbw m3, m18, m16 ; 56 + mova m18, m16 + pmaddubsw m16, m2, m10 ; a2 + pmaddubsw m17, m3, m10 ; b2 + paddw m4, m16 + paddw m5, m17 + pmulhrsw m4, m7 + pmulhrsw m5, m7 + mova [tmpq+ 0], m4 + mova [tmpq+64], m5 + add tmpq, 64*2 + sub hd, 2 + jg .v_w32_loop + vzeroupper + RET +.v_w64: +.v_w128: + mova m6, [bilin_v_perm64] + add wd, wd + lea r6d, [hq+wq] +.v_loop0: + vpermq m12, m6, [srcq+ssq*0] + vpermq m13, m6, [srcq+ssq*1] + lea r5, [srcq+ssq*2] + vpermq m14, m6, [r5 +ssq*0] + vpermq m15, m6, [r5 +ssq*1] + lea r5, [r5+ssq*2] + vpermq m16, m6, [r5 +ssq*0] + mov r7, tmpq + punpcklbw m0, m12, m13 ; 01 + punpckhbw m12, m13 + punpcklbw m1, m13, m14 ; 12 + punpckhbw m13, m14 + punpcklbw m2, m14, m15 ; 23 + punpckhbw m14, m15 + punpcklbw m3, m15, m16 ; 34 + punpckhbw m15, m16 +.v_loop: + pmaddubsw m17, m0, m8 ; a0 + vpermq m5, m6, [r5+ssq*1] + pmaddubsw m18, m12, m8 + mova m0, m2 + pmaddubsw m2, m9 ; a1 + mova m12, m14 + pmaddubsw m14, m9 + lea r5, [r5+ssq*2] + pmaddubsw m19, m1, m8 ; b0 + pmaddubsw m20, m13, m8 + mova m1, m3 + pmaddubsw m3, m9 ; b1 + mova m13, m15 + pmaddubsw m15, m9 + paddw m17, m2 + punpcklbw m2, m16, m5 ; 67 + paddw m18, m14 + punpckhbw m14, m16, m5 + vpermq m16, m6, [r5+ssq*0] + paddw m19, m3 + pmaddubsw m3, m2, m10 ; a3 + paddw m20, m15 + pmaddubsw m15, m14, m10 + paddw m17, m3 + punpcklbw m3, m5, m16 ; 78 + pmaddubsw m4, m3, m10 ; b3 + paddw m18, m15 + punpckhbw m15, m5, m16 + pmaddubsw m5, m15, m10 + paddw m19, m4 + paddw m20, m5 + REPX {pmulhrsw x, m7}, m17, m18, m19, m20 + mova [r7+wq*0+ 0], m17 + mova [r7+wq*0+64], m18 + mova [r7+wq*1+ 0], m19 + mova [r7+wq*1+64], m20 + lea r7, [r7+wq*2] + sub hd, 2 + jg .v_loop + add srcq, 64 + add tmpq, 128 + movzx hd, r6b + sub r6d, 1<<8 + jg .v_loop0 + vzeroupper + RET +.h: + test myd, 0xf00 + jz mangle(private_prefix %+ _prep_8tap_8bpc_avx512icl).h2 +.hv: + vpbroadcastd m8, [pd_2] + vpbroadcastd m9, [pd_32] + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + vpbroadcastd m11, [base+subpel_filters+mxq*8+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + vpbroadcastq m3, [base+subpel_filters+1+myq*8] + vbroadcasti128 m10, [subpel_h_shufA] + lea r6, [ssq*2+1] + mov r3d, 0x30 + sub srcq, r6 + kmovb k1, r3d + vpbroadcastq ym2, [srcq+ssq*0] + lea ss3q, [ssq*3] + vpbroadcastq m1, [srcq+ssq*1] + kaddb k2, k1, k1 + vpbroadcastq m2{k1}, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq m1{k2}, [srcq+ssq*0] ; _ _ 1 3 + punpcklbw m3, m3 + vpbroadcastq m2{k2}, [srcq+ssq*1] ; _ 0 2 4 + psraw m3, 8 ; sign-extend + mova m6, [spel_hv_perm4a] + kshiftrb k1, k1, 2 + movu m7, [spel_hv_perm4b] + pshufb m1, m10 + mova m0, m8 + vpdpbusd m0, m1, m11 + pshufb m2, m10 + mova m1, m8 + vpdpbusd m1, m2, m11 + pshufd m12, m3, q0000 + pshufd m13, m3, q1111 + pshufd m14, m3, q2222 + packssdw m0, m1 ; _ _ _ 0 1 2 3 4 + psraw m0, 2 + vpermb m1, m7, m0 ; 01 12 23 34 +.hv_w4_loop: + movq xm3, [srcq+ssq*2] + movq xm4, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + vpbroadcastq ym3{k1}, [srcq+ssq*0] ; 5 7 + vpbroadcastq ym4{k1}, [srcq+ssq*1] ; 6 8 + pshufb ym3, ym10 + mova ym2, ym8 + vpdpbusd ym2, ym3, ym11 + pshufb ym4, ym10 + mova ym3, ym8 + vpdpbusd ym3, ym4, ym11 + mova m4, m9 + vpdpwssd m4, m1, m12 ; a0 b0 c0 d0 + packssdw ym2, ym3 ; 5 6 7 8 + psraw ym2, 2 + vshufi32x4 m0, m2, q1032 ; _ 2 3 4 5 6 7 8 + vpermb m2, m6, m0 ; 23 34 45 56 + vpermb m1, m7, m0 ; 45 56 67 78 + vpdpwssd m4, m2, m13 ; a1 b1 c1 d1 + vpdpwssd m4, m1, m14 ; a2 b2 c2 d2 + psrad m4, 6 + vpmovdw [tmpq], m4 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop + RET +.hv_w8: + shr mxd, 16 + vpbroadcastd m10, [base+subpel_filters+mxq*8+0] + vpbroadcastd m11, [base+subpel_filters+mxq*8+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + vpbroadcastq m0, [base+subpel_filters+1+myq*8] + lea r6, [ssq*2+3] + punpcklbw m0, m0 + sub srcq, r6 + psraw m0, 8 ; sign-extend + lea ss3q, [ssq*3] + pshufd m12, m0, q0000 + pshufd m13, m0, q1111 + pshufd m14, m0, q2222 + cmp wd, 8 + jg .hv_w16 + movu xm16, [srcq+ssq*0] + vbroadcasti32x4 m19, [subpel_h_shufA] + vinserti128 ym16, [srcq+ssq*1], 1 + vbroadcasti32x4 m21, [subpel_h_shufC] + vinserti32x4 m16, [srcq+ssq*2], 2 + add srcq, ss3q + vinserti32x4 m16, [srcq+ssq*0], 3 + movu xm17, [srcq+ssq*1] + vbroadcasti32x4 m20, [subpel_h_shufB] + pshufb m3, m16, m19 ; 0 1 2 3 0123 + mova m2, m8 + pshufb m0, m16, m21 ; 0 1 2 3 89ab + vpdpbusd m2, m3, m10 + mova m3, m8 + pshufb xm1, xm17, xm19 ; 3 4 5 6 0123 + vpdpbusd m3, m0, m11 + mova xm0, xm8 + pshufb xm18, xm17, xm21 ; 3 4 5 6 89ab + vpdpbusd xm0, xm1, xm10 + mova xm1, xm8 + pshufb m16, m20 ; 0 1 2 3 4567 + vpdpbusd xm1, xm18, xm11 + pshufb xm17, xm20 ; 3 4 5 6 4567 + vpdpbusd m2, m16, m11 + vpdpbusd m3, m16, m10 + vpdpbusd xm0, xm17, xm11 + vpdpbusd xm1, xm17, xm10 + packssdw m2, m3 + packssdw xm0, xm1 + psraw m2, 2 ; 0 1 2 3 + psraw xm0, 2 ; 4 + valignq m0, m2, 2 ; 1 2 3 4 + punpcklwd m1, m2, m0 ; 01 12 23 34 + punpckhwd m2, m0 +.hv_w8_loop: + movu xm16, [srcq+ssq*2] + vinserti128 ym16, [srcq+ss3q ], 1 + lea srcq, [srcq+ssq*4] + vinserti32x4 m16, [srcq+ssq*0], 2 + vinserti32x4 m16, [srcq+ssq*1], 3 + pshufb m6, m16, m19 ; 5 6 7 8 0123 + mova m5, m8 + pshufb m3, m16, m21 ; 5 6 7 8 89ab + vpdpbusd m5, m6, m10 + mova m6, m8 + pshufb m16, m20 ; 5 6 7 8 4567 + vpdpbusd m6, m3, m11 + mova m3, m9 + vpdpwssd m3, m1, m12 ; a0 b0 c0 d0 + mova m4, m9 + vpdpwssd m4, m2, m12 + vpdpbusd m5, m16, m11 + vpdpbusd m6, m16, m10 + mova m16, m1 + packssdw m5, m6 + mova m6, m2 + psraw m5, 2 ; 5 6 7 8 + valignq m2, m5, m0, 6 ; 4 5 6 7 + mova m0, m5 + punpcklwd m1, m2, m5 ; 45 56 67 78 + punpckhwd m2, m5 + vpdpwssd m3, m1, m14 ; a2 b2 c2 d2 + vpdpwssd m4, m2, m14 + vshufi32x4 m16, m1, q1032 ; 23 34 45 56 + vshufi32x4 m6, m2, q1032 + vpdpwssd m3, m16, m13 ; a1 b1 c1 d1 + vpdpwssd m4, m6, m13 + psrad m3, 6 + psrad m4, 6 + packssdw m3, m4 + mova [tmpq], m3 + add tmpq, 64 + sub hd, 4 + jg .hv_w8_loop + vzeroupper + RET +.hv_w16: + mova m16, [spel_h_perm16] + vpbroadcastd m18, [pb_4] + add wd, wd + paddb m17, m18, m16 + lea r6d, [hq+wq*8-256] + paddb m18, m17 +.hv_w16_loop0: + movu ym19, [srcq+ssq*0] + vinserti32x8 m19, [srcq+ssq*1], 1 + lea r5, [srcq+ssq*2] + movu ym20, [r5 +ssq*0] + vinserti32x8 m20, [r5 +ssq*1], 1 + lea r5, [r5 +ssq*2] + movu ym21, [r5 +ssq*0] + mov r7, tmpq + vpermb m3, m16, m19 ; 0 1 0123 89ab + mova m2, m8 + vpermb m4, m18, m19 ; 0 1 89ab ghij + vpdpbusd m2, m3, m10 + mova m3, m8 + vpermb m5, m16, m20 ; 2 3 0123 89ab + vpdpbusd m3, m4, m11 + mova m4, m8 + vpermb m0, m18, m20 ; 2 3 89ab ghij + vpdpbusd m4, m5, m10 + mova m5, m8 + vpermb ym1, ym16, ym21 ; 4 0123 89ab + vpdpbusd m5, m0, m11 + mova ym0, ym8 + vpermb ym6, ym18, ym21 ; 4 89ab ghij + vpdpbusd ym0, ym1, ym10 + mova ym1, ym8 + vpermb m19, m17, m19 ; 0 1 4567 cdef + vpdpbusd ym1, ym6, ym11 + vpermb m20, m17, m20 ; 2 3 4567 cdef + vpdpbusd m2, m19, m11 + vpdpbusd m3, m19, m10 + vpermb ym21, ym17, ym21 ; 4 4567 cdef + vpdpbusd m4, m20, m11 + vpdpbusd m5, m20, m10 + vpdpbusd ym0, ym21, ym11 + vpdpbusd ym1, ym21, ym10 + packssdw m2, m3 ; 0 1 + packssdw m4, m5 ; 2 3 + packssdw ym0, ym1 ; 4 + REPX {psraw x, 2}, m2, m4, ym0 + vshufi32x4 m3, m2, m4, q1032 ; 1 2 + vshufi32x4 m0, m4, m0, q1032 ; 3 4 + punpcklwd m1, m2, m3 ; 01 12 + punpckhwd m2, m3 + punpcklwd m3, m4, m0 ; 23 34 + punpckhwd m4, m0 +.hv_w16_loop: + movu ym19, [r5+ssq*1] + lea r5, [r5+ssq*2] + vinserti32x8 m19, [r5+ssq*0], 1 + vpermb m6, m16, m19 ; 5 6 0123 89ab + mova m5, m8 + vpermb m20, m18, m19 ; 5 6 89ab ghij + vpdpbusd m5, m6, m10 + mova m6, m8 + vpermb m19, m17, m19 ; 5 6 4567 cdef + vpdpbusd m6, m20, m11 + mova m20, m9 + vpdpwssd m20, m1, m12 ; a0 b0 + mova m21, m9 + vpdpwssd m21, m2, m12 + vpdpbusd m5, m19, m11 + vpdpbusd m6, m19, m10 + vpdpwssd m20, m3, m13 ; a1 b1 + vpdpwssd m21, m4, m13 + packssdw m5, m6 + mova m1, m3 + psraw m5, 2 ; 5 6 + mova m2, m4 + vshufi32x4 m4, m0, m5, q1032 ; 4 5 + mova m0, m5 + punpcklwd m3, m4, m0 ; 45 56 + punpckhwd m4, m0 + vpdpwssd m20, m3, m14 ; a2 b2 + vpdpwssd m21, m4, m14 + psrad m20, 6 + psrad m21, 6 + packssdw m20, m21 + mova [r7+wq*0], ym20 + vextracti32x8 [r7+wq*1], m20, 1 + lea r7, [r7+wq*2] + sub hd, 2 + jg .hv_w16_loop + add srcq, 16 + add tmpq, 32 movzx hd, r6b - mov srcq, r4 - mov dstq, r7 sub r6d, 1<<8 jg .hv_w16_loop0 vzeroupper @@ -2353,183 +3424,38 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 mova [tmpq+64*1], m1 %endmacro -%if WIN64 -DECLARE_REG_TMP 6, 4 -%else -DECLARE_REG_TMP 6, 7 -%endif - -%define PREP_8TAP_FN FN prep_8tap, - +PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_8bpc +PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_8bpc PREP_8TAP_FN sharp, SHARP, SHARP -PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH -PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP -PREP_8TAP_FN smooth, SMOOTH, SMOOTH -PREP_8TAP_FN sharp_regular, SHARP, REGULAR -PREP_8TAP_FN regular_sharp, REGULAR, SHARP -PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR -PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH -PREP_8TAP_FN regular, REGULAR, REGULAR -cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 +cglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my, stride3 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r7, [prep_avx512icl] - movsxd wq, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 - jnz .v - tzcnt wd, wd - movzx wd, word [r7+wq*2+table_offset(prep,)] - add wq, r7 - lea r6, [strideq*3] -%if WIN64 - pop r7 -%endif - jmp wq -.h: - test myd, 0xf00 - jnz .hv - vpbroadcastd m4, [pd_2] - WIN64_SPILL_XMM 10 - cmp wd, 4 - je .h_w4 - tzcnt wd, wd - shr mxd, 16 - sub srcq, 3 - movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] - vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+0] - vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep_avx512icl+4] - add wq, r7 - jmp wq -.h_w4: - movzx mxd, mxb - vbroadcasti128 ym5, [subpel_h_shufA] - mov r3d, 0x4 - dec srcq - vpbroadcastd ym6, [r7+mxq*8+subpel_filters-prep_avx512icl+2] - kmovb k1, r3d - lea stride3q, [strideq*3] -.h_w4_loop: - movq xm2, [srcq+strideq*0] - movq xm3, [srcq+strideq*1] - vpbroadcastq ym2{k1}, [srcq+strideq*2] - vpbroadcastq ym3{k1}, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - pshufb ym2, ym5 - pshufb ym3, ym5 - mova ym0, ym4 - vpdpbusd ym0, ym2, ym6 - mova ym1, ym4 - vpdpbusd ym1, ym3, ym6 - packssdw ym0, ym1 - psraw ym0, 2 - mova [tmpq], ym0 - add tmpq, 32 - sub hd, 4 - jg .h_w4_loop - RET -.h_w8: - vbroadcasti128 m5, [subpel_h_shufA] - vbroadcasti128 m6, [subpel_h_shufB] - vbroadcasti128 m7, [subpel_h_shufC] - lea stride3q, [strideq*3] -.h_w8_loop: - movu xmm3, [srcq+strideq*0] - vinserti128 ym3, ymm3, [srcq+strideq*1], 1 - vinserti128 m3, [srcq+strideq*2], 2 - vinserti128 m3, [srcq+stride3q ], 3 - lea srcq, [srcq+strideq*4] - pshufb m1, m3, m5 - pshufb m2, m3, m6 - mova m0, m4 - vpdpbusd m0, m1, m8 - mova m1, m4 - vpdpbusd m1, m2, m8 - pshufb m3, m7 - vpdpbusd m0, m2, m9 - vpdpbusd m1, m3, m9 - packssdw m0, m1 - psraw m0, 2 - mova [tmpq], m0 - add tmpq, 64 - sub hd, 4 - jg .h_w8_loop - RET -.h_w16: - mova m5, [spel_h_perm16a] - mova m6, [spel_h_perm16b] - mova m7, [spel_h_perm16c] - lea stride3q, [strideq*3] -.h_w16_loop: - movu ym0, [srcq+strideq*0] - movu ym1, [srcq+strideq*2] - vinserti32x8 m0, [srcq+strideq*1], 1 - vinserti32x8 m1, [srcq+stride3q ], 1 - lea srcq, [srcq+strideq*4] - PREP_8TAP_H - add tmpq, 64*2 - sub hd, 4 - jg .h_w16_loop - RET -.h_w32: - mova m5, [spel_h_perm32a] - mova m6, [spel_h_perm32b] - mova m7, [spel_h_perm32c] -.h_w32_loop: - movu m0, [srcq+strideq*0] - movu m1, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - PREP_8TAP_H - add tmpq, 64*2 - sub hd, 2 - jg .h_w32_loop - RET -.h_w64: - xor r6d, r6d - jmp .h_start -.h_w128: - mov r6, -64*1 -.h_start: - mova m5, [spel_h_perm32a] - mova m6, [spel_h_perm32b] - mova m7, [spel_h_perm32c] - sub srcq, r6 - mov r5, r6 -.h_loop: - movu m0, [srcq+r6+32*0] - movu m1, [srcq+r6+32*1] - PREP_8TAP_H - add tmpq, 64*2 - add r6, 64 - jle .h_loop - add srcq, strideq - mov r6, r5 - dec hd - jg .h_loop - RET + jz mangle(private_prefix %+ _prep_6tap_8bpc_avx512icl).prep .v: movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. shr myd, 16 ; Note that the code is 8-tap only, having - tzcnt wd, wd cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 cmove myd, mxd ; had a negligible effect on performance. - ; TODO: Would a 6-tap code path be worth it? - lea myq, [r7+myq*8+subpel_filters-prep_avx512icl] - movzx wd, word [r7+wq*2+table_offset(prep, _8tap_v)] - add wq, r7 - lea stride3q, [strideq*3] - sub srcq, stride3q + tzcnt r5d, wd + lea myq, [base+subpel_filters+myq*8] + movzx r5d, word [r7+r5*2+table_offset(prep, _8tap_v)] vpbroadcastd m7, [pw_8192] vpbroadcastw m8, [myq+0] + add r5, r7 vpbroadcastw m9, [myq+2] + lea stride3q, [strideq*3] vpbroadcastw m10, [myq+4] + sub srcq, stride3q vpbroadcastw m11, [myq+6] - jmp wq + jmp r5 .v_w4: movd xmm0, [srcq+strideq*0] vpbroadcastd ymm1, [srcq+strideq*2] @@ -2576,172 +3502,146 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 vzeroupper RET .v_w8: - mov r3d, 0xf044 - kmovw k1, r3d - kshiftrw k2, k1, 8 - movq xm0, [srcq+strideq*0] - vpbroadcastq ym1, [srcq+strideq*1] - vpbroadcastq m2, [srcq+strideq*2] - vpbroadcastq m3, [srcq+stride3q ] + mova m6, [spel_v_perm8] + movq xm1, [srcq+strideq*0] + mov r6d, 0x3e + movq xm2, [srcq+strideq*1] + vpbroadcastq ym3, [srcq+strideq*2] + kmovb k1, r6d + vpbroadcastq ym4, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - vpbroadcastq m4, [srcq+strideq*0] - vpbroadcastq m5, [srcq+strideq*1] - vpbroadcastq m6, [srcq+strideq*2] - vmovdqa64 ym0{k1}, ym1 - vmovdqa64 ym1{k1}, ym2 - vmovdqa64 m2{k1}, m3 - vmovdqa64 m3{k1}, m4 - vmovdqa64 m4{k1}, m5 - vmovdqa64 m5{k1}, m6 - punpcklbw ym0, ym1 ; 01 12 __ __ - punpcklbw m2, m3 ; 23 34 23 34 - punpcklbw m4, m5 ; 45 56 45 56 - vmovdqa64 m0{k2}, m2 ; 01 12 23 34 - vmovdqa64 m2{k2}, m4 ; 23 34 45 56 + vpunpcklqdq m1{k1}, m3, [srcq+strideq*0] {1to8} + vpunpcklqdq m2{k1}, m4, [srcq+strideq*1] {1to8} + movq xm0, [srcq+strideq*2] + kshiftlb k2, k1, 2 + shufpd m1, m2, 0x30 ; 0 1 2 3 4 5 + vshufi32x4 m2, m1, m0, q0021 ; 2 3 4 5 6 _ + vpermb m1, m6, m1 ; 01 12 23 34 + vpermb m2, m6, m2 ; 23 34 45 56 .v_w8_loop: - vpbroadcastq m1, [srcq+stride3q ] + vpbroadcastq ym3, [srcq+strideq*4] + vpunpcklqdq ym0{k1}, ym3, [srcq+stride3q] {1to4} lea srcq, [srcq+strideq*4] - vpbroadcastq m3, [srcq+strideq*0] - vpbroadcastq m5, [srcq+strideq*1] - pmaddubsw m14, m0, m8 - pmaddubsw m15, m2, m9 - vpblendmq m0{k1}, m6, m1 - vpblendmq m2{k1}, m1, m3 - vpbroadcastq m6, [srcq+strideq*2] - paddw m14, m15 - punpcklbw m2, m0, m2 ; 67 78 67 78 - vpblendmq m12{k1}, m3, m5 - vpblendmq m13{k1}, m5, m6 - vpblendmq m0{k2}, m4, m2 ; 45 56 67 78 - punpcklbw m4, m12, m13 ; 89 9a 89 9a - vmovdqa64 m2{k2}, m4 ; 67 78 89 9a - pmaddubsw m12, m0, m10 - pmaddubsw m13, m2, m11 - paddw m14, m12 - paddw m14, m13 - pmulhrsw m14, m7 - mova [tmpq], m14 + vpbroadcastq m3, [srcq+strideq*2] + vpunpcklqdq m0{k2}, m3, [srcq+strideq*1] {1to8} + pmaddubsw m4, m1, m8 ; a0 b0 c0 d0 + mova m1, m2 + pmaddubsw m5, m2, m9 ; a1 b1 c1 d1 + vpermb m2, m6, m0 ; 67 78 89 9a + mova xm0, xm3 + vshufi32x4 m1, m2, q1032 ; 45 56 67 78 + pmaddubsw m3, m2, m11 ; a3 b3 c3 d3 + paddw m4, m5 + pmaddubsw m5, m1, m10 ; a2 b2 c2 d2 + paddw m4, m3 + paddw m4, m5 + pmulhrsw m4, m7 + mova [tmpq], m4 add tmpq, 64 sub hd, 4 jg .v_w8_loop RET .v_w16: - mov r3d, 0xf0 - kmovb k1, r3d - vbroadcasti128 m0, [srcq+strideq*0] - vbroadcasti128 m1, [srcq+strideq*1] - vbroadcasti128 m2, [srcq+strideq*2] - vbroadcasti128 m3, [srcq+stride3q ] + mova m12, [spel_v_perm16b] + vbroadcasti32x4 m1, [srcq+strideq*0] + mov r6d, 0x0f + vbroadcasti32x4 ym4, [srcq+strideq*1] + vbroadcasti32x4 m2, [srcq+strideq*2] + kmovb k1, r6d + vbroadcasti32x4 ym5, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - vbroadcasti128 m4, [srcq+strideq*0] - vbroadcasti128 m5, [srcq+strideq*1] - vbroadcasti128 m6, [srcq+strideq*2] - vmovdqa64 m0{k1}, m1 - vmovdqa64 m1{k1}, m2 - vmovdqa64 m2{k1}, m3 - vmovdqa64 m3{k1}, m4 - vmovdqa64 m4{k1}, m5 - vmovdqa64 m5{k1}, m6 - shufpd m0, m2, 0xcc ; 0a_2a 0b_2b 1a_3a 1b_3b - shufpd m1, m3, 0xcc ; 1a_3a 1b_3b 2a_4a 2b_4b - shufpd m4, m4, 0x44 ; 4a_-- 4b_-- 5a_-- 5b_-- - shufpd m5, m5, 0x44 ; 5a_-- 5b_-- 6a_-- 6b_-- - punpckhbw m2, m0, m1 ; 23a 23b 34a 34b - punpcklbw m0, m1 ; 01a 01b 12a 12b - punpcklbw m4, m5 ; 45a 45b 56a 56b + vbroadcasti32x4 m3, [srcq+strideq*0] + vbroadcasti32x4 ym6, [srcq+strideq*1] + vbroadcasti32x4 m0, [srcq+strideq*2] + vshufpd m1{k1}, m4, m2, 0xcc + vshufpd m2{k1}, m5, m3, 0xcc + vshufpd m3{k1}, m6, m0, 0xcc + vpermb m1, m12, m1 ; 01 12 + vpermb m2, m12, m2 ; 23 34 + vpermb m3, m12, m3 ; 45 56 .v_w16_loop: - vbroadcasti128 m3, [srcq+stride3q ] + pmaddubsw m4, m1, m8 ; a0 b0 + mova m1, m3 + pmaddubsw m13, m2, m9 ; a1 b1 + vbroadcasti32x4 ym6, [srcq+stride3q ] + pmaddubsw m5, m2, m8 ; c0 d0 lea srcq, [srcq+strideq*4] - vbroadcasti128 m5, [srcq+strideq*0] - vpblendmq m1{k1}, m6, m3 - vmovdqa64 m3{k1}, m5 - pmaddubsw m12, m0, m8 - pmaddubsw m13, m2, m8 - pmaddubsw m14, m2, m9 - pmaddubsw m15, m4, m9 - pmaddubsw m0, m4, m10 - vbroadcasti128 m2, [srcq+strideq*1] - vbroadcasti128 m6, [srcq+strideq*2] - paddw m12, m14 - paddw m13, m15 - paddw m12, m0 - vmovdqa64 m5{k1}, m2 - vmovdqa64 m2{k1}, m6 - mova m0, m4 - shufpd m1, m5, 0xcc ; 6a_8a 6b_8b 7a_9a 7b_9b - shufpd m3, m2, 0xcc ; 7a_9a 7b_9b 8a_Aa 8b_Ab - punpcklbw m2, m1, m3 ; 67a 67b 78a 78b - punpckhbw m4, m1, m3 ; 89a 89b 9Aa 9Ab - pmaddubsw m14, m2, m10 - pmaddubsw m15, m2, m11 - paddw m13, m14 - paddw m12, m15 - pmaddubsw m14, m4, m11 - paddw m13, m14 - pmulhrsw m12, m7 - pmulhrsw m13, m7 - mova [tmpq+ 0], m12 - mova [tmpq+64], m13 + pmaddubsw m14, m3, m9 ; c1 d1 + vbroadcasti32x4 m3, [srcq+strideq*0] + vshufpd m0{k1}, m6, m3, 0xcc + vbroadcasti32x4 ym6, [srcq+strideq*1] + vpermb m2, m12, m0 ; 67 78 + vbroadcasti32x4 m0, [srcq+strideq*2] + vshufpd m3{k1}, m6, m0, 0xcc + paddw m4, m13 + pmaddubsw m13, m1, m10 ; a2 b2 + vpermb m3, m12, m3 ; 89 9a + paddw m5, m14 + pmaddubsw m14, m2, m10 ; c2 d2 + pmaddubsw m15, m2, m11 ; a3 b3 + pmaddubsw m6, m3, m11 ; c3 d3 + paddw m4, m13 + paddw m5, m14 + paddw m4, m15 + paddw m5, m6 + pmulhrsw m4, m7 + pmulhrsw m5, m7 + mova [tmpq+ 0], m4 + mova [tmpq+64], m5 add tmpq, 64*2 sub hd, 4 jg .v_w16_loop RET .v_w32: - mova m18, [bilin_v_perm64] - movu ym0, [srcq+strideq*0] - movu ym1, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - movu ym2, [srcq+strideq*0] - movu ym3, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - movu ym4, [srcq+strideq*0] - movu ym5, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - movu ym6, [srcq+strideq*0] - vpermq m0, m18, m0 - vpermq m1, m18, m1 - vpermq m2, m18, m2 - vpermq m3, m18, m3 - vpermq m4, m18, m4 - vpermq m5, m18, m5 - vpermq m6, m18, m6 - punpcklbw m0, m1 - punpcklbw m1, m2 - punpcklbw m2, m3 - punpcklbw m3, m4 - punpcklbw m4, m5 - punpcklbw m5, m6 + movshdup m21, [bilin_v_perm64] + movu ym16, [srcq+strideq*0] + movu ym17, [srcq+strideq*1] + movu ym18, [srcq+strideq*2] + add srcq, stride3q + movu ym19, [srcq+strideq*0] + vpermt2q m16, m21, m19 ; 0 3 + movu ym20, [srcq+strideq*1] + vpermt2q m17, m21, m20 ; 1 4 + movu ym20, [srcq+strideq*2] + add srcq, stride3q + vpermt2q m18, m21, m20 ; 2 5 + movu ym20, [srcq+strideq*0] + vpermt2q m19, m21, m20 ; 3 6 + punpcklbw m0, m16, m17 ; 01 + punpcklbw m1, m17, m18 ; 12 + punpcklbw m2, m18, m19 ; 23 + punpckhbw m3, m16, m17 ; 34 + punpckhbw m4, m17, m18 ; 45 + punpckhbw m5, m18, m19 ; 56 .v_w32_loop: - movu ym12, [srcq+strideq*1] + movu ym16, [srcq+strideq*1] lea srcq, [srcq+strideq*2] - movu ym13, [srcq+strideq*0] + movu ym17, [srcq+strideq*0] pmaddubsw m14, m0, m8 - pmaddubsw m16, m2, m9 - pmaddubsw m15, m1, m8 - pmaddubsw m17, m3, m9 mova m0, m2 + pmaddubsw m15, m1, m8 mova m1, m3 - vpermq m12, m18, m12 - vpermq m13, m18, m13 - paddw m14, m16 - paddw m15, m17 - pmaddubsw m16, m4, m10 - pmaddubsw m17, m5, m10 - punpcklbw m6, m12 - punpcklbw m12, m13 + pmaddubsw m2, m9 + vpermt2q m16, m21, m17 ; 7 8 + pmaddubsw m3, m9 + pmaddubsw m12, m4, m10 + pmaddubsw m13, m5, m10 + shufpd m19, m16, 0x55 ; 6 7 + paddw m14, m2 mova m2, m4 + punpcklbw m4, m19, m16 ; 67 + paddw m15, m3 mova m3, m5 - paddw m14, m16 - paddw m15, m17 - pmaddubsw m16, m6, m11 - pmaddubsw m17, m12, m11 - mova m4, m6 - mova m5, m12 - paddw m14, m16 - paddw m15, m17 + punpckhbw m5, m19, m16 ; 78 + paddw m14, m12 + paddw m15, m13 + pmaddubsw m12, m4, m11 + pmaddubsw m13, m5, m11 + mova m19, m16 + paddw m14, m12 + paddw m15, m13 pmulhrsw m14, m7 pmulhrsw m15, m7 - mova m6, m13 mova [tmpq+ 0], m14 mova [tmpq+64], m15 add tmpq, 64*2 @@ -2750,154 +3650,241 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 vzeroupper RET .v_w64: - mov wd, 64 - jmp .v_start .v_w128: - mov wd, 128 -.v_start: - WIN64_SPILL_XMM 27 - mova m26, [bilin_v_perm64] - lea r6d, [hq+wq*2] - mov r5, srcq - mov r7, tmpq + WIN64_SPILL_XMM 24 + mova m23, [bilin_v_perm64] + add wd, wd + lea r6d, [hq+wq] .v_loop0: - vpermq m0, m26, [srcq+strideq*0] - vpermq m1, m26, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vpermq m2, m26, [srcq+strideq*0] - vpermq m3, m26, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vpermq m4, m26, [srcq+strideq*0] - vpermq m5, m26, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vpermq m6, m26, [srcq+strideq*0] - punpckhbw m12, m0, m1 - punpcklbw m0, m1 - punpckhbw m13, m1, m2 - punpcklbw m1, m2 - punpckhbw m14, m2, m3 - punpcklbw m2, m3 - punpckhbw m15, m3, m4 - punpcklbw m3, m4 - punpckhbw m16, m4, m5 - punpcklbw m4, m5 - punpckhbw m17, m5, m6 - punpcklbw m5, m6 + vpermq m12, m23, [srcq+strideq*0] + vpermq m13, m23, [srcq+strideq*1] + lea r5, [srcq+strideq*2] + vpermq m14, m23, [r5 +strideq*0] + vpermq m15, m23, [r5 +strideq*1] + lea r5, [r5+strideq*2] + vpermq m16, m23, [r5 +strideq*0] + vpermq m17, m23, [r5 +strideq*1] + lea r5, [r5+strideq*2] + vpermq m18, m23, [r5 +strideq*0] + mov r7, tmpq + punpcklbw m0, m12, m13 ; 01 + punpckhbw m12, m13 + punpcklbw m1, m13, m14 ; 12 + punpckhbw m13, m14 + punpcklbw m2, m14, m15 ; 23 + punpckhbw m14, m15 + punpcklbw m3, m15, m16 ; 34 + punpckhbw m15, m16 + punpcklbw m4, m16, m17 ; 45 + punpckhbw m16, m17 + punpcklbw m5, m17, m18 ; 56 + punpckhbw m17, m18 .v_loop: - vpermq m18, m26, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vpermq m19, m26, [srcq+strideq*0] - pmaddubsw m20, m0, m8 - pmaddubsw m21, m12, m8 - pmaddubsw m22, m1, m8 - pmaddubsw m23, m13, m8 + pmaddubsw m19, m0, m8 ; a0 + vpermq m6, m23, [r5+strideq*1] + pmaddubsw m20, m12, m8 mova m0, m2 + pmaddubsw m2, m9 ; a1 mova m12, m14 + pmaddubsw m14, m9 + lea r5, [r5+strideq*2] + pmaddubsw m21, m1, m8 ; b0 + pmaddubsw m22, m13, m8 mova m1, m3 + pmaddubsw m3, m9 ; b1 mova m13, m15 - pmaddubsw m2, m9 - pmaddubsw m14, m9 - pmaddubsw m3, m9 pmaddubsw m15, m9 - punpckhbw m24, m6, m18 - punpcklbw m6, m18 - paddw m20, m2 - paddw m21, m14 - paddw m22, m3 - paddw m23, m15 + paddw m19, m2 mova m2, m4 + pmaddubsw m4, m10 ; a2 + paddw m20, m14 mova m14, m16 + pmaddubsw m16, m10 + paddw m21, m3 mova m3, m5 + pmaddubsw m5, m10 ; b2 + paddw m22, m15 mova m15, m17 - pmaddubsw m4, m10 - pmaddubsw m16, m10 - pmaddubsw m5, m10 pmaddubsw m17, m10 - punpckhbw m25, m18, m19 - punpcklbw m18, m19 - paddw m20, m4 - paddw m21, m16 - paddw m22, m5 - paddw m23, m17 - mova m4, m6 - mova m16, m24 - mova m5, m18 - mova m17, m25 - pmaddubsw m6, m11 - pmaddubsw m24, m11 - pmaddubsw m18, m11 - pmaddubsw m25, m11 - paddw m20, m6 - paddw m21, m24 - paddw m22, m18 - paddw m23, m25 - pmulhrsw m20, m7 - pmulhrsw m21, m7 - pmulhrsw m22, m7 - pmulhrsw m23, m7 - mova m6, m19 - mova [tmpq+wq*0+ 0], m20 - mova [tmpq+wq*0+64], m21 - mova [tmpq+wq*2+ 0], m22 - mova [tmpq+wq*2+64], m23 - lea tmpq, [tmpq+wq*4] + paddw m19, m4 + punpcklbw m4, m18, m6 ; 67 + paddw m20, m16 + punpckhbw m16, m18, m6 + vpermq m18, m23, [r5+strideq*0] + paddw m21, m5 + pmaddubsw m5, m4, m11 ; a3 + paddw m22, m17 + pmaddubsw m17, m16, m11 + paddw m19, m5 + punpcklbw m5, m6, m18 ; 78 + paddw m20, m17 + punpckhbw m17, m6, m18 + pmaddubsw m6, m5, m11 ; b3 + paddw m21, m6 + pmaddubsw m6, m17, m11 + paddw m22, m6 + REPX {pmulhrsw x, m7}, m19, m20, m21, m22 + mova [r7+wq*0+ 0], m19 + mova [r7+wq*0+64], m20 + mova [r7+wq*1+ 0], m21 + mova [r7+wq*1+64], m22 + lea r7, [r7+wq*2] sub hd, 2 jg .v_loop - add r5, 64 - add r7, 128 + add srcq, 64 + add tmpq, 128 movzx hd, r6b - mov srcq, r5 - mov tmpq, r7 sub r6d, 1<<8 jg .v_loop0 RET -.hv: - WIN64_SPILL_XMM 16 +.h: + RESET_STACK_STATE + test myd, 0xf00 + jnz .hv +.h2: + vpbroadcastd m4, [pd_2] cmp wd, 4 - je .hv_w4 + je .h_w4 + tzcnt wd, wd shr mxd, 16 sub srcq, 3 - vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep_avx512icl+0] - vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep_avx512icl+4] - movzx mxd, myb - shr myd, 16 - cmp hd, 4 - cmove myd, mxd - tzcnt wd, wd - vpbroadcastd m8, [pd_2] - movzx wd, word [r7+wq*2+table_offset(prep, _8tap_hv)] - vpbroadcastd m9, [pd_32] + movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] + vpbroadcastd m8, [base+subpel_filters+mxq*8+0] + vpbroadcastd m9, [base+subpel_filters+mxq*8+4] add wq, r7 - vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl] - lea stride3q, [strideq*3] - sub srcq, stride3q - punpcklbw m0, m0 - psraw m0, 8 ; sign-extend - pshufd m12, m0, q0000 - pshufd m13, m0, q1111 - pshufd m14, m0, q2222 - pshufd m15, m0, q3333 jmp wq -.hv_w4: +.h_w4: + movzx mxd, mxb + vbroadcasti128 ym5, [subpel_h_shufA] + mov r3d, 0x4 + dec srcq + vpbroadcastd ym6, [base+subpel_filters+mxq*8+2] + kmovb k1, r3d + lea stride3q, [strideq*3] +.h_w4_loop: + movq xm2, [srcq+strideq*0] + movq xm3, [srcq+strideq*1] + vpbroadcastq ym2{k1}, [srcq+strideq*2] + vpbroadcastq ym3{k1}, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + pshufb ym2, ym5 + pshufb ym3, ym5 + mova ym0, ym4 + vpdpbusd ym0, ym2, ym6 + mova ym1, ym4 + vpdpbusd ym1, ym3, ym6 + packssdw ym0, ym1 + psraw ym0, 2 + mova [tmpq], ym0 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h_w8: + vbroadcasti128 m5, [subpel_h_shufA] + vbroadcasti128 m6, [subpel_h_shufB] + vbroadcasti128 m7, [subpel_h_shufC] + lea stride3q, [strideq*3] +.h_w8_loop: + movu xmm3, [srcq+strideq*0] + vinserti128 ym3, ymm3, [srcq+strideq*1], 1 + vinserti128 m3, [srcq+strideq*2], 2 + vinserti128 m3, [srcq+stride3q ], 3 + lea srcq, [srcq+strideq*4] + pshufb m1, m3, m5 + pshufb m2, m3, m6 + mova m0, m4 + vpdpbusd m0, m1, m8 + mova m1, m4 + vpdpbusd m1, m2, m8 + pshufb m3, m7 + vpdpbusd m0, m2, m9 + vpdpbusd m1, m3, m9 + packssdw m0, m1 + psraw m0, 2 + mova [tmpq], m0 + add tmpq, 64 + sub hd, 4 + jg .h_w8_loop + RET +.h_w16: + mova m5, [spel_h_perm16] + vpbroadcastd m7, [pb_4] + lea stride3q, [strideq*3] + paddb m6, m7, m5 + paddb m7, m6 +.h_w16_loop: + movu ym0, [srcq+strideq*0] + movu ym1, [srcq+strideq*2] + vinserti32x8 m0, [srcq+strideq*1], 1 + vinserti32x8 m1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + PREP_8TAP_H + add tmpq, 64*2 + sub hd, 4 + jg .h_w16_loop + RET +.h_w32: + mova m5, [spel_h_perm32] + vpbroadcastd m7, [pb_4] + paddb m6, m7, m5 + paddb m7, m6 +.h_w32_loop: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + PREP_8TAP_H + add tmpq, 64*2 + sub hd, 2 + jg .h_w32_loop + RET +.h_w64: + xor r6d, r6d + jmp .h_start +.h_w128: + mov r6, -64*1 +.h_start: + mova m5, [spel_h_perm32] + vpbroadcastd m7, [pb_4] + sub srcq, r6 + paddb m6, m7, m5 + paddb m7, m6 +.h_loop0: + mov r5, r6 +.h_loop: + movu m0, [srcq+r5+32*0] + movu m1, [srcq+r5+32*1] + PREP_8TAP_H + add tmpq, 64*2 + add r5, 64 + jle .h_loop + add srcq, strideq + dec hd + jg .h_loop0 + RET +.hv: + RESET_STACK_STATE + vpbroadcastd m8, [pd_2] + vpbroadcastd m9, [pd_32] + cmp wd, 4 + jg .hv_w8 movzx mxd, mxb dec srcq - vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+2] + vpbroadcastd m11, [base+subpel_filters+mxq*8+2] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd - vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl] + vpbroadcastq m0, [base+subpel_filters+myq*8] lea stride3q, [strideq*3] sub srcq, stride3q mov r3d, 0x04 kmovb k1, r3d kshiftlb k2, k1, 2 kshiftlb k3, k1, 4 - vpbroadcastd m10, [pd_2] - vbroadcasti128 m16, [subpel_h_shufA] + vbroadcasti128 m10, [subpel_h_shufA] punpcklbw m0, m0 psraw m0, 8 ; sign-extend - vpbroadcastd m11, [pd_32] pshufd m12, m0, q0000 pshufd m13, m0, q1111 pshufd m14, m0, q2222 @@ -2910,263 +3897,265 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 vpbroadcastq m3{k2}, [srcq+strideq*0] vpbroadcastq m2{k3}, [srcq+strideq*1] vpbroadcastq m3{k3}, [srcq+strideq*2] - mova m17, [spel_hv_perm4a] - movu m18, [spel_hv_perm4b] - mova m0, m10 - mova m1, m10 - pshufb m2, m16 - pshufb m3, m16 - vpdpbusd m0, m2, m8 - vpdpbusd m1, m3, m8 + mova m6, [spel_hv_perm4a] + movu m7, [spel_hv_perm4b] + mova m0, m8 + mova m1, m8 + pshufb m2, m10 + pshufb m3, m10 + vpdpbusd m0, m2, m11 + vpdpbusd m1, m3, m11 packssdw m0, m1 ; _ 0 1 2 3 4 5 6 psraw m0, 2 - vpermb m1, m17, m0 ; 01 12 23 34 - vpermb m2, m18, m0 ; 23 34 45 56 + vpermb m1, m6, m0 ; 01 12 23 34 + vpermb m2, m7, m0 ; 23 34 45 56 .hv_w4_loop: movq xm3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] movq xm4, [srcq+strideq*0] vpbroadcastq ym3{k1}, [srcq+strideq*1] vpbroadcastq ym4{k1}, [srcq+strideq*2] - mova ym5, ym10 - mova ym6, ym10 - pshufb ym3, ym16 - pshufb ym4, ym16 - vpdpbusd ym5, ym3, ym8 - vpdpbusd ym6, ym4, ym8 - mova m7, m11 - packssdw ym5, ym6 ; 7 8 9 a _ _ _ _ - psraw ym5, 2 - valignq m0, m5, m0, 4 ; _ 4 5 6 7 8 9 a - vpdpwssd m7, m1, m12 - vpdpwssd m7, m2, m13 - vpermb m1, m17, m0 ; 45 56 67 78 - vpermb m2, m18, m0 ; 67 78 89 9a - vpdpwssd m7, m1, m14 - vpdpwssd m7, m2, m15 - psrad m7, 6 - vpmovdw [tmpq], m7 + mova m5, m9 + pshufb ym3, ym10 + vpdpwssd m5, m1, m12 ; a0 b0 c0 d0 + mova ym1, ym8 + pshufb ym4, ym10 + vpdpbusd ym1, ym3, ym11 + mova ym3, ym8 + vpdpbusd ym3, ym4, ym11 + vpdpwssd m5, m2, m13 ; a1 b1 c1 d1 + packssdw ym1, ym3 ; 7 8 9 a + psraw ym1, 2 + vshufi32x4 m0, m1, q1032 ; _ 4 5 6 7 8 9 a + vpermb m1, m6, m0 ; 45 56 67 78 + vpermb m2, m7, m0 ; 67 78 89 9a + vpdpwssd m5, m1, m14 ; a2 b2 c2 d2 + vpdpwssd m5, m2, m15 ; a3 b3 c3 d3 + psrad m5, 6 + vpmovdw [tmpq], m5 add tmpq, 32 sub hd, 4 jg .hv_w4_loop - vzeroupper RET .hv_w8: - WIN64_SPILL_XMM 24 - vbroadcasti128 m16, [subpel_h_shufA] - vbroadcasti128 m17, [subpel_h_shufB] - vbroadcasti128 m18, [subpel_h_shufC] - vinserti128 ym0, [srcq+strideq*0], 1 - vinserti128 m0, [srcq+strideq*1], 2 - vinserti128 m0, [srcq+strideq*2], 3 - movu xm1, [srcq+stride3q ] + shr mxd, 16 + sub srcq, 3 + vpbroadcastd m10, [base+subpel_filters+mxq*8+0] + vpbroadcastd m11, [base+subpel_filters+mxq*8+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + vpbroadcastq m0, [base+subpel_filters+myq*8] + lea stride3q, [strideq*3] + sub srcq, stride3q + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + pshufd m12, m0, q0000 + pshufd m13, m0, q1111 + pshufd m14, m0, q2222 + pshufd m15, m0, q3333 + cmp wd, 8 + jg .hv_w16 + vbroadcasti32x4 m17, [srcq+stride3q ] + vinserti32x4 m16, m17, [srcq+strideq*0], 0 + vbroadcasti32x4 m19, [subpel_h_shufA] + vinserti32x4 m16, [srcq+strideq*1], 1 + vbroadcasti32x4 m21, [subpel_h_shufC] + vinserti32x4 m16, [srcq+strideq*2], 2 lea srcq, [srcq+strideq*4] - vinserti128 ym1, [srcq+strideq*0], 1 - vinserti128 m1, [srcq+strideq*1], 2 - vinserti128 m1, [srcq+strideq*2], 3 + vinserti128 ym17, [srcq+strideq*0], 1 + vbroadcasti32x4 m20, [subpel_h_shufB] + vinserti32x4 m17, [srcq+strideq*1], 2 + vinserti32x4 m17, [srcq+strideq*2], 3 + pshufb m3, m16, m19 ; 0 1 2 3 0123 mova m2, m8 - mova m4, m8 + pshufb m0, m16, m21 ; 0 1 2 3 89ab + vpdpbusd m2, m3, m10 mova m3, m8 - mova m5, m8 - pshufb m20, m0, m16 - pshufb m21, m0, m17 - pshufb m22, m0, m18 - pshufb m23, m1, m16 - pshufb m6, m1, m17 - pshufb m7, m1, m18 - vpdpbusd m2, m20, m10 - vpdpbusd m4, m21, m10 - vpdpbusd m2, m21, m11 - vpdpbusd m4, m22, m11 - vpdpbusd m3, m23, m10 - vpdpbusd m5, m6, m10 - vpdpbusd m3, m6, m11 - vpdpbusd m5, m7, m11 - packssdw m2, m4 - packssdw m3, m5 - psraw m2, 2 ; _ 0 1 2 - psraw m3, 2 ; 3 4 5 6 - valignq m0, m3, m2, 2 ; 0 1 2 3 - valignq m1, m3, m2, 4 ; 1 2 3 4 - valignq m2, m3, m2, 6 ; 2 3 4 5 - punpcklwd m4, m0, m1 ; 01a 12a 23a 34a - punpckhwd m5, m0, m1 ; 01b 12b 23b 34b - punpcklwd m6, m2, m3 ; 23a 34a 45a 56a - punpckhwd m7, m2, m3 ; 23b 34b 45b 56b + pshufb m1, m17, m19 ; 3 4 5 6 0123 + vpdpbusd m3, m0, m11 + mova m0, m8 + pshufb m4, m17, m21 ; 3 4 5 6 89ab + vpdpbusd m0, m1, m10 + mova m1, m8 + pshufb m16, m20 ; 0 1 2 3 4567 + vpdpbusd m1, m4, m11 + pshufb m17, m20 ; 3 4 5 6 4567 + vpdpbusd m2, m16, m11 + vpdpbusd m3, m16, m10 + vpdpbusd m0, m17, m11 + vpdpbusd m1, m17, m10 + packssdw m2, m3 + packssdw m0, m1 + psraw m2, 2 ; 0 1 2 3 + psraw m0, 2 ; 3 4 5 6 + vshufi32x4 m4, m2, m0, q2132 ; 2 3 4 5 + vshufi32x4 m5, m2, m0, q1021 ; 1 2 3 4 + punpcklwd m3, m4, m0 ; 23 34 45 56 + punpckhwd m4, m0 + punpcklwd m1, m2, m5 ; 01 12 23 34 + punpckhwd m2, m5 .hv_w8_loop: - movu xm19, [srcq+stride3q ] + movu xm18, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - vinserti128 ym19, [srcq+strideq*0], 1 - vinserti128 m19, [srcq+strideq*1], 2 - vinserti128 m19, [srcq+strideq*2], 3 - mova m20, m9 - mova m21, m9 - mova m22, m8 - mova m23, m8 - vpdpwssd m20, m4, m12 - vpdpwssd m21, m5, m12 - vpdpwssd m20, m6, m13 - vpdpwssd m21, m7, m13 - pshufb m0, m19, m16 - pshufb m1, m19, m17 - pshufb m2, m19, m18 - vpdpbusd m22, m0, m10 - vpdpbusd m23, m1, m10 - vpdpbusd m22, m1, m11 - vpdpbusd m23, m2, m11 - packssdw m22, m23 - psraw m22, 2 ; 7 8 9 A - valignq m0, m22, m3, 2 ; 4 5 6 7 - valignq m1, m22, m3, 4 ; 5 6 7 8 - valignq m2, m22, m3, 6 ; 6 7 8 9 - mova m3, m22 - punpcklwd m4, m0, m1 ; 45a 56a 67a 78a - punpckhwd m5, m0, m1 ; 45b 56b 67b 78b - punpcklwd m6, m2, m3 ; 67a 78a 89a 9Aa - punpckhwd m7, m2, m3 ; 67b 78b 89b 9Ab - vpdpwssd m20, m4, m14 - vpdpwssd m21, m5, m14 - vpdpwssd m20, m6, m15 - vpdpwssd m21, m7, m15 - psrad m20, 6 - psrad m21, 6 - packssdw m20, m21 - mova [tmpq], m20 + vinserti128 ym18, [srcq+strideq*0], 1 + vinserti32x4 m18, [srcq+strideq*1], 2 + vinserti32x4 m18, [srcq+strideq*2], 3 + pshufb m17, m18, m19 ; 7 8 9 a 0123 + mova m16, m8 + pshufb m5, m18, m21 ; 7 8 9 a 89ab + vpdpbusd m16, m17, m10 + mova m17, m8 + pshufb m18, m20 ; 7 8 9 a 4567 + vpdpbusd m17, m5, m11 + mova m5, m9 + vpdpwssd m5, m3, m13 ; a1 b1 c1 d1 + mova m6, m9 + vpdpwssd m6, m4, m13 + vpdpbusd m16, m18, m11 + vpdpbusd m17, m18, m10 + vpdpwssd m5, m1, m12 ; a0 b0 c0 d0 + mova m1, m3 + vpdpwssd m6, m2, m12 + mova m2, m4 + packssdw m16, m17 + psraw m16, 2 ; 7 8 9 a + valignq m4, m16, m0, 6 ; 6 7 8 9 + mova m0, m16 + punpcklwd m3, m4, m16 ; 67 78 89 9a + punpckhwd m4, m16 + vpdpwssd m5, m3, m15 ; a3 b3 c3 d3 + vpdpwssd m6, m4, m15 + vshufi32x4 m1, m3, q1032 ; 45 56 67 78 + vshufi32x4 m2, m4, q1032 + vpdpwssd m5, m1, m14 ; a2 b2 c2 d2 + vpdpwssd m6, m2, m14 + psrad m5, 6 + psrad m6, 6 + packssdw m5, m6 + mova [tmpq], m5 add tmpq, 64 sub hd, 4 jg .hv_w8_loop + vzeroupper RET .hv_w16: - mov wd, 16*2 - jmp .hv_start -.hv_w32: - mov wd, 32*2 - jmp .hv_start -.hv_w64: - mov wd, 64*2 - jmp .hv_start -.hv_w128: - mov wd, 128*2 -.hv_start: - WIN64_SPILL_XMM 31 - mova m16, [spel_h_perm16a] - mova m17, [spel_h_perm16b] - mova m18, [spel_h_perm16c] + WIN64_SPILL_XMM 23 + mova m16, [spel_h_perm16] + vpbroadcastd m18, [pb_4] + add wd, wd + paddb m17, m18, m16 lea r6d, [hq+wq*8-256] - mov r5, srcq + paddb m18, m17 +.hv_w16_loop0: + movu ym19, [srcq+strideq*0] + vinserti32x8 m19, [srcq+strideq*1], 1 + lea r5, [srcq+strideq*2] + movu ym20, [r5 +strideq*0] + vinserti32x8 m20, [r5 +strideq*1], 1 + lea r5, [r5 +strideq*2] + movu ym21, [r5 +strideq*0] + vinserti32x8 m21, [r5 +strideq*1], 1 + lea r5, [r5 +strideq*2] + movu ym22, [r5 +strideq*0] mov r7, tmpq -.hv_loop0: - movu ym0, [srcq+strideq*0] - vinserti32x8 m0, [srcq+strideq*1], 1 - lea srcq, [srcq+strideq*2] - movu ym1, [srcq+strideq*0] - vinserti32x8 m1, [srcq+strideq*1], 1 - lea srcq, [srcq+strideq*2] - movu ym2, [srcq+strideq*0] - vinserti32x8 m2, [srcq+strideq*1], 1 - lea srcq, [srcq+strideq*2] - movu ym3, [srcq+strideq*0] + vpermb m3, m16, m19 ; 0 1 0123 89ab + mova m2, m8 + vpermb m4, m18, m19 ; 0 1 89ab ghij + vpdpbusd m2, m3, m10 + mova m3, m8 + vpermb m5, m16, m20 ; 2 3 0123 89ab + vpdpbusd m3, m4, m11 mova m4, m8 + vpermb m6, m18, m20 ; 2 3 89ab ghij + vpdpbusd m4, m5, m10 mova m5, m8 + vpermb m7, m16, m21 ; 4 5 0123 89ab + vpdpbusd m5, m6, m11 mova m6, m8 + vpermb m0, m18, m21 ; 4 5 89ab ghij + vpdpbusd m6, m7, m10 mova m7, m8 - vpermb m19, m16, m0 - vpermb m20, m17, m0 - vpermb m21, m18, m0 - vpermb m22, m16, m1 - vpermb m23, m17, m1 - vpermb m24, m18, m1 - vpermb m25, m16, m2 - vpermb m26, m17, m2 - vpermb m27, m18, m2 - vpermb ym28, ym16, ym3 - vpermb ym29, ym17, ym3 - vpermb ym30, ym18, ym3 - mova m0, m8 - mova m1, m8 - mova ym2, ym8 - mova ym3, ym8 - vpdpbusd m4, m19, m10 - vpdpbusd m5, m20, m10 - vpdpbusd m6, m22, m10 - vpdpbusd m7, m23, m10 - vpdpbusd m0, m25, m10 - vpdpbusd m1, m26, m10 - vpdpbusd ym2, ym28, ym10 - vpdpbusd ym3, ym29, ym10 + vpermb ym1, ym16, ym22 ; 6 0123 89ab + vpdpbusd m7, m0, m11 + mova ym0, ym8 + vpermb m19, m17, m19 ; 0 1 4567 cdef + vpdpbusd ym0, ym1, ym10 + vpermb ym1, ym18, ym22 ; 6 89ab ghij + vpdpbusd m2, m19, m11 + vpdpbusd m3, m19, m10 + mova ym19, ym8 + vpermb m20, m17, m20 ; 2 3 4567 cdef + vpdpbusd ym19, ym1, ym11 + vpermb m21, m17, m21 ; 4 5 4567 cdef vpdpbusd m4, m20, m11 - vpdpbusd m5, m21, m11 - vpdpbusd m6, m23, m11 - vpdpbusd m7, m24, m11 - vpdpbusd m0, m26, m11 - vpdpbusd m1, m27, m11 - vpdpbusd ym2, ym29, ym11 - vpdpbusd ym3, ym30, ym11 - packssdw m4, m5 - packssdw m6, m7 - packssdw m0, m1 - packssdw ym2, ym3 - psraw m4, 2 ; 0a 0b 1a 1b - psraw m6, 2 ; 2a 2b 3a 3b - psraw m0, 2 ; 4a 4b 5a 5b - psraw ym2, 2 ; 6a 6b __ __ - vshufi32x4 m5, m4, m6, q1032 ; 1a 1b 2a 2b - vshufi32x4 m7, m6, m0, q1032 ; 3a 3b 4a 4b - vshufi32x4 m1, m0, m2, q1032 ; 5a 5b 6a 6b - punpcklwd m2, m4, m5 ; 01a 01c 12a 12c - punpckhwd m3, m4, m5 ; 01b 01d 12b 12d - punpcklwd m4, m6, m7 ; 23a 23c 34a 34c - punpckhwd m5, m6, m7 ; 23b 23d 34b 34d - punpcklwd m6, m0, m1 ; 45a 45c 56a 56c - punpckhwd m7, m0, m1 ; 45b 45d 56b 56d -.hv_loop: - movu ym19, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vinserti32x8 m19, [srcq+strideq*0], 1 + vpdpbusd m5, m20, m10 + vpermb ym22, ym17, ym22 ; 6 4567 cdef + vpdpbusd m6, m21, m11 + vpdpbusd m7, m21, m10 + packssdw m2, m3 ; 0 1 + vpdpbusd ym0, ym22, ym11 + packssdw m4, m5 ; 2 3 + vpdpbusd ym19, ym22, ym10 + packssdw m6, m7 ; 4 5 + packssdw ym0, ym19 ; 6 + REPX {psraw x, 2}, m2, m4, m6, ym0 + vshufi32x4 m3, m2, m4, q1032 ; 1 2 + vshufi32x4 m5, m4, m6, q1032 ; 3 4 + vshufi32x4 m0, m6, m0, q1032 ; 5 6 + punpcklwd m1, m2, m3 ; 01 12 + punpckhwd m2, m3 + punpcklwd m3, m4, m5 ; 23 34 + punpckhwd m4, m5 + punpcklwd m5, m6, m0 ; 45 56 + punpckhwd m6, m0 +.hv_w16_loop: + movu ym19, [r5+strideq*1] + lea r5, [r5+strideq*2] + vinserti32x8 m19, [r5+strideq*0], 1 mova m20, m9 + vpdpwssd m20, m1, m12 ; a0 + vpermb m1, m16, m19 mova m21, m9 + vpdpwssd m21, m2, m12 ; b0 + vpermb m2, m17, m19 mova m22, m8 - mova m23, m8 - vpdpwssd m20, m2, m12 - vpdpwssd m21, m3, m12 - vpdpwssd m20, m4, m13 - vpdpwssd m21, m5, m13 - vpermb m24, m16, m19 - vpermb m25, m17, m19 - vpermb m26, m18, m19 - vpdpbusd m22, m24, m10 - vpdpbusd m23, m25, m10 - vpdpbusd m22, m25, m11 - vpdpbusd m23, m26, m11 - packssdw m22, m23 - psraw m22, 2 ; 7a 7b 8a 8b - vshufi32x4 m0, m1, m22, q1032 ; 6a 6b 7a 7b + vpdpbusd m22, m1, m10 + mova m1, m8 + vpermb m19, m18, m19 + vpdpbusd m1, m2, m10 + vpdpwssd m20, m3, m13 ; a1 + vpdpwssd m21, m4, m13 ; b1 + vpdpbusd m22, m2, m11 mova m2, m4 - mova m3, m5 - mova m1, m22 + vpdpbusd m1, m19, m11 mova m4, m6 - mova m5, m7 - punpcklwd m6, m0, m1 ; 67a 67c 78a 78c - punpckhwd m7, m0, m1 ; 67b 67d 78b 78d - vpdpwssd m20, m4, m14 - vpdpwssd m21, m5, m14 - vpdpwssd m20, m6, m15 - vpdpwssd m21, m7, m15 + vpdpwssd m20, m5, m14 ; a2 + vpdpwssd m21, m6, m14 ; b2 + packssdw m22, m1 + mova m1, m3 + psraw m22, 2 ; 7 8 + mova m3, m5 + vshufi32x4 m6, m0, m22, q1032 ; 6 7 + mova m0, m22 + punpcklwd m5, m6, m0 ; 67 78 + punpckhwd m6, m0 + vpdpwssd m20, m5, m15 ; a3 + vpdpwssd m21, m6, m15 ; b3 psrad m20, 6 psrad m21, 6 packssdw m20, m21 - mova [tmpq+wq*0], ym20 - vextracti32x8 [tmpq+wq*1], m20, 1 - lea tmpq, [tmpq+wq*2] + mova [r7+wq*0], ym20 + vextracti32x8 [r7+wq*1], m20, 1 + lea r7, [r7+wq*2] sub hd, 2 - jg .hv_loop - add r5, 16 - add r7, 32 + jg .hv_w16_loop + add srcq, 16 + add tmpq, 32 movzx hd, r6b - mov srcq, r5 - mov tmpq, r7 sub r6d, 1<<8 - jg .hv_loop0 + jg .hv_w16_loop0 RET cglobal warp_affine_8x8t_8bpc, 4, 7, 22, tmp, ts |