summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/x86/mc_avx512.asm
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/dav1d/src/x86/mc_avx512.asm')
-rw-r--r--third_party/dav1d/src/x86/mc_avx512.asm2953
1 files changed, 1971 insertions, 982 deletions
diff --git a/third_party/dav1d/src/x86/mc_avx512.asm b/third_party/dav1d/src/x86/mc_avx512.asm
index f9043f1ad3..50e670ec25 100644
--- a/third_party/dav1d/src/x86/mc_avx512.asm
+++ b/third_party/dav1d/src/x86/mc_avx512.asm
@@ -89,55 +89,47 @@ wm_444_mask: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 3
db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
-bilin_h_perm16: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
- db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15
- db 33, 32, 34, 33, 35, 34, 36, 35, 37, 36, 38, 37, 39, 38, 40, 39
- db 41, 40, 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47
-bilin_h_perm32: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
- db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15
- db 17, 16, 18, 17, 19, 18, 20, 19, 21, 20, 22, 21, 23, 22, 24, 23
- db 25, 24, 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31
-bilin_v_perm8: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
- db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23
- db 32, 80, 33, 81, 34, 82, 35, 83, 36, 84, 37, 85, 38, 86, 39, 87
- db 64, 32, 65, 33, 66, 34, 67, 35, 68, 36, 69, 37, 70, 38, 71, 39
-bilin_v_perm16: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
- db 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
- db 64, 16, 65, 17, 66, 18, 67, 19, 68, 20, 69, 21, 70, 22, 71, 23
- db 72, 24, 73, 25, 74, 26, 75, 27, 76, 28, 77, 29, 78, 30, 79, 31
-bilin_v_perm32: db 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7
- db 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15
- db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23
- db 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31
-bilin_v_perm64: dq 0, 4, 1, 5, 2, 6, 3, 7
-spel_h_perm16a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+bilin_h_perm16: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+ db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16
+ db 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40
+ db 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48
+bilin_h_perm32: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+ db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16
+ db 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24
+ db 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32
+bilin_v_perm8: db 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
+ db 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87
+ db 80, 32, 81, 33, 82, 34, 83, 35, 84, 36, 85, 37, 86, 38, 87, 39
+ db 32, 64, 33, 65, 34, 66, 35, 67, 36, 68, 37, 69, 38, 70, 39, 71
+bilin_v_perm16: db 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
+ db 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
+ db 16, 64, 17, 65, 18, 66, 19, 67, 20, 68, 21, 69, 22, 70, 23, 71
+ db 24, 72, 25, 73, 26, 74, 27, 75, 28, 76, 29, 77, 30, 78, 31, 79
+bilin_v_perm32: db 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71
+ db 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79
+ db 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87
+ db 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95
+bilin_v_perm64: dd 0, 0, 4, 8, 1, 1, 5, 9, 2, 2, 6, 10, 3, 3, 7, 11
+spel_h_perm16: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
-spel_h_perm16b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
- db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18
- db 36, 37, 38, 39, 37, 38, 39, 40, 38, 39, 40, 41, 39, 40, 41, 42
- db 44, 45, 46, 47, 45, 46, 47, 48, 46, 47, 48, 49, 47, 48, 49, 50
-spel_h_perm16c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
- db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
- db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
- db 48, 49, 50, 51, 49, 50, 51, 52, 50, 51, 52, 53, 51, 52, 53, 54
-spel_h_perm32a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+spel_h_perm32: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30
-spel_h_perm32b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
- db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18
- db 20, 21, 22, 23, 21, 22, 23, 24, 22, 23, 24, 25, 23, 24, 25, 26
- db 28, 29, 30, 31, 29, 30, 31, 32, 30, 31, 32, 33, 31, 32, 33, 34
-spel_h_perm32c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
- db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
- db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30
- db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
-spel_v_perm16: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7
+spel_v_perm8: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+ db 8, 16, 9, 17, 10, 18, 11, 19, 12, 20, 13, 21, 14, 22, 15, 23
+ db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
+ db 24, 32, 25, 33, 26, 34, 27, 35, 28, 36, 29, 37, 30, 38, 31, 39
+spel_v_perm16a: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7
db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23
db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
+spel_v_perm16b: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7
+ db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23
+ db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+ db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
spel_v_perm32: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39
db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47
db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55
@@ -154,34 +146,20 @@ spel_hv_perm8a: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 2
db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31
db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39
db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47
-spel_hv_perm8b: db 32, 33, 48, 49, 34, 35, 50, 51, 36, 37, 52, 53, 38, 39, 54, 55
- db 40, 41, 56, 57, 42, 43, 58, 59, 44, 45, 60, 61, 46, 47, 62, 63
- db 48, 49, 64, 65, 50, 51, 66, 67, 52, 53, 68, 69, 54, 55, 70, 71
- db 56, 57, 72, 73, 58, 59, 74, 75, 60, 61, 76, 77, 62, 63, 78, 79
-spel_hv_perm8c: db 34, 35, 0, 1, 38, 39, 4, 5, 42, 43, 8, 9, 46, 47, 12, 13
+spel_hv_perm8b: db 34, 35, 0, 1, 38, 39, 4, 5, 42, 43, 8, 9, 46, 47, 12, 13
db 50, 51, 16, 17, 54, 55, 20, 21, 58, 59, 24, 25, 62, 63, 28, 29
db 0, 1, 32, 33, 4, 5, 36, 37, 8, 9, 40, 41, 12, 13, 44, 45
db 16, 17, 48, 49, 20, 21, 52, 53, 24, 25, 56, 57, 28, 29, 60, 61
-spel_hv_end16: db 1, 3, 17, 19, 5, 7, 21, 23, 33, 35, 49, 51, 37, 39, 53, 55
- db 9, 11, 25, 27, 13, 15, 29, 31, 41, 43, 57, 59, 45, 47, 61, 63
spel_hv_perm16a:db 0, 1, 2, 3, 32, 33, 34, 35, 1, 2, 3, 4, 33, 34, 35, 36
db 2, 3, 4, 5, 34, 35, 36, 37, 3, 4, 5, 6, 35, 36, 37, 38
-spel_hv_perm16c:db 8, 9, 10, 11, 40, 41, 42, 43, 9, 10, 11, 12, 41, 42, 43, 44
+ db 8, 9, 10, 11, 40, 41, 42, 43, 9, 10, 11, 12, 41, 42, 43, 44
db 10, 11, 12, 13, 42, 43, 44, 45, 11, 12, 13, 14, 43, 44, 45, 46
- db 16, 17, 18, 19, 48, 49, 50, 51, 17, 18, 19, 20, 49, 50, 51, 52
- db 18, 19, 20, 21, 50, 51, 52, 53, 19, 20, 21, 22, 51, 52, 53, 54
-spel_hv_perm16b:db 4, 5, 6, 7, 36, 37, 38, 39, 5, 6, 7, 8, 37, 38, 39, 40
- db 6, 7, 8, 9, 38, 39, 40, 41, 7, 8, 9, 10, 39, 40, 41, 42
- db 12, 13, 14, 15, 44, 45, 46, 47, 13, 14, 15, 16, 45, 46, 47, 48
- db 14, 15, 16, 17, 46, 47, 48, 49, 15, 16, 17, 18, 47, 48, 49, 50
-spel_hv_perm16d:db 0, 1, 2, 3, 1, 2, 3, 4, 4, 5, 6, 7, 5, 6, 7, 8
+spel_hv_perm16b:db 0, 1, 2, 3, 1, 2, 3, 4, 4, 5, 6, 7, 5, 6, 7, 8
db 2, 3, 4, 5, 3, 4, 5, 6, 6, 7, 8, 9, 7, 8, 9, 10
db 8, 9, 10, 11, 9, 10, 11, 12, 12, 13, 14, 15, 13, 14, 15, 16
db 10, 11, 12, 13, 11, 12, 13, 14, 14, 15, 16, 17, 15, 16, 17, 18
-spel_hv_perm16e:db 4, 5, 6, 7, 5, 6, 7, 8, 8, 9, 10, 11, 9, 10, 11, 12
- db 6, 7, 8, 9, 7, 8, 9, 10, 10, 11, 12, 13, 11, 12, 13, 14
- db 12, 13, 14, 15, 13, 14, 15, 16, 16, 17, 18, 19, 17, 18, 19, 20
- db 14, 15, 16, 17, 15, 16, 17, 18, 18, 19, 20, 21, 19, 20, 21, 22
+spel_hv_end16: db 1, 3, 17, 19, 5, 7, 21, 23, 33, 35, 49, 51, 37, 39, 53, 55
+ db 9, 11, 25, 27, 13, 15, 29, 31, 41, 43, 57, 59, 45, 47, 61, 63
spel_hv_end: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55
deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
@@ -189,15 +167,14 @@ subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 1
subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
-bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
-bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
-bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7
+bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
+bilin_v_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7
resize_permA: dd 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
resize_permB: dd 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
resize_permC: dd 0, 4, 8, 12
+resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7
pb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7
wm_420_perm64: dq 0xfedcba9876543210
@@ -205,6 +182,8 @@ wm_sign: dd 0x40804080, 0xc0c0c0c0, 0x40404040
pb_8x0_8x8: times 8 db 0
times 8 db 8
+pb_4: times 4 db 4
+pb_32: times 4 db 32
pb_127: times 4 db 127
pw_m128 times 2 dw -128
pw_m256: times 2 dw -256
@@ -216,7 +195,6 @@ pd_32: dd 32
pd_34: dd 34
pd_63: dd 63
pd_512: dd 512
-pd_32768: dd 32768
%define pb_m64 (wm_sign+4)
%define pb_64 (wm_sign+8)
@@ -289,8 +267,10 @@ BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128
BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, 6tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, 8tap, avx512icl, 3, 2, 4, 8, 16, 32, 64, 128
-HV_JMP_TABLE prep, 8tap, avx512icl, 7, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 6tap, avx512icl, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 8tap, avx512icl, 3, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128
@@ -401,9 +381,9 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
.h:
; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
- imul mxyd, 0xff01
- vbroadcasti128 m4, [bilin_h_shuf8]
- add mxyd, 16 << 8
+ imul mxyd, 255
+ vbroadcasti128 m4, [bilin_h_perm16]
+ add mxyd, 16
vpbroadcastw m5, mxyd
mov mxyd, r7m ; my
test mxyd, mxyd
@@ -526,9 +506,9 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
RET
.v:
movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)]
- imul mxyd, 0xff01
+ imul mxyd, 255
vpbroadcastd m5, [pw_2048]
- add mxyd, 16 << 8
+ add mxyd, 16
add wq, r7
vpbroadcastw m4, mxyd
jmp wq
@@ -539,7 +519,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
lea srcq, [srcq+ssq*2]
pinsrw xmm0, xmm1, [srcq+ssq*0], 0 ; 2 1
pshuflw xmm1, xmm1, q2301 ; 1 0
- punpcklbw xmm1, xmm0, xmm1
+ punpcklbw xmm1, xmm0
pmaddubsw xmm1, xm4
pmulhrsw xmm1, xm5
packuswb xmm1, xmm1
@@ -552,11 +532,11 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
.v_w4:
movd xmm0, [srcq+ssq*0]
.v_w4_loop:
- vpbroadcastd xmm1, [srcq+ssq*1]
+ vpbroadcastd xmm2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- vpblendd xmm2, xmm1, xmm0, 0x01 ; 0 1
+ vpblendd xmm1, xmm2, xmm0, 0x01 ; 0 1
vpbroadcastd xmm0, [srcq+ssq*0]
- vpblendd xmm1, xmm0, 0x02 ; 1 2
+ vpblendd xmm2, xmm0, 0x02 ; 1 2
punpcklbw xmm1, xmm2
pmaddubsw xmm1, xm4
pmulhrsw xmm1, xm5
@@ -570,11 +550,11 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
.v_w8:
movq xmm0, [srcq+ssq*0]
.v_w8_loop:
- movq xmm3, [srcq+ssq*1]
+ movq xmm2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- punpcklbw xmm1, xmm3, xmm0
+ punpcklbw xmm1, xmm0, xmm2
movq xmm0, [srcq+ssq*0]
- punpcklbw xmm2, xmm0, xmm3
+ punpcklbw xmm2, xmm0
pmaddubsw xmm1, xm4
pmaddubsw xmm2, xm4
pmulhrsw xmm1, xm5
@@ -589,11 +569,11 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
.v_w16:
movu xmm0, [srcq+ssq*0]
.v_w16_loop:
- vbroadcasti128 ymm2, [srcq+ssq*1]
+ vbroadcasti128 ymm3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- vpblendd ymm3, ymm2, ymm0, 0x0f ; 0 1
+ vpblendd ymm2, ymm3, ymm0, 0x0f ; 0 1
vbroadcasti128 ymm0, [srcq+ssq*0]
- vpblendd ymm2, ymm2, ymm0, 0xf0 ; 1 2
+ vpblendd ymm3, ymm0, 0xf0 ; 1 2
punpcklbw ymm1, ymm2, ymm3
punpckhbw ymm2, ymm3
pmaddubsw ymm1, ym4
@@ -612,11 +592,11 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
movu ym0, [srcq+ssq*0]
kxnorb k1, k1, k1
.v_w32_loop:
- vbroadcasti32x8 m2, [srcq+ssq*1]
+ vbroadcasti32x8 m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- vpblendmd m3{k1}, m2, m0 ; 0 1
+ vpblendmd m2{k1}, m3, m0 ; 0 1
vbroadcasti32x8 m0, [srcq+ssq*0]
- vpblendmd m2{k1}, m0, m2 ; 1 2
+ vpblendmd m3{k1}, m0, m3 ; 1 2
punpcklbw m1, m2, m3
punpckhbw m2, m3
pmaddubsw m1, m4
@@ -635,18 +615,18 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
.v_w64_loop:
movu m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- punpcklbw m1, m3, m0
- punpckhbw m6, m3, m0
+ punpcklbw m1, m0, m3
+ punpckhbw m6, m0, m3
movu m0, [srcq+ssq*0]
pmaddubsw m1, m4
pmaddubsw m6, m4
- punpcklbw m2, m0, m3
- punpckhbw m7, m0, m3
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
pmaddubsw m2, m4
- pmaddubsw m7, m4
- REPX {pmulhrsw x, m5}, m1, m6, m2, m7
+ pmaddubsw m3, m4
+ REPX {pmulhrsw x, m5}, m1, m6, m2, m3
packuswb m1, m6
- packuswb m2, m7
+ packuswb m2, m3
mova [dstq+dsq*0], m1
mova [dstq+dsq*1], m2
lea dstq, [dstq+dsq*2]
@@ -660,13 +640,13 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
add srcq, ssq
movu m2, [srcq+64*0]
movu m3, [srcq+64*1]
- punpcklbw m6, m2, m0
+ punpcklbw m6, m0, m2
pmaddubsw m6, m4
- punpckhbw m0, m2, m0
+ punpckhbw m0, m2
pmaddubsw m0, m4
- punpcklbw m7, m3, m1
+ punpcklbw m7, m1, m3
pmaddubsw m7, m4
- punpckhbw m1, m3, m1
+ punpckhbw m1, m3
pmaddubsw m1, m4
REPX {pmulhrsw x, m5}, m6, m0, m7, m1
packuswb m6, m0
@@ -1005,8 +985,8 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
.h:
; 16 * src[x] + (mx * (src[x + 1] - src[x]))
; = (16 - mx) * src[x] + mx * src[x + 1]
- imul mxyd, 0xff01
- add mxyd, 16 << 8
+ imul mxyd, 255
+ add mxyd, 16
vpbroadcastw m5, mxyd
mov mxyd, r6m ; my
test mxyd, mxyd
@@ -1032,7 +1012,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
jg .h_w4_loop
RET
.h_w8:
- vbroadcasti32x4 m4, [bilin_h_shuf8]
+ vbroadcasti32x4 m4, [bilin_h_perm16]
.h_w8_loop:
movu xmm0, [srcq+strideq*0]
vinserti32x4 ym0, ymm0, [srcq+strideq*1], 1
@@ -1127,8 +1107,8 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
.v:
WIN64_SPILL_XMM 7
movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
- imul mxyd, 0xff01
- add mxyd, 16 << 8
+ imul mxyd, 255
+ add mxyd, 16
add wq, t2
lea stride3q, [strideq*3]
vpbroadcastw m6, mxyd
@@ -1218,11 +1198,11 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
.v_w64_loop:
vpermq m1, m5, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
- punpcklbw m4, m1, m0
- punpckhbw m2, m1, m0
+ punpcklbw m4, m0, m1
+ punpckhbw m2, m0, m1
vpermq m0, m5, [srcq+strideq*0]
- punpcklbw m3, m0, m1
- punpckhbw m1, m0, m1
+ punpcklbw m3, m1, m0
+ punpckhbw m1, m0
pmaddubsw m4, m6
pmaddubsw m2, m6
pmaddubsw m3, m6
@@ -1243,28 +1223,28 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
vpermq m2, m5, [srcq+strideq*1+ 0]
vpermq m3, m5, [srcq+strideq*1+64]
lea srcq, [srcq+strideq*2]
- punpcklbw m4, m2, m0
- punpckhbw m0, m2, m0
+ punpcklbw m4, m0, m2
+ punpckhbw m0, m2
pmaddubsw m4, m6
pmaddubsw m0, m6
mova [tmpq+64*0], m4
mova [tmpq+64*1], m0
- punpcklbw m4, m3, m1
- punpckhbw m1, m3, m1
+ punpcklbw m4, m1, m3
+ punpckhbw m1, m3
pmaddubsw m4, m6
pmaddubsw m1, m6
mova [tmpq+64*2], m4
mova [tmpq+64*3], m1
vpermq m0, m5, [srcq+strideq*0+ 0]
vpermq m1, m5, [srcq+strideq*0+64]
- punpcklbw m4, m0, m2
- punpckhbw m2, m0, m2
+ punpcklbw m4, m2, m0
+ punpckhbw m2, m0
pmaddubsw m4, m6
pmaddubsw m2, m6
mova [tmpq+64*4], m4
mova [tmpq+64*5], m2
- punpcklbw m4, m1, m3
- punpckhbw m3, m1, m3
+ punpcklbw m4, m3, m1
+ punpckhbw m3, m1
pmaddubsw m4, m6
pmaddubsw m3, m6
mova [tmpq+64*6], m4
@@ -1308,7 +1288,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
jg .hv_w4_loop
RET
.hv_w8:
- vbroadcasti32x4 m4, [bilin_h_shuf8]
+ vbroadcasti32x4 m4, [bilin_h_perm16]
vbroadcasti32x4 m0, [srcq+strideq*0]
pshufb m0, m4
pmaddubsw m0, m5
@@ -1448,7 +1428,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
%assign FILTER_SMOOTH (1*15 << 16) | 4*15
%assign FILTER_SHARP (2*15 << 16) | 3*15
-%macro FN 4 ; fn, type, type_h, type_v
+%macro FN 4-5 ; fn, type, type_h, type_v, jmp_to
cglobal %1_%2_8bpc
mov t0d, FILTER_%3
%ifidn %3, %4
@@ -1456,8 +1436,8 @@ cglobal %1_%2_8bpc
%else
mov t1d, FILTER_%4
%endif
-%ifnidn %2, regular ; skip the jump in the last filter
- jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX)
+%if %0 == 5 ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
%endif
%endmacro
@@ -1489,24 +1469,22 @@ DECLARE_REG_TMP 4, 5
DECLARE_REG_TMP 7, 8
%endif
+; Due to the use of vpdpbusd (which does 4 pixels per instruction) in
+; the horizontal filter, 6-tap is only used for the vertical filter.
%define PUT_8TAP_FN FN put_8tap,
-
-PUT_8TAP_FN sharp, SHARP, SHARP
-PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
-PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
-PUT_8TAP_FN smooth, SMOOTH, SMOOTH
-PUT_8TAP_FN sharp_regular, SHARP, REGULAR
-PUT_8TAP_FN regular_sharp, REGULAR, SHARP
-PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
-PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_6tap_8bpc
+PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_6tap_8bpc
+PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_8bpc
+PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_8bpc
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_8bpc
PUT_8TAP_FN regular, REGULAR, REGULAR
-cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ns
%define base r8-put_avx512icl
imul mxd, mxm, 0x010101
- add mxd, t0d ; 8tap_h, mx, 4tap_h
+ add mxd, t0d ; 6tap_h, mx, 4tap_h
imul myd, mym, 0x010101
- add myd, t1d ; 8tap_v, my, 4tap_v
+ add myd, t1d ; 6tap_v, my, 4tap_v
lea r8, [put_avx512icl]
movsxd wq, wm
movifnidn hd, hm
@@ -1514,6 +1492,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
jnz .h
test myd, 0xf00
jnz .v
+.put:
tzcnt wd, wd
movzx wd, word [r8+wq*2+table_offset(put,)]
add wq, r8
@@ -1523,158 +1502,577 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
pop r8
%endif
jmp wq
-.h:
- test myd, 0xf00
- jnz .hv
- vpbroadcastd m5, [pd_34] ; 2 + (8 << 2)
- WIN64_SPILL_XMM 11
- cmp wd, 4
- jl .h_w2
- vbroadcasti128 m6, [subpel_h_shufA]
- je .h_w4
- tzcnt wd, wd
- vbroadcasti128 m7, [subpel_h_shufB]
- vbroadcasti128 m8, [subpel_h_shufC]
- shr mxd, 16
- sub srcq, 3
- movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)]
- vpbroadcastd m9, [base+mxq*8+subpel_filters+0]
- vpbroadcastd m10, [base+mxq*8+subpel_filters+4]
- add wq, r8
- jmp wq
-.h_w2:
- movzx mxd, mxb
- dec srcq
- mova xmm4, [subpel_h_shuf4]
- vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2]
-.h_w2_loop:
- movq xmm0, [srcq+ssq*0]
- movhps xmm0, [srcq+ssq*1]
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ tzcnt r6d, wd
+ movzx r6d, word [r8+r6*2+table_offset(put, _6tap_v)]
+ vpbroadcastd m6, [pw_512]
+ lea myq, [base+subpel_filters+1+myq*8]
+ vpbroadcastw m7, [myq+0]
+ add r6, r8
+ vpbroadcastw m8, [myq+2]
+ mov nsq, ssq
+ vpbroadcastw m9, [myq+4]
+ neg nsq
+ jmp r6
+.v_w2:
+ movd xmm2, [srcq+nsq*2]
+ pinsrw xmm2, [srcq+nsq*1], 2
+ pinsrw xmm2, [srcq+ssq*0], 4
+ pinsrw xmm2, [srcq+ssq*1], 6 ; 0 1 2 3
lea srcq, [srcq+ssq*2]
- pshufb xmm0, xmm4
- mova xmm1, xm5
- vpdpbusd xmm1, xmm0, xmm3
- packssdw xmm0, xmm1, xmm1
- psraw xmm0, 6
- packuswb xmm0, xm0
- pextrw [dstq+dsq*0], xmm0, 0
- pextrw [dstq+dsq*1], xmm0, 1
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4
+ punpcklbw xmm1, xmm2, xmm3 ; 01 12
+ punpckhbw xmm2, xmm3 ; 23 34
+.v_w2_loop:
+ vpbroadcastd xmm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw xmm3, xmm1, xm7 ; a0 b0
+ mova xmm1, xmm2
+ pmaddubsw xmm2, xm8 ; a1 b1
+ paddw xmm3, xmm2
+ vpblendd xmm2, xmm0, xmm4, 0x02 ; 4 5
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm4, xmm0, 0x02 ; 5 6
+ punpcklbw xmm2, xmm4 ; 67 78
+ pmaddubsw xmm4, xmm2, xm9 ; a3 b3
+ paddw xmm3, xmm4
+ pmulhrsw xmm3, xm6
+ packuswb xmm3, xmm3
+ pextrw [dstq+dsq*0], xmm3, 0
+ pextrw [dstq+dsq*1], xmm3, 2
lea dstq, [dstq+dsq*2]
sub hd, 2
- jg .h_w2_loop
+ jg .v_w2_loop
RET
-.h_w4:
- movzx mxd, mxb
- dec srcq
- vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2]
-.h_w4_loop:
- movq xmm0, [srcq+ssq*0]
- movq xmm1, [srcq+ssq*1]
+.v_w4:
+ movd xmm2, [srcq+nsq*2]
+ pinsrd xmm2, [srcq+nsq*1], 1
+ pinsrd xmm2, [srcq+ssq*0], 2
+ pinsrd xmm2, [srcq+ssq*1], 3 ; 0 1 2 3
lea srcq, [srcq+ssq*2]
- pshufb xmm0, xm6
- pshufb xmm1, xm6
- mova xmm2, xm5
- vpdpbusd xmm2, xmm0, xmm3
- mova xmm0, xm5
- vpdpbusd xmm0, xmm1, xmm3
- packssdw xmm0, xmm2, xmm0
- psraw xmm0, 6
- packuswb xmm0, xmm0
- movd [dstq+dsq*0], xmm0
- pextrd [dstq+dsq*1], xmm0, 1
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4
+ punpcklbw xmm1, xmm2, xmm3 ; 01 12
+ punpckhbw xmm2, xmm3 ; 23 34
+.v_w4_loop:
+ vpbroadcastd xmm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw xmm3, xmm1, xm7 ; a0 b0
+ mova xmm1, xmm2
+ pmaddubsw xmm2, xm8 ; a1 b1
+ paddw xmm3, xmm2
+ vpblendd xmm2, xmm0, xmm4, 0x02 ; 4 5
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm4, xmm0, 0x02 ; 5 6
+ punpcklbw xmm2, xmm4 ; 45 56
+ pmaddubsw xmm4, xmm2, xm9 ; a2 b2
+ paddw xmm3, xmm4
+ pmulhrsw xmm3, xm6
+ packuswb xmm3, xmm3
+ movd [dstq+dsq*0], xmm3
+ pextrd [dstq+dsq*1], xmm3, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
- jg .h_w4_loop
+ jg .v_w4_loop
RET
-.h_w8:
- movu xm0, [srcq+ssq*0]
- vinserti32x4 ym0, [srcq+ssq*1], 1
+.v_w8:
+ movq xmm1, [srcq+nsq*2]
+ vpbroadcastq ymm3, [srcq+nsq*1]
+ vpbroadcastq ymm2, [srcq+ssq*0]
+ vpbroadcastq ymm4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- WRAP_YMM PUT_8TAP_H 0, 1, 2, 3
- vpmovuswb xm0, ym0
- movq [dstq+dsq*0], xm0
- movhps [dstq+dsq*1], xm0
+ vpbroadcastq ymm0, [srcq+ssq*0]
+ vpblendd ymm1, ymm3, 0x30
+ vpblendd ymm3, ymm2, 0x30
+ punpcklbw ymm1, ymm3 ; 01 12
+ vpblendd ymm2, ymm4, 0x30
+ vpblendd ymm4, ymm0, 0x30
+ punpcklbw ymm2, ymm4 ; 23 34
+.v_w8_loop:
+ vpbroadcastq ymm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw ymm3, ymm1, ym7 ; a0 b0
+ mova ymm1, ymm2
+ pmaddubsw ymm2, ym8 ; a1 b1
+ paddw ymm3, ymm2
+ vpblendd ymm2, ymm0, ymm4, 0x30
+ vpbroadcastq ymm0, [srcq+ssq*0]
+ vpblendd ymm4, ymm0, 0x30
+ punpcklbw ymm2, ymm4 ; 45 56
+ pmaddubsw ymm4, ymm2, ym9 ; a2 b2
+ paddw ymm3, ymm4
+ pmulhrsw ymm3, ym6
+ vextracti128 xmm4, ymm3, 1
+ packuswb xmm3, xmm4
+ movq [dstq+dsq*0], xmm3
+ movhps [dstq+dsq*1], xmm3
lea dstq, [dstq+dsq*2]
sub hd, 2
- jg .h_w8
+ jg .v_w8_loop
+ vzeroupper
RET
-.h_w16:
- mova m6, [spel_h_perm16a]
- mova m7, [spel_h_perm16b]
- mova m8, [spel_h_perm16c]
-.h_w16_loop:
- movu ym0, [srcq+ssq*0]
+.v_w16:
+ mova m5, [spel_v_perm16a]
+ vbroadcasti32x4 m1, [srcq+nsq*2]
+ vbroadcasti32x4 ym3, [srcq+nsq*1]
+ mov r6d, 0x0f
+ vbroadcasti32x4 m2, [srcq+ssq*0]
+ kmovb k1, r6d
+ vbroadcasti32x4 ym4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti32x4 m0, [srcq+ssq*0]
+ vshufpd m1{k1}, m3, m2, 0xcc
+ vshufpd m2{k1}, m4, m0, 0xcc
+ vpermb m1, m5, m1 ; 01 12
+ vpermb m2, m5, m2 ; 23 34
+.v_w16_loop:
+ vbroadcasti32x4 ym4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw m3, m1, m7 ; a0 b0
+ mova m1, m2
+ pmaddubsw m2, m8 ; a1 b1
+ paddw m3, m2
+ mova m2, m0
+ vbroadcasti32x4 m0, [srcq+ssq*0]
+ vshufpd m2{k1}, m4, m0, 0xcc
+ vpermb m2, m5, m2 ; 45 56
+ pmaddubsw m4, m2, m9 ; a2 b2
+ paddw m3, m4
+ pmulhrsw m3, m6
+ vextracti32x8 ym4, m3, 1
+ packuswb ym3, ym4
+ mova [dstq+dsq*0], xm3
+ vextracti32x4 [dstq+dsq*1], ym3, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ RET
+.v_w32:
+ mova m10, [spel_v_perm32]
+ pmovzxbq m5, [pb_02461357]
+ vpshrdw m11, m10, m10, 8
+ movu ym0, [srcq+nsq*2]
+ vinserti32x8 m0, [srcq+nsq*1], 1
+ vpermb m1, m10, m0 ; 01
+ vinserti32x8 m0, [srcq+ssq*0], 0
+ vpermb m2, m11, m0 ; 12
vinserti32x8 m0, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
- PUT_8TAP_H 0, 1, 2, 3, 1
- vpmovuswb ym0, m0
- mova [dstq+dsq*0], xm0
- vextracti128 [dstq+dsq*1], ym0, 1
+ vpermb m3, m10, m0 ; 23
+ vinserti32x8 m0, [srcq+ssq*0], 0
+ vpermb m4, m11, m0 ; 34
+.v_w32_loop:
+ vinserti32x8 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw m12, m1, m7
+ mova m1, m3
+ pmaddubsw m13, m2, m7
+ mova m2, m4
+ pmaddubsw m14, m3, m8
+ vpermb m3, m10, m0 ; 45
+ vinserti32x8 m0, [srcq+ssq*0], 0
+ pmaddubsw m15, m4, m8
+ vpermb m4, m11, m0 ; 56
+ paddw m12, m14
+ pmaddubsw m14, m3, m9
+ paddw m13, m15
+ pmaddubsw m15, m4, m9
+ paddw m12, m14
+ paddw m13, m15
+ pmulhrsw m12, m6
+ pmulhrsw m13, m6
+ packuswb m12, m13
+ vpermq m12, m5, m12
+ mova [dstq+dsq*0], ym12
+ vextracti32x8 [dstq+dsq*1], m12, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
- jg .h_w16_loop
+ jg .v_w32_loop
RET
-.h_w32:
- movu ym0, [srcq+ssq*0+8*0]
- vinserti32x8 m0, [srcq+ssq*1+8*0], 1
- movu ym1, [srcq+ssq*0+8*1]
- vinserti32x8 m1, [srcq+ssq*1+8*1], 1
+.v_w64:
+.v_w128:
+ lea r6d, [hq+wq*4-256]
+.v_loop0:
+ movu m2, [srcq+nsq*2]
+ movu m4, [srcq+nsq*1]
+ lea r4, [srcq+ssq*2]
+ movu m11, [srcq+ssq*0]
+ movu m13, [srcq+ssq*1]
+ mov r7, dstq
+ movu m0, [r4 +ssq*0]
+ punpcklbw m1, m2, m4 ; 01l
+ punpckhbw m2, m4 ; 01h
+ punpcklbw m3, m4, m11 ; 12l
+ punpckhbw m4, m11 ; 12h
+ punpcklbw m10, m11, m13 ; 23l
+ punpckhbw m11, m13 ; 23h
+ punpcklbw m12, m13, m0 ; 34l
+ punpckhbw m13, m0 ; 34h
+.v_loop:
+ movu m5, [r4+ssq*1]
+ pmaddubsw m14, m1, m7 ; a0l
+ mova m1, m10
+ pmaddubsw m10, m8 ; a1l
+ lea r4, [r4+ssq*2]
+ pmaddubsw m15, m2, m7 ; a0h
+ mova m2, m11
+ pmaddubsw m11, m8 ; a1h
+ paddw m14, m10
+ punpcklbw m10, m0, m5 ; 45l
+ paddw m15, m11
+ punpckhbw m11, m0, m5 ; 45h
+ pmaddubsw m0, m10, m9 ; a2l
+ paddw m14, m0
+ pmaddubsw m0, m11, m9 ; a2h
+ paddw m15, m0
+ movu m0, [r4+ssq*0]
+ pmulhrsw m14, m6
+ pmulhrsw m15, m6
+ packuswb m14, m15
+ pmaddubsw m15, m3, m7 ; b0l
+ mova m3, m12
+ pmaddubsw m12, m8 ; b1l
+ mova [r7+dsq*0], m14
+ pmaddubsw m14, m4, m7 ; b0h
+ mova m4, m13
+ pmaddubsw m13, m8 ; b1h
+ paddw m15, m12
+ punpcklbw m12, m5, m0 ; 56l
+ paddw m14, m13
+ punpckhbw m13, m5, m0 ; 56h
+ pmaddubsw m5, m12, m9 ; b2l
+ paddw m15, m5
+ pmaddubsw m5, m13, m9 ; b2h
+ paddw m14, m5
+ pmulhrsw m15, m6
+ pmulhrsw m14, m6
+ packuswb m15, m14
+ mova [r7+dsq*1], m15
+ lea r7, [r7+dsq*2]
+ sub hd, 2
+ jg .v_loop
+ add srcq, 64
+ add dstq, 64
+ movzx hd, r6b
+ sub r6d, 256
+ jg .v_loop0
+ RET
+.h:
+ test myd, 0xf00
+ jz mangle(private_prefix %+ _put_8tap_8bpc_avx512icl).h2
+.hv:
+ vpbroadcastd m9, [pd_34]
+ mova xm10, [spel_hv_end]
+ pxor xm0, xm0
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m7, [base+subpel_filters+mxq*8+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastq ym1, [base+subpel_filters+1+myq*8]
+ mov nsq, ssq
+ punpcklbw ym0, ym1
+ neg nsq
+ psraw ym0, 2 ; << 6
+ pshufd ym11, ym0, q0000
+ pshufd ym12, ym0, q1111
+ pshufd ym13, ym0, q2222
+ cmp wd, 4
+ je .hv_w4
+ vbroadcasti128 ym5, [subpel_h_shuf4]
+ movq xmm0, [srcq+nsq*2]
+ movhps xmm0, [srcq+nsq*1]
+ movq xmm2, [srcq+ssq*0]
+ movhps xmm2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- PUT_8TAP_H 0, 2, 3, 4
- PUT_8TAP_H 1, 4, 3, 2
- packuswb m0, m1
- mova [dstq+dsq*0], ym0
- vextracti32x8 [dstq+dsq*1], m0, 1
+ vpbroadcastq ymm1, [srcq+ssq*0]
+ vpblendd ymm0, ymm1, 0x30
+ pshufb xmm2, xm5 ; 2 3
+ pshufb ymm0, ym5 ; 0 1 4
+ mova xmm1, xm9
+ vpdpbusd xmm1, xmm2, xm7
+ mova ymm2, ym9
+ vpdpbusd ymm2, ymm0, ym7
+ packssdw ymm2, ymm1
+ psraw ymm2, 2
+ vextracti128 xmm0, ymm2, 1
+ vzeroupper
+ palignr xmm0, xmm2, 4
+ punpcklwd xmm1, xmm2, xmm0 ; 01 12
+ punpckhwd xmm2, xmm0 ; 23 34
+.hv_w2_loop:
+ movq xmm3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xmm3, [srcq+ssq*0]
+ pmaddwd xmm4, xmm1, xm11 ; a0 b0
+ mova xmm1, xmm2
+ vpdpwssd xmm4, xmm2, xm12 ; a1 b1
+ pshufb xmm3, xm5
+ mova xmm2, xm9
+ vpdpbusd xmm2, xmm3, xm7
+ packssdw xmm3, xmm2, xmm2
+ psraw xmm3, 2
+ palignr xmm2, xmm3, xmm0, 12
+ mova xmm0, xmm3
+ punpcklwd xmm2, xmm3 ; 45 56
+ vpdpwssd xmm4, xmm2, xm13 ; a2 b2
+ packuswb xmm4, xmm4
+ pshufb xmm4, xm10
+ pextrw [dstq+dsq*0], xmm4, 0
+ pextrw [dstq+dsq*1], xmm4, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
- jg .h_w32
+ jg .hv_w2_loop
RET
-.h_w64:
- movu m0, [srcq+8*0]
- movu m1, [srcq+8*1]
- add srcq, ssq
- PUT_8TAP_H 0, 2, 3, 4
- PUT_8TAP_H 1, 4, 3, 2
- packuswb m0, m1
- mova [dstq], m0
- add dstq, dsq
- dec hd
- jg .h_w64
+.hv_w4:
+ movq xm2, [srcq+nsq*2]
+ vpbroadcastq ym1, [srcq+nsq*1]
+ vinserti32x4 ym2, [srcq+ssq*0], 1
+ vinserti32x4 m1, [srcq+ssq*1], 2 ; _ 1 3
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti32x4 m5, [subpel_h_shufA]
+ vinserti32x4 m2, [srcq+ssq*0], 2 ; 0 2 4
+ pshufb m1, m5
+ mova m0, m9
+ pshufb m2, m5
+ mova m3, m9
+ vpdpbusd m0, m1, m7
+ mova ym1, [spel_hv_perm4a]
+ vpdpbusd m3, m2, m7
+ mova ym2, [spel_hv_perm4b]
+ mov r6d, 0x5555
+ mova ym6, [spel_hv_perm4d]
+ packssdw m0, m3
+ kmovw k1, r6d
+ psraw m0, 2 ; _ 0 1 2 3 4 5 6
+ vpermb ym1, ym1, ym0 ; 01 12
+ vpermb m2, m2, m0 ; 23 34
+.hv_w4_loop:
+ movq xm3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x4 ym3, [srcq+ssq*0], 1
+ pmaddwd ym4, ym1, ym11 ; a0 b0
+ mova ym1, ym2
+ pshufb ym3, ym5
+ mova ym0, ym9
+ vpdpbusd ym0, ym3, ym7
+ vpdpwssd ym4, ym2, ym12 ; a1 b1
+ vpsraw ym2{k1}, ym0, 2 ; 5 6
+ vpermb ym2, ym6, ym2 ; 45 56
+ vpdpwssd ym4, ym2, ym13 ; a2 b2
+ packuswb ym4, ym4
+ vpermb ym4, ym10, ym4
+ movd [dstq+dsq*0], xm4
+ pextrd [dstq+dsq*1], xm4, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
RET
-.h_w128:
- movu m0, [srcq+8*0]
- movu m2, [srcq+8*1]
- movu m1, [srcq+8*8]
- movu m3, [srcq+8*9]
- add srcq, ssq
- PUT_8TAP_H 0, 4, 11, 12
- PUT_8TAP_H 2, 12, 11, 4
- PUT_8TAP_H 1, 4, 11, 12
- PUT_8TAP_H 3, 12, 11, 4
- packuswb m0, m2
- packuswb m1, m3
- mova [dstq+64*0], m0
- mova [dstq+64*1], m1
- add dstq, dsq
- dec hd
- jg .h_w128
+.hv_w8:
+ shr mxd, 16
+ sub srcq, 3
+ vpbroadcastd m11, [base+subpel_filters+mxq*8+0]
+ vpbroadcastd m12, [base+subpel_filters+mxq*8+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastq m1, [base+subpel_filters+1+myq*8]
+ mov nsq, ssq
+ punpcklbw m0, m1
+ neg nsq
+ psraw m0, 2 ; << 6
+ pshufd m13, m0, q0000
+ pshufd m14, m0, q1111
+ pshufd m15, m0, q2222
+ cmp wd, 8
+ jne .hv_w16
+ movu xm0, [srcq+nsq*2]
+ vinserti32x4 ym0, [srcq+nsq*1], 1
+ vbroadcasti32x4 m1, [subpel_h_shufA]
+ vinserti32x4 m0, [srcq+ssq*0], 2
+ vbroadcasti32x4 m4, [subpel_h_shufB]
+ vinserti32x4 m0, [srcq+ssq*1], 3
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti32x4 m7, [subpel_h_shufC]
+ vbroadcasti32x4 ym5, [srcq+ssq*0]
+ vbroadcasti32x8 m6, [subpel_h_shufA]
+ pshufb m1, m0, m1 ; 0 1 2 3 0123
+ mova m2, m9
+ vpdpbusd m2, m1, m11
+ pshufb m4, m0, m4 ; 0 1 2 3 4567
+ mova m1, m9
+ vpdpbusd m1, m4, m11
+ pshufb m0, m7 ; 0 1 2 3 89ab
+ pshufb ym7, ym5, ym6 ; 4 0123 4567
+ mova ym3, ym9
+ vpdpbusd ym3, ym7, ym11
+ vbroadcasti32x8 m7, [subpel_h_shufB]
+ vpdpbusd m2, m4, m12
+ mova m4, [spel_hv_perm8a]
+ pshufb ym5, ym7 ; 4 4567 89ab
+ vpdpbusd m1, m0, m12
+ vpaddd m0, m4, [pb_32] {1to16}
+ vpdpbusd ym3, ym5, ym12
+ mova m5, [spel_hv_perm8b]
+ mov r6, 0x55555555ff00
+ packssdw m2, m1
+ vpmovsdw xm3, ym3
+ kmovq k1, r6
+ psraw m2, 2 ; 0 1 2 3
+ psraw xm3, 2 ; 4
+ vpermb m1, m4, m2 ; 01 12
+ kshiftrq k2, k1, 16
+ vpermt2b m2, m0, m3 ; 23 34
+.hv_w8_loop:
+ vbroadcasti32x4 ym3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti32x4 m3{k1}, [srcq+ssq*0]
+ pmaddwd m0, m1, m13 ; a0 b0
+ pshufb m1, m3, m6 ; 5 6 0123 4567
+ mova m4, m9
+ vpdpbusd m4, m1, m11
+ pshufb m3, m7 ; 5 6 4567 89ab
+ vpdpwssd m0, m2, m14 ; a1 b1
+ mova m1, m2
+ vpdpbusd m4, m3, m12
+ psraw m2{k2}, m4, 2 ; 53 64
+ vpermb m2, m5, m2 ; 45 56
+ vpdpwssd m0, m2, m15 ; a2 b2
+ packuswb m0, m0
+ vpermb m0, m10, m0
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ movu m19, [spel_hv_perm16a]
+ vpbroadcastd m7, [pb_4]
+ lea r6d, [wq*2-32]
+ mova m6, [spel_hv_perm16b]
+ paddb m20, m7, m19
+ lea r6d, [hq+r6*8]
+ paddb m21, m7, m20
+ mova ym10, [spel_hv_end16]
+ paddb m7, m6
+.hv_w16_loop0:
+ movu ym16, [srcq+nsq*2]
+ vinserti32x8 m16, [srcq+nsq*1], 1
+ lea r4, [srcq+ssq*2]
+ movu ym17, [srcq+ssq*0]
+ vinserti32x8 m17, [srcq+ssq*1], 1
+ mov r7, dstq
+ movu ym18, [r4 +ssq*0]
+ vpermb m2, m19, m16 ; 0 1 0123 89ab
+ mova m1, m9
+ vpermb m3, m21, m16 ; 0 1 89ab ghij
+ vpdpbusd m1, m2, m11
+ mova m2, m9
+ vpermb m4, m19, m17 ; 2 3 0123 89ab
+ vpdpbusd m2, m3, m12
+ mova m3, m9
+ vpermb m5, m21, m17 ; 2 3 89ab ghij
+ vpdpbusd m3, m4, m11
+ mova m4, m9
+ vpermb m0, m6, m18 ; 4 0145 2367 89cd abef
+ vpdpbusd m4, m5, m12
+ mova m5, m9
+ vpermb m16, m20, m16 ; 0 1 4567 cdef
+ vpdpbusd m5, m0, m11
+ vpermb m17, m20, m17 ; 2 3 4567 cdef
+ vpdpbusd m1, m16, m12
+ vpermb m18, m7, m18 ; 4 4589 67ab cdgh efij
+ vpdpbusd m2, m16, m11
+ vpdpbusd m3, m17, m12
+ vpdpbusd m4, m17, m11
+ vpdpbusd m5, m18, m12
+ packssdw m1, m2 ; 01
+ packssdw m3, m4 ; 23
+ REPX {psraw x, 2}, m1, m3, m5
+ vpshrdd m2, m1, m3, 16 ; 12
+ vpshrdd m4, m3, m5, 16 ; 34
+.hv_w16_loop:
+ movu ym18, [r4+ssq*1]
+ lea r4, [r4+ssq*2]
+ vinserti32x8 m18, [r4+ssq*0], 1
+ pmaddwd m16, m1, m13 ; a0
+ vpermb m1, m19, m18 ; 5 6 0123 89ab
+ pmaddwd m17, m2, m13 ; b0
+ vpermb m2, m20, m18 ; 5 6 4567 cdef
+ mova m0, m9
+ vpdpbusd m0, m1, m11
+ vpermb m18, m21, m18
+ mova m1, m9
+ vpdpbusd m1, m2, m11
+ vpdpwssd m16, m3, m14 ; a1
+ vpdpwssd m17, m4, m14 ; b1
+ vpdpbusd m0, m2, m12
+ mova m2, m4
+ vpdpbusd m1, m18, m12
+ packssdw m0, m1
+ mova m1, m3
+ psraw m4, m0, 2 ; 5 6
+ vpshrdd m3, m2, m4, 16 ; 4 5
+ vpdpwssd m17, m4, m15 ; b2
+ vpdpwssd m16, m3, m15 ; a2
+ packuswb m16, m17
+ vpermb m16, m10, m16
+ mova [r7+dsq*0], xm16
+ vextracti128 [r7+dsq*1], ym16, 1
+ lea r7, [r7+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ add srcq, 16
+ add dstq, 16
+ movzx hd, r6b
+ sub r6d, 1<<8
+ jg .hv_w16_loop0
+ vzeroupper
RET
+
+PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_8bpc
+PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_8bpc
+PUT_8TAP_FN sharp, SHARP, SHARP
+
+cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r8, [put_avx512icl]
+ movsxd wq, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jz mangle(private_prefix %+ _put_6tap_8bpc_avx512icl).put
.v:
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovs myd, mxd
tzcnt r6d, wd
+ lea myq, [base+subpel_filters+myq*8]
movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)]
vpbroadcastd m7, [pw_512]
- lea myq, [base+subpel_filters+myq*8]
vpbroadcastw m8, [myq+0]
- vpbroadcastw m9, [myq+2]
- vpbroadcastw m10, [myq+4]
- vpbroadcastw m11, [myq+6]
add r6, r8
+ vpbroadcastw m9, [myq+2]
lea ss3q, [ssq*3]
+ vpbroadcastw m10, [myq+4]
sub srcq, ss3q
+ vpbroadcastw m11, [myq+6]
jmp r6
.v_w2:
movd xmm2, [srcq+ssq*0]
@@ -1802,7 +2200,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
vzeroupper
RET
.v_w16:
- mova m12, [spel_v_perm16]
+ mova m12, [spel_v_perm16a]
vbroadcasti32x4 m1, [srcq+ssq*0]
vbroadcasti32x4 ym4, [srcq+ssq*1]
mov r6d, 0x0f
@@ -1990,7 +2388,146 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
jg .v_loop0
vzeroupper
RET
+.h:
+ test myd, 0xf00
+ jnz .hv
+.h2:
+ vpbroadcastd m5, [pd_34] ; 2 + (8 << 2)
+ cmp wd, 4
+ jl .h_w2
+ vbroadcasti128 m6, [subpel_h_shufA]
+ je .h_w4
+ tzcnt wd, wd
+ vbroadcasti128 m7, [subpel_h_shufB]
+ vbroadcasti128 m8, [subpel_h_shufC]
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)]
+ vpbroadcastd m9, [base+mxq*8+subpel_filters+0]
+ vpbroadcastd m10, [base+mxq*8+subpel_filters+4]
+ add wq, r8
+ jmp wq
+.h_w2:
+ movzx mxd, mxb
+ dec srcq
+ mova xmm4, [subpel_h_shuf4]
+ vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2]
+.h_w2_loop:
+ movq xmm0, [srcq+ssq*0]
+ movhps xmm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xmm0, xmm4
+ mova xmm1, xm5
+ vpdpbusd xmm1, xmm0, xmm3
+ packssdw xmm0, xmm1, xmm1
+ psraw xmm0, 6
+ packuswb xmm0, xm0
+ pextrw [dstq+dsq*0], xmm0, 0
+ pextrw [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2]
+.h_w4_loop:
+ movq xmm0, [srcq+ssq*0]
+ movq xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xmm0, xm6
+ pshufb xmm1, xm6
+ mova xmm2, xm5
+ vpdpbusd xmm2, xmm0, xmm3
+ mova xmm0, xm5
+ vpdpbusd xmm0, xmm1, xmm3
+ packssdw xmm0, xmm2, xmm0
+ psraw xmm0, 6
+ packuswb xmm0, xmm0
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h_w8:
+ movu xm0, [srcq+ssq*0]
+ vinserti32x4 ym0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ WRAP_YMM PUT_8TAP_H 0, 1, 2, 3
+ vpmovuswb xm0, ym0
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ mova m6, [spel_h_perm16]
+ vpbroadcastd m8, [pb_4]
+ paddb m7, m8, m6
+ paddb m8, m7
+.h_w16_loop:
+ movu ym0, [srcq+ssq*0]
+ vinserti32x8 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 0, 1, 2, 3, 1
+ vpmovuswb ym0, m0
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], ym0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16_loop
+ RET
+.h_w32:
+ movu ym0, [srcq+ssq*0+8*0]
+ vinserti32x8 m0, [srcq+ssq*1+8*0], 1
+ movu ym1, [srcq+ssq*0+8*1]
+ vinserti32x8 m1, [srcq+ssq*1+8*1], 1
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 0, 2, 3, 4
+ PUT_8TAP_H 1, 4, 3, 2
+ packuswb m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w32
+ RET
+.h_w64:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ add srcq, ssq
+ PUT_8TAP_H 0, 2, 3, 4
+ PUT_8TAP_H 1, 4, 3, 2
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ movu m0, [srcq+8*0]
+ movu m2, [srcq+8*1]
+ movu m1, [srcq+8*8]
+ movu m3, [srcq+8*9]
+ add srcq, ssq
+ PUT_8TAP_H 0, 4, 11, 12
+ PUT_8TAP_H 2, 12, 11, 4
+ PUT_8TAP_H 1, 4, 11, 12
+ PUT_8TAP_H 3, 12, 11, 4
+ packuswb m0, m2
+ packuswb m1, m3
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, dsq
+ dec hd
+ jg .h_w128
+ RET
.hv:
+ vpbroadcastd m9, [pd_34]
+ pxor xm0, xm0
cmp wd, 4
jg .hv_w8
movzx mxd, mxb
@@ -2000,12 +2537,10 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
shr myd, 16
cmp hd, 6
cmovs myd, mxd
- vpbroadcastd m8, [pd_2]
- vpbroadcastq ym0, [base+subpel_filters+myq*8]
+ vpbroadcastq ym1, [base+subpel_filters+myq*8]
lea ss3q, [ssq*3]
- vpbroadcastd ym9, [pd_32768]
mov r6, srcq
- punpcklbw ym0, ym8, ym0
+ punpcklbw ym0, ym1
sub r6, ss3q
psraw ym0, 2 ; << 6
mova xm14, [spel_hv_end]
@@ -2029,9 +2564,9 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
vpblendd ymm2, ymm4, 0xc0 ; 0 1 4 5
pshufb ymm2, ym6
pshufb ymm0, ym6
- mova ymm1, ym8
+ mova ymm1, ym9
vpdpbusd ymm1, ymm2, ym7
- mova ymm2, ym8
+ mova ymm2, ym9
vpdpbusd ymm2, ymm0, ym7
packssdw ymm2, ymm1, ymm2
psraw ymm2, 2
@@ -2045,14 +2580,13 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
movq xmm4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
movhps xmm4, [srcq+ssq*0]
- mova xmm5, xm9
- vpdpwssd xmm5, xmm1, xm10 ; a0 b0
+ pmaddwd xmm5, xmm1, xm10 ; a0 b0
mova xmm1, xmm2
vpdpwssd xmm5, xmm2, xm11 ; a1 b1
pshufb xmm4, xm6
mova xmm2, xmm3
vpdpwssd xmm5, xmm3, xm12 ; a2 b2
- mova xmm3, xm8
+ mova xmm3, xm9
vpdpbusd xmm3, xmm4, xm7
packssdw xmm4, xmm3, xmm3
psraw xmm4, 2
@@ -2081,9 +2615,9 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
vinserti32x4 m1, [srcq+ssq*0], 3 ; 0 2 4 6
pshufb m2, m6
pshufb m1, m6
- mova m0, m8
+ mova m0, m9
vpdpbusd m0, m2, m7
- mova m4, m8
+ mova m4, m9
vpdpbusd m4, m1, m7
mova ym1, [spel_hv_perm4a]
mova ym2, [spel_hv_perm4b]
@@ -2100,11 +2634,10 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
movq xmm4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vinserti32x4 ym4, ymm4, [srcq+ssq*0], 1
- mova ym5, ym9
- vpdpwssd ym5, ym1, ym10 ; a0 b0
+ pmaddwd ym5, ym1, ym10 ; a0 b0
mova ym1, ym2
pshufb ym4, ym6
- mova ym0, ym8
+ mova ym0, ym9
vpdpbusd ym0, ym4, ym7
vpdpwssd ym5, ym2, ym11 ; a1 b1
mova ym2, ym3
@@ -2129,10 +2662,8 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
shr myd, 16
cmp hd, 6
cmovs myd, mxd
- vpbroadcastd m8, [pd_2]
- vpbroadcastq m0, [base+subpel_filters+myq*8]
- vpbroadcastd m9, [pd_32768]
- punpcklbw m0, m8, m0
+ vpbroadcastq m1, [base+subpel_filters+myq*8]
+ punpcklbw m0, m1
lea ss3q, [ssq*3]
psraw m0, 2 ; << 6
pshufd m12, m0, q0000
@@ -2153,31 +2684,31 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
vbroadcasti32x4 m4, [subpel_h_shufA]
vinserti32x4 m0, zmm2, [srcq+ssq*0], 2 ; 4 5 6 _
vbroadcasti32x4 m7, [subpel_h_shufB]
- vbroadcasti32x4 m17, [subpel_h_shufC]
+ vbroadcasti32x4 m8, [subpel_h_shufC]
pshufb m1, m6, m4 ; 0 1 2 3 0123
- mova m2, m8
+ mova m2, m9
vpdpbusd m2, m1, m10
pshufb m5, m6, m7 ; 0 1 2 3 4567
- mova m1, m8
+ mova m1, m9
vpdpbusd m1, m5, m10
pshufb m4, m0, m4 ; 4 5 6 _ 0123
- mova m3, m8
+ mova m3, m9
vpdpbusd m3, m4, m10
pshufb m7, m0, m7 ; 4 5 6 _ 4567
- mova m4, m8
+ mova m4, m9
vpdpbusd m4, m7, m10
- pshufb m6, m17
+ pshufb m6, m8
vpdpbusd m2, m5, m11
vpdpbusd m1, m6, m11
- pshufb m6, m0, m17
+ pshufb m6, m0, m8
vpdpbusd m3, m7, m11
vpdpbusd m4, m6, m11
mova m5, [spel_hv_perm8a]
- mova m0, [spel_hv_perm8b]
+ vpaddd m0, m5, [pb_32] {1to16}
mov r6, 0x55555555ff00
packssdw m2, m1
packssdw m3, m4
- mova m18, [spel_hv_perm8c]
+ mova m8, [spel_hv_perm8b]
psraw m2, 2 ; 0 1 2 3
psraw m3, 2 ; 4 5 6 _
vpermb m1, m5, m2 ; 01 12
@@ -2192,10 +2723,9 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
vbroadcasti32x4 ym4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vbroadcasti32x4 m4{k1}, [srcq+ssq*0]
- mova m0, m9
- vpdpwssd m0, m1, m12 ; a0 b0
+ pmaddwd m0, m1, m12 ; a0 b0
pshufb m1, m4, m6 ; 7 8 0123 4567
- mova m5, m8
+ mova m5, m9
vpdpbusd m5, m1, m10
pshufb m4, m7 ; 7 8 4567 89ab
vpdpwssd m0, m2, m13 ; a1 b1
@@ -2204,7 +2734,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
mova m2, m3
vpdpwssd m0, m3, m14 ; a2 b2
psraw m3{k2}, m5, 2 ; 75 86
- vpermb m3, m18, m3 ; 67 78
+ vpermb m3, m8, m3 ; 67 78
vpdpwssd m0, m3, m15 ; a3 b3
packuswb m0, m0
vpermb zmm1, m16, m0
@@ -2216,111 +2746,652 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
vzeroupper
RET
.hv_w16:
- movu m7, [spel_hv_perm16a]
+ WIN64_SPILL_XMM 23
+ movu m22, [spel_hv_perm16a]
sub srcq, ss3q
- mova m20, [spel_hv_perm16b]
+ vpbroadcastd m8, [pb_4]
lea r6d, [wq*2-32]
- mova m21, [spel_hv_perm16c]
- mov r4, srcq
- mov r7, dstq
+ mova m7, [spel_hv_perm16b]
+ paddb m20, m8, m22
mova ym16, [spel_hv_end16]
+ paddb m21, m8, m20
lea r6d, [hq+r6*8]
+ paddb m8, m7
.hv_w16_loop0:
movu ym17, [srcq+ssq*0]
vinserti32x8 m17, [srcq+ssq*1], 1 ; 0 1
+ lea r4, [srcq+ss3q]
movu ym18, [srcq+ssq*2]
- add srcq, ss3q
- vinserti32x8 m18, [srcq+ssq*0], 1 ; 2 3
- movu ym19, [srcq+ssq*1]
- vinserti32x8 m19, [srcq+ssq*2], 1 ; 4 5
- add srcq, ss3q
- vpermb m2, m7, m17 ; 0 1 0123 89ab
- vpermb m0, m20, m17 ; 0 1 4567 cdef
- vpermb m4, m7, m18 ; 2 3 0123 89ab
- mova m1, m8
+ vinserti32x8 m18, [r4 +ssq*0], 1 ; 2 3
+ mov r7, dstq
+ movu ym19, [r4 +ssq*1]
+ vinserti32x8 m19, [r4 +ssq*2], 1 ; 4 5
+ add r4, ss3q
+ vpermb m2, m22, m17 ; 0 1 0123 89ab
+ mova m1, m9
+ vpermb m3, m21, m17 ; 0 1 89ab ghij
vpdpbusd m1, m2, m10
- vpermb m5, m20, m18 ; 2 3 4567 cdef
- mova m2, m8
- vpdpbusd m2, m0, m10
- vpermb m17, m21, m17 ; 0 1 89ab ghij
- mova m3, m8
+ mova m2, m9
+ vpermb m4, m22, m18 ; 2 3 0123 89ab
+ vpdpbusd m2, m3, m11
+ mova m3, m9
+ vpermb m5, m21, m18 ; 2 3 89ab ghij
vpdpbusd m3, m4, m10
- vpermb m6, m7, m19 ; 4 5 0123 89ab
- mova m4, m8
- vpdpbusd m4, m5, m10
- vpermb m18, m21, m18 ; 2 3 89ab ghij
- vpdpbusd m1, m0, m11
- movu ym0, [srcq+ssq*0] ; 6
- vpdpbusd m2, m17, m11
- vpermb m17, m20, m19 ; 4 5 4567 cdef
- vpdpbusd m3, m5, m11
- mova m5, m8
+ mova m4, m9
+ vpermb m6, m22, m19 ; 4 5 0123 89ab
+ vpdpbusd m4, m5, m11
+ mova m5, m9
+ vpermb m17, m20, m17 ; 0 1 4567 cdef
vpdpbusd m5, m6, m10
- mova m6, m8
- vpdpbusd m6, m17, m10
- vpdpbusd m4, m18, m11
- mova m18, [spel_hv_perm16d]
- vpermb m18, m18, m0 ; 6 0145 2367 89cd abef
- vpdpbusd m5, m17, m11
- vpermb m19, m21, m19 ; 4 5 89ab ghij
- mova m17, m8
- vpdpbusd m17, m18, m10
- mova m18, [spel_hv_perm16e]
- vpermb m0, m18, m0 ; 6 4589 67ab cdgh efij
- packssdw m1, m2 ; 01
- vpdpbusd m6, m19, m11
- packssdw m3, m4 ; 23
- vpdpbusd m17, m0, m11
- psraw m1, 2
- packssdw m5, m6 ; 45
- psraw m3, 2
+ mova m6, m9
+ vpermb m0, m21, m19 ; 4 5 89ab ghij
+ vpdpbusd m1, m17, m11
+ vpdpbusd m2, m17, m10
+ movu ym17, [r4+ssq*0] ; 6
+ vpermb m18, m20, m18 ; 2 3 4567 cdef
+ vpdpbusd m6, m0, m11
+ vpermb m0, m7, m17 ; 6 0145 2367 89cd abef
+ vpdpbusd m3, m18, m11
+ vpermb m19, m20, m19 ; 4 5 4567 cdef
+ vpdpbusd m4, m18, m10
+ mova m18, m9
+ vpermb m17, m8, m17 ; 6 4589 67ab cdgh efij
+ vpdpbusd m18, m0, m10
+ packssdw m1, m2
+ vpdpbusd m5, m19, m11
+ vpdpbusd m6, m19, m10
+ packssdw m3, m4
+ vpdpbusd m18, m17, m11
+ psraw m1, 2 ; 01
+ psraw m3, 2 ; 23
+ packssdw m5, m6
vpshrdd m2, m1, m3, 16 ; 12
- psraw m5, 2
+ psraw m5, 2 ; 45
vpshrdd m4, m3, m5, 16 ; 34
- psraw m17, 2
- vpshrdd m6, m5, m17, 16 ; 56
+ psraw m18, 2
+ vpshrdd m6, m5, m18, 16 ; 56
.hv_w16_loop:
- movu ym18, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- vinserti32x8 m18, [srcq+ssq*0], 1
+ movu ym19, [r4+ssq*1]
+ lea r4, [r4+ssq*2]
+ vinserti32x8 m19, [r4+ssq*0], 1
+ pmaddwd m17, m1, m12 ; a0
+ vpermb m1, m22, m19 ; 7 8 0123 89ab
+ pmaddwd m18, m2, m12 ; b0
mova m0, m9
- vpdpwssd m0, m1, m12 ; a0
- vpermb m1, m7, m18 ; 7 8 0123 89ab
- mova m17, m9
- vpdpwssd m17, m2, m12 ; b0
- vpermb m2, m20, m18 ; 7 8 4567 cdef
- mova m19, m8
- vpdpbusd m19, m1, m10
- vpermb m18, m21, m18
- mova m1, m8
- vpdpbusd m1, m2, m10
- vpdpwssd m0, m3, m13 ; a1
- vpdpwssd m17, m4, m13 ; b1
- vpdpbusd m19, m2, m11
+ vpermb m2, m21, m19 ; 7 8 89ab ghij
+ vpdpbusd m0, m1, m10
+ mova m1, m9
+ vpermb m19, m20, m19 ; 7 8 4567 cdef
+ vpdpbusd m1, m2, m11
mova m2, m4
- vpdpbusd m1, m18, m11
+ vpdpwssd m17, m3, m13 ; a1
+ vpdpwssd m18, m4, m13 ; b1
mova m4, m6
- vpdpwssd m0, m5, m14 ; a2
- vpdpwssd m17, m6, m14 ; b2
- packssdw m19, m1
+ vpdpbusd m0, m19, m11
+ vpdpbusd m1, m19, m10
+ vpdpwssd m17, m5, m14 ; a2
+ vpdpwssd m18, m6, m14 ; b2
+ packssdw m0, m1
mova m1, m3
+ psraw m6, m0, 2 ; 78
mova m3, m5
- psraw m6, m19, 2 ; 7 8
- vpshrdd m5, m4, m6, 16 ; 6 7
- vpdpwssd m17, m6, m15 ; b3
- vpdpwssd m0, m5, m15 ; a3
- packuswb m0, m17
- vpermb zmm1, m16, m0
- mova [dstq+dsq*0], xmm1
- vextracti128 [dstq+dsq*1], ymm1, 1
- lea dstq, [dstq+dsq*2]
+ vpshrdd m5, m4, m6, 16 ; 67
+ vpdpwssd m18, m6, m15 ; b3
+ vpdpwssd m17, m5, m15 ; a3
+ packuswb m17, m18
+ vpermb m17, m16, m17
+ mova [r7+dsq*0], xm17
+ vextracti128 [r7+dsq*1], ym17, 1
+ lea r7, [r7+dsq*2]
sub hd, 2
jg .hv_w16_loop
- add r4, 16
- add r7, 16
+ add srcq, 16
+ add dstq, 16
+ movzx hd, r6b
+ sub r6d, 1<<8
+ jg .hv_w16_loop0
+ RET
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%define PREP_8TAP_FN FN prep_8tap,
+PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_6tap_8bpc
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_6tap_8bpc
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_8bpc
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_8bpc
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_8bpc
+PREP_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal prep_6tap_8bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my, ss3
+%define base r7-prep_avx512icl
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 6tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 6tap_v, my, 4tap_v
+ lea r7, [prep_avx512icl]
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+.prep:
+ tzcnt wd, wd
+ movzx wd, word [r7+wq*2+table_offset(prep,)]
+ add wq, r7
+ lea r6, [ssq*3]
+%if WIN64
+ pop r7
+%endif
+ jmp wq
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ tzcnt r5d, wd
+ lea myq, [base+subpel_filters+1+myq*8]
+ movzx r5d, word [r7+r5*2+table_offset(prep, _6tap_v)]
+ vpbroadcastd m7, [pw_8192]
+ sub srcq, ssq
+ vpbroadcastw m8, [myq+0]
+ add r5, r7
+ vpbroadcastw m9, [myq+2]
+ lea ss3q, [ssq*3]
+ vpbroadcastw m10, [myq+4]
+ sub srcq, ssq
+ jmp r5
+.v_w4:
+ movd xmm2, [srcq+ssq*0]
+ pinsrd xmm2, [srcq+ssq*1], 1
+ vpbroadcastd ymm1, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastd ymm3, [srcq+ssq*0]
+ vpbroadcastd ymm0, [srcq+ssq*1]
+ vbroadcasti128 ymm5, [deint_shuf4]
+ vpblendd ymm1, ymm2, 0xeb
+ punpcklqdq ymm3, ymm0
+ vpblendd ymm1, ymm3, 0x60 ; 0 1 2 _ 2 3 4 _
+ pshufb ymm1, ymm5 ; 01 12 23 34
+.v_w4_loop:
+ pinsrd xmm0, [srcq+ssq*2], 1
+ vpbroadcastd ymm2, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ vpbroadcastd ymm3, [srcq+ssq*0]
+ vpblendd ymm2, ymm0, 0xeb
+ vpbroadcastd ymm0, [srcq+ssq*1]
+ punpcklqdq ymm3, ymm0
+ vpblendd ymm2, ymm3, 0x60 ; 4 5 6 _ 6 7 8 _
+ pshufb ymm2, ymm5 ; 45 56 67 78
+ pmaddubsw ymm3, ymm1, ym8 ; a0 b0 c0 d0
+ vperm2i128 ymm1, ymm2, 0x21 ; 23 34 45 56
+ pmaddubsw ymm4, ymm2, ym10 ; a2 b2 c2 d2
+ pmaddubsw ymm1, ym9 ; a1 b1 c1 d1
+ paddw ymm3, ymm4
+ paddw ymm3, ymm1
+ pmulhrsw ymm3, ym7
+ mova ymm1, ymm2
+ mova [tmpq], ymm3
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ vzeroupper
+ RET
+.v_w8:
+ mova m6, [spel_v_perm8]
+ movq xm1, [srcq+ssq*0]
+ mov r6d, 0x3e
+ movq xm2, [srcq+ssq*1]
+ kmovb k1, r6d
+ vpbroadcastq ym3, [srcq+ssq*2]
+ add srcq, ss3q
+ vpunpcklqdq ym2, [srcq+ssq*0] {1to4}
+ vpunpcklqdq m1{k1}, m3, [srcq+ssq*1] {1to8}
+ movq xm0, [srcq+ssq*1]
+ kshiftlb k2, k1, 2
+ shufpd m1, m2, 0x18 ; 0 1 2 3 4
+ vpermb m1, m6, m1 ; 01 12 23 34
+.v_w8_loop:
+ vpbroadcastq ym3, [srcq+ss3q ]
+ vpunpcklqdq ym0{k1}, ym3, [srcq+ssq*2] {1to4}
+ lea srcq, [srcq+ssq*4]
+ vpbroadcastq m3, [srcq+ssq*1]
+ vpunpcklqdq m0{k2}, m3, [srcq+ssq*0] {1to8}
+ pmaddubsw m4, m1, m8 ; a0 b0 c0 d0
+ vpermb m2, m6, m0 ; 45 56 67 78
+ mova xm0, xm3
+ vshufi32x4 m1, m2, q1032 ; 23 34 45 56
+ pmaddubsw m3, m2, m10 ; a3 b3 c3 d3
+ pmaddubsw m5, m1, m9 ; a2 b2 c2 d2
+ mova m1, m2
+ paddw m4, m3
+ paddw m4, m5
+ pmulhrsw m4, m7
+ mova [tmpq], m4
+ add tmpq, 64
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ mova m11, [spel_v_perm16b]
+ vbroadcasti32x4 m1, [srcq+ssq*0]
+ mov r6d, 0x0f
+ vbroadcasti32x4 ym3, [srcq+ssq*1]
+ vbroadcasti32x4 m2, [srcq+ssq*2]
+ kmovb k1, r6d
+ add srcq, ss3q
+ vbroadcasti32x4 ym4, [srcq+ssq*0]
+ vbroadcasti32x4 m0, [srcq+ssq*1]
+ vshufpd m1{k1}, m3, m2, 0xcc
+ vshufpd m2{k1}, m4, m0, 0xcc
+ vpermb m1, m11, m1 ; 01 12
+ vpermb m2, m11, m2 ; 23 34
+.v_w16_loop:
+ pmaddubsw m3, m1, m8 ; a0 b0
+ pmaddubsw m5, m2, m9 ; a1 b1
+ vbroadcasti32x4 ym6, [srcq+ssq*2]
+ pmaddubsw m4, m2, m8 ; c0 d0
+ vbroadcasti32x4 m2, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ vshufpd m0{k1}, m6, m2, 0xcc
+ vbroadcasti32x4 ym6, [srcq+ssq*0]
+ vpermb m1, m11, m0 ; 45 56
+ vbroadcasti32x4 m0, [srcq+ssq*1]
+ vshufpd m2{k1}, m6, m0, 0xcc
+ pmaddubsw m6, m1, m9 ; c1 d1
+ vpermb m2, m11, m2 ; 67 78
+ paddw m3, m5
+ pmaddubsw m5, m1, m10 ; a2 b2
+ paddw m4, m6
+ pmaddubsw m6, m2, m10 ; c2 d2
+ paddw m3, m5
+ paddw m4, m6
+ pmulhrsw m3, m7
+ pmulhrsw m4, m7
+ mova [tmpq+ 0], m3
+ mova [tmpq+64], m4
+ add tmpq, 64*2
+ sub hd, 4
+ jg .v_w16_loop
+ RET
+.v_w32:
+ movshdup m6, [bilin_v_perm64]
+ movu ym16, [srcq+ssq*0]
+ movu ym17, [srcq+ssq*1]
+ movu ym18, [srcq+ssq*2]
+ add srcq, ss3q
+ movu ym19, [srcq+ssq*0]
+ add srcq, ssq
+ movu ym20, [srcq+ssq*0]
+ vpermt2q m16, m6, m18 ; 0 2
+ vpermt2q m17, m6, m19 ; 1 3
+ vpermt2q m18, m6, m20 ; 2 4
+ punpcklbw m0, m16, m17 ; 01
+ punpcklbw m1, m17, m18 ; 12
+ punpckhbw m2, m16, m17 ; 23
+ punpckhbw m3, m17, m18 ; 34
+.v_w32_loop:
+ movu ym16, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movu ym17, [srcq+ssq*0]
+ pmaddubsw m4, m0, m8 ; a0
+ mova m0, m2
+ pmaddubsw m2, m9 ; a1
+ vpermt2q m16, m6, m17 ; 5 6
+ pmaddubsw m5, m1, m8 ; b0
+ mova m1, m3
+ pmaddubsw m3, m9 ; b1
+ shufpd m18, m16, 0x55 ; 4 5
+ paddw m4, m2
+ punpcklbw m2, m18, m16 ; 45
+ paddw m5, m3
+ punpckhbw m3, m18, m16 ; 56
+ mova m18, m16
+ pmaddubsw m16, m2, m10 ; a2
+ pmaddubsw m17, m3, m10 ; b2
+ paddw m4, m16
+ paddw m5, m17
+ pmulhrsw m4, m7
+ pmulhrsw m5, m7
+ mova [tmpq+ 0], m4
+ mova [tmpq+64], m5
+ add tmpq, 64*2
+ sub hd, 2
+ jg .v_w32_loop
+ vzeroupper
+ RET
+.v_w64:
+.v_w128:
+ mova m6, [bilin_v_perm64]
+ add wd, wd
+ lea r6d, [hq+wq]
+.v_loop0:
+ vpermq m12, m6, [srcq+ssq*0]
+ vpermq m13, m6, [srcq+ssq*1]
+ lea r5, [srcq+ssq*2]
+ vpermq m14, m6, [r5 +ssq*0]
+ vpermq m15, m6, [r5 +ssq*1]
+ lea r5, [r5+ssq*2]
+ vpermq m16, m6, [r5 +ssq*0]
+ mov r7, tmpq
+ punpcklbw m0, m12, m13 ; 01
+ punpckhbw m12, m13
+ punpcklbw m1, m13, m14 ; 12
+ punpckhbw m13, m14
+ punpcklbw m2, m14, m15 ; 23
+ punpckhbw m14, m15
+ punpcklbw m3, m15, m16 ; 34
+ punpckhbw m15, m16
+.v_loop:
+ pmaddubsw m17, m0, m8 ; a0
+ vpermq m5, m6, [r5+ssq*1]
+ pmaddubsw m18, m12, m8
+ mova m0, m2
+ pmaddubsw m2, m9 ; a1
+ mova m12, m14
+ pmaddubsw m14, m9
+ lea r5, [r5+ssq*2]
+ pmaddubsw m19, m1, m8 ; b0
+ pmaddubsw m20, m13, m8
+ mova m1, m3
+ pmaddubsw m3, m9 ; b1
+ mova m13, m15
+ pmaddubsw m15, m9
+ paddw m17, m2
+ punpcklbw m2, m16, m5 ; 67
+ paddw m18, m14
+ punpckhbw m14, m16, m5
+ vpermq m16, m6, [r5+ssq*0]
+ paddw m19, m3
+ pmaddubsw m3, m2, m10 ; a3
+ paddw m20, m15
+ pmaddubsw m15, m14, m10
+ paddw m17, m3
+ punpcklbw m3, m5, m16 ; 78
+ pmaddubsw m4, m3, m10 ; b3
+ paddw m18, m15
+ punpckhbw m15, m5, m16
+ pmaddubsw m5, m15, m10
+ paddw m19, m4
+ paddw m20, m5
+ REPX {pmulhrsw x, m7}, m17, m18, m19, m20
+ mova [r7+wq*0+ 0], m17
+ mova [r7+wq*0+64], m18
+ mova [r7+wq*1+ 0], m19
+ mova [r7+wq*1+64], m20
+ lea r7, [r7+wq*2]
+ sub hd, 2
+ jg .v_loop
+ add srcq, 64
+ add tmpq, 128
+ movzx hd, r6b
+ sub r6d, 1<<8
+ jg .v_loop0
+ vzeroupper
+ RET
+.h:
+ test myd, 0xf00
+ jz mangle(private_prefix %+ _prep_8tap_8bpc_avx512icl).h2
+.hv:
+ vpbroadcastd m8, [pd_2]
+ vpbroadcastd m9, [pd_32]
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ vpbroadcastd m11, [base+subpel_filters+mxq*8+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ vpbroadcastq m3, [base+subpel_filters+1+myq*8]
+ vbroadcasti128 m10, [subpel_h_shufA]
+ lea r6, [ssq*2+1]
+ mov r3d, 0x30
+ sub srcq, r6
+ kmovb k1, r3d
+ vpbroadcastq ym2, [srcq+ssq*0]
+ lea ss3q, [ssq*3]
+ vpbroadcastq m1, [srcq+ssq*1]
+ kaddb k2, k1, k1
+ vpbroadcastq m2{k1}, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq m1{k2}, [srcq+ssq*0] ; _ _ 1 3
+ punpcklbw m3, m3
+ vpbroadcastq m2{k2}, [srcq+ssq*1] ; _ 0 2 4
+ psraw m3, 8 ; sign-extend
+ mova m6, [spel_hv_perm4a]
+ kshiftrb k1, k1, 2
+ movu m7, [spel_hv_perm4b]
+ pshufb m1, m10
+ mova m0, m8
+ vpdpbusd m0, m1, m11
+ pshufb m2, m10
+ mova m1, m8
+ vpdpbusd m1, m2, m11
+ pshufd m12, m3, q0000
+ pshufd m13, m3, q1111
+ pshufd m14, m3, q2222
+ packssdw m0, m1 ; _ _ _ 0 1 2 3 4
+ psraw m0, 2
+ vpermb m1, m7, m0 ; 01 12 23 34
+.hv_w4_loop:
+ movq xm3, [srcq+ssq*2]
+ movq xm4, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ vpbroadcastq ym3{k1}, [srcq+ssq*0] ; 5 7
+ vpbroadcastq ym4{k1}, [srcq+ssq*1] ; 6 8
+ pshufb ym3, ym10
+ mova ym2, ym8
+ vpdpbusd ym2, ym3, ym11
+ pshufb ym4, ym10
+ mova ym3, ym8
+ vpdpbusd ym3, ym4, ym11
+ mova m4, m9
+ vpdpwssd m4, m1, m12 ; a0 b0 c0 d0
+ packssdw ym2, ym3 ; 5 6 7 8
+ psraw ym2, 2
+ vshufi32x4 m0, m2, q1032 ; _ 2 3 4 5 6 7 8
+ vpermb m2, m6, m0 ; 23 34 45 56
+ vpermb m1, m7, m0 ; 45 56 67 78
+ vpdpwssd m4, m2, m13 ; a1 b1 c1 d1
+ vpdpwssd m4, m1, m14 ; a2 b2 c2 d2
+ psrad m4, 6
+ vpmovdw [tmpq], m4
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+ vpbroadcastd m10, [base+subpel_filters+mxq*8+0]
+ vpbroadcastd m11, [base+subpel_filters+mxq*8+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ vpbroadcastq m0, [base+subpel_filters+1+myq*8]
+ lea r6, [ssq*2+3]
+ punpcklbw m0, m0
+ sub srcq, r6
+ psraw m0, 8 ; sign-extend
+ lea ss3q, [ssq*3]
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ cmp wd, 8
+ jg .hv_w16
+ movu xm16, [srcq+ssq*0]
+ vbroadcasti32x4 m19, [subpel_h_shufA]
+ vinserti128 ym16, [srcq+ssq*1], 1
+ vbroadcasti32x4 m21, [subpel_h_shufC]
+ vinserti32x4 m16, [srcq+ssq*2], 2
+ add srcq, ss3q
+ vinserti32x4 m16, [srcq+ssq*0], 3
+ movu xm17, [srcq+ssq*1]
+ vbroadcasti32x4 m20, [subpel_h_shufB]
+ pshufb m3, m16, m19 ; 0 1 2 3 0123
+ mova m2, m8
+ pshufb m0, m16, m21 ; 0 1 2 3 89ab
+ vpdpbusd m2, m3, m10
+ mova m3, m8
+ pshufb xm1, xm17, xm19 ; 3 4 5 6 0123
+ vpdpbusd m3, m0, m11
+ mova xm0, xm8
+ pshufb xm18, xm17, xm21 ; 3 4 5 6 89ab
+ vpdpbusd xm0, xm1, xm10
+ mova xm1, xm8
+ pshufb m16, m20 ; 0 1 2 3 4567
+ vpdpbusd xm1, xm18, xm11
+ pshufb xm17, xm20 ; 3 4 5 6 4567
+ vpdpbusd m2, m16, m11
+ vpdpbusd m3, m16, m10
+ vpdpbusd xm0, xm17, xm11
+ vpdpbusd xm1, xm17, xm10
+ packssdw m2, m3
+ packssdw xm0, xm1
+ psraw m2, 2 ; 0 1 2 3
+ psraw xm0, 2 ; 4
+ valignq m0, m2, 2 ; 1 2 3 4
+ punpcklwd m1, m2, m0 ; 01 12 23 34
+ punpckhwd m2, m0
+.hv_w8_loop:
+ movu xm16, [srcq+ssq*2]
+ vinserti128 ym16, [srcq+ss3q ], 1
+ lea srcq, [srcq+ssq*4]
+ vinserti32x4 m16, [srcq+ssq*0], 2
+ vinserti32x4 m16, [srcq+ssq*1], 3
+ pshufb m6, m16, m19 ; 5 6 7 8 0123
+ mova m5, m8
+ pshufb m3, m16, m21 ; 5 6 7 8 89ab
+ vpdpbusd m5, m6, m10
+ mova m6, m8
+ pshufb m16, m20 ; 5 6 7 8 4567
+ vpdpbusd m6, m3, m11
+ mova m3, m9
+ vpdpwssd m3, m1, m12 ; a0 b0 c0 d0
+ mova m4, m9
+ vpdpwssd m4, m2, m12
+ vpdpbusd m5, m16, m11
+ vpdpbusd m6, m16, m10
+ mova m16, m1
+ packssdw m5, m6
+ mova m6, m2
+ psraw m5, 2 ; 5 6 7 8
+ valignq m2, m5, m0, 6 ; 4 5 6 7
+ mova m0, m5
+ punpcklwd m1, m2, m5 ; 45 56 67 78
+ punpckhwd m2, m5
+ vpdpwssd m3, m1, m14 ; a2 b2 c2 d2
+ vpdpwssd m4, m2, m14
+ vshufi32x4 m16, m1, q1032 ; 23 34 45 56
+ vshufi32x4 m6, m2, q1032
+ vpdpwssd m3, m16, m13 ; a1 b1 c1 d1
+ vpdpwssd m4, m6, m13
+ psrad m3, 6
+ psrad m4, 6
+ packssdw m3, m4
+ mova [tmpq], m3
+ add tmpq, 64
+ sub hd, 4
+ jg .hv_w8_loop
+ vzeroupper
+ RET
+.hv_w16:
+ mova m16, [spel_h_perm16]
+ vpbroadcastd m18, [pb_4]
+ add wd, wd
+ paddb m17, m18, m16
+ lea r6d, [hq+wq*8-256]
+ paddb m18, m17
+.hv_w16_loop0:
+ movu ym19, [srcq+ssq*0]
+ vinserti32x8 m19, [srcq+ssq*1], 1
+ lea r5, [srcq+ssq*2]
+ movu ym20, [r5 +ssq*0]
+ vinserti32x8 m20, [r5 +ssq*1], 1
+ lea r5, [r5 +ssq*2]
+ movu ym21, [r5 +ssq*0]
+ mov r7, tmpq
+ vpermb m3, m16, m19 ; 0 1 0123 89ab
+ mova m2, m8
+ vpermb m4, m18, m19 ; 0 1 89ab ghij
+ vpdpbusd m2, m3, m10
+ mova m3, m8
+ vpermb m5, m16, m20 ; 2 3 0123 89ab
+ vpdpbusd m3, m4, m11
+ mova m4, m8
+ vpermb m0, m18, m20 ; 2 3 89ab ghij
+ vpdpbusd m4, m5, m10
+ mova m5, m8
+ vpermb ym1, ym16, ym21 ; 4 0123 89ab
+ vpdpbusd m5, m0, m11
+ mova ym0, ym8
+ vpermb ym6, ym18, ym21 ; 4 89ab ghij
+ vpdpbusd ym0, ym1, ym10
+ mova ym1, ym8
+ vpermb m19, m17, m19 ; 0 1 4567 cdef
+ vpdpbusd ym1, ym6, ym11
+ vpermb m20, m17, m20 ; 2 3 4567 cdef
+ vpdpbusd m2, m19, m11
+ vpdpbusd m3, m19, m10
+ vpermb ym21, ym17, ym21 ; 4 4567 cdef
+ vpdpbusd m4, m20, m11
+ vpdpbusd m5, m20, m10
+ vpdpbusd ym0, ym21, ym11
+ vpdpbusd ym1, ym21, ym10
+ packssdw m2, m3 ; 0 1
+ packssdw m4, m5 ; 2 3
+ packssdw ym0, ym1 ; 4
+ REPX {psraw x, 2}, m2, m4, ym0
+ vshufi32x4 m3, m2, m4, q1032 ; 1 2
+ vshufi32x4 m0, m4, m0, q1032 ; 3 4
+ punpcklwd m1, m2, m3 ; 01 12
+ punpckhwd m2, m3
+ punpcklwd m3, m4, m0 ; 23 34
+ punpckhwd m4, m0
+.hv_w16_loop:
+ movu ym19, [r5+ssq*1]
+ lea r5, [r5+ssq*2]
+ vinserti32x8 m19, [r5+ssq*0], 1
+ vpermb m6, m16, m19 ; 5 6 0123 89ab
+ mova m5, m8
+ vpermb m20, m18, m19 ; 5 6 89ab ghij
+ vpdpbusd m5, m6, m10
+ mova m6, m8
+ vpermb m19, m17, m19 ; 5 6 4567 cdef
+ vpdpbusd m6, m20, m11
+ mova m20, m9
+ vpdpwssd m20, m1, m12 ; a0 b0
+ mova m21, m9
+ vpdpwssd m21, m2, m12
+ vpdpbusd m5, m19, m11
+ vpdpbusd m6, m19, m10
+ vpdpwssd m20, m3, m13 ; a1 b1
+ vpdpwssd m21, m4, m13
+ packssdw m5, m6
+ mova m1, m3
+ psraw m5, 2 ; 5 6
+ mova m2, m4
+ vshufi32x4 m4, m0, m5, q1032 ; 4 5
+ mova m0, m5
+ punpcklwd m3, m4, m0 ; 45 56
+ punpckhwd m4, m0
+ vpdpwssd m20, m3, m14 ; a2 b2
+ vpdpwssd m21, m4, m14
+ psrad m20, 6
+ psrad m21, 6
+ packssdw m20, m21
+ mova [r7+wq*0], ym20
+ vextracti32x8 [r7+wq*1], m20, 1
+ lea r7, [r7+wq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ add srcq, 16
+ add tmpq, 32
movzx hd, r6b
- mov srcq, r4
- mov dstq, r7
sub r6d, 1<<8
jg .hv_w16_loop0
vzeroupper
@@ -2353,183 +3424,38 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
mova [tmpq+64*1], m1
%endmacro
-%if WIN64
-DECLARE_REG_TMP 6, 4
-%else
-DECLARE_REG_TMP 6, 7
-%endif
-
-%define PREP_8TAP_FN FN prep_8tap,
-
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_8bpc
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_8bpc
PREP_8TAP_FN sharp, SHARP, SHARP
-PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
-PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
-PREP_8TAP_FN smooth, SMOOTH, SMOOTH
-PREP_8TAP_FN sharp_regular, SHARP, REGULAR
-PREP_8TAP_FN regular_sharp, REGULAR, SHARP
-PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
-PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
-PREP_8TAP_FN regular, REGULAR, REGULAR
-cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
+cglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my, stride3
imul mxd, mxm, 0x010101
add mxd, t0d ; 8tap_h, mx, 4tap_h
imul myd, mym, 0x010101
add myd, t1d ; 8tap_v, my, 4tap_v
lea r7, [prep_avx512icl]
- movsxd wq, wm
movifnidn hd, hm
test mxd, 0xf00
jnz .h
test myd, 0xf00
- jnz .v
- tzcnt wd, wd
- movzx wd, word [r7+wq*2+table_offset(prep,)]
- add wq, r7
- lea r6, [strideq*3]
-%if WIN64
- pop r7
-%endif
- jmp wq
-.h:
- test myd, 0xf00
- jnz .hv
- vpbroadcastd m4, [pd_2]
- WIN64_SPILL_XMM 10
- cmp wd, 4
- je .h_w4
- tzcnt wd, wd
- shr mxd, 16
- sub srcq, 3
- movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
- vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+0]
- vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep_avx512icl+4]
- add wq, r7
- jmp wq
-.h_w4:
- movzx mxd, mxb
- vbroadcasti128 ym5, [subpel_h_shufA]
- mov r3d, 0x4
- dec srcq
- vpbroadcastd ym6, [r7+mxq*8+subpel_filters-prep_avx512icl+2]
- kmovb k1, r3d
- lea stride3q, [strideq*3]
-.h_w4_loop:
- movq xm2, [srcq+strideq*0]
- movq xm3, [srcq+strideq*1]
- vpbroadcastq ym2{k1}, [srcq+strideq*2]
- vpbroadcastq ym3{k1}, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- pshufb ym2, ym5
- pshufb ym3, ym5
- mova ym0, ym4
- vpdpbusd ym0, ym2, ym6
- mova ym1, ym4
- vpdpbusd ym1, ym3, ym6
- packssdw ym0, ym1
- psraw ym0, 2
- mova [tmpq], ym0
- add tmpq, 32
- sub hd, 4
- jg .h_w4_loop
- RET
-.h_w8:
- vbroadcasti128 m5, [subpel_h_shufA]
- vbroadcasti128 m6, [subpel_h_shufB]
- vbroadcasti128 m7, [subpel_h_shufC]
- lea stride3q, [strideq*3]
-.h_w8_loop:
- movu xmm3, [srcq+strideq*0]
- vinserti128 ym3, ymm3, [srcq+strideq*1], 1
- vinserti128 m3, [srcq+strideq*2], 2
- vinserti128 m3, [srcq+stride3q ], 3
- lea srcq, [srcq+strideq*4]
- pshufb m1, m3, m5
- pshufb m2, m3, m6
- mova m0, m4
- vpdpbusd m0, m1, m8
- mova m1, m4
- vpdpbusd m1, m2, m8
- pshufb m3, m7
- vpdpbusd m0, m2, m9
- vpdpbusd m1, m3, m9
- packssdw m0, m1
- psraw m0, 2
- mova [tmpq], m0
- add tmpq, 64
- sub hd, 4
- jg .h_w8_loop
- RET
-.h_w16:
- mova m5, [spel_h_perm16a]
- mova m6, [spel_h_perm16b]
- mova m7, [spel_h_perm16c]
- lea stride3q, [strideq*3]
-.h_w16_loop:
- movu ym0, [srcq+strideq*0]
- movu ym1, [srcq+strideq*2]
- vinserti32x8 m0, [srcq+strideq*1], 1
- vinserti32x8 m1, [srcq+stride3q ], 1
- lea srcq, [srcq+strideq*4]
- PREP_8TAP_H
- add tmpq, 64*2
- sub hd, 4
- jg .h_w16_loop
- RET
-.h_w32:
- mova m5, [spel_h_perm32a]
- mova m6, [spel_h_perm32b]
- mova m7, [spel_h_perm32c]
-.h_w32_loop:
- movu m0, [srcq+strideq*0]
- movu m1, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- PREP_8TAP_H
- add tmpq, 64*2
- sub hd, 2
- jg .h_w32_loop
- RET
-.h_w64:
- xor r6d, r6d
- jmp .h_start
-.h_w128:
- mov r6, -64*1
-.h_start:
- mova m5, [spel_h_perm32a]
- mova m6, [spel_h_perm32b]
- mova m7, [spel_h_perm32c]
- sub srcq, r6
- mov r5, r6
-.h_loop:
- movu m0, [srcq+r6+32*0]
- movu m1, [srcq+r6+32*1]
- PREP_8TAP_H
- add tmpq, 64*2
- add r6, 64
- jle .h_loop
- add srcq, strideq
- mov r6, r5
- dec hd
- jg .h_loop
- RET
+ jz mangle(private_prefix %+ _prep_6tap_8bpc_avx512icl).prep
.v:
movzx mxd, myb ; Select 4-tap/8-tap filter multipliers.
shr myd, 16 ; Note that the code is 8-tap only, having
- tzcnt wd, wd
cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4
cmove myd, mxd ; had a negligible effect on performance.
- ; TODO: Would a 6-tap code path be worth it?
- lea myq, [r7+myq*8+subpel_filters-prep_avx512icl]
- movzx wd, word [r7+wq*2+table_offset(prep, _8tap_v)]
- add wq, r7
- lea stride3q, [strideq*3]
- sub srcq, stride3q
+ tzcnt r5d, wd
+ lea myq, [base+subpel_filters+myq*8]
+ movzx r5d, word [r7+r5*2+table_offset(prep, _8tap_v)]
vpbroadcastd m7, [pw_8192]
vpbroadcastw m8, [myq+0]
+ add r5, r7
vpbroadcastw m9, [myq+2]
+ lea stride3q, [strideq*3]
vpbroadcastw m10, [myq+4]
+ sub srcq, stride3q
vpbroadcastw m11, [myq+6]
- jmp wq
+ jmp r5
.v_w4:
movd xmm0, [srcq+strideq*0]
vpbroadcastd ymm1, [srcq+strideq*2]
@@ -2576,172 +3502,146 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
vzeroupper
RET
.v_w8:
- mov r3d, 0xf044
- kmovw k1, r3d
- kshiftrw k2, k1, 8
- movq xm0, [srcq+strideq*0]
- vpbroadcastq ym1, [srcq+strideq*1]
- vpbroadcastq m2, [srcq+strideq*2]
- vpbroadcastq m3, [srcq+stride3q ]
+ mova m6, [spel_v_perm8]
+ movq xm1, [srcq+strideq*0]
+ mov r6d, 0x3e
+ movq xm2, [srcq+strideq*1]
+ vpbroadcastq ym3, [srcq+strideq*2]
+ kmovb k1, r6d
+ vpbroadcastq ym4, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
- vpbroadcastq m4, [srcq+strideq*0]
- vpbroadcastq m5, [srcq+strideq*1]
- vpbroadcastq m6, [srcq+strideq*2]
- vmovdqa64 ym0{k1}, ym1
- vmovdqa64 ym1{k1}, ym2
- vmovdqa64 m2{k1}, m3
- vmovdqa64 m3{k1}, m4
- vmovdqa64 m4{k1}, m5
- vmovdqa64 m5{k1}, m6
- punpcklbw ym0, ym1 ; 01 12 __ __
- punpcklbw m2, m3 ; 23 34 23 34
- punpcklbw m4, m5 ; 45 56 45 56
- vmovdqa64 m0{k2}, m2 ; 01 12 23 34
- vmovdqa64 m2{k2}, m4 ; 23 34 45 56
+ vpunpcklqdq m1{k1}, m3, [srcq+strideq*0] {1to8}
+ vpunpcklqdq m2{k1}, m4, [srcq+strideq*1] {1to8}
+ movq xm0, [srcq+strideq*2]
+ kshiftlb k2, k1, 2
+ shufpd m1, m2, 0x30 ; 0 1 2 3 4 5
+ vshufi32x4 m2, m1, m0, q0021 ; 2 3 4 5 6 _
+ vpermb m1, m6, m1 ; 01 12 23 34
+ vpermb m2, m6, m2 ; 23 34 45 56
.v_w8_loop:
- vpbroadcastq m1, [srcq+stride3q ]
+ vpbroadcastq ym3, [srcq+strideq*4]
+ vpunpcklqdq ym0{k1}, ym3, [srcq+stride3q] {1to4}
lea srcq, [srcq+strideq*4]
- vpbroadcastq m3, [srcq+strideq*0]
- vpbroadcastq m5, [srcq+strideq*1]
- pmaddubsw m14, m0, m8
- pmaddubsw m15, m2, m9
- vpblendmq m0{k1}, m6, m1
- vpblendmq m2{k1}, m1, m3
- vpbroadcastq m6, [srcq+strideq*2]
- paddw m14, m15
- punpcklbw m2, m0, m2 ; 67 78 67 78
- vpblendmq m12{k1}, m3, m5
- vpblendmq m13{k1}, m5, m6
- vpblendmq m0{k2}, m4, m2 ; 45 56 67 78
- punpcklbw m4, m12, m13 ; 89 9a 89 9a
- vmovdqa64 m2{k2}, m4 ; 67 78 89 9a
- pmaddubsw m12, m0, m10
- pmaddubsw m13, m2, m11
- paddw m14, m12
- paddw m14, m13
- pmulhrsw m14, m7
- mova [tmpq], m14
+ vpbroadcastq m3, [srcq+strideq*2]
+ vpunpcklqdq m0{k2}, m3, [srcq+strideq*1] {1to8}
+ pmaddubsw m4, m1, m8 ; a0 b0 c0 d0
+ mova m1, m2
+ pmaddubsw m5, m2, m9 ; a1 b1 c1 d1
+ vpermb m2, m6, m0 ; 67 78 89 9a
+ mova xm0, xm3
+ vshufi32x4 m1, m2, q1032 ; 45 56 67 78
+ pmaddubsw m3, m2, m11 ; a3 b3 c3 d3
+ paddw m4, m5
+ pmaddubsw m5, m1, m10 ; a2 b2 c2 d2
+ paddw m4, m3
+ paddw m4, m5
+ pmulhrsw m4, m7
+ mova [tmpq], m4
add tmpq, 64
sub hd, 4
jg .v_w8_loop
RET
.v_w16:
- mov r3d, 0xf0
- kmovb k1, r3d
- vbroadcasti128 m0, [srcq+strideq*0]
- vbroadcasti128 m1, [srcq+strideq*1]
- vbroadcasti128 m2, [srcq+strideq*2]
- vbroadcasti128 m3, [srcq+stride3q ]
+ mova m12, [spel_v_perm16b]
+ vbroadcasti32x4 m1, [srcq+strideq*0]
+ mov r6d, 0x0f
+ vbroadcasti32x4 ym4, [srcq+strideq*1]
+ vbroadcasti32x4 m2, [srcq+strideq*2]
+ kmovb k1, r6d
+ vbroadcasti32x4 ym5, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
- vbroadcasti128 m4, [srcq+strideq*0]
- vbroadcasti128 m5, [srcq+strideq*1]
- vbroadcasti128 m6, [srcq+strideq*2]
- vmovdqa64 m0{k1}, m1
- vmovdqa64 m1{k1}, m2
- vmovdqa64 m2{k1}, m3
- vmovdqa64 m3{k1}, m4
- vmovdqa64 m4{k1}, m5
- vmovdqa64 m5{k1}, m6
- shufpd m0, m2, 0xcc ; 0a_2a 0b_2b 1a_3a 1b_3b
- shufpd m1, m3, 0xcc ; 1a_3a 1b_3b 2a_4a 2b_4b
- shufpd m4, m4, 0x44 ; 4a_-- 4b_-- 5a_-- 5b_--
- shufpd m5, m5, 0x44 ; 5a_-- 5b_-- 6a_-- 6b_--
- punpckhbw m2, m0, m1 ; 23a 23b 34a 34b
- punpcklbw m0, m1 ; 01a 01b 12a 12b
- punpcklbw m4, m5 ; 45a 45b 56a 56b
+ vbroadcasti32x4 m3, [srcq+strideq*0]
+ vbroadcasti32x4 ym6, [srcq+strideq*1]
+ vbroadcasti32x4 m0, [srcq+strideq*2]
+ vshufpd m1{k1}, m4, m2, 0xcc
+ vshufpd m2{k1}, m5, m3, 0xcc
+ vshufpd m3{k1}, m6, m0, 0xcc
+ vpermb m1, m12, m1 ; 01 12
+ vpermb m2, m12, m2 ; 23 34
+ vpermb m3, m12, m3 ; 45 56
.v_w16_loop:
- vbroadcasti128 m3, [srcq+stride3q ]
+ pmaddubsw m4, m1, m8 ; a0 b0
+ mova m1, m3
+ pmaddubsw m13, m2, m9 ; a1 b1
+ vbroadcasti32x4 ym6, [srcq+stride3q ]
+ pmaddubsw m5, m2, m8 ; c0 d0
lea srcq, [srcq+strideq*4]
- vbroadcasti128 m5, [srcq+strideq*0]
- vpblendmq m1{k1}, m6, m3
- vmovdqa64 m3{k1}, m5
- pmaddubsw m12, m0, m8
- pmaddubsw m13, m2, m8
- pmaddubsw m14, m2, m9
- pmaddubsw m15, m4, m9
- pmaddubsw m0, m4, m10
- vbroadcasti128 m2, [srcq+strideq*1]
- vbroadcasti128 m6, [srcq+strideq*2]
- paddw m12, m14
- paddw m13, m15
- paddw m12, m0
- vmovdqa64 m5{k1}, m2
- vmovdqa64 m2{k1}, m6
- mova m0, m4
- shufpd m1, m5, 0xcc ; 6a_8a 6b_8b 7a_9a 7b_9b
- shufpd m3, m2, 0xcc ; 7a_9a 7b_9b 8a_Aa 8b_Ab
- punpcklbw m2, m1, m3 ; 67a 67b 78a 78b
- punpckhbw m4, m1, m3 ; 89a 89b 9Aa 9Ab
- pmaddubsw m14, m2, m10
- pmaddubsw m15, m2, m11
- paddw m13, m14
- paddw m12, m15
- pmaddubsw m14, m4, m11
- paddw m13, m14
- pmulhrsw m12, m7
- pmulhrsw m13, m7
- mova [tmpq+ 0], m12
- mova [tmpq+64], m13
+ pmaddubsw m14, m3, m9 ; c1 d1
+ vbroadcasti32x4 m3, [srcq+strideq*0]
+ vshufpd m0{k1}, m6, m3, 0xcc
+ vbroadcasti32x4 ym6, [srcq+strideq*1]
+ vpermb m2, m12, m0 ; 67 78
+ vbroadcasti32x4 m0, [srcq+strideq*2]
+ vshufpd m3{k1}, m6, m0, 0xcc
+ paddw m4, m13
+ pmaddubsw m13, m1, m10 ; a2 b2
+ vpermb m3, m12, m3 ; 89 9a
+ paddw m5, m14
+ pmaddubsw m14, m2, m10 ; c2 d2
+ pmaddubsw m15, m2, m11 ; a3 b3
+ pmaddubsw m6, m3, m11 ; c3 d3
+ paddw m4, m13
+ paddw m5, m14
+ paddw m4, m15
+ paddw m5, m6
+ pmulhrsw m4, m7
+ pmulhrsw m5, m7
+ mova [tmpq+ 0], m4
+ mova [tmpq+64], m5
add tmpq, 64*2
sub hd, 4
jg .v_w16_loop
RET
.v_w32:
- mova m18, [bilin_v_perm64]
- movu ym0, [srcq+strideq*0]
- movu ym1, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- movu ym2, [srcq+strideq*0]
- movu ym3, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- movu ym4, [srcq+strideq*0]
- movu ym5, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- movu ym6, [srcq+strideq*0]
- vpermq m0, m18, m0
- vpermq m1, m18, m1
- vpermq m2, m18, m2
- vpermq m3, m18, m3
- vpermq m4, m18, m4
- vpermq m5, m18, m5
- vpermq m6, m18, m6
- punpcklbw m0, m1
- punpcklbw m1, m2
- punpcklbw m2, m3
- punpcklbw m3, m4
- punpcklbw m4, m5
- punpcklbw m5, m6
+ movshdup m21, [bilin_v_perm64]
+ movu ym16, [srcq+strideq*0]
+ movu ym17, [srcq+strideq*1]
+ movu ym18, [srcq+strideq*2]
+ add srcq, stride3q
+ movu ym19, [srcq+strideq*0]
+ vpermt2q m16, m21, m19 ; 0 3
+ movu ym20, [srcq+strideq*1]
+ vpermt2q m17, m21, m20 ; 1 4
+ movu ym20, [srcq+strideq*2]
+ add srcq, stride3q
+ vpermt2q m18, m21, m20 ; 2 5
+ movu ym20, [srcq+strideq*0]
+ vpermt2q m19, m21, m20 ; 3 6
+ punpcklbw m0, m16, m17 ; 01
+ punpcklbw m1, m17, m18 ; 12
+ punpcklbw m2, m18, m19 ; 23
+ punpckhbw m3, m16, m17 ; 34
+ punpckhbw m4, m17, m18 ; 45
+ punpckhbw m5, m18, m19 ; 56
.v_w32_loop:
- movu ym12, [srcq+strideq*1]
+ movu ym16, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
- movu ym13, [srcq+strideq*0]
+ movu ym17, [srcq+strideq*0]
pmaddubsw m14, m0, m8
- pmaddubsw m16, m2, m9
- pmaddubsw m15, m1, m8
- pmaddubsw m17, m3, m9
mova m0, m2
+ pmaddubsw m15, m1, m8
mova m1, m3
- vpermq m12, m18, m12
- vpermq m13, m18, m13
- paddw m14, m16
- paddw m15, m17
- pmaddubsw m16, m4, m10
- pmaddubsw m17, m5, m10
- punpcklbw m6, m12
- punpcklbw m12, m13
+ pmaddubsw m2, m9
+ vpermt2q m16, m21, m17 ; 7 8
+ pmaddubsw m3, m9
+ pmaddubsw m12, m4, m10
+ pmaddubsw m13, m5, m10
+ shufpd m19, m16, 0x55 ; 6 7
+ paddw m14, m2
mova m2, m4
+ punpcklbw m4, m19, m16 ; 67
+ paddw m15, m3
mova m3, m5
- paddw m14, m16
- paddw m15, m17
- pmaddubsw m16, m6, m11
- pmaddubsw m17, m12, m11
- mova m4, m6
- mova m5, m12
- paddw m14, m16
- paddw m15, m17
+ punpckhbw m5, m19, m16 ; 78
+ paddw m14, m12
+ paddw m15, m13
+ pmaddubsw m12, m4, m11
+ pmaddubsw m13, m5, m11
+ mova m19, m16
+ paddw m14, m12
+ paddw m15, m13
pmulhrsw m14, m7
pmulhrsw m15, m7
- mova m6, m13
mova [tmpq+ 0], m14
mova [tmpq+64], m15
add tmpq, 64*2
@@ -2750,154 +3650,241 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
vzeroupper
RET
.v_w64:
- mov wd, 64
- jmp .v_start
.v_w128:
- mov wd, 128
-.v_start:
- WIN64_SPILL_XMM 27
- mova m26, [bilin_v_perm64]
- lea r6d, [hq+wq*2]
- mov r5, srcq
- mov r7, tmpq
+ WIN64_SPILL_XMM 24
+ mova m23, [bilin_v_perm64]
+ add wd, wd
+ lea r6d, [hq+wq]
.v_loop0:
- vpermq m0, m26, [srcq+strideq*0]
- vpermq m1, m26, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vpermq m2, m26, [srcq+strideq*0]
- vpermq m3, m26, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vpermq m4, m26, [srcq+strideq*0]
- vpermq m5, m26, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vpermq m6, m26, [srcq+strideq*0]
- punpckhbw m12, m0, m1
- punpcklbw m0, m1
- punpckhbw m13, m1, m2
- punpcklbw m1, m2
- punpckhbw m14, m2, m3
- punpcklbw m2, m3
- punpckhbw m15, m3, m4
- punpcklbw m3, m4
- punpckhbw m16, m4, m5
- punpcklbw m4, m5
- punpckhbw m17, m5, m6
- punpcklbw m5, m6
+ vpermq m12, m23, [srcq+strideq*0]
+ vpermq m13, m23, [srcq+strideq*1]
+ lea r5, [srcq+strideq*2]
+ vpermq m14, m23, [r5 +strideq*0]
+ vpermq m15, m23, [r5 +strideq*1]
+ lea r5, [r5+strideq*2]
+ vpermq m16, m23, [r5 +strideq*0]
+ vpermq m17, m23, [r5 +strideq*1]
+ lea r5, [r5+strideq*2]
+ vpermq m18, m23, [r5 +strideq*0]
+ mov r7, tmpq
+ punpcklbw m0, m12, m13 ; 01
+ punpckhbw m12, m13
+ punpcklbw m1, m13, m14 ; 12
+ punpckhbw m13, m14
+ punpcklbw m2, m14, m15 ; 23
+ punpckhbw m14, m15
+ punpcklbw m3, m15, m16 ; 34
+ punpckhbw m15, m16
+ punpcklbw m4, m16, m17 ; 45
+ punpckhbw m16, m17
+ punpcklbw m5, m17, m18 ; 56
+ punpckhbw m17, m18
.v_loop:
- vpermq m18, m26, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vpermq m19, m26, [srcq+strideq*0]
- pmaddubsw m20, m0, m8
- pmaddubsw m21, m12, m8
- pmaddubsw m22, m1, m8
- pmaddubsw m23, m13, m8
+ pmaddubsw m19, m0, m8 ; a0
+ vpermq m6, m23, [r5+strideq*1]
+ pmaddubsw m20, m12, m8
mova m0, m2
+ pmaddubsw m2, m9 ; a1
mova m12, m14
+ pmaddubsw m14, m9
+ lea r5, [r5+strideq*2]
+ pmaddubsw m21, m1, m8 ; b0
+ pmaddubsw m22, m13, m8
mova m1, m3
+ pmaddubsw m3, m9 ; b1
mova m13, m15
- pmaddubsw m2, m9
- pmaddubsw m14, m9
- pmaddubsw m3, m9
pmaddubsw m15, m9
- punpckhbw m24, m6, m18
- punpcklbw m6, m18
- paddw m20, m2
- paddw m21, m14
- paddw m22, m3
- paddw m23, m15
+ paddw m19, m2
mova m2, m4
+ pmaddubsw m4, m10 ; a2
+ paddw m20, m14
mova m14, m16
+ pmaddubsw m16, m10
+ paddw m21, m3
mova m3, m5
+ pmaddubsw m5, m10 ; b2
+ paddw m22, m15
mova m15, m17
- pmaddubsw m4, m10
- pmaddubsw m16, m10
- pmaddubsw m5, m10
pmaddubsw m17, m10
- punpckhbw m25, m18, m19
- punpcklbw m18, m19
- paddw m20, m4
- paddw m21, m16
- paddw m22, m5
- paddw m23, m17
- mova m4, m6
- mova m16, m24
- mova m5, m18
- mova m17, m25
- pmaddubsw m6, m11
- pmaddubsw m24, m11
- pmaddubsw m18, m11
- pmaddubsw m25, m11
- paddw m20, m6
- paddw m21, m24
- paddw m22, m18
- paddw m23, m25
- pmulhrsw m20, m7
- pmulhrsw m21, m7
- pmulhrsw m22, m7
- pmulhrsw m23, m7
- mova m6, m19
- mova [tmpq+wq*0+ 0], m20
- mova [tmpq+wq*0+64], m21
- mova [tmpq+wq*2+ 0], m22
- mova [tmpq+wq*2+64], m23
- lea tmpq, [tmpq+wq*4]
+ paddw m19, m4
+ punpcklbw m4, m18, m6 ; 67
+ paddw m20, m16
+ punpckhbw m16, m18, m6
+ vpermq m18, m23, [r5+strideq*0]
+ paddw m21, m5
+ pmaddubsw m5, m4, m11 ; a3
+ paddw m22, m17
+ pmaddubsw m17, m16, m11
+ paddw m19, m5
+ punpcklbw m5, m6, m18 ; 78
+ paddw m20, m17
+ punpckhbw m17, m6, m18
+ pmaddubsw m6, m5, m11 ; b3
+ paddw m21, m6
+ pmaddubsw m6, m17, m11
+ paddw m22, m6
+ REPX {pmulhrsw x, m7}, m19, m20, m21, m22
+ mova [r7+wq*0+ 0], m19
+ mova [r7+wq*0+64], m20
+ mova [r7+wq*1+ 0], m21
+ mova [r7+wq*1+64], m22
+ lea r7, [r7+wq*2]
sub hd, 2
jg .v_loop
- add r5, 64
- add r7, 128
+ add srcq, 64
+ add tmpq, 128
movzx hd, r6b
- mov srcq, r5
- mov tmpq, r7
sub r6d, 1<<8
jg .v_loop0
RET
-.hv:
- WIN64_SPILL_XMM 16
+.h:
+ RESET_STACK_STATE
+ test myd, 0xf00
+ jnz .hv
+.h2:
+ vpbroadcastd m4, [pd_2]
cmp wd, 4
- je .hv_w4
+ je .h_w4
+ tzcnt wd, wd
shr mxd, 16
sub srcq, 3
- vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep_avx512icl+0]
- vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep_avx512icl+4]
- movzx mxd, myb
- shr myd, 16
- cmp hd, 4
- cmove myd, mxd
- tzcnt wd, wd
- vpbroadcastd m8, [pd_2]
- movzx wd, word [r7+wq*2+table_offset(prep, _8tap_hv)]
- vpbroadcastd m9, [pd_32]
+ movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
+ vpbroadcastd m8, [base+subpel_filters+mxq*8+0]
+ vpbroadcastd m9, [base+subpel_filters+mxq*8+4]
add wq, r7
- vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl]
- lea stride3q, [strideq*3]
- sub srcq, stride3q
- punpcklbw m0, m0
- psraw m0, 8 ; sign-extend
- pshufd m12, m0, q0000
- pshufd m13, m0, q1111
- pshufd m14, m0, q2222
- pshufd m15, m0, q3333
jmp wq
-.hv_w4:
+.h_w4:
+ movzx mxd, mxb
+ vbroadcasti128 ym5, [subpel_h_shufA]
+ mov r3d, 0x4
+ dec srcq
+ vpbroadcastd ym6, [base+subpel_filters+mxq*8+2]
+ kmovb k1, r3d
+ lea stride3q, [strideq*3]
+.h_w4_loop:
+ movq xm2, [srcq+strideq*0]
+ movq xm3, [srcq+strideq*1]
+ vpbroadcastq ym2{k1}, [srcq+strideq*2]
+ vpbroadcastq ym3{k1}, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pshufb ym2, ym5
+ pshufb ym3, ym5
+ mova ym0, ym4
+ vpdpbusd ym0, ym2, ym6
+ mova ym1, ym4
+ vpdpbusd ym1, ym3, ym6
+ packssdw ym0, ym1
+ psraw ym0, 2
+ mova [tmpq], ym0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+ vbroadcasti128 m5, [subpel_h_shufA]
+ vbroadcasti128 m6, [subpel_h_shufB]
+ vbroadcasti128 m7, [subpel_h_shufC]
+ lea stride3q, [strideq*3]
+.h_w8_loop:
+ movu xmm3, [srcq+strideq*0]
+ vinserti128 ym3, ymm3, [srcq+strideq*1], 1
+ vinserti128 m3, [srcq+strideq*2], 2
+ vinserti128 m3, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
+ pshufb m1, m3, m5
+ pshufb m2, m3, m6
+ mova m0, m4
+ vpdpbusd m0, m1, m8
+ mova m1, m4
+ vpdpbusd m1, m2, m8
+ pshufb m3, m7
+ vpdpbusd m0, m2, m9
+ vpdpbusd m1, m3, m9
+ packssdw m0, m1
+ psraw m0, 2
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 4
+ jg .h_w8_loop
+ RET
+.h_w16:
+ mova m5, [spel_h_perm16]
+ vpbroadcastd m7, [pb_4]
+ lea stride3q, [strideq*3]
+ paddb m6, m7, m5
+ paddb m7, m6
+.h_w16_loop:
+ movu ym0, [srcq+strideq*0]
+ movu ym1, [srcq+strideq*2]
+ vinserti32x8 m0, [srcq+strideq*1], 1
+ vinserti32x8 m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ PREP_8TAP_H
+ add tmpq, 64*2
+ sub hd, 4
+ jg .h_w16_loop
+ RET
+.h_w32:
+ mova m5, [spel_h_perm32]
+ vpbroadcastd m7, [pb_4]
+ paddb m6, m7, m5
+ paddb m7, m6
+.h_w32_loop:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_H
+ add tmpq, 64*2
+ sub hd, 2
+ jg .h_w32_loop
+ RET
+.h_w64:
+ xor r6d, r6d
+ jmp .h_start
+.h_w128:
+ mov r6, -64*1
+.h_start:
+ mova m5, [spel_h_perm32]
+ vpbroadcastd m7, [pb_4]
+ sub srcq, r6
+ paddb m6, m7, m5
+ paddb m7, m6
+.h_loop0:
+ mov r5, r6
+.h_loop:
+ movu m0, [srcq+r5+32*0]
+ movu m1, [srcq+r5+32*1]
+ PREP_8TAP_H
+ add tmpq, 64*2
+ add r5, 64
+ jle .h_loop
+ add srcq, strideq
+ dec hd
+ jg .h_loop0
+ RET
+.hv:
+ RESET_STACK_STATE
+ vpbroadcastd m8, [pd_2]
+ vpbroadcastd m9, [pd_32]
+ cmp wd, 4
+ jg .hv_w8
movzx mxd, mxb
dec srcq
- vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+2]
+ vpbroadcastd m11, [base+subpel_filters+mxq*8+2]
movzx mxd, myb
shr myd, 16
cmp hd, 4
cmove myd, mxd
- vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl]
+ vpbroadcastq m0, [base+subpel_filters+myq*8]
lea stride3q, [strideq*3]
sub srcq, stride3q
mov r3d, 0x04
kmovb k1, r3d
kshiftlb k2, k1, 2
kshiftlb k3, k1, 4
- vpbroadcastd m10, [pd_2]
- vbroadcasti128 m16, [subpel_h_shufA]
+ vbroadcasti128 m10, [subpel_h_shufA]
punpcklbw m0, m0
psraw m0, 8 ; sign-extend
- vpbroadcastd m11, [pd_32]
pshufd m12, m0, q0000
pshufd m13, m0, q1111
pshufd m14, m0, q2222
@@ -2910,263 +3897,265 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
vpbroadcastq m3{k2}, [srcq+strideq*0]
vpbroadcastq m2{k3}, [srcq+strideq*1]
vpbroadcastq m3{k3}, [srcq+strideq*2]
- mova m17, [spel_hv_perm4a]
- movu m18, [spel_hv_perm4b]
- mova m0, m10
- mova m1, m10
- pshufb m2, m16
- pshufb m3, m16
- vpdpbusd m0, m2, m8
- vpdpbusd m1, m3, m8
+ mova m6, [spel_hv_perm4a]
+ movu m7, [spel_hv_perm4b]
+ mova m0, m8
+ mova m1, m8
+ pshufb m2, m10
+ pshufb m3, m10
+ vpdpbusd m0, m2, m11
+ vpdpbusd m1, m3, m11
packssdw m0, m1 ; _ 0 1 2 3 4 5 6
psraw m0, 2
- vpermb m1, m17, m0 ; 01 12 23 34
- vpermb m2, m18, m0 ; 23 34 45 56
+ vpermb m1, m6, m0 ; 01 12 23 34
+ vpermb m2, m7, m0 ; 23 34 45 56
.hv_w4_loop:
movq xm3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
movq xm4, [srcq+strideq*0]
vpbroadcastq ym3{k1}, [srcq+strideq*1]
vpbroadcastq ym4{k1}, [srcq+strideq*2]
- mova ym5, ym10
- mova ym6, ym10
- pshufb ym3, ym16
- pshufb ym4, ym16
- vpdpbusd ym5, ym3, ym8
- vpdpbusd ym6, ym4, ym8
- mova m7, m11
- packssdw ym5, ym6 ; 7 8 9 a _ _ _ _
- psraw ym5, 2
- valignq m0, m5, m0, 4 ; _ 4 5 6 7 8 9 a
- vpdpwssd m7, m1, m12
- vpdpwssd m7, m2, m13
- vpermb m1, m17, m0 ; 45 56 67 78
- vpermb m2, m18, m0 ; 67 78 89 9a
- vpdpwssd m7, m1, m14
- vpdpwssd m7, m2, m15
- psrad m7, 6
- vpmovdw [tmpq], m7
+ mova m5, m9
+ pshufb ym3, ym10
+ vpdpwssd m5, m1, m12 ; a0 b0 c0 d0
+ mova ym1, ym8
+ pshufb ym4, ym10
+ vpdpbusd ym1, ym3, ym11
+ mova ym3, ym8
+ vpdpbusd ym3, ym4, ym11
+ vpdpwssd m5, m2, m13 ; a1 b1 c1 d1
+ packssdw ym1, ym3 ; 7 8 9 a
+ psraw ym1, 2
+ vshufi32x4 m0, m1, q1032 ; _ 4 5 6 7 8 9 a
+ vpermb m1, m6, m0 ; 45 56 67 78
+ vpermb m2, m7, m0 ; 67 78 89 9a
+ vpdpwssd m5, m1, m14 ; a2 b2 c2 d2
+ vpdpwssd m5, m2, m15 ; a3 b3 c3 d3
+ psrad m5, 6
+ vpmovdw [tmpq], m5
add tmpq, 32
sub hd, 4
jg .hv_w4_loop
- vzeroupper
RET
.hv_w8:
- WIN64_SPILL_XMM 24
- vbroadcasti128 m16, [subpel_h_shufA]
- vbroadcasti128 m17, [subpel_h_shufB]
- vbroadcasti128 m18, [subpel_h_shufC]
- vinserti128 ym0, [srcq+strideq*0], 1
- vinserti128 m0, [srcq+strideq*1], 2
- vinserti128 m0, [srcq+strideq*2], 3
- movu xm1, [srcq+stride3q ]
+ shr mxd, 16
+ sub srcq, 3
+ vpbroadcastd m10, [base+subpel_filters+mxq*8+0]
+ vpbroadcastd m11, [base+subpel_filters+mxq*8+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ vpbroadcastq m0, [base+subpel_filters+myq*8]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ cmp wd, 8
+ jg .hv_w16
+ vbroadcasti32x4 m17, [srcq+stride3q ]
+ vinserti32x4 m16, m17, [srcq+strideq*0], 0
+ vbroadcasti32x4 m19, [subpel_h_shufA]
+ vinserti32x4 m16, [srcq+strideq*1], 1
+ vbroadcasti32x4 m21, [subpel_h_shufC]
+ vinserti32x4 m16, [srcq+strideq*2], 2
lea srcq, [srcq+strideq*4]
- vinserti128 ym1, [srcq+strideq*0], 1
- vinserti128 m1, [srcq+strideq*1], 2
- vinserti128 m1, [srcq+strideq*2], 3
+ vinserti128 ym17, [srcq+strideq*0], 1
+ vbroadcasti32x4 m20, [subpel_h_shufB]
+ vinserti32x4 m17, [srcq+strideq*1], 2
+ vinserti32x4 m17, [srcq+strideq*2], 3
+ pshufb m3, m16, m19 ; 0 1 2 3 0123
mova m2, m8
- mova m4, m8
+ pshufb m0, m16, m21 ; 0 1 2 3 89ab
+ vpdpbusd m2, m3, m10
mova m3, m8
- mova m5, m8
- pshufb m20, m0, m16
- pshufb m21, m0, m17
- pshufb m22, m0, m18
- pshufb m23, m1, m16
- pshufb m6, m1, m17
- pshufb m7, m1, m18
- vpdpbusd m2, m20, m10
- vpdpbusd m4, m21, m10
- vpdpbusd m2, m21, m11
- vpdpbusd m4, m22, m11
- vpdpbusd m3, m23, m10
- vpdpbusd m5, m6, m10
- vpdpbusd m3, m6, m11
- vpdpbusd m5, m7, m11
- packssdw m2, m4
- packssdw m3, m5
- psraw m2, 2 ; _ 0 1 2
- psraw m3, 2 ; 3 4 5 6
- valignq m0, m3, m2, 2 ; 0 1 2 3
- valignq m1, m3, m2, 4 ; 1 2 3 4
- valignq m2, m3, m2, 6 ; 2 3 4 5
- punpcklwd m4, m0, m1 ; 01a 12a 23a 34a
- punpckhwd m5, m0, m1 ; 01b 12b 23b 34b
- punpcklwd m6, m2, m3 ; 23a 34a 45a 56a
- punpckhwd m7, m2, m3 ; 23b 34b 45b 56b
+ pshufb m1, m17, m19 ; 3 4 5 6 0123
+ vpdpbusd m3, m0, m11
+ mova m0, m8
+ pshufb m4, m17, m21 ; 3 4 5 6 89ab
+ vpdpbusd m0, m1, m10
+ mova m1, m8
+ pshufb m16, m20 ; 0 1 2 3 4567
+ vpdpbusd m1, m4, m11
+ pshufb m17, m20 ; 3 4 5 6 4567
+ vpdpbusd m2, m16, m11
+ vpdpbusd m3, m16, m10
+ vpdpbusd m0, m17, m11
+ vpdpbusd m1, m17, m10
+ packssdw m2, m3
+ packssdw m0, m1
+ psraw m2, 2 ; 0 1 2 3
+ psraw m0, 2 ; 3 4 5 6
+ vshufi32x4 m4, m2, m0, q2132 ; 2 3 4 5
+ vshufi32x4 m5, m2, m0, q1021 ; 1 2 3 4
+ punpcklwd m3, m4, m0 ; 23 34 45 56
+ punpckhwd m4, m0
+ punpcklwd m1, m2, m5 ; 01 12 23 34
+ punpckhwd m2, m5
.hv_w8_loop:
- movu xm19, [srcq+stride3q ]
+ movu xm18, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
- vinserti128 ym19, [srcq+strideq*0], 1
- vinserti128 m19, [srcq+strideq*1], 2
- vinserti128 m19, [srcq+strideq*2], 3
- mova m20, m9
- mova m21, m9
- mova m22, m8
- mova m23, m8
- vpdpwssd m20, m4, m12
- vpdpwssd m21, m5, m12
- vpdpwssd m20, m6, m13
- vpdpwssd m21, m7, m13
- pshufb m0, m19, m16
- pshufb m1, m19, m17
- pshufb m2, m19, m18
- vpdpbusd m22, m0, m10
- vpdpbusd m23, m1, m10
- vpdpbusd m22, m1, m11
- vpdpbusd m23, m2, m11
- packssdw m22, m23
- psraw m22, 2 ; 7 8 9 A
- valignq m0, m22, m3, 2 ; 4 5 6 7
- valignq m1, m22, m3, 4 ; 5 6 7 8
- valignq m2, m22, m3, 6 ; 6 7 8 9
- mova m3, m22
- punpcklwd m4, m0, m1 ; 45a 56a 67a 78a
- punpckhwd m5, m0, m1 ; 45b 56b 67b 78b
- punpcklwd m6, m2, m3 ; 67a 78a 89a 9Aa
- punpckhwd m7, m2, m3 ; 67b 78b 89b 9Ab
- vpdpwssd m20, m4, m14
- vpdpwssd m21, m5, m14
- vpdpwssd m20, m6, m15
- vpdpwssd m21, m7, m15
- psrad m20, 6
- psrad m21, 6
- packssdw m20, m21
- mova [tmpq], m20
+ vinserti128 ym18, [srcq+strideq*0], 1
+ vinserti32x4 m18, [srcq+strideq*1], 2
+ vinserti32x4 m18, [srcq+strideq*2], 3
+ pshufb m17, m18, m19 ; 7 8 9 a 0123
+ mova m16, m8
+ pshufb m5, m18, m21 ; 7 8 9 a 89ab
+ vpdpbusd m16, m17, m10
+ mova m17, m8
+ pshufb m18, m20 ; 7 8 9 a 4567
+ vpdpbusd m17, m5, m11
+ mova m5, m9
+ vpdpwssd m5, m3, m13 ; a1 b1 c1 d1
+ mova m6, m9
+ vpdpwssd m6, m4, m13
+ vpdpbusd m16, m18, m11
+ vpdpbusd m17, m18, m10
+ vpdpwssd m5, m1, m12 ; a0 b0 c0 d0
+ mova m1, m3
+ vpdpwssd m6, m2, m12
+ mova m2, m4
+ packssdw m16, m17
+ psraw m16, 2 ; 7 8 9 a
+ valignq m4, m16, m0, 6 ; 6 7 8 9
+ mova m0, m16
+ punpcklwd m3, m4, m16 ; 67 78 89 9a
+ punpckhwd m4, m16
+ vpdpwssd m5, m3, m15 ; a3 b3 c3 d3
+ vpdpwssd m6, m4, m15
+ vshufi32x4 m1, m3, q1032 ; 45 56 67 78
+ vshufi32x4 m2, m4, q1032
+ vpdpwssd m5, m1, m14 ; a2 b2 c2 d2
+ vpdpwssd m6, m2, m14
+ psrad m5, 6
+ psrad m6, 6
+ packssdw m5, m6
+ mova [tmpq], m5
add tmpq, 64
sub hd, 4
jg .hv_w8_loop
+ vzeroupper
RET
.hv_w16:
- mov wd, 16*2
- jmp .hv_start
-.hv_w32:
- mov wd, 32*2
- jmp .hv_start
-.hv_w64:
- mov wd, 64*2
- jmp .hv_start
-.hv_w128:
- mov wd, 128*2
-.hv_start:
- WIN64_SPILL_XMM 31
- mova m16, [spel_h_perm16a]
- mova m17, [spel_h_perm16b]
- mova m18, [spel_h_perm16c]
+ WIN64_SPILL_XMM 23
+ mova m16, [spel_h_perm16]
+ vpbroadcastd m18, [pb_4]
+ add wd, wd
+ paddb m17, m18, m16
lea r6d, [hq+wq*8-256]
- mov r5, srcq
+ paddb m18, m17
+.hv_w16_loop0:
+ movu ym19, [srcq+strideq*0]
+ vinserti32x8 m19, [srcq+strideq*1], 1
+ lea r5, [srcq+strideq*2]
+ movu ym20, [r5 +strideq*0]
+ vinserti32x8 m20, [r5 +strideq*1], 1
+ lea r5, [r5 +strideq*2]
+ movu ym21, [r5 +strideq*0]
+ vinserti32x8 m21, [r5 +strideq*1], 1
+ lea r5, [r5 +strideq*2]
+ movu ym22, [r5 +strideq*0]
mov r7, tmpq
-.hv_loop0:
- movu ym0, [srcq+strideq*0]
- vinserti32x8 m0, [srcq+strideq*1], 1
- lea srcq, [srcq+strideq*2]
- movu ym1, [srcq+strideq*0]
- vinserti32x8 m1, [srcq+strideq*1], 1
- lea srcq, [srcq+strideq*2]
- movu ym2, [srcq+strideq*0]
- vinserti32x8 m2, [srcq+strideq*1], 1
- lea srcq, [srcq+strideq*2]
- movu ym3, [srcq+strideq*0]
+ vpermb m3, m16, m19 ; 0 1 0123 89ab
+ mova m2, m8
+ vpermb m4, m18, m19 ; 0 1 89ab ghij
+ vpdpbusd m2, m3, m10
+ mova m3, m8
+ vpermb m5, m16, m20 ; 2 3 0123 89ab
+ vpdpbusd m3, m4, m11
mova m4, m8
+ vpermb m6, m18, m20 ; 2 3 89ab ghij
+ vpdpbusd m4, m5, m10
mova m5, m8
+ vpermb m7, m16, m21 ; 4 5 0123 89ab
+ vpdpbusd m5, m6, m11
mova m6, m8
+ vpermb m0, m18, m21 ; 4 5 89ab ghij
+ vpdpbusd m6, m7, m10
mova m7, m8
- vpermb m19, m16, m0
- vpermb m20, m17, m0
- vpermb m21, m18, m0
- vpermb m22, m16, m1
- vpermb m23, m17, m1
- vpermb m24, m18, m1
- vpermb m25, m16, m2
- vpermb m26, m17, m2
- vpermb m27, m18, m2
- vpermb ym28, ym16, ym3
- vpermb ym29, ym17, ym3
- vpermb ym30, ym18, ym3
- mova m0, m8
- mova m1, m8
- mova ym2, ym8
- mova ym3, ym8
- vpdpbusd m4, m19, m10
- vpdpbusd m5, m20, m10
- vpdpbusd m6, m22, m10
- vpdpbusd m7, m23, m10
- vpdpbusd m0, m25, m10
- vpdpbusd m1, m26, m10
- vpdpbusd ym2, ym28, ym10
- vpdpbusd ym3, ym29, ym10
+ vpermb ym1, ym16, ym22 ; 6 0123 89ab
+ vpdpbusd m7, m0, m11
+ mova ym0, ym8
+ vpermb m19, m17, m19 ; 0 1 4567 cdef
+ vpdpbusd ym0, ym1, ym10
+ vpermb ym1, ym18, ym22 ; 6 89ab ghij
+ vpdpbusd m2, m19, m11
+ vpdpbusd m3, m19, m10
+ mova ym19, ym8
+ vpermb m20, m17, m20 ; 2 3 4567 cdef
+ vpdpbusd ym19, ym1, ym11
+ vpermb m21, m17, m21 ; 4 5 4567 cdef
vpdpbusd m4, m20, m11
- vpdpbusd m5, m21, m11
- vpdpbusd m6, m23, m11
- vpdpbusd m7, m24, m11
- vpdpbusd m0, m26, m11
- vpdpbusd m1, m27, m11
- vpdpbusd ym2, ym29, ym11
- vpdpbusd ym3, ym30, ym11
- packssdw m4, m5
- packssdw m6, m7
- packssdw m0, m1
- packssdw ym2, ym3
- psraw m4, 2 ; 0a 0b 1a 1b
- psraw m6, 2 ; 2a 2b 3a 3b
- psraw m0, 2 ; 4a 4b 5a 5b
- psraw ym2, 2 ; 6a 6b __ __
- vshufi32x4 m5, m4, m6, q1032 ; 1a 1b 2a 2b
- vshufi32x4 m7, m6, m0, q1032 ; 3a 3b 4a 4b
- vshufi32x4 m1, m0, m2, q1032 ; 5a 5b 6a 6b
- punpcklwd m2, m4, m5 ; 01a 01c 12a 12c
- punpckhwd m3, m4, m5 ; 01b 01d 12b 12d
- punpcklwd m4, m6, m7 ; 23a 23c 34a 34c
- punpckhwd m5, m6, m7 ; 23b 23d 34b 34d
- punpcklwd m6, m0, m1 ; 45a 45c 56a 56c
- punpckhwd m7, m0, m1 ; 45b 45d 56b 56d
-.hv_loop:
- movu ym19, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vinserti32x8 m19, [srcq+strideq*0], 1
+ vpdpbusd m5, m20, m10
+ vpermb ym22, ym17, ym22 ; 6 4567 cdef
+ vpdpbusd m6, m21, m11
+ vpdpbusd m7, m21, m10
+ packssdw m2, m3 ; 0 1
+ vpdpbusd ym0, ym22, ym11
+ packssdw m4, m5 ; 2 3
+ vpdpbusd ym19, ym22, ym10
+ packssdw m6, m7 ; 4 5
+ packssdw ym0, ym19 ; 6
+ REPX {psraw x, 2}, m2, m4, m6, ym0
+ vshufi32x4 m3, m2, m4, q1032 ; 1 2
+ vshufi32x4 m5, m4, m6, q1032 ; 3 4
+ vshufi32x4 m0, m6, m0, q1032 ; 5 6
+ punpcklwd m1, m2, m3 ; 01 12
+ punpckhwd m2, m3
+ punpcklwd m3, m4, m5 ; 23 34
+ punpckhwd m4, m5
+ punpcklwd m5, m6, m0 ; 45 56
+ punpckhwd m6, m0
+.hv_w16_loop:
+ movu ym19, [r5+strideq*1]
+ lea r5, [r5+strideq*2]
+ vinserti32x8 m19, [r5+strideq*0], 1
mova m20, m9
+ vpdpwssd m20, m1, m12 ; a0
+ vpermb m1, m16, m19
mova m21, m9
+ vpdpwssd m21, m2, m12 ; b0
+ vpermb m2, m17, m19
mova m22, m8
- mova m23, m8
- vpdpwssd m20, m2, m12
- vpdpwssd m21, m3, m12
- vpdpwssd m20, m4, m13
- vpdpwssd m21, m5, m13
- vpermb m24, m16, m19
- vpermb m25, m17, m19
- vpermb m26, m18, m19
- vpdpbusd m22, m24, m10
- vpdpbusd m23, m25, m10
- vpdpbusd m22, m25, m11
- vpdpbusd m23, m26, m11
- packssdw m22, m23
- psraw m22, 2 ; 7a 7b 8a 8b
- vshufi32x4 m0, m1, m22, q1032 ; 6a 6b 7a 7b
+ vpdpbusd m22, m1, m10
+ mova m1, m8
+ vpermb m19, m18, m19
+ vpdpbusd m1, m2, m10
+ vpdpwssd m20, m3, m13 ; a1
+ vpdpwssd m21, m4, m13 ; b1
+ vpdpbusd m22, m2, m11
mova m2, m4
- mova m3, m5
- mova m1, m22
+ vpdpbusd m1, m19, m11
mova m4, m6
- mova m5, m7
- punpcklwd m6, m0, m1 ; 67a 67c 78a 78c
- punpckhwd m7, m0, m1 ; 67b 67d 78b 78d
- vpdpwssd m20, m4, m14
- vpdpwssd m21, m5, m14
- vpdpwssd m20, m6, m15
- vpdpwssd m21, m7, m15
+ vpdpwssd m20, m5, m14 ; a2
+ vpdpwssd m21, m6, m14 ; b2
+ packssdw m22, m1
+ mova m1, m3
+ psraw m22, 2 ; 7 8
+ mova m3, m5
+ vshufi32x4 m6, m0, m22, q1032 ; 6 7
+ mova m0, m22
+ punpcklwd m5, m6, m0 ; 67 78
+ punpckhwd m6, m0
+ vpdpwssd m20, m5, m15 ; a3
+ vpdpwssd m21, m6, m15 ; b3
psrad m20, 6
psrad m21, 6
packssdw m20, m21
- mova [tmpq+wq*0], ym20
- vextracti32x8 [tmpq+wq*1], m20, 1
- lea tmpq, [tmpq+wq*2]
+ mova [r7+wq*0], ym20
+ vextracti32x8 [r7+wq*1], m20, 1
+ lea r7, [r7+wq*2]
sub hd, 2
- jg .hv_loop
- add r5, 16
- add r7, 32
+ jg .hv_w16_loop
+ add srcq, 16
+ add tmpq, 32
movzx hd, r6b
- mov srcq, r5
- mov tmpq, r7
sub r6d, 1<<8
- jg .hv_loop0
+ jg .hv_w16_loop0
RET
cglobal warp_affine_8x8t_8bpc, 4, 7, 22, tmp, ts