summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/x86/mc_avx2.asm
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-06-12 05:35:29 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-06-12 05:35:29 +0000
commit59203c63bb777a3bacec32fb8830fba33540e809 (patch)
tree58298e711c0ff0575818c30485b44a2f21bf28a0 /third_party/dav1d/src/x86/mc_avx2.asm
parentAdding upstream version 126.0.1. (diff)
downloadfirefox-59203c63bb777a3bacec32fb8830fba33540e809.tar.xz
firefox-59203c63bb777a3bacec32fb8830fba33540e809.zip
Adding upstream version 127.0.upstream/127.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/dav1d/src/x86/mc_avx2.asm')
-rw-r--r--third_party/dav1d/src/x86/mc_avx2.asm1471
1 files changed, 1190 insertions, 281 deletions
diff --git a/third_party/dav1d/src/x86/mc_avx2.asm b/third_party/dav1d/src/x86/mc_avx2.asm
index 58e3cb5af1..df8bebb1cb 100644
--- a/third_party/dav1d/src/x86/mc_avx2.asm
+++ b/third_party/dav1d/src/x86/mc_avx2.asm
@@ -60,15 +60,14 @@ subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 1
subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11
subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
-bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7
deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
pb_8x0_8x8: db 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8
bdct_lb_dw: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12
wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
-rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7
resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7
+rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7
wm_420_sign: dd 0x01020102, 0x01010101
wm_422_sign: dd 0x80808080, 0x7f7f7f7f
@@ -95,6 +94,7 @@ pq_0x40000000: dq 0x40000000
cextern mc_subpel_filters
cextern mc_warp_filter2
cextern resize_filter
+cextern z_filter_s
%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
@@ -184,7 +184,9 @@ BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128
BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, 6tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 6tap, avx2, 1, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128
SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128
SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128
@@ -298,7 +300,7 @@ INIT_YMM avx2
; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
imul mxyd, 255
- vbroadcasti128 m4, [bilin_h_shuf8]
+ vbroadcasti128 m4, [z_filter_s+2]
add mxyd, 16
movd xm5, mxyd
mov mxyd, r7m ; my
@@ -900,7 +902,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
; 16 * src[x] + (mx * (src[x + 1] - src[x]))
; = (16 - mx) * src[x] + mx * src[x + 1]
imul mxyd, 255
- vbroadcasti128 m4, [bilin_h_shuf8]
+ vbroadcasti128 m4, [z_filter_s+2]
add mxyd, 16
movd xm5, mxyd
mov mxyd, r6m ; my
@@ -1436,7 +1438,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
%assign FILTER_SMOOTH (1*15 << 16) | 4*15
%assign FILTER_SHARP (2*15 << 16) | 3*15
-%macro FN 4 ; fn, type, type_h, type_v
+%macro FN 4-5 ; fn, type, type_h, type_v, jmp_to
cglobal %1_%2_8bpc
mov t0d, FILTER_%3
%ifidn %3, %4
@@ -1444,8 +1446,8 @@ cglobal %1_%2_8bpc
%else
mov t1d, FILTER_%4
%endif
-%ifnidn %2, regular ; skip the jump in the last filter
- jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX)
+%if %0 == 5 ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
%endif
%endmacro
@@ -1456,28 +1458,24 @@ DECLARE_REG_TMP 7, 8
%endif
%define PUT_8TAP_FN FN put_8tap,
-PUT_8TAP_FN sharp, SHARP, SHARP
-PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
-PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
-PUT_8TAP_FN smooth, SMOOTH, SMOOTH
-PUT_8TAP_FN sharp_regular, SHARP, REGULAR
-PUT_8TAP_FN regular_sharp, REGULAR, SHARP
-PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
-PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_8bpc
+PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_8bpc
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_8bpc
PUT_8TAP_FN regular, REGULAR, REGULAR
-cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ns
imul mxd, mxm, 0x010101
- add mxd, t0d ; 8tap_h, mx, 4tap_h
+ add mxd, t0d ; 6tap_h, mx, 4tap_h
imul myd, mym, 0x010101
- add myd, t1d ; 8tap_v, my, 4tap_v
+ add myd, t1d ; 6tap_v, my, 4tap_v
lea r8, [put_avx2]
- movsxd wq, wm
+ mov wd, wm
movifnidn hd, hm
test mxd, 0xf00
jnz .h
test myd, 0xf00
jnz .v
+.put:
tzcnt wd, wd
movzx wd, word [r8+wq*2+table_offset(put,)]
add wq, r8
@@ -1487,36 +1485,18 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
pop r8
%endif
jmp wq
-.h:
- test myd, 0xf00
- jnz .hv
- vpbroadcastd m5, [pw_34] ; 2 + (8 << 2)
- WIN64_SPILL_XMM 11
- cmp wd, 4
- jl .h_w2
- vbroadcasti128 m6, [subpel_h_shufA]
- je .h_w4
- tzcnt wd, wd
- vbroadcasti128 m7, [subpel_h_shufB]
- vbroadcasti128 m8, [subpel_h_shufC]
- shr mxd, 16
- sub srcq, 3
- movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)]
- vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0]
- vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4]
- add wq, r8
- jmp wq
.h_w2:
movzx mxd, mxb
- dec srcq
- mova xm4, [subpel_h_shuf4]
- vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2]
+ lea srcq, [srcq-1]
+ vpbroadcastd xm4, [r8+mxq*8+subpel_filters-put_avx2+2]
+ je .h_w4
+ mova xm3, [subpel_h_shuf4]
.h_w2_loop:
movq xm0, [srcq+ssq*0]
movhps xm0, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- pshufb xm0, xm4
- pmaddubsw xm0, xm3
+ pshufb xm0, xm3
+ pmaddubsw xm0, xm4
phaddw xm0, xm0
paddw xm0, xm5
psraw xm0, 6
@@ -1528,17 +1508,15 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
jg .h_w2_loop
RET
.h_w4:
- movzx mxd, mxb
- dec srcq
- vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2]
+ mova xm3, [subpel_h_shufA]
.h_w4_loop:
movq xm0, [srcq+ssq*0]
movq xm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- pshufb xm0, xm6
- pshufb xm1, xm6
- pmaddubsw xm0, xm3
- pmaddubsw xm1, xm3
+ pshufb xm0, xm3
+ pshufb xm1, xm3
+ pmaddubsw xm0, xm4
+ pmaddubsw xm1, xm4
phaddw xm0, xm1
paddw xm0, xm5
psraw xm0, 6
@@ -1549,25 +1527,43 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
sub hd, 2
jg .h_w4_loop
RET
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m5, [pw_34] ; 2 + (8 << 2)
+ cmp wd, 4
+ jle .h_w2
+ WIN64_SPILL_XMM 11
+ tzcnt wd, wd
+ vbroadcasti128 m4, [z_filter_s+ 2] ; 01
+ shr mxd, 16
+ vbroadcasti128 m6, [z_filter_s+ 6] ; 23
+ sub srcq, 2
+ vbroadcasti128 m7, [z_filter_s+10] ; 45
+ lea mxq, [r8+mxq*8+subpel_filters+1-put_avx2]
+ movzx wd, word [r8+wq*2+table_offset(put, _6tap_h)]
+ vpbroadcastw m8, [mxq+0]
+ vpbroadcastw m9, [mxq+2]
+ add wq, r8
+ vpbroadcastw m10, [mxq+4]
+ jmp wq
.h_w8:
-%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
- pshufb m%2, m%1, m7
- pshufb m%3, m%1, m8
- pshufb m%1, m6
- pmaddubsw m%4, m%2, m9
- pmaddubsw m%2, m10
- pmaddubsw m%3, m10
- pmaddubsw m%1, m9
- paddw m%3, m%4
+%macro PUT_6TAP_H 3 ; dst/src, tmp[1-2]
+ pshufb m%2, m%1, m4
+ pmaddubsw m%2, m8
+ pshufb m%3, m%1, m6
+ pmaddubsw m%3, m9
+ pshufb m%1, m7
+ pmaddubsw m%1, m10
+ paddw m%2, m5
+ paddw m%1, m%3
paddw m%1, m%2
- phaddw m%1, m%3
- paddw m%1, m5
psraw m%1, 6
%endmacro
movu xm0, [srcq+ssq*0]
vinserti128 m0, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
- PUT_8TAP_H 0, 1, 2, 3
+ PUT_6TAP_H 0, 1, 2
vextracti128 xm1, m0, 1
packuswb xm0, xm1
movq [dstq+dsq*0], xm0
@@ -1581,9 +1577,9 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
vinserti128 m0, [srcq+ssq*1+8*0], 1
movu xm1, [srcq+ssq*0+8*1]
vinserti128 m1, [srcq+ssq*1+8*1], 1
- PUT_8TAP_H 0, 2, 3, 4
+ PUT_6TAP_H 0, 2, 3
lea srcq, [srcq+ssq*2]
- PUT_8TAP_H 1, 2, 3, 4
+ PUT_6TAP_H 1, 2, 3
packuswb m0, m1
mova [dstq+dsq*0], xm0
vextracti128 [dstq+dsq*1], m0, 1
@@ -1606,8 +1602,8 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
.h_loop:
movu m0, [srcq+r6+8*0]
movu m1, [srcq+r6+8*1]
- PUT_8TAP_H 0, 2, 3, 4
- PUT_8TAP_H 1, 2, 3, 4
+ PUT_6TAP_H 0, 2, 3
+ PUT_6TAP_H 1, 2, 3
packuswb m0, m1
mova [dstq+r6], m0
add r6, 32
@@ -1619,7 +1615,421 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
jg .h_loop
RET
.v:
- WIN64_SPILL_XMM 16
+ WIN64_SPILL_XMM 9, 12
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ tzcnt r6d, wd
+ movzx r6d, word [r8+r6*2+table_offset(put, _6tap_v)]
+ vpbroadcastd m8, [pw_512]
+ lea myq, [r8+myq*8+subpel_filters+1-put_avx2]
+ vpbroadcastw m5, [myq+0]
+ vpbroadcastw m6, [myq+2]
+ vpbroadcastw m7, [myq+4]
+ add r6, r8
+ mov nsq, ssq
+ neg nsq
+ jmp r6
+.v_w2:
+ movd xm2, [srcq+nsq*2]
+ pinsrw xm2, [srcq+nsq*1], 2
+ pinsrw xm2, [srcq+ssq*0], 4
+ pinsrw xm2, [srcq+ssq*1], 6 ; 0 1 2 3
+ lea srcq, [srcq+ssq*2]
+ vpbroadcastd xm0, [srcq+ssq*0]
+ palignr xm3, xm0, xm2, 4 ; 1 2 3 4
+ punpcklbw xm1, xm2, xm3 ; 01 12
+ punpckhbw xm2, xm3 ; 23 34
+.v_w2_loop:
+ vpbroadcastd xm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw xm3, xm1, xm5 ; a0 b0
+ mova xm1, xm2
+ pmaddubsw xm2, xm6 ; a1 b1
+ paddw xm3, xm2
+ vpblendd xm2, xm0, xm4, 0x02 ; 4 5
+ vpbroadcastd xm0, [srcq+ssq*0]
+ vpblendd xm4, xm0, 0x02 ; 5 6
+ punpcklbw xm2, xm4 ; 67 78
+ pmaddubsw xm4, xm2, xm7 ; a3 b3
+ paddw xm3, xm4
+ pmulhrsw xm3, xm8
+ packuswb xm3, xm3
+ pextrw [dstq+dsq*0], xm3, 0
+ pextrw [dstq+dsq*1], xm3, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movd xm2, [srcq+nsq*2]
+ pinsrd xm2, [srcq+nsq*1], 1
+ pinsrd xm2, [srcq+ssq*0], 2
+ pinsrd xm2, [srcq+ssq*1], 3 ; 0 1 2 3
+ lea srcq, [srcq+ssq*2]
+ vpbroadcastd xm0, [srcq+ssq*0]
+ palignr xm3, xm0, xm2, 4 ; 1 2 3 4
+ punpcklbw xm1, xm2, xm3 ; 01 12
+ punpckhbw xm2, xm3 ; 23 34
+.v_w4_loop:
+ vpbroadcastd xm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw xm3, xm1, xm5 ; a0 b0
+ mova xm1, xm2
+ pmaddubsw xm2, xm6 ; a1 b1
+ paddw xm3, xm2
+ vpblendd xm2, xm0, xm4, 0x02 ; 4 5
+ vpbroadcastd xm0, [srcq+ssq*0]
+ vpblendd xm4, xm0, 0x02 ; 5 6
+ punpcklbw xm2, xm4 ; 45 56
+ pmaddubsw xm4, xm2, xm7 ; a2 b2
+ paddw xm3, xm4
+ pmulhrsw xm3, xm8
+ packuswb xm3, xm3
+ movd [dstq+dsq*0], xm3
+ pextrd [dstq+dsq*1], xm3, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xm1, [srcq+nsq*2]
+ vpbroadcastq m3, [srcq+nsq*1]
+ vpbroadcastq m2, [srcq+ssq*0]
+ vpbroadcastq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpbroadcastq m0, [srcq+ssq*0]
+ vpblendd m1, m3, 0x30
+ vpblendd m3, m2, 0x30
+ punpcklbw m1, m3 ; 01 12
+ vpblendd m2, m4, 0x30
+ vpblendd m4, m0, 0x30
+ punpcklbw m2, m4 ; 23 34
+.v_w8_loop:
+ vpbroadcastq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw m3, m1, m5 ; a0 b0
+ mova m1, m2
+ pmaddubsw m2, m6 ; a1 b1
+ paddw m3, m2
+ vpblendd m2, m0, m4, 0x30
+ vpbroadcastq m0, [srcq+ssq*0]
+ vpblendd m4, m0, 0x30
+ punpcklbw m2, m4 ; 45 56
+ pmaddubsw m4, m2, m7 ; a2 b2
+ paddw m3, m4
+ pmulhrsw m3, m8
+ vextracti128 xm4, m3, 1
+ packuswb xm3, xm4
+ movq [dstq+dsq*0], xm3
+ movhps [dstq+dsq*1], xm3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+.v_w32:
+.v_w64:
+.v_w128:
+ lea r6d, [wq*8-128]
+ WIN64_PUSH_XMM 12
+ lea r6d, [hq+r6*2]
+.v_w16_loop0:
+ vbroadcasti128 m3, [srcq+nsq*2]
+ vbroadcasti128 m4, [srcq+nsq*1]
+ lea r4, [srcq+ssq*2]
+ vbroadcasti128 m0, [srcq+ssq*0]
+ vbroadcasti128 m1, [srcq+ssq*1]
+ mov r7, dstq
+ vbroadcasti128 m2, [r4+ssq*0]
+ shufpd m3, m0, 0x0c
+ shufpd m4, m1, 0x0c
+ punpcklbw m1, m3, m4 ; 01
+ punpckhbw m3, m4 ; 23
+ shufpd m0, m2, 0x0c
+ punpcklbw m2, m4, m0 ; 12
+ punpckhbw m4, m0 ; 34
+.v_w16_loop:
+ vbroadcasti128 m9, [r4+ssq*1]
+ pmaddubsw m10, m1, m5 ; a0
+ lea r4, [r4+ssq*2]
+ pmaddubsw m11, m2, m5 ; b0
+ mova m1, m3
+ pmaddubsw m3, m6 ; a1
+ mova m2, m4
+ pmaddubsw m4, m6 ; b1
+ paddw m10, m3
+ vbroadcasti128 m3, [r4+ssq*0]
+ paddw m11, m4
+ shufpd m4, m0, m9, 0x0d
+ shufpd m0, m9, m3, 0x0c
+ punpcklbw m3, m4, m0 ; 45
+ punpckhbw m4, m0 ; 56
+ pmaddubsw m9, m3, m7 ; a2
+ paddw m10, m9
+ pmaddubsw m9, m4, m7 ; b2
+ paddw m11, m9
+ pmulhrsw m10, m8
+ pmulhrsw m11, m8
+ packuswb m10, m11
+ vpermq m10, m10, q3120
+ mova [r7+dsq*0], xm10
+ vextracti128 [r7+dsq*1], m10, 1
+ lea r7, [r7+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ add srcq, 16
+ add dstq, 16
+ movzx hd, r6b
+ sub r6d, 1<<8
+ jg .v_w16_loop0
+ RET
+.hv:
+ WIN64_SPILL_XMM 12, 16
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m6, [r8+mxq*8+subpel_filters-put_avx2+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastq m0, [r8+myq*8+subpel_filters+1-put_avx2]
+ vpbroadcastd m7, [pw_8192]
+ punpcklbw m0, m0
+ vpbroadcastd m8, [pd_512]
+ psraw m0, 8 ; sign-extend
+ mov nsq, ssq
+ pshufd m9, m0, q0000
+ neg nsq
+ pshufd m10, m0, q1111
+ pshufd m11, m0, q2222
+ cmp wd, 4
+ je .hv_w4
+ vbroadcasti128 m5, [subpel_h_shuf4]
+ movq xm2, [srcq+nsq*2]
+ movhps xm2, [srcq+nsq*1]
+ movq xm0, [srcq+ssq*0]
+ movhps xm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpbroadcastq m1, [srcq+ssq*0]
+ vpblendd m2, m1, 0x30
+ pshufb m2, m5
+ pshufb xm0, xm5
+ pmaddubsw m2, m6
+ pmaddubsw xm0, xm6
+ phaddw m2, m0
+ pmulhrsw m2, m7
+ vextracti128 xm0, m2, 1
+ palignr xm0, xm2, 4
+ punpcklwd xm1, xm2, xm0 ; 01 12
+ punpckhwd xm2, xm0 ; 23 34
+.hv_w2_loop:
+ movq xm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xm4, [srcq+ssq*0]
+ pshufb xm4, xm5
+ pmaddubsw xm4, xm6
+ pmaddwd xm3, xm9, xm1 ; a0 b0
+ mova xm1, xm2
+ pmaddwd xm2, xm10 ; a1 b1
+ phaddw xm4, xm4
+ paddd xm3, xm2
+ pmulhrsw xm4, xm7
+ palignr xm2, xm4, xm0, 12
+ mova xm0, xm4
+ punpcklwd xm2, xm4 ; 45 56
+ pmaddwd xm4, xm11, xm2 ; a2 b2
+ paddd xm3, xm8
+ paddd xm3, xm4
+ psrad xm3, 10
+ packssdw xm3, xm3
+ packuswb xm3, xm3
+ pextrw [dstq+dsq*0], xm3, 0
+ pextrw [dstq+dsq*1], xm3, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ mova m5, [subpel_h_shuf4]
+ vpbroadcastq m2, [srcq+nsq*2]
+ vpbroadcastq m4, [srcq+nsq*1]
+ vpbroadcastq m1, [srcq+ssq*0]
+ vpbroadcastq m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpbroadcastq m0, [srcq+ssq*0]
+ vpblendd m2, m4, 0xcc ; 0 1
+ vpblendd m1, m3, 0xcc ; 2 3
+ pshufb m2, m5
+ pshufb m1, m5
+ pshufb m0, m5
+ pmaddubsw m2, m6
+ pmaddubsw m1, m6
+ pmaddubsw m0, m6
+ phaddw m2, m1
+ phaddw m0, m0
+ pmulhrsw m2, m7
+ pmulhrsw m0, m7
+ palignr m3, m0, m2, 4
+ punpcklwd m1, m2, m3 ; 01 12
+ punpckhwd m2, m3 ; 23 34
+.hv_w4_loop:
+ vpbroadcastq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m3, m9, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m10 ; a1 b1
+ paddd m3, m2
+ vpbroadcastq m2, [srcq+ssq*0]
+ vpblendd m4, m2, 0xcc ; 5 6
+ pshufb m4, m5
+ pmaddubsw m4, m6
+ phaddw m4, m4
+ pmulhrsw m4, m7
+ palignr m2, m4, m0, 12
+ mova m0, m4
+ punpcklwd m2, m4 ; 45 56
+ pmaddwd m4, m11, m2 ; a2 b2
+ paddd m3, m8
+ paddd m3, m4
+ psrad m3, 10
+ vextracti128 xm4, m3, 1
+ packssdw xm3, xm4
+ packuswb xm3, xm3
+ pshuflw xm3, xm3, q3120
+ movd [dstq+dsq*0], xm3
+ pextrd [dstq+dsq*1], xm3, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+ sub srcq, 2
+ lea mxq, [r8+mxq*8+subpel_filters+1-put_avx2]
+ WIN64_PUSH_XMM 16
+ vpbroadcastw m10, [mxq+0]
+ vpbroadcastw m11, [mxq+2]
+ vpbroadcastw m12, [mxq+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastq m0, [r8+myq*8+subpel_filters+1-put_avx2]
+ lea r6d, [wq*8-64]
+ vbroadcasti128 m8, [z_filter_s+ 6]
+ punpcklbw m0, m0
+ vbroadcasti128 m9, [z_filter_s+10]
+ psraw m0, 8 ; sign-extend
+ mov nsq, ssq
+ pshufd m13, m0, q0000
+ neg nsq
+ pshufd m14, m0, q1111
+ lea r6d, [hq+r6*4]
+ pshufd m15, m0, q2222
+.hv_w8_loop0:
+ vbroadcasti128 m7, [z_filter_s+2]
+ movu xm3, [srcq+nsq*2]
+ lea r4, [srcq+ssq*2]
+ movu xm4, [srcq+nsq*1]
+ vbroadcasti128 m0, [srcq+ssq*0]
+ mov r7, dstq
+ vinserti128 m4, [srcq+ssq*1], 1 ; 1 3
+ vpblendd m3, m0, 0xf0 ; 0 2
+ vinserti128 m0, [r4+ssq*0], 1 ; 2 4
+ vpbroadcastd m5, [pw_8192]
+%macro HV_H_6TAP_W8 6 ; src/dst, tmp[1-2], shuf[1-3]
+ pshufb %2, %1, %4
+ pmaddubsw %2, m10
+ pshufb %3, %1, %5
+ pmaddubsw %3, m11
+ pshufb %1, %6
+ pmaddubsw %1, m12
+ paddw %2, %3
+ paddw %1, %2
+%endmacro
+ HV_H_6TAP_W8 m3, m1, m2, m7, m8, m9
+ HV_H_6TAP_W8 m4, m1, m2, m7, m8, m9
+ HV_H_6TAP_W8 m0, m1, m2, m7, m8, m9
+ vpermq m3, m3, q3120
+ vpermq m4, m4, q3120
+ vpermq m0, m0, q3120
+ pmulhrsw m3, m5
+ pmulhrsw m4, m5
+ pmulhrsw m0, m5
+ punpcklwd m1, m3, m4 ; 01
+ punpckhwd m3, m4 ; 23
+ punpcklwd m2, m4, m0 ; 12
+ punpckhwd m4, m0 ; 34
+.hv_w8_loop:
+ movu xm7, [r4+ssq*1]
+ lea r4, [r4+ssq*2]
+ vinserti128 m7, [r4+ssq*0], 1 ; 5 6
+ pmaddwd m5, m13, m1 ; a0
+ mova m1, m3
+ pmaddwd m6, m13, m2 ; b0
+ mova m2, m4
+ pmaddwd m3, m14 ; a1
+ pmaddwd m4, m14 ; b1
+ paddd m5, m3
+ vbroadcasti128 m3, [z_filter_s+2]
+ paddd m6, m4
+ HV_H_6TAP_W8 m7, m3, m4, m3, m8, m9
+ vpbroadcastd m3, [pw_8192]
+ vpbroadcastd m4, [pd_512]
+ pmulhrsw m7, m3
+ paddd m5, m4
+ paddd m6, m4
+ mova m4, m0
+ vpermq m0, m7, q3120
+ shufpd m4, m0, 0x05
+ punpcklwd m3, m4, m0 ; 45
+ pmaddwd m7, m15, m3 ; a2
+ punpckhwd m4, m0 ; 67
+ paddd m5, m7
+ pmaddwd m7, m15, m4 ; b2
+ paddd m6, m7
+ psrad m5, 10
+ psrad m6, 10
+ packssdw m5, m6
+ vextracti128 xm6, m5, 1
+ packuswb xm5, xm6
+ pshufd xm5, xm5, q3120
+ movq [r7+dsq*0], xm5
+ movhps [r7+dsq*1], xm5
+ lea r7, [r7+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ add srcq, 8
+ add dstq, 8
+ movzx hd, r6b
+ sub r6d, 1<<8
+ jg .hv_w8_loop0
+ RET
+
+PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_8bpc
+PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_8bpc
+PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_8bpc
+PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_8bpc
+PUT_8TAP_FN sharp, SHARP, SHARP
+
+cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r8, [put_avx2]
+ movsxd wq, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jz mangle(private_prefix %+ _put_6tap_8bpc_avx2).put
+.v:
+ WIN64_SPILL_XMM 12, 15
movzx mxd, myb
shr myd, 16
cmp hd, 6
@@ -1765,19 +2175,19 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
.v_w64:
.v_w128:
lea r6d, [wq*8-128]
- mov r4, srcq
- mov r7, dstq
+ WIN64_PUSH_XMM 15
lea r6d, [hq+r6*2]
.v_w16_loop0:
vbroadcasti128 m4, [srcq+ssq*0]
vbroadcasti128 m5, [srcq+ssq*1]
+ lea r4, [srcq+ss3q]
vbroadcasti128 m6, [srcq+ssq*2]
- add srcq, ss3q
- vbroadcasti128 m0, [srcq+ssq*0]
- vbroadcasti128 m1, [srcq+ssq*1]
- vbroadcasti128 m2, [srcq+ssq*2]
- add srcq, ss3q
- vbroadcasti128 m3, [srcq+ssq*0]
+ vbroadcasti128 m0, [r4+ssq*0]
+ mov r7, dstq
+ vbroadcasti128 m1, [r4+ssq*1]
+ vbroadcasti128 m2, [r4+ssq*2]
+ add r4, ss3q
+ vbroadcasti128 m3, [r4+ssq*0]
shufpd m4, m0, 0x0c
shufpd m5, m1, 0x0c
punpcklbw m1, m4, m5 ; 01
@@ -1789,50 +2199,137 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
punpcklbw m3, m6, m0 ; 23
punpckhbw m6, m0 ; 56
.v_w16_loop:
- vbroadcasti128 m12, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- vbroadcasti128 m13, [srcq+ssq*0]
- pmaddubsw m14, m1, m8 ; a0
- pmaddubsw m15, m2, m8 ; b0
+ vbroadcasti128 m12, [r4+ssq*1]
+ lea r4, [r4+ssq*2]
+ pmaddubsw m13, m1, m8 ; a0
+ pmaddubsw m14, m2, m8 ; b0
mova m1, m3
mova m2, m4
pmaddubsw m3, m9 ; a1
pmaddubsw m4, m9 ; b1
- paddw m14, m3
- paddw m15, m4
+ paddw m13, m3
+ paddw m14, m4
mova m3, m5
mova m4, m6
pmaddubsw m5, m10 ; a2
pmaddubsw m6, m10 ; b2
- paddw m14, m5
- paddw m15, m6
+ paddw m13, m5
+ vbroadcasti128 m5, [r4+ssq*0]
+ paddw m14, m6
shufpd m6, m0, m12, 0x0d
- shufpd m0, m12, m13, 0x0c
+ shufpd m0, m12, m5, 0x0c
punpcklbw m5, m6, m0 ; 67
punpckhbw m6, m0 ; 78
pmaddubsw m12, m5, m11 ; a3
- pmaddubsw m13, m6, m11 ; b3
+ paddw m13, m12
+ pmaddubsw m12, m6, m11 ; b3
paddw m14, m12
- paddw m15, m13
+ pmulhrsw m13, m7
pmulhrsw m14, m7
- pmulhrsw m15, m7
- packuswb m14, m15
- vpermq m14, m14, q3120
- mova [dstq+dsq*0], xm14
- vextracti128 [dstq+dsq*1], m14, 1
- lea dstq, [dstq+dsq*2]
+ packuswb m13, m14
+ vpermq m13, m13, q3120
+ mova [r7+dsq*0], xm13
+ vextracti128 [r7+dsq*1], m13, 1
+ lea r7, [r7+dsq*2]
sub hd, 2
jg .v_w16_loop
- add r4, 16
- add r7, 16
+ add srcq, 16
+ add dstq, 16
movzx hd, r6b
- mov srcq, r4
- mov dstq, r7
sub r6d, 1<<8
jg .v_w16_loop0
RET
+.h:
+.h_w2:
+.h_w4:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m5, [pw_34] ; 2 + (8 << 2)
+ cmp wd, 4
+ jle mangle(private_prefix %+ _put_6tap_8bpc_avx2).h_w2
+ WIN64_SPILL_XMM 11
+ tzcnt wd, wd
+ vbroadcasti128 m6, [subpel_h_shufA]
+ shr mxd, 16
+ vbroadcasti128 m7, [subpel_h_shufB]
+ sub srcq, 3
+ vbroadcasti128 m8, [subpel_h_shufC]
+ movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)]
+ vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0]
+ vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4]
+ add wq, r8
+ jmp wq
+.h_w8:
+%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
+ pshufb m%2, m%1, m7
+ pshufb m%3, m%1, m8
+ pshufb m%1, m6
+ pmaddubsw m%4, m%2, m9
+ pmaddubsw m%2, m10
+ pmaddubsw m%3, m10
+ pmaddubsw m%1, m9
+ paddw m%3, m%4
+ paddw m%1, m%2
+ phaddw m%1, m%3
+ paddw m%1, m5
+ psraw m%1, 6
+%endmacro
+ movu xm0, [srcq+ssq*0]
+ vinserti128 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 0, 1, 2, 3
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu xm0, [srcq+ssq*0+8*0]
+ vinserti128 m0, [srcq+ssq*1+8*0], 1
+ movu xm1, [srcq+ssq*0+8*1]
+ vinserti128 m1, [srcq+ssq*1+8*1], 1
+ PUT_8TAP_H 0, 2, 3, 4
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 1, 2, 3, 4
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ xor r6d, r6d
+ jmp .h_start
+.h_w64:
+ mov r6, -32*1
+ jmp .h_start
+.h_w128:
+ mov r6, -32*3
+.h_start:
+ sub srcq, r6
+ sub dstq, r6
+ mov r4, r6
+.h_loop:
+ movu m0, [srcq+r6+8*0]
+ movu m1, [srcq+r6+8*1]
+ PUT_8TAP_H 0, 2, 3, 4
+ PUT_8TAP_H 1, 2, 3, 4
+ packuswb m0, m1
+ mova [dstq+r6], m0
+ add r6, 32
+ jle .h_loop
+ add srcq, ssq
+ add dstq, dsq
+ mov r6, r4
+ dec hd
+ jg .h_loop
+ RET
.hv:
- WIN64_SPILL_XMM 16
+ WIN64_SPILL_XMM 14, 16
cmp wd, 4
jg .hv_w8
movzx mxd, mxb
@@ -1975,6 +2472,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
jg .hv_w4_loop
RET
.hv_w8:
+ WIN64_PUSH_XMM 16
shr mxd, 16
sub srcq, 3
vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+0]
@@ -1993,24 +2491,23 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
pshufd m14, m0, q2222
pshufd m15, m0, q3333
lea r6d, [wq*8-64]
- mov r4, srcq
- mov r7, dstq
lea r6d, [hq+r6*4]
.hv_w8_loop0:
vbroadcasti128 m7, [subpel_h_shufA]
movu xm4, [srcq+ssq*0]
+ lea r4, [srcq+ss3q]
vbroadcasti128 m8, [subpel_h_shufB]
movu xm5, [srcq+ssq*1]
+ mov r7, dstq
vbroadcasti128 m9, [subpel_h_shufC]
movu xm6, [srcq+ssq*2]
- add srcq, ss3q
- vbroadcasti128 m0, [srcq+ssq*0]
- vpblendd m4, m0, 0xf0 ; 0 3
- vinserti128 m5, [srcq+ssq*1], 1 ; 1 4
- vinserti128 m6, [srcq+ssq*2], 1 ; 2 5
- add srcq, ss3q
- vinserti128 m0, [srcq+ssq*0], 1 ; 3 6
-%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
+ vbroadcasti128 m0, [r4+ssq*0]
+ vpblendd m4, m0, 0xf0 ; 0 3
+ vinserti128 m5, [r4+ssq*1], 1 ; 1 4
+ vinserti128 m6, [r4+ssq*2], 1 ; 2 5
+ add r4, ss3q
+ vinserti128 m0, [r4+ssq*0], 1 ; 3 6
+%macro HV_H_8TAP_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
pshufb %3, %1, %6
pshufb %4, %1, %7
pshufb %1, %5
@@ -2022,10 +2519,10 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
paddw %1, %3
phaddw %1, %2
%endmacro
- HV_H_W8 m4, m1, m2, m3, m7, m8, m9
- HV_H_W8 m5, m1, m2, m3, m7, m8, m9
- HV_H_W8 m6, m1, m2, m3, m7, m8, m9
- HV_H_W8 m0, m1, m2, m3, m7, m8, m9
+ HV_H_8TAP_W8 m4, m1, m2, m3, m7, m8, m9
+ HV_H_8TAP_W8 m5, m1, m2, m3, m7, m8, m9
+ HV_H_8TAP_W8 m6, m1, m2, m3, m7, m8, m9
+ HV_H_8TAP_W8 m0, m1, m2, m3, m7, m8, m9
vpbroadcastd m7, [pw_8192]
vpermq m4, m4, q3120
vpermq m5, m5, q3120
@@ -2043,9 +2540,9 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
punpckhwd m6, m7 ; 56
.hv_w8_loop:
vextracti128 r6m, m0, 1 ; not enough registers
- movu xm0, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- vinserti128 m0, [srcq+ssq*0], 1 ; 7 8
+ movu xm0, [r4+ssq*1]
+ lea r4, [r4+ssq*2]
+ vinserti128 m0, [r4+ssq*0], 1 ; 7 8
pmaddwd m8, m1, m12 ; a0
pmaddwd m9, m2, m12 ; b0
mova m1, m3
@@ -2063,15 +2560,15 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
vbroadcasti128 m6, [subpel_h_shufB]
vbroadcasti128 m7, [subpel_h_shufC]
vbroadcasti128 m5, [subpel_h_shufA]
- HV_H_W8 m0, m5, m6, m7, m5, m6, m7
+ HV_H_8TAP_W8 m0, m5, m6, m7, m5, m6, m7
vpbroadcastd m5, [pw_8192]
vpbroadcastd m7, [pd_512]
vbroadcasti128 m6, r6m
pmulhrsw m0, m5
paddd m8, m7
paddd m9, m7
- vpermq m7, m0, q3120 ; 7 8
- shufpd m6, m6, m7, 0x04 ; 6 7
+ vpermq m7, m0, q3120 ; 7 8
+ shufpd m6, m7, 0x04 ; 6 7
punpcklwd m5, m6, m7 ; 67
punpckhwd m6, m7 ; 78
pmaddwd m7, m5, m15 ; a3
@@ -2084,34 +2581,18 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
vextracti128 xm7, m8, 1
packuswb xm8, xm7
pshufd xm7, xm8, q3120
- movq [dstq+dsq*0], xm7
- movhps [dstq+dsq*1], xm7
- lea dstq, [dstq+dsq*2]
+ movq [r7+dsq*0], xm7
+ movhps [r7+dsq*1], xm7
+ lea r7, [r7+dsq*2]
sub hd, 2
jg .hv_w8_loop
- add r4, 8
- add r7, 8
+ add srcq, 8
+ add dstq, 8
movzx hd, r6b
- mov srcq, r4
- mov dstq, r7
sub r6d, 1<<8
jg .hv_w8_loop0
RET
-%macro PREP_8TAP_H 0
- pshufb m1, m0, m5
- pshufb m2, m0, m6
- pshufb m3, m0, m7
- pmaddubsw m1, m8
- pmaddubsw m0, m2, m8
- pmaddubsw m2, m9
- pmaddubsw m3, m9
- paddw m1, m2
- paddw m0, m3
- phaddw m0, m1, m0
- pmulhrsw m0, m4
-%endmacro
-
%if WIN64
DECLARE_REG_TMP 6, 4
%else
@@ -2119,71 +2600,197 @@ DECLARE_REG_TMP 6, 7
%endif
%define PREP_8TAP_FN FN prep_8tap,
-PREP_8TAP_FN sharp, SHARP, SHARP
-PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
-PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
-PREP_8TAP_FN smooth, SMOOTH, SMOOTH
-PREP_8TAP_FN sharp_regular, SHARP, REGULAR
-PREP_8TAP_FN regular_sharp, REGULAR, SHARP
-PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
-PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_8bpc
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_8bpc
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_8bpc
PREP_8TAP_FN regular, REGULAR, REGULAR
-cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
+cglobal prep_6tap_8bpc, 3, 8, 0, tmp, src, ss, w, h, mx, my, ns
imul mxd, mxm, 0x010101
- add mxd, t0d ; 8tap_h, mx, 4tap_h
+ add mxd, t0d ; 6tap_h, mx, 4tap_h
imul myd, mym, 0x010101
- add myd, t1d ; 8tap_v, my, 4tap_v
+ add myd, t1d ; 6tap_v, my, 4tap_v
lea r7, [prep%+SUFFIX]
- movsxd wq, wm
+ mov wd, wm
movifnidn hd, hm
test mxd, 0xf00
jnz .h
test myd, 0xf00
jnz .v
+.prep:
tzcnt wd, wd
movzx wd, word [r7+wq*2+table_offset(prep,)]
add wq, r7
- lea r6, [strideq*3]
+ lea r6, [ssq*3]
%if WIN64
pop r7
%endif
jmp wq
-.h:
- test myd, 0xf00
- jnz .hv
- vpbroadcastd m4, [pw_8192]
- vbroadcasti128 m5, [subpel_h_shufA]
- WIN64_SPILL_XMM 10
- cmp wd, 4
- je .h_w4
- tzcnt wd, wd
- vbroadcasti128 m6, [subpel_h_shufB]
- vbroadcasti128 m7, [subpel_h_shufC]
- shr mxd, 16
- sub srcq, 3
- movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
- vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0]
- vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4]
- add wq, r7
- jmp wq
+.v:
+ WIN64_SPILL_XMM 10, 12
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ lea myq, [r7+myq*8+subpel_filters+1-prep%+SUFFIX]
+ vpbroadcastd m9, [pw_8192]
+ vpbroadcastw m6, [myq+0]
+ mov nsq, ssq
+ vpbroadcastw m7, [myq+2]
+ neg nsq
+ vpbroadcastw m8, [myq+4]
+ cmp wd, 8
+ jg .v_w16
+ je .v_w8
+.v_w4:
+ movd xm2, [srcq+nsq*2]
+ pinsrd xm2, [srcq+nsq*1], 1
+ vpbroadcastd m1, [srcq+ssq*0]
+ vpbroadcastd m3, [srcq+ssq*1]
+ vpbroadcastd m0, [srcq+ssq*2]
+ vbroadcasti128 m5, [deint_shuf4]
+ vpblendd m1, m2, 0xeb
+ punpcklqdq m3, m0
+ vpblendd m1, m3, 0x60 ; 0 1 2 _ 2 3 4 _
+ pshufb m1, m5 ; 01 12 23 34
+.v_w4_loop:
+ lea srcq, [srcq+ssq*4]
+ pinsrd xm0, [srcq+nsq*1], 1
+ vpbroadcastd m2, [srcq+ssq*0]
+ vpbroadcastd m3, [srcq+ssq*1]
+ vpblendd m2, m0, 0xeb
+ vpbroadcastd m0, [srcq+ssq*2]
+ punpcklqdq m3, m0
+ vpblendd m2, m3, 0x60 ; 4 5 6 _ 6 7 8 _
+ pshufb m2, m5 ; 45 56 67 78
+ pmaddubsw m3, m1, m6 ; a0 b0 c0 d0
+ vperm2i128 m1, m2, 0x21 ; 23 34 45 56
+ pmaddubsw m4, m2, m8 ; a2 b2 c2 d2
+ pmaddubsw m1, m7 ; a1 b1 c1 d1
+ paddw m3, m4
+ paddw m3, m1
+ pmulhrsw m3, m9
+ mova m1, m2
+ mova [tmpq], m3
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xm1, [srcq+nsq*2]
+ vpbroadcastq m3, [srcq+nsq*1]
+ vpbroadcastq m2, [srcq+ssq*0]
+ vpbroadcastq m4, [srcq+ssq*1]
+ vpbroadcastq m0, [srcq+ssq*2]
+ vpblendd m1, m3, 0x30
+ vpblendd m3, m2, 0x30
+ punpcklbw m1, m3 ; 01 12
+ vpblendd m2, m4, 0x30
+ vpblendd m4, m0, 0x30
+ punpcklbw m2, m4 ; 23 34
+.v_w8_loop:
+ lea srcq, [srcq+ssq*4]
+ pmaddubsw m1, m6 ; a0
+ vpbroadcastq m3, [srcq+nsq*1]
+ pmaddubsw m4, m2, m7 ; a1
+ pmaddubsw m5, m2, m6 ; b0
+ vpbroadcastq m2, [srcq+ssq*0]
+ vpblendd m0, m3, 0x30
+ vpblendd m3, m2, 0x30
+ paddw m4, m1
+ punpcklbw m1, m0, m3 ; 45 56
+ vpbroadcastq m3, [srcq+ssq*1]
+ vpbroadcastq m0, [srcq+ssq*2]
+ vpblendd m2, m3, 0x30
+ vpblendd m3, m0, 0x30
+ punpcklbw m2, m3 ; 67 78
+ pmaddubsw m3, m1, m7 ; b1
+ paddw m5, m3
+ pmaddubsw m3, m1, m8 ; a2
+ paddw m4, m3
+ pmaddubsw m3, m2, m8 ; b2
+ paddw m5, m3
+ pmulhrsw m4, m9
+ pmulhrsw m5, m9
+ mova [tmpq+32*0], m4
+ mova [tmpq+32*1], m5
+ add tmpq, 32*2
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ lea r6d, [wq*2-32]
+ lea srcq, [srcq+nsq*2]
+ WIN64_PUSH_XMM 12
+ lea r6d, [hq+r6*8]
+.v_w16_loop0:
+ vbroadcasti128 m3, [srcq+ssq*0]
+ lea r5, [srcq+ssq*2]
+ vbroadcasti128 m4, [srcq+ssq*1]
+ mov r7, tmpq
+ vbroadcasti128 m0, [r5+ssq*0]
+ vbroadcasti128 m1, [r5+ssq*1]
+ lea r5, [r5+ssq*2]
+ vbroadcasti128 m2, [r5+ssq*0]
+ shufpd m3, m0, 0x0c
+ shufpd m4, m1, 0x0c
+ punpcklbw m1, m3, m4 ; 01
+ punpckhbw m3, m4 ; 23
+ shufpd m0, m2, 0x0c
+ punpcklbw m2, m4, m0 ; 12
+ punpckhbw m4, m0 ; 34
+.v_w16_loop:
+ vbroadcasti128 m5, [r5+ssq*1]
+ pmaddubsw m10, m1, m6 ; a0
+ lea r5, [r5+ssq*2]
+ pmaddubsw m11, m2, m6 ; b0
+ mova m1, m3
+ pmaddubsw m3, m7 ; a1
+ mova m2, m4
+ pmaddubsw m4, m7 ; b1
+ paddw m10, m3
+ vbroadcasti128 m3, [r5+ssq*0]
+ paddw m11, m4
+ shufpd m4, m0, m5, 0x0d
+ shufpd m0, m5, m3, 0x0c
+ punpcklbw m3, m4, m0 ; 45
+ punpckhbw m4, m0 ; 56
+ pmaddubsw m5, m3, m8 ; a2
+ paddw m10, m5
+ pmaddubsw m5, m4, m8 ; b2
+ paddw m11, m5
+ pmulhrsw m10, m9
+ pmulhrsw m11, m9
+ mova [r7+wq*0], m10
+ mova [r7+wq*2], m11
+ lea r7, [r7+wq*4]
+ sub hd, 2
+ jg .v_w16_loop
+ add srcq, 16
+ add tmpq, 32
+ movzx hd, r6b
+ sub r6d, 1<<8
+ jg .v_w16_loop0
+ RET
.h_w4:
+ RESET_STACK_STATE
movzx mxd, mxb
+ vbroadcasti128 m3, [subpel_h_shufA]
dec srcq
- vpbroadcastd m6, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
- lea stride3q, [strideq*3]
+ vpbroadcastd m5, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
+ lea r3, [ssq*3]
.h_w4_loop:
- movq xm0, [srcq+strideq*0]
- vpbroadcastq m2, [srcq+strideq*2]
- movq xm1, [srcq+strideq*1]
- vpblendd m0, m2, 0xf0
- vpbroadcastq m2, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- vpblendd m1, m2, 0xf0
- pshufb m0, m5
- pshufb m1, m5
- pmaddubsw m0, m6
- pmaddubsw m1, m6
+ movq xm0, [srcq+ssq*0]
+ vpbroadcastq m2, [srcq+ssq*2]
+ movq xm1, [srcq+ssq*1]
+ vpblendd m0, m2, 0x30
+ vpbroadcastq m2, [srcq+r3 ]
+ lea srcq, [srcq+ssq*4]
+ vpblendd m1, m2, 0x30
+ pshufb m0, m3
+ pshufb m1, m3
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
phaddw m0, m1
pmulhrsw m0, m4
mova [tmpq], m0
@@ -2191,25 +2798,56 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
sub hd, 4
jg .h_w4_loop
RET
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m4, [pw_8192]
+ cmp wd, 4
+ je .h_w4
+ WIN64_SPILL_XMM 10
+ tzcnt wd, wd
+ vbroadcasti128 m3, [z_filter_s+ 2]
+ shr mxd, 16
+ vbroadcasti128 m5, [z_filter_s+ 6]
+ sub srcq, 2
+ vbroadcasti128 m6, [z_filter_s+10]
+ lea mxq, [r7+mxq*8+subpel_filters+1-prep%+SUFFIX]
+ movzx wd, word [r7+wq*2+table_offset(prep, _6tap_h)]
+ vpbroadcastw m7, [mxq+0]
+ vpbroadcastw m8, [mxq+2]
+ add wq, r7
+ vpbroadcastw m9, [mxq+4]
+ jmp wq
.h_w8:
- movu xm0, [srcq+strideq*0]
- vinserti128 m0, [srcq+strideq*1], 1
- lea srcq, [srcq+strideq*2]
- PREP_8TAP_H
+ movu xm0, [srcq+ssq*0]
+ vinserti128 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+%macro PREP_6TAP_H 0
+ pshufb m1, m0, m3
+ pmaddubsw m1, m7
+ pshufb m2, m0, m5
+ pmaddubsw m2, m8
+ pshufb m0, m6
+ pmaddubsw m0, m9
+ paddw m1, m2
+ paddw m0, m1
+ pmulhrsw m0, m4
+%endmacro
+ PREP_6TAP_H
mova [tmpq], m0
add tmpq, 32
sub hd, 2
jg .h_w8
RET
.h_w16:
- movu xm0, [srcq+strideq*0+8*0]
- vinserti128 m0, [srcq+strideq*0+8*1], 1
- PREP_8TAP_H
+ movu xm0, [srcq+ssq*0+8*0]
+ vinserti128 m0, [srcq+ssq*0+8*1], 1
+ PREP_6TAP_H
mova [tmpq+32*0], m0
- movu xm0, [srcq+strideq*1+8*0]
- vinserti128 m0, [srcq+strideq*1+8*1], 1
- lea srcq, [srcq+strideq*2]
- PREP_8TAP_H
+ movu xm0, [srcq+ssq*1+8*0]
+ vinserti128 m0, [srcq+ssq*1+8*1], 1
+ lea srcq, [srcq+ssq*2]
+ PREP_6TAP_H
mova [tmpq+32*1], m0
add tmpq, 32*2
sub hd, 2
@@ -2229,27 +2867,219 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
.h_loop:
movu xm0, [srcq+r6+8*0]
vinserti128 m0, [srcq+r6+8*1], 1
- PREP_8TAP_H
+ PREP_6TAP_H
mova [tmpq+32*0], m0
movu xm0, [srcq+r6+8*2]
vinserti128 m0, [srcq+r6+8*3], 1
- PREP_8TAP_H
+ PREP_6TAP_H
mova [tmpq+32*1], m0
add tmpq, 32*2
add r6, 32
jle .h_loop
- add srcq, strideq
+ add srcq, ssq
mov r6, r5
dec hd
jg .h_loop
RET
+.hv:
+ WIN64_SPILL_XMM 14, 16
+ cmp wd, 4
+ jne .hv_w8
+.hv_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m7, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ mova m6, [subpel_h_shuf4]
+ vpbroadcastq m0, [r7+myq*8+subpel_filters+1-prep%+SUFFIX]
+ mov nsq, ssq
+ pmovzxbd m13, [deint_shuf4]
+ neg nsq
+ vpbroadcastd m8, [pw_8192]
+ vpbroadcastd m9, [pd_32]
+ punpcklbw m0, m0
+ vpbroadcastq m2, [srcq+nsq*2]
+ psraw m0, 8 ; sign-extend
+ vpbroadcastq m4, [srcq+nsq*1]
+ pshufd m10, m0, q0000
+ vpbroadcastq m1, [srcq+ssq*0]
+ pshufd m11, m0, q1111
+ vpbroadcastq m3, [srcq+ssq*1]
+ pshufd m12, m0, q2222
+ vpbroadcastq m0, [srcq+ssq*2]
+ vpblendd m2, m4, 0xcc ; 0 1
+ vpblendd m1, m3, 0xcc ; 2 3
+ pshufb m2, m6
+ pshufb m1, m6
+ pshufb m0, m6
+ pmaddubsw m2, m7
+ pmaddubsw m1, m7
+ pmaddubsw m0, m7
+ phaddw m2, m1 ; 0 1 2 3
+ phaddw m0, m0 ; 4
+ pmulhrsw m2, m8
+ pmulhrsw m0, m8
+ palignr m0, m2, 4
+ punpcklwd m1, m2, m0 ; 01 12
+ punpckhwd m2, m0 ; 23 34
+.hv_w4_loop:
+ pmaddwd m4, m10, m1 ; a0 b0
+ lea srcq, [srcq+ssq*4]
+ pmaddwd m5, m2, m10 ; c0 d0
+ vpbroadcastq m1, [srcq+nsq*1]
+ pmaddwd m2, m11 ; a1 b1
+ vpbroadcastq m3, [srcq+ssq*0]
+ paddd m4, m2
+ vpbroadcastq m2, [srcq+ssq*1]
+ vpblendd m1, m3, 0xcc ; 5 6
+ vpbroadcastq m3, [srcq+ssq*2]
+ vpblendd m2, m3, 0xcc ; 7 8
+ pshufb m1, m6
+ pshufb m2, m6
+ pmaddubsw m1, m7
+ pmaddubsw m2, m7
+ phaddw m1, m2 ; 5 6 7 8
+ pmulhrsw m1, m8
+ paddd m5, m9
+ paddd m4, m9
+ palignr m2, m1, m0, 12
+ mova m0, m1
+ punpcklwd m1, m2, m0 ; 45 56
+ punpckhwd m2, m0 ; 67 78
+ pmaddwd m3, m11, m1 ; c1 d1
+ paddd m5, m3
+ pmaddwd m3, m12, m1 ; a2 b2
+ paddd m4, m3
+ pmaddwd m3, m12, m2 ; c2 d2
+ paddd m5, m3
+ psrad m4, 6
+ psrad m5, 6
+ packssdw m4, m5
+ vpermd m4, m13, m4
+ mova [tmpq], m4
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+ lea mxq, [r7+mxq*8+subpel_filters+1-prep_avx2]
+ WIN64_PUSH_XMM 16
+ vpbroadcastw m10, [mxq+0]
+ vpbroadcastw m11, [mxq+2]
+ vpbroadcastw m12, [mxq+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastq m0, [r7+myq*8+subpel_filters+1-prep_avx2]
+ lea r7, [ssq*2+2]
+ vbroadcasti128 m8, [z_filter_s+ 6]
+ punpcklbw m0, m0
+ vbroadcasti128 m9, [z_filter_s+10]
+ psraw m0, 8 ; sign-extend
+ lea r6d, [wq*8-64]
+ pshufd m13, m0, q0000
+ sub srcq, r7
+ pshufd m14, m0, q1111
+ lea r6d, [hq+r6*4]
+ pshufd m15, m0, q2222
+.hv_w8_loop0:
+ vbroadcasti128 m7, [z_filter_s+2]
+ movu xm3, [srcq+ssq*0]
+ lea r5, [srcq+ssq*2]
+ movu xm4, [srcq+ssq*1]
+ vbroadcasti128 m0, [r5+ssq*0]
+ mov r7, tmpq
+ vinserti128 m4, [r5+ssq*1], 1 ; 1 3
+ lea r5, [r5+ssq*2]
+ vpblendd m3, m0, 0xf0 ; 0 2
+ vinserti128 m0, [r5+ssq*0], 1 ; 2 4
+ vpbroadcastd m5, [pw_8192]
+ HV_H_6TAP_W8 m3, m1, m2, m7, m8, m9
+ HV_H_6TAP_W8 m4, m1, m2, m7, m8, m9
+ HV_H_6TAP_W8 m0, m1, m2, m7, m8, m9
+ vpermq m3, m3, q3120
+ vpermq m4, m4, q3120
+ vpermq m0, m0, q3120
+ pmulhrsw m3, m5
+ pmulhrsw m4, m5
+ pmulhrsw m0, m5
+ punpcklwd m1, m3, m4 ; 01
+ punpckhwd m3, m4 ; 23
+ punpcklwd m2, m4, m0 ; 12
+ punpckhwd m4, m0 ; 34
+.hv_w8_loop:
+ movu xm7, [r5+ssq*1]
+ lea r5, [r5+ssq*2]
+ vinserti128 m7, [r5+ssq*0], 1 ; 5 6
+ pmaddwd m5, m13, m1 ; a0
+ mova m1, m3
+ pmaddwd m6, m13, m2 ; b0
+ mova m2, m4
+ pmaddwd m3, m14 ; a1
+ pmaddwd m4, m14 ; b1
+ paddd m5, m3
+ vbroadcasti128 m3, [z_filter_s+2]
+ paddd m6, m4
+ HV_H_6TAP_W8 m7, m3, m4, m3, m8, m9
+ vpbroadcastd m3, [pw_8192]
+ vpbroadcastd m4, [pd_32]
+ pmulhrsw m7, m3
+ paddd m5, m4
+ paddd m6, m4
+ mova m4, m0
+ vpermq m0, m7, q3120
+ shufpd m4, m0, 0x05
+ punpcklwd m3, m4, m0 ; 45
+ pmaddwd m7, m15, m3 ; a2
+ punpckhwd m4, m0 ; 67
+ paddd m5, m7
+ pmaddwd m7, m15, m4 ; b2
+ paddd m6, m7
+ psrad m5, 6
+ psrad m6, 6
+ packssdw m5, m6
+ vpermq m5, m5, q3120
+ mova [r7+wq*0], xm5
+ vextracti128 [r7+wq*2], m5, 1
+ lea r7, [r7+wq*4]
+ sub hd, 2
+ jg .hv_w8_loop
+ add srcq, 8
+ add tmpq, 16
+ movzx hd, r6b
+ sub r6d, 1<<8
+ jg .hv_w8_loop0
+ RET
+
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_8bpc
+PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_8bpc
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_8bpc
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_8bpc
+PREP_8TAP_FN sharp, SHARP, SHARP
+
+cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r7, [prep%+SUFFIX]
+ mov wd, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jz mangle(private_prefix %+ _prep_6tap_8bpc_avx2).prep
.v:
- WIN64_SPILL_XMM 16
+ WIN64_SPILL_XMM 12, 15
movzx mxd, myb ; Select 4-tap/8-tap filter multipliers.
shr myd, 16 ; Note that the code is 8-tap only, having
cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4
cmove myd, mxd ; had a negligible effect on performance.
- ; TODO: Would a 6-tap code path be worth it?
lea myq, [r7+myq*8+subpel_filters-prep%+SUFFIX]
lea stride3q, [strideq*3]
sub srcq, stride3q
@@ -2359,72 +3189,154 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
jg .v_w8_loop
RET
.v_w16:
- add wd, wd
- mov r5, srcq
- mov r7, tmpq
- lea r6d, [hq+wq*8-256]
+ lea r6d, [wq*2-32]
+ WIN64_PUSH_XMM 15
+ lea r6d, [hq+r6*8]
.v_w16_loop0:
vbroadcasti128 m4, [srcq+strideq*0]
vbroadcasti128 m5, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vbroadcasti128 m0, [srcq+strideq*1]
- vbroadcasti128 m6, [srcq+strideq*0]
- lea srcq, [srcq+strideq*2]
- vbroadcasti128 m1, [srcq+strideq*0]
- vbroadcasti128 m2, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vbroadcasti128 m3, [srcq+strideq*0]
- shufpd m4, m4, m0, 0x0c
- shufpd m5, m5, m1, 0x0c
+ lea r5, [srcq+strideq*2]
+ vbroadcasti128 m0, [r5+strideq*1]
+ vbroadcasti128 m6, [r5+strideq*0]
+ lea r5, [r5+strideq*2]
+ vbroadcasti128 m1, [r5+strideq*0]
+ vbroadcasti128 m2, [r5+strideq*1]
+ lea r5, [r5+strideq*2]
+ vbroadcasti128 m3, [r5+strideq*0]
+ mov r7, tmpq
+ shufpd m4, m0, 0x0c
+ shufpd m5, m1, 0x0c
punpcklbw m1, m4, m5 ; 01
punpckhbw m4, m5 ; 34
- shufpd m6, m6, m2, 0x0c
+ shufpd m6, m2, 0x0c
punpcklbw m2, m5, m6 ; 12
punpckhbw m5, m6 ; 45
- shufpd m0, m0, m3, 0x0c
+ shufpd m0, m3, 0x0c
punpcklbw m3, m6, m0 ; 23
punpckhbw m6, m0 ; 56
.v_w16_loop:
- vbroadcasti128 m12, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vbroadcasti128 m13, [srcq+strideq*0]
- pmaddubsw m14, m1, m8 ; a0
- pmaddubsw m15, m2, m8 ; b0
+ vbroadcasti128 m12, [r5+strideq*1]
+ lea r5, [r5+strideq*2]
+ pmaddubsw m13, m1, m8 ; a0
+ pmaddubsw m14, m2, m8 ; b0
mova m1, m3
mova m2, m4
pmaddubsw m3, m9 ; a1
pmaddubsw m4, m9 ; b1
- paddw m14, m3
- paddw m15, m4
+ paddw m13, m3
+ paddw m14, m4
mova m3, m5
mova m4, m6
pmaddubsw m5, m10 ; a2
pmaddubsw m6, m10 ; b2
- paddw m14, m5
- paddw m15, m6
+ paddw m13, m5
+ vbroadcasti128 m5, [r5+strideq*0]
+ paddw m14, m6
shufpd m6, m0, m12, 0x0d
- shufpd m0, m12, m13, 0x0c
+ shufpd m0, m12, m5, 0x0c
punpcklbw m5, m6, m0 ; 67
punpckhbw m6, m0 ; 78
pmaddubsw m12, m5, m11 ; a3
- pmaddubsw m13, m6, m11 ; b3
+ paddw m13, m12
+ pmaddubsw m12, m6, m11 ; b3
paddw m14, m12
- paddw m15, m13
+ pmulhrsw m13, m7
pmulhrsw m14, m7
- pmulhrsw m15, m7
- mova [tmpq+wq*0], m14
- mova [tmpq+wq*1], m15
- lea tmpq, [tmpq+wq*2]
+ mova [r7+wq*0], m13
+ mova [r7+wq*2], m14
+ lea r7, [r7+wq*4]
sub hd, 2
jg .v_w16_loop
- add r5, 16
- add r7, 32
+ add srcq, 16
+ add tmpq, 32
movzx hd, r6b
- mov srcq, r5
- mov tmpq, r7
sub r6d, 1<<8
jg .v_w16_loop0
RET
+.h:
+.h_w4:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m4, [pw_8192]
+ cmp wd, 4
+ je mangle(private_prefix %+ _prep_6tap_8bpc_avx2).h_w4
+ WIN64_SPILL_XMM 10
+ vbroadcasti128 m5, [subpel_h_shufA]
+ tzcnt wd, wd
+ vbroadcasti128 m6, [subpel_h_shufB]
+ vbroadcasti128 m7, [subpel_h_shufC]
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
+ vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0]
+ vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4]
+ add wq, r7
+ jmp wq
+.h_w8:
+ movu xm0, [srcq+strideq*0]
+ vinserti128 m0, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+%macro PREP_8TAP_H 0
+ pshufb m1, m0, m5
+ pshufb m2, m0, m6
+ pshufb m3, m0, m7
+ pmaddubsw m1, m8
+ pmaddubsw m0, m2, m8
+ pmaddubsw m2, m9
+ pmaddubsw m3, m9
+ paddw m1, m2
+ paddw m0, m3
+ phaddw m0, m1, m0
+ pmulhrsw m0, m4
+%endmacro
+ PREP_8TAP_H
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
+ PREP_8TAP_H
+ mova [tmpq+32*0], m0
+ movu xm0, [srcq+strideq*1+8*0]
+ vinserti128 m0, [srcq+strideq*1+8*1], 1
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_H
+ mova [tmpq+32*1], m0
+ add tmpq, 32*2
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ xor r6d, r6d
+ jmp .h_start
+.h_w64:
+ mov r6, -32*1
+ jmp .h_start
+.h_w128:
+ mov r6, -32*3
+.h_start:
+ sub srcq, r6
+ mov r5, r6
+.h_loop:
+ movu xm0, [srcq+r6+8*0]
+ vinserti128 m0, [srcq+r6+8*1], 1
+ PREP_8TAP_H
+ mova [tmpq+32*0], m0
+ movu xm0, [srcq+r6+8*2]
+ vinserti128 m0, [srcq+r6+8*3], 1
+ PREP_8TAP_H
+ mova [tmpq+32*1], m0
+ add tmpq, 32*2
+ add r6, 32
+ jle .h_loop
+ add srcq, strideq
+ mov r6, r5
+ dec hd
+ jg .h_loop
+ RET
.hv:
WIN64_SPILL_XMM 16
cmp wd, 4
@@ -2542,28 +3454,27 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
RET
.hv_w8:
lea r6d, [wq*8-64]
- mov r5, srcq
- mov r7, tmpq
lea r6d, [hq+r6*4]
.hv_w8_loop0:
vbroadcasti128 m7, [subpel_h_shufA]
movu xm4, [srcq+strideq*0]
+ lea r5, [srcq+strideq*2]
vbroadcasti128 m8, [subpel_h_shufB]
movu xm5, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
+ mov r7, tmpq
vbroadcasti128 m9, [subpel_h_shufC]
- movu xm6, [srcq+strideq*0]
- vbroadcasti128 m0, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vpblendd m4, m0, 0xf0 ; 0 3
- vinserti128 m5, [srcq+strideq*0], 1 ; 1 4
- vinserti128 m6, [srcq+strideq*1], 1 ; 2 5
- lea srcq, [srcq+strideq*2]
- vinserti128 m0, [srcq+strideq*0], 1 ; 3 6
- HV_H_W8 m4, m1, m2, m3, m7, m8, m9
- HV_H_W8 m5, m1, m2, m3, m7, m8, m9
- HV_H_W8 m6, m1, m2, m3, m7, m8, m9
- HV_H_W8 m0, m1, m2, m3, m7, m8, m9
+ movu xm6, [r5+strideq*0]
+ vbroadcasti128 m0, [r5+strideq*1]
+ lea r5, [r5+strideq*2]
+ vpblendd m4, m0, 0xf0 ; 0 3
+ vinserti128 m5, [r5+strideq*0], 1 ; 1 4
+ vinserti128 m6, [r5+strideq*1], 1 ; 2 5
+ lea r5, [r5+strideq*2]
+ vinserti128 m0, [r5+strideq*0], 1 ; 3 6
+ HV_H_8TAP_W8 m4, m1, m2, m3, m7, m8, m9
+ HV_H_8TAP_W8 m5, m1, m2, m3, m7, m8, m9
+ HV_H_8TAP_W8 m6, m1, m2, m3, m7, m8, m9
+ HV_H_8TAP_W8 m0, m1, m2, m3, m7, m8, m9
vpbroadcastd m7, [pw_8192]
vpermq m4, m4, q3120
vpermq m5, m5, q3120
@@ -2580,10 +3491,10 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
punpcklwd m3, m6, m7 ; 23
punpckhwd m6, m7 ; 56
.hv_w8_loop:
- vextracti128 [tmpq], m0, 1 ; not enough registers
- movu xm0, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vinserti128 m0, [srcq+strideq*0], 1 ; 7 8
+ vextracti128 [r7], m0, 1 ; not enough registers
+ movu xm0, [r5+strideq*1]
+ lea r5, [r5+strideq*2]
+ vinserti128 m0, [r5+strideq*0], 1 ; 7 8
pmaddwd m8, m1, m12 ; a0
pmaddwd m9, m2, m12 ; b0
mova m1, m3
@@ -2601,15 +3512,15 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
vbroadcasti128 m6, [subpel_h_shufB]
vbroadcasti128 m7, [subpel_h_shufC]
vbroadcasti128 m5, [subpel_h_shufA]
- HV_H_W8 m0, m5, m6, m7, m5, m6, m7
+ HV_H_8TAP_W8 m0, m5, m6, m7, m5, m6, m7
vpbroadcastd m5, [pw_8192]
vpbroadcastd m7, [pd_32]
- vbroadcasti128 m6, [tmpq]
+ vbroadcasti128 m6, [r7]
pmulhrsw m0, m5
paddd m8, m7
paddd m9, m7
- vpermq m7, m0, q3120 ; 7 8
- shufpd m6, m6, m7, 0x04 ; 6 7
+ vpermq m7, m0, q3120 ; 7 8
+ shufpd m6, m7, 0x04 ; 6 7
punpcklwd m5, m6, m7 ; 67
punpckhwd m6, m7 ; 78
pmaddwd m7, m5, m15 ; a3
@@ -2620,16 +3531,14 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
psrad m7, 6
packssdw m8, m7
vpermq m7, m8, q3120
- mova [tmpq+wq*0], xm7
- vextracti128 [tmpq+wq*2], m7, 1
- lea tmpq, [tmpq+wq*4]
+ mova [r7+wq*0], xm7
+ vextracti128 [r7+wq*2], m7, 1
+ lea r7, [r7+wq*4]
sub hd, 2
jg .hv_w8_loop
- add r5, 8
- add r7, 16
+ add srcq, 8
+ add tmpq, 16
movzx hd, r6b
- mov srcq, r5
- mov tmpq, r7
sub r6d, 1<<8
jg .hv_w8_loop0
RET
@@ -4008,14 +4917,14 @@ DECLARE_REG_TMP 6, 8
%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
BILIN_SCALED_FN put
-PUT_8TAP_SCALED_FN sharp, SHARP, SHARP
-PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
-PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
-PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
-PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
-PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
-PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
-PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_SCALED_FN sharp, SHARP, SHARP, put_8tap_scaled_8bpc
+PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, put_8tap_scaled_8bpc
+PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, put_8tap_scaled_8bpc
+PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, put_8tap_scaled_8bpc
+PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, put_8tap_scaled_8bpc
+PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, put_8tap_scaled_8bpc
+PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, put_8tap_scaled_8bpc
+PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, put_8tap_scaled_8bpc
PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR
MC_8TAP_SCALED put
@@ -4026,14 +4935,14 @@ DECLARE_REG_TMP 6, 7
%endif
BILIN_SCALED_FN prep
-PREP_8TAP_SCALED_FN sharp, SHARP, SHARP
-PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
-PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
-PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
-PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
-PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
-PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
-PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_SCALED_FN sharp, SHARP, SHARP, prep_8tap_scaled_8bpc
+PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_scaled_8bpc
+PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_scaled_8bpc
+PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, prep_8tap_scaled_8bpc
+PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, prep_8tap_scaled_8bpc
+PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, prep_8tap_scaled_8bpc
+PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, prep_8tap_scaled_8bpc
+PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, prep_8tap_scaled_8bpc
PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR
MC_8TAP_SCALED prep