diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-06-12 05:35:29 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-06-12 05:35:29 +0000 |
commit | 59203c63bb777a3bacec32fb8830fba33540e809 (patch) | |
tree | 58298e711c0ff0575818c30485b44a2f21bf28a0 /third_party/dav1d/src/x86/mc_avx2.asm | |
parent | Adding upstream version 126.0.1. (diff) | |
download | firefox-59203c63bb777a3bacec32fb8830fba33540e809.tar.xz firefox-59203c63bb777a3bacec32fb8830fba33540e809.zip |
Adding upstream version 127.0.upstream/127.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/dav1d/src/x86/mc_avx2.asm')
-rw-r--r-- | third_party/dav1d/src/x86/mc_avx2.asm | 1471 |
1 files changed, 1190 insertions, 281 deletions
diff --git a/third_party/dav1d/src/x86/mc_avx2.asm b/third_party/dav1d/src/x86/mc_avx2.asm index 58e3cb5af1..df8bebb1cb 100644 --- a/third_party/dav1d/src/x86/mc_avx2.asm +++ b/third_party/dav1d/src/x86/mc_avx2.asm @@ -60,15 +60,14 @@ subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 1 subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 -bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 pb_8x0_8x8: db 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8 bdct_lb_dw: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 -rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 +rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 wm_420_sign: dd 0x01020102, 0x01010101 wm_422_sign: dd 0x80808080, 0x7f7f7f7f @@ -95,6 +94,7 @@ pq_0x40000000: dq 0x40000000 cextern mc_subpel_filters cextern mc_warp_filter2 cextern resize_filter +cextern z_filter_s %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) @@ -184,7 +184,9 @@ BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, 6tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, 6tap, avx2, 1, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128 SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128 SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128 @@ -298,7 +300,7 @@ INIT_YMM avx2 ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 imul mxyd, 255 - vbroadcasti128 m4, [bilin_h_shuf8] + vbroadcasti128 m4, [z_filter_s+2] add mxyd, 16 movd xm5, mxyd mov mxyd, r7m ; my @@ -900,7 +902,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 ; 16 * src[x] + (mx * (src[x + 1] - src[x])) ; = (16 - mx) * src[x] + mx * src[x + 1] imul mxyd, 255 - vbroadcasti128 m4, [bilin_h_shuf8] + vbroadcasti128 m4, [z_filter_s+2] add mxyd, 16 movd xm5, mxyd mov mxyd, r6m ; my @@ -1436,7 +1438,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 -%macro FN 4 ; fn, type, type_h, type_v +%macro FN 4-5 ; fn, type, type_h, type_v, jmp_to cglobal %1_%2_8bpc mov t0d, FILTER_%3 %ifidn %3, %4 @@ -1444,8 +1446,8 @@ cglobal %1_%2_8bpc %else mov t1d, FILTER_%4 %endif -%ifnidn %2, regular ; skip the jump in the last filter - jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX) +%if %0 == 5 ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%5 %+ SUFFIX) %endif %endmacro @@ -1456,28 +1458,24 @@ DECLARE_REG_TMP 7, 8 %endif %define PUT_8TAP_FN FN put_8tap, -PUT_8TAP_FN sharp, SHARP, SHARP -PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH -PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP -PUT_8TAP_FN smooth, SMOOTH, SMOOTH -PUT_8TAP_FN sharp_regular, SHARP, REGULAR -PUT_8TAP_FN regular_sharp, REGULAR, SHARP -PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR -PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_8bpc +PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_8bpc +PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_8bpc PUT_8TAP_FN regular, REGULAR, REGULAR -cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 +cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ns imul mxd, mxm, 0x010101 - add mxd, t0d ; 8tap_h, mx, 4tap_h + add mxd, t0d ; 6tap_h, mx, 4tap_h imul myd, mym, 0x010101 - add myd, t1d ; 8tap_v, my, 4tap_v + add myd, t1d ; 6tap_v, my, 4tap_v lea r8, [put_avx2] - movsxd wq, wm + mov wd, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v +.put: tzcnt wd, wd movzx wd, word [r8+wq*2+table_offset(put,)] add wq, r8 @@ -1487,36 +1485,18 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 pop r8 %endif jmp wq -.h: - test myd, 0xf00 - jnz .hv - vpbroadcastd m5, [pw_34] ; 2 + (8 << 2) - WIN64_SPILL_XMM 11 - cmp wd, 4 - jl .h_w2 - vbroadcasti128 m6, [subpel_h_shufA] - je .h_w4 - tzcnt wd, wd - vbroadcasti128 m7, [subpel_h_shufB] - vbroadcasti128 m8, [subpel_h_shufC] - shr mxd, 16 - sub srcq, 3 - movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] - vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0] - vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4] - add wq, r8 - jmp wq .h_w2: movzx mxd, mxb - dec srcq - mova xm4, [subpel_h_shuf4] - vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2] + lea srcq, [srcq-1] + vpbroadcastd xm4, [r8+mxq*8+subpel_filters-put_avx2+2] + je .h_w4 + mova xm3, [subpel_h_shuf4] .h_w2_loop: movq xm0, [srcq+ssq*0] movhps xm0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - pshufb xm0, xm4 - pmaddubsw xm0, xm3 + pshufb xm0, xm3 + pmaddubsw xm0, xm4 phaddw xm0, xm0 paddw xm0, xm5 psraw xm0, 6 @@ -1528,17 +1508,15 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 jg .h_w2_loop RET .h_w4: - movzx mxd, mxb - dec srcq - vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2] + mova xm3, [subpel_h_shufA] .h_w4_loop: movq xm0, [srcq+ssq*0] movq xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - pshufb xm0, xm6 - pshufb xm1, xm6 - pmaddubsw xm0, xm3 - pmaddubsw xm1, xm3 + pshufb xm0, xm3 + pshufb xm1, xm3 + pmaddubsw xm0, xm4 + pmaddubsw xm1, xm4 phaddw xm0, xm1 paddw xm0, xm5 psraw xm0, 6 @@ -1549,25 +1527,43 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 sub hd, 2 jg .h_w4_loop RET +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m5, [pw_34] ; 2 + (8 << 2) + cmp wd, 4 + jle .h_w2 + WIN64_SPILL_XMM 11 + tzcnt wd, wd + vbroadcasti128 m4, [z_filter_s+ 2] ; 01 + shr mxd, 16 + vbroadcasti128 m6, [z_filter_s+ 6] ; 23 + sub srcq, 2 + vbroadcasti128 m7, [z_filter_s+10] ; 45 + lea mxq, [r8+mxq*8+subpel_filters+1-put_avx2] + movzx wd, word [r8+wq*2+table_offset(put, _6tap_h)] + vpbroadcastw m8, [mxq+0] + vpbroadcastw m9, [mxq+2] + add wq, r8 + vpbroadcastw m10, [mxq+4] + jmp wq .h_w8: -%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] - pshufb m%2, m%1, m7 - pshufb m%3, m%1, m8 - pshufb m%1, m6 - pmaddubsw m%4, m%2, m9 - pmaddubsw m%2, m10 - pmaddubsw m%3, m10 - pmaddubsw m%1, m9 - paddw m%3, m%4 +%macro PUT_6TAP_H 3 ; dst/src, tmp[1-2] + pshufb m%2, m%1, m4 + pmaddubsw m%2, m8 + pshufb m%3, m%1, m6 + pmaddubsw m%3, m9 + pshufb m%1, m7 + pmaddubsw m%1, m10 + paddw m%2, m5 + paddw m%1, m%3 paddw m%1, m%2 - phaddw m%1, m%3 - paddw m%1, m5 psraw m%1, 6 %endmacro movu xm0, [srcq+ssq*0] vinserti128 m0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] - PUT_8TAP_H 0, 1, 2, 3 + PUT_6TAP_H 0, 1, 2 vextracti128 xm1, m0, 1 packuswb xm0, xm1 movq [dstq+dsq*0], xm0 @@ -1581,9 +1577,9 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 vinserti128 m0, [srcq+ssq*1+8*0], 1 movu xm1, [srcq+ssq*0+8*1] vinserti128 m1, [srcq+ssq*1+8*1], 1 - PUT_8TAP_H 0, 2, 3, 4 + PUT_6TAP_H 0, 2, 3 lea srcq, [srcq+ssq*2] - PUT_8TAP_H 1, 2, 3, 4 + PUT_6TAP_H 1, 2, 3 packuswb m0, m1 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 @@ -1606,8 +1602,8 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 .h_loop: movu m0, [srcq+r6+8*0] movu m1, [srcq+r6+8*1] - PUT_8TAP_H 0, 2, 3, 4 - PUT_8TAP_H 1, 2, 3, 4 + PUT_6TAP_H 0, 2, 3 + PUT_6TAP_H 1, 2, 3 packuswb m0, m1 mova [dstq+r6], m0 add r6, 32 @@ -1619,7 +1615,421 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 jg .h_loop RET .v: - WIN64_SPILL_XMM 16 + WIN64_SPILL_XMM 9, 12 + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + tzcnt r6d, wd + movzx r6d, word [r8+r6*2+table_offset(put, _6tap_v)] + vpbroadcastd m8, [pw_512] + lea myq, [r8+myq*8+subpel_filters+1-put_avx2] + vpbroadcastw m5, [myq+0] + vpbroadcastw m6, [myq+2] + vpbroadcastw m7, [myq+4] + add r6, r8 + mov nsq, ssq + neg nsq + jmp r6 +.v_w2: + movd xm2, [srcq+nsq*2] + pinsrw xm2, [srcq+nsq*1], 2 + pinsrw xm2, [srcq+ssq*0], 4 + pinsrw xm2, [srcq+ssq*1], 6 ; 0 1 2 3 + lea srcq, [srcq+ssq*2] + vpbroadcastd xm0, [srcq+ssq*0] + palignr xm3, xm0, xm2, 4 ; 1 2 3 4 + punpcklbw xm1, xm2, xm3 ; 01 12 + punpckhbw xm2, xm3 ; 23 34 +.v_w2_loop: + vpbroadcastd xm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw xm3, xm1, xm5 ; a0 b0 + mova xm1, xm2 + pmaddubsw xm2, xm6 ; a1 b1 + paddw xm3, xm2 + vpblendd xm2, xm0, xm4, 0x02 ; 4 5 + vpbroadcastd xm0, [srcq+ssq*0] + vpblendd xm4, xm0, 0x02 ; 5 6 + punpcklbw xm2, xm4 ; 67 78 + pmaddubsw xm4, xm2, xm7 ; a3 b3 + paddw xm3, xm4 + pmulhrsw xm3, xm8 + packuswb xm3, xm3 + pextrw [dstq+dsq*0], xm3, 0 + pextrw [dstq+dsq*1], xm3, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movd xm2, [srcq+nsq*2] + pinsrd xm2, [srcq+nsq*1], 1 + pinsrd xm2, [srcq+ssq*0], 2 + pinsrd xm2, [srcq+ssq*1], 3 ; 0 1 2 3 + lea srcq, [srcq+ssq*2] + vpbroadcastd xm0, [srcq+ssq*0] + palignr xm3, xm0, xm2, 4 ; 1 2 3 4 + punpcklbw xm1, xm2, xm3 ; 01 12 + punpckhbw xm2, xm3 ; 23 34 +.v_w4_loop: + vpbroadcastd xm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw xm3, xm1, xm5 ; a0 b0 + mova xm1, xm2 + pmaddubsw xm2, xm6 ; a1 b1 + paddw xm3, xm2 + vpblendd xm2, xm0, xm4, 0x02 ; 4 5 + vpbroadcastd xm0, [srcq+ssq*0] + vpblendd xm4, xm0, 0x02 ; 5 6 + punpcklbw xm2, xm4 ; 45 56 + pmaddubsw xm4, xm2, xm7 ; a2 b2 + paddw xm3, xm4 + pmulhrsw xm3, xm8 + packuswb xm3, xm3 + movd [dstq+dsq*0], xm3 + pextrd [dstq+dsq*1], xm3, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movq xm1, [srcq+nsq*2] + vpbroadcastq m3, [srcq+nsq*1] + vpbroadcastq m2, [srcq+ssq*0] + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpbroadcastq m0, [srcq+ssq*0] + vpblendd m1, m3, 0x30 + vpblendd m3, m2, 0x30 + punpcklbw m1, m3 ; 01 12 + vpblendd m2, m4, 0x30 + vpblendd m4, m0, 0x30 + punpcklbw m2, m4 ; 23 34 +.v_w8_loop: + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw m3, m1, m5 ; a0 b0 + mova m1, m2 + pmaddubsw m2, m6 ; a1 b1 + paddw m3, m2 + vpblendd m2, m0, m4, 0x30 + vpbroadcastq m0, [srcq+ssq*0] + vpblendd m4, m0, 0x30 + punpcklbw m2, m4 ; 45 56 + pmaddubsw m4, m2, m7 ; a2 b2 + paddw m3, m4 + pmulhrsw m3, m8 + vextracti128 xm4, m3, 1 + packuswb xm3, xm4 + movq [dstq+dsq*0], xm3 + movhps [dstq+dsq*1], xm3 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: +.v_w32: +.v_w64: +.v_w128: + lea r6d, [wq*8-128] + WIN64_PUSH_XMM 12 + lea r6d, [hq+r6*2] +.v_w16_loop0: + vbroadcasti128 m3, [srcq+nsq*2] + vbroadcasti128 m4, [srcq+nsq*1] + lea r4, [srcq+ssq*2] + vbroadcasti128 m0, [srcq+ssq*0] + vbroadcasti128 m1, [srcq+ssq*1] + mov r7, dstq + vbroadcasti128 m2, [r4+ssq*0] + shufpd m3, m0, 0x0c + shufpd m4, m1, 0x0c + punpcklbw m1, m3, m4 ; 01 + punpckhbw m3, m4 ; 23 + shufpd m0, m2, 0x0c + punpcklbw m2, m4, m0 ; 12 + punpckhbw m4, m0 ; 34 +.v_w16_loop: + vbroadcasti128 m9, [r4+ssq*1] + pmaddubsw m10, m1, m5 ; a0 + lea r4, [r4+ssq*2] + pmaddubsw m11, m2, m5 ; b0 + mova m1, m3 + pmaddubsw m3, m6 ; a1 + mova m2, m4 + pmaddubsw m4, m6 ; b1 + paddw m10, m3 + vbroadcasti128 m3, [r4+ssq*0] + paddw m11, m4 + shufpd m4, m0, m9, 0x0d + shufpd m0, m9, m3, 0x0c + punpcklbw m3, m4, m0 ; 45 + punpckhbw m4, m0 ; 56 + pmaddubsw m9, m3, m7 ; a2 + paddw m10, m9 + pmaddubsw m9, m4, m7 ; b2 + paddw m11, m9 + pmulhrsw m10, m8 + pmulhrsw m11, m8 + packuswb m10, m11 + vpermq m10, m10, q3120 + mova [r7+dsq*0], xm10 + vextracti128 [r7+dsq*1], m10, 1 + lea r7, [r7+dsq*2] + sub hd, 2 + jg .v_w16_loop + add srcq, 16 + add dstq, 16 + movzx hd, r6b + sub r6d, 1<<8 + jg .v_w16_loop0 + RET +.hv: + WIN64_SPILL_XMM 12, 16 + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + dec srcq + vpbroadcastd m6, [r8+mxq*8+subpel_filters-put_avx2+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq m0, [r8+myq*8+subpel_filters+1-put_avx2] + vpbroadcastd m7, [pw_8192] + punpcklbw m0, m0 + vpbroadcastd m8, [pd_512] + psraw m0, 8 ; sign-extend + mov nsq, ssq + pshufd m9, m0, q0000 + neg nsq + pshufd m10, m0, q1111 + pshufd m11, m0, q2222 + cmp wd, 4 + je .hv_w4 + vbroadcasti128 m5, [subpel_h_shuf4] + movq xm2, [srcq+nsq*2] + movhps xm2, [srcq+nsq*1] + movq xm0, [srcq+ssq*0] + movhps xm0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpbroadcastq m1, [srcq+ssq*0] + vpblendd m2, m1, 0x30 + pshufb m2, m5 + pshufb xm0, xm5 + pmaddubsw m2, m6 + pmaddubsw xm0, xm6 + phaddw m2, m0 + pmulhrsw m2, m7 + vextracti128 xm0, m2, 1 + palignr xm0, xm2, 4 + punpcklwd xm1, xm2, xm0 ; 01 12 + punpckhwd xm2, xm0 ; 23 34 +.hv_w2_loop: + movq xm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps xm4, [srcq+ssq*0] + pshufb xm4, xm5 + pmaddubsw xm4, xm6 + pmaddwd xm3, xm9, xm1 ; a0 b0 + mova xm1, xm2 + pmaddwd xm2, xm10 ; a1 b1 + phaddw xm4, xm4 + paddd xm3, xm2 + pmulhrsw xm4, xm7 + palignr xm2, xm4, xm0, 12 + mova xm0, xm4 + punpcklwd xm2, xm4 ; 45 56 + pmaddwd xm4, xm11, xm2 ; a2 b2 + paddd xm3, xm8 + paddd xm3, xm4 + psrad xm3, 10 + packssdw xm3, xm3 + packuswb xm3, xm3 + pextrw [dstq+dsq*0], xm3, 0 + pextrw [dstq+dsq*1], xm3, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + mova m5, [subpel_h_shuf4] + vpbroadcastq m2, [srcq+nsq*2] + vpbroadcastq m4, [srcq+nsq*1] + vpbroadcastq m1, [srcq+ssq*0] + vpbroadcastq m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpbroadcastq m0, [srcq+ssq*0] + vpblendd m2, m4, 0xcc ; 0 1 + vpblendd m1, m3, 0xcc ; 2 3 + pshufb m2, m5 + pshufb m1, m5 + pshufb m0, m5 + pmaddubsw m2, m6 + pmaddubsw m1, m6 + pmaddubsw m0, m6 + phaddw m2, m1 + phaddw m0, m0 + pmulhrsw m2, m7 + pmulhrsw m0, m7 + palignr m3, m0, m2, 4 + punpcklwd m1, m2, m3 ; 01 12 + punpckhwd m2, m3 ; 23 34 +.hv_w4_loop: + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd m3, m9, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m10 ; a1 b1 + paddd m3, m2 + vpbroadcastq m2, [srcq+ssq*0] + vpblendd m4, m2, 0xcc ; 5 6 + pshufb m4, m5 + pmaddubsw m4, m6 + phaddw m4, m4 + pmulhrsw m4, m7 + palignr m2, m4, m0, 12 + mova m0, m4 + punpcklwd m2, m4 ; 45 56 + pmaddwd m4, m11, m2 ; a2 b2 + paddd m3, m8 + paddd m3, m4 + psrad m3, 10 + vextracti128 xm4, m3, 1 + packssdw xm3, xm4 + packuswb xm3, xm3 + pshuflw xm3, xm3, q3120 + movd [dstq+dsq*0], xm3 + pextrd [dstq+dsq*1], xm3, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + shr mxd, 16 + sub srcq, 2 + lea mxq, [r8+mxq*8+subpel_filters+1-put_avx2] + WIN64_PUSH_XMM 16 + vpbroadcastw m10, [mxq+0] + vpbroadcastw m11, [mxq+2] + vpbroadcastw m12, [mxq+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq m0, [r8+myq*8+subpel_filters+1-put_avx2] + lea r6d, [wq*8-64] + vbroadcasti128 m8, [z_filter_s+ 6] + punpcklbw m0, m0 + vbroadcasti128 m9, [z_filter_s+10] + psraw m0, 8 ; sign-extend + mov nsq, ssq + pshufd m13, m0, q0000 + neg nsq + pshufd m14, m0, q1111 + lea r6d, [hq+r6*4] + pshufd m15, m0, q2222 +.hv_w8_loop0: + vbroadcasti128 m7, [z_filter_s+2] + movu xm3, [srcq+nsq*2] + lea r4, [srcq+ssq*2] + movu xm4, [srcq+nsq*1] + vbroadcasti128 m0, [srcq+ssq*0] + mov r7, dstq + vinserti128 m4, [srcq+ssq*1], 1 ; 1 3 + vpblendd m3, m0, 0xf0 ; 0 2 + vinserti128 m0, [r4+ssq*0], 1 ; 2 4 + vpbroadcastd m5, [pw_8192] +%macro HV_H_6TAP_W8 6 ; src/dst, tmp[1-2], shuf[1-3] + pshufb %2, %1, %4 + pmaddubsw %2, m10 + pshufb %3, %1, %5 + pmaddubsw %3, m11 + pshufb %1, %6 + pmaddubsw %1, m12 + paddw %2, %3 + paddw %1, %2 +%endmacro + HV_H_6TAP_W8 m3, m1, m2, m7, m8, m9 + HV_H_6TAP_W8 m4, m1, m2, m7, m8, m9 + HV_H_6TAP_W8 m0, m1, m2, m7, m8, m9 + vpermq m3, m3, q3120 + vpermq m4, m4, q3120 + vpermq m0, m0, q3120 + pmulhrsw m3, m5 + pmulhrsw m4, m5 + pmulhrsw m0, m5 + punpcklwd m1, m3, m4 ; 01 + punpckhwd m3, m4 ; 23 + punpcklwd m2, m4, m0 ; 12 + punpckhwd m4, m0 ; 34 +.hv_w8_loop: + movu xm7, [r4+ssq*1] + lea r4, [r4+ssq*2] + vinserti128 m7, [r4+ssq*0], 1 ; 5 6 + pmaddwd m5, m13, m1 ; a0 + mova m1, m3 + pmaddwd m6, m13, m2 ; b0 + mova m2, m4 + pmaddwd m3, m14 ; a1 + pmaddwd m4, m14 ; b1 + paddd m5, m3 + vbroadcasti128 m3, [z_filter_s+2] + paddd m6, m4 + HV_H_6TAP_W8 m7, m3, m4, m3, m8, m9 + vpbroadcastd m3, [pw_8192] + vpbroadcastd m4, [pd_512] + pmulhrsw m7, m3 + paddd m5, m4 + paddd m6, m4 + mova m4, m0 + vpermq m0, m7, q3120 + shufpd m4, m0, 0x05 + punpcklwd m3, m4, m0 ; 45 + pmaddwd m7, m15, m3 ; a2 + punpckhwd m4, m0 ; 67 + paddd m5, m7 + pmaddwd m7, m15, m4 ; b2 + paddd m6, m7 + psrad m5, 10 + psrad m6, 10 + packssdw m5, m6 + vextracti128 xm6, m5, 1 + packuswb xm5, xm6 + pshufd xm5, xm5, q3120 + movq [r7+dsq*0], xm5 + movhps [r7+dsq*1], xm5 + lea r7, [r7+dsq*2] + sub hd, 2 + jg .hv_w8_loop + add srcq, 8 + add dstq, 8 + movzx hd, r6b + sub r6d, 1<<8 + jg .hv_w8_loop0 + RET + +PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_8bpc +PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_8bpc +PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_8bpc +PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_8bpc +PUT_8TAP_FN sharp, SHARP, SHARP + +cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r8, [put_avx2] + movsxd wq, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jz mangle(private_prefix %+ _put_6tap_8bpc_avx2).put +.v: + WIN64_SPILL_XMM 12, 15 movzx mxd, myb shr myd, 16 cmp hd, 6 @@ -1765,19 +2175,19 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 .v_w64: .v_w128: lea r6d, [wq*8-128] - mov r4, srcq - mov r7, dstq + WIN64_PUSH_XMM 15 lea r6d, [hq+r6*2] .v_w16_loop0: vbroadcasti128 m4, [srcq+ssq*0] vbroadcasti128 m5, [srcq+ssq*1] + lea r4, [srcq+ss3q] vbroadcasti128 m6, [srcq+ssq*2] - add srcq, ss3q - vbroadcasti128 m0, [srcq+ssq*0] - vbroadcasti128 m1, [srcq+ssq*1] - vbroadcasti128 m2, [srcq+ssq*2] - add srcq, ss3q - vbroadcasti128 m3, [srcq+ssq*0] + vbroadcasti128 m0, [r4+ssq*0] + mov r7, dstq + vbroadcasti128 m1, [r4+ssq*1] + vbroadcasti128 m2, [r4+ssq*2] + add r4, ss3q + vbroadcasti128 m3, [r4+ssq*0] shufpd m4, m0, 0x0c shufpd m5, m1, 0x0c punpcklbw m1, m4, m5 ; 01 @@ -1789,50 +2199,137 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 punpcklbw m3, m6, m0 ; 23 punpckhbw m6, m0 ; 56 .v_w16_loop: - vbroadcasti128 m12, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - vbroadcasti128 m13, [srcq+ssq*0] - pmaddubsw m14, m1, m8 ; a0 - pmaddubsw m15, m2, m8 ; b0 + vbroadcasti128 m12, [r4+ssq*1] + lea r4, [r4+ssq*2] + pmaddubsw m13, m1, m8 ; a0 + pmaddubsw m14, m2, m8 ; b0 mova m1, m3 mova m2, m4 pmaddubsw m3, m9 ; a1 pmaddubsw m4, m9 ; b1 - paddw m14, m3 - paddw m15, m4 + paddw m13, m3 + paddw m14, m4 mova m3, m5 mova m4, m6 pmaddubsw m5, m10 ; a2 pmaddubsw m6, m10 ; b2 - paddw m14, m5 - paddw m15, m6 + paddw m13, m5 + vbroadcasti128 m5, [r4+ssq*0] + paddw m14, m6 shufpd m6, m0, m12, 0x0d - shufpd m0, m12, m13, 0x0c + shufpd m0, m12, m5, 0x0c punpcklbw m5, m6, m0 ; 67 punpckhbw m6, m0 ; 78 pmaddubsw m12, m5, m11 ; a3 - pmaddubsw m13, m6, m11 ; b3 + paddw m13, m12 + pmaddubsw m12, m6, m11 ; b3 paddw m14, m12 - paddw m15, m13 + pmulhrsw m13, m7 pmulhrsw m14, m7 - pmulhrsw m15, m7 - packuswb m14, m15 - vpermq m14, m14, q3120 - mova [dstq+dsq*0], xm14 - vextracti128 [dstq+dsq*1], m14, 1 - lea dstq, [dstq+dsq*2] + packuswb m13, m14 + vpermq m13, m13, q3120 + mova [r7+dsq*0], xm13 + vextracti128 [r7+dsq*1], m13, 1 + lea r7, [r7+dsq*2] sub hd, 2 jg .v_w16_loop - add r4, 16 - add r7, 16 + add srcq, 16 + add dstq, 16 movzx hd, r6b - mov srcq, r4 - mov dstq, r7 sub r6d, 1<<8 jg .v_w16_loop0 RET +.h: +.h_w2: +.h_w4: + test myd, 0xf00 + jnz .hv + vpbroadcastd m5, [pw_34] ; 2 + (8 << 2) + cmp wd, 4 + jle mangle(private_prefix %+ _put_6tap_8bpc_avx2).h_w2 + WIN64_SPILL_XMM 11 + tzcnt wd, wd + vbroadcasti128 m6, [subpel_h_shufA] + shr mxd, 16 + vbroadcasti128 m7, [subpel_h_shufB] + sub srcq, 3 + vbroadcasti128 m8, [subpel_h_shufC] + movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] + vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0] + vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4] + add wq, r8 + jmp wq +.h_w8: +%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] + pshufb m%2, m%1, m7 + pshufb m%3, m%1, m8 + pshufb m%1, m6 + pmaddubsw m%4, m%2, m9 + pmaddubsw m%2, m10 + pmaddubsw m%3, m10 + pmaddubsw m%1, m9 + paddw m%3, m%4 + paddw m%1, m%2 + phaddw m%1, m%3 + paddw m%1, m5 + psraw m%1, 6 +%endmacro + movu xm0, [srcq+ssq*0] + vinserti128 m0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + PUT_8TAP_H 0, 1, 2, 3 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + movu xm0, [srcq+ssq*0+8*0] + vinserti128 m0, [srcq+ssq*1+8*0], 1 + movu xm1, [srcq+ssq*0+8*1] + vinserti128 m1, [srcq+ssq*1+8*1], 1 + PUT_8TAP_H 0, 2, 3, 4 + lea srcq, [srcq+ssq*2] + PUT_8TAP_H 1, 2, 3, 4 + packuswb m0, m1 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16 + RET +.h_w32: + xor r6d, r6d + jmp .h_start +.h_w64: + mov r6, -32*1 + jmp .h_start +.h_w128: + mov r6, -32*3 +.h_start: + sub srcq, r6 + sub dstq, r6 + mov r4, r6 +.h_loop: + movu m0, [srcq+r6+8*0] + movu m1, [srcq+r6+8*1] + PUT_8TAP_H 0, 2, 3, 4 + PUT_8TAP_H 1, 2, 3, 4 + packuswb m0, m1 + mova [dstq+r6], m0 + add r6, 32 + jle .h_loop + add srcq, ssq + add dstq, dsq + mov r6, r4 + dec hd + jg .h_loop + RET .hv: - WIN64_SPILL_XMM 16 + WIN64_SPILL_XMM 14, 16 cmp wd, 4 jg .hv_w8 movzx mxd, mxb @@ -1975,6 +2472,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 jg .hv_w4_loop RET .hv_w8: + WIN64_PUSH_XMM 16 shr mxd, 16 sub srcq, 3 vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+0] @@ -1993,24 +2491,23 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 pshufd m14, m0, q2222 pshufd m15, m0, q3333 lea r6d, [wq*8-64] - mov r4, srcq - mov r7, dstq lea r6d, [hq+r6*4] .hv_w8_loop0: vbroadcasti128 m7, [subpel_h_shufA] movu xm4, [srcq+ssq*0] + lea r4, [srcq+ss3q] vbroadcasti128 m8, [subpel_h_shufB] movu xm5, [srcq+ssq*1] + mov r7, dstq vbroadcasti128 m9, [subpel_h_shufC] movu xm6, [srcq+ssq*2] - add srcq, ss3q - vbroadcasti128 m0, [srcq+ssq*0] - vpblendd m4, m0, 0xf0 ; 0 3 - vinserti128 m5, [srcq+ssq*1], 1 ; 1 4 - vinserti128 m6, [srcq+ssq*2], 1 ; 2 5 - add srcq, ss3q - vinserti128 m0, [srcq+ssq*0], 1 ; 3 6 -%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] + vbroadcasti128 m0, [r4+ssq*0] + vpblendd m4, m0, 0xf0 ; 0 3 + vinserti128 m5, [r4+ssq*1], 1 ; 1 4 + vinserti128 m6, [r4+ssq*2], 1 ; 2 5 + add r4, ss3q + vinserti128 m0, [r4+ssq*0], 1 ; 3 6 +%macro HV_H_8TAP_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] pshufb %3, %1, %6 pshufb %4, %1, %7 pshufb %1, %5 @@ -2022,10 +2519,10 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 paddw %1, %3 phaddw %1, %2 %endmacro - HV_H_W8 m4, m1, m2, m3, m7, m8, m9 - HV_H_W8 m5, m1, m2, m3, m7, m8, m9 - HV_H_W8 m6, m1, m2, m3, m7, m8, m9 - HV_H_W8 m0, m1, m2, m3, m7, m8, m9 + HV_H_8TAP_W8 m4, m1, m2, m3, m7, m8, m9 + HV_H_8TAP_W8 m5, m1, m2, m3, m7, m8, m9 + HV_H_8TAP_W8 m6, m1, m2, m3, m7, m8, m9 + HV_H_8TAP_W8 m0, m1, m2, m3, m7, m8, m9 vpbroadcastd m7, [pw_8192] vpermq m4, m4, q3120 vpermq m5, m5, q3120 @@ -2043,9 +2540,9 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 punpckhwd m6, m7 ; 56 .hv_w8_loop: vextracti128 r6m, m0, 1 ; not enough registers - movu xm0, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - vinserti128 m0, [srcq+ssq*0], 1 ; 7 8 + movu xm0, [r4+ssq*1] + lea r4, [r4+ssq*2] + vinserti128 m0, [r4+ssq*0], 1 ; 7 8 pmaddwd m8, m1, m12 ; a0 pmaddwd m9, m2, m12 ; b0 mova m1, m3 @@ -2063,15 +2560,15 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 vbroadcasti128 m6, [subpel_h_shufB] vbroadcasti128 m7, [subpel_h_shufC] vbroadcasti128 m5, [subpel_h_shufA] - HV_H_W8 m0, m5, m6, m7, m5, m6, m7 + HV_H_8TAP_W8 m0, m5, m6, m7, m5, m6, m7 vpbroadcastd m5, [pw_8192] vpbroadcastd m7, [pd_512] vbroadcasti128 m6, r6m pmulhrsw m0, m5 paddd m8, m7 paddd m9, m7 - vpermq m7, m0, q3120 ; 7 8 - shufpd m6, m6, m7, 0x04 ; 6 7 + vpermq m7, m0, q3120 ; 7 8 + shufpd m6, m7, 0x04 ; 6 7 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m5, m15 ; a3 @@ -2084,34 +2581,18 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 vextracti128 xm7, m8, 1 packuswb xm8, xm7 pshufd xm7, xm8, q3120 - movq [dstq+dsq*0], xm7 - movhps [dstq+dsq*1], xm7 - lea dstq, [dstq+dsq*2] + movq [r7+dsq*0], xm7 + movhps [r7+dsq*1], xm7 + lea r7, [r7+dsq*2] sub hd, 2 jg .hv_w8_loop - add r4, 8 - add r7, 8 + add srcq, 8 + add dstq, 8 movzx hd, r6b - mov srcq, r4 - mov dstq, r7 sub r6d, 1<<8 jg .hv_w8_loop0 RET -%macro PREP_8TAP_H 0 - pshufb m1, m0, m5 - pshufb m2, m0, m6 - pshufb m3, m0, m7 - pmaddubsw m1, m8 - pmaddubsw m0, m2, m8 - pmaddubsw m2, m9 - pmaddubsw m3, m9 - paddw m1, m2 - paddw m0, m3 - phaddw m0, m1, m0 - pmulhrsw m0, m4 -%endmacro - %if WIN64 DECLARE_REG_TMP 6, 4 %else @@ -2119,71 +2600,197 @@ DECLARE_REG_TMP 6, 7 %endif %define PREP_8TAP_FN FN prep_8tap, -PREP_8TAP_FN sharp, SHARP, SHARP -PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH -PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP -PREP_8TAP_FN smooth, SMOOTH, SMOOTH -PREP_8TAP_FN sharp_regular, SHARP, REGULAR -PREP_8TAP_FN regular_sharp, REGULAR, SHARP -PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR -PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_8bpc +PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_8bpc +PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_8bpc PREP_8TAP_FN regular, REGULAR, REGULAR -cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 +cglobal prep_6tap_8bpc, 3, 8, 0, tmp, src, ss, w, h, mx, my, ns imul mxd, mxm, 0x010101 - add mxd, t0d ; 8tap_h, mx, 4tap_h + add mxd, t0d ; 6tap_h, mx, 4tap_h imul myd, mym, 0x010101 - add myd, t1d ; 8tap_v, my, 4tap_v + add myd, t1d ; 6tap_v, my, 4tap_v lea r7, [prep%+SUFFIX] - movsxd wq, wm + mov wd, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v +.prep: tzcnt wd, wd movzx wd, word [r7+wq*2+table_offset(prep,)] add wq, r7 - lea r6, [strideq*3] + lea r6, [ssq*3] %if WIN64 pop r7 %endif jmp wq -.h: - test myd, 0xf00 - jnz .hv - vpbroadcastd m4, [pw_8192] - vbroadcasti128 m5, [subpel_h_shufA] - WIN64_SPILL_XMM 10 - cmp wd, 4 - je .h_w4 - tzcnt wd, wd - vbroadcasti128 m6, [subpel_h_shufB] - vbroadcasti128 m7, [subpel_h_shufC] - shr mxd, 16 - sub srcq, 3 - movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] - vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0] - vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4] - add wq, r7 - jmp wq +.v: + WIN64_SPILL_XMM 10, 12 + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + lea myq, [r7+myq*8+subpel_filters+1-prep%+SUFFIX] + vpbroadcastd m9, [pw_8192] + vpbroadcastw m6, [myq+0] + mov nsq, ssq + vpbroadcastw m7, [myq+2] + neg nsq + vpbroadcastw m8, [myq+4] + cmp wd, 8 + jg .v_w16 + je .v_w8 +.v_w4: + movd xm2, [srcq+nsq*2] + pinsrd xm2, [srcq+nsq*1], 1 + vpbroadcastd m1, [srcq+ssq*0] + vpbroadcastd m3, [srcq+ssq*1] + vpbroadcastd m0, [srcq+ssq*2] + vbroadcasti128 m5, [deint_shuf4] + vpblendd m1, m2, 0xeb + punpcklqdq m3, m0 + vpblendd m1, m3, 0x60 ; 0 1 2 _ 2 3 4 _ + pshufb m1, m5 ; 01 12 23 34 +.v_w4_loop: + lea srcq, [srcq+ssq*4] + pinsrd xm0, [srcq+nsq*1], 1 + vpbroadcastd m2, [srcq+ssq*0] + vpbroadcastd m3, [srcq+ssq*1] + vpblendd m2, m0, 0xeb + vpbroadcastd m0, [srcq+ssq*2] + punpcklqdq m3, m0 + vpblendd m2, m3, 0x60 ; 4 5 6 _ 6 7 8 _ + pshufb m2, m5 ; 45 56 67 78 + pmaddubsw m3, m1, m6 ; a0 b0 c0 d0 + vperm2i128 m1, m2, 0x21 ; 23 34 45 56 + pmaddubsw m4, m2, m8 ; a2 b2 c2 d2 + pmaddubsw m1, m7 ; a1 b1 c1 d1 + paddw m3, m4 + paddw m3, m1 + pmulhrsw m3, m9 + mova m1, m2 + mova [tmpq], m3 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + RET +.v_w8: + movq xm1, [srcq+nsq*2] + vpbroadcastq m3, [srcq+nsq*1] + vpbroadcastq m2, [srcq+ssq*0] + vpbroadcastq m4, [srcq+ssq*1] + vpbroadcastq m0, [srcq+ssq*2] + vpblendd m1, m3, 0x30 + vpblendd m3, m2, 0x30 + punpcklbw m1, m3 ; 01 12 + vpblendd m2, m4, 0x30 + vpblendd m4, m0, 0x30 + punpcklbw m2, m4 ; 23 34 +.v_w8_loop: + lea srcq, [srcq+ssq*4] + pmaddubsw m1, m6 ; a0 + vpbroadcastq m3, [srcq+nsq*1] + pmaddubsw m4, m2, m7 ; a1 + pmaddubsw m5, m2, m6 ; b0 + vpbroadcastq m2, [srcq+ssq*0] + vpblendd m0, m3, 0x30 + vpblendd m3, m2, 0x30 + paddw m4, m1 + punpcklbw m1, m0, m3 ; 45 56 + vpbroadcastq m3, [srcq+ssq*1] + vpbroadcastq m0, [srcq+ssq*2] + vpblendd m2, m3, 0x30 + vpblendd m3, m0, 0x30 + punpcklbw m2, m3 ; 67 78 + pmaddubsw m3, m1, m7 ; b1 + paddw m5, m3 + pmaddubsw m3, m1, m8 ; a2 + paddw m4, m3 + pmaddubsw m3, m2, m8 ; b2 + paddw m5, m3 + pmulhrsw m4, m9 + pmulhrsw m5, m9 + mova [tmpq+32*0], m4 + mova [tmpq+32*1], m5 + add tmpq, 32*2 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + lea r6d, [wq*2-32] + lea srcq, [srcq+nsq*2] + WIN64_PUSH_XMM 12 + lea r6d, [hq+r6*8] +.v_w16_loop0: + vbroadcasti128 m3, [srcq+ssq*0] + lea r5, [srcq+ssq*2] + vbroadcasti128 m4, [srcq+ssq*1] + mov r7, tmpq + vbroadcasti128 m0, [r5+ssq*0] + vbroadcasti128 m1, [r5+ssq*1] + lea r5, [r5+ssq*2] + vbroadcasti128 m2, [r5+ssq*0] + shufpd m3, m0, 0x0c + shufpd m4, m1, 0x0c + punpcklbw m1, m3, m4 ; 01 + punpckhbw m3, m4 ; 23 + shufpd m0, m2, 0x0c + punpcklbw m2, m4, m0 ; 12 + punpckhbw m4, m0 ; 34 +.v_w16_loop: + vbroadcasti128 m5, [r5+ssq*1] + pmaddubsw m10, m1, m6 ; a0 + lea r5, [r5+ssq*2] + pmaddubsw m11, m2, m6 ; b0 + mova m1, m3 + pmaddubsw m3, m7 ; a1 + mova m2, m4 + pmaddubsw m4, m7 ; b1 + paddw m10, m3 + vbroadcasti128 m3, [r5+ssq*0] + paddw m11, m4 + shufpd m4, m0, m5, 0x0d + shufpd m0, m5, m3, 0x0c + punpcklbw m3, m4, m0 ; 45 + punpckhbw m4, m0 ; 56 + pmaddubsw m5, m3, m8 ; a2 + paddw m10, m5 + pmaddubsw m5, m4, m8 ; b2 + paddw m11, m5 + pmulhrsw m10, m9 + pmulhrsw m11, m9 + mova [r7+wq*0], m10 + mova [r7+wq*2], m11 + lea r7, [r7+wq*4] + sub hd, 2 + jg .v_w16_loop + add srcq, 16 + add tmpq, 32 + movzx hd, r6b + sub r6d, 1<<8 + jg .v_w16_loop0 + RET .h_w4: + RESET_STACK_STATE movzx mxd, mxb + vbroadcasti128 m3, [subpel_h_shufA] dec srcq - vpbroadcastd m6, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] - lea stride3q, [strideq*3] + vpbroadcastd m5, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] + lea r3, [ssq*3] .h_w4_loop: - movq xm0, [srcq+strideq*0] - vpbroadcastq m2, [srcq+strideq*2] - movq xm1, [srcq+strideq*1] - vpblendd m0, m2, 0xf0 - vpbroadcastq m2, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vpblendd m1, m2, 0xf0 - pshufb m0, m5 - pshufb m1, m5 - pmaddubsw m0, m6 - pmaddubsw m1, m6 + movq xm0, [srcq+ssq*0] + vpbroadcastq m2, [srcq+ssq*2] + movq xm1, [srcq+ssq*1] + vpblendd m0, m2, 0x30 + vpbroadcastq m2, [srcq+r3 ] + lea srcq, [srcq+ssq*4] + vpblendd m1, m2, 0x30 + pshufb m0, m3 + pshufb m1, m3 + pmaddubsw m0, m5 + pmaddubsw m1, m5 phaddw m0, m1 pmulhrsw m0, m4 mova [tmpq], m0 @@ -2191,25 +2798,56 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 sub hd, 4 jg .h_w4_loop RET +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m4, [pw_8192] + cmp wd, 4 + je .h_w4 + WIN64_SPILL_XMM 10 + tzcnt wd, wd + vbroadcasti128 m3, [z_filter_s+ 2] + shr mxd, 16 + vbroadcasti128 m5, [z_filter_s+ 6] + sub srcq, 2 + vbroadcasti128 m6, [z_filter_s+10] + lea mxq, [r7+mxq*8+subpel_filters+1-prep%+SUFFIX] + movzx wd, word [r7+wq*2+table_offset(prep, _6tap_h)] + vpbroadcastw m7, [mxq+0] + vpbroadcastw m8, [mxq+2] + add wq, r7 + vpbroadcastw m9, [mxq+4] + jmp wq .h_w8: - movu xm0, [srcq+strideq*0] - vinserti128 m0, [srcq+strideq*1], 1 - lea srcq, [srcq+strideq*2] - PREP_8TAP_H + movu xm0, [srcq+ssq*0] + vinserti128 m0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] +%macro PREP_6TAP_H 0 + pshufb m1, m0, m3 + pmaddubsw m1, m7 + pshufb m2, m0, m5 + pmaddubsw m2, m8 + pshufb m0, m6 + pmaddubsw m0, m9 + paddw m1, m2 + paddw m0, m1 + pmulhrsw m0, m4 +%endmacro + PREP_6TAP_H mova [tmpq], m0 add tmpq, 32 sub hd, 2 jg .h_w8 RET .h_w16: - movu xm0, [srcq+strideq*0+8*0] - vinserti128 m0, [srcq+strideq*0+8*1], 1 - PREP_8TAP_H + movu xm0, [srcq+ssq*0+8*0] + vinserti128 m0, [srcq+ssq*0+8*1], 1 + PREP_6TAP_H mova [tmpq+32*0], m0 - movu xm0, [srcq+strideq*1+8*0] - vinserti128 m0, [srcq+strideq*1+8*1], 1 - lea srcq, [srcq+strideq*2] - PREP_8TAP_H + movu xm0, [srcq+ssq*1+8*0] + vinserti128 m0, [srcq+ssq*1+8*1], 1 + lea srcq, [srcq+ssq*2] + PREP_6TAP_H mova [tmpq+32*1], m0 add tmpq, 32*2 sub hd, 2 @@ -2229,27 +2867,219 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 .h_loop: movu xm0, [srcq+r6+8*0] vinserti128 m0, [srcq+r6+8*1], 1 - PREP_8TAP_H + PREP_6TAP_H mova [tmpq+32*0], m0 movu xm0, [srcq+r6+8*2] vinserti128 m0, [srcq+r6+8*3], 1 - PREP_8TAP_H + PREP_6TAP_H mova [tmpq+32*1], m0 add tmpq, 32*2 add r6, 32 jle .h_loop - add srcq, strideq + add srcq, ssq mov r6, r5 dec hd jg .h_loop RET +.hv: + WIN64_SPILL_XMM 14, 16 + cmp wd, 4 + jne .hv_w8 +.hv_w4: + movzx mxd, mxb + dec srcq + vpbroadcastd m7, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + mova m6, [subpel_h_shuf4] + vpbroadcastq m0, [r7+myq*8+subpel_filters+1-prep%+SUFFIX] + mov nsq, ssq + pmovzxbd m13, [deint_shuf4] + neg nsq + vpbroadcastd m8, [pw_8192] + vpbroadcastd m9, [pd_32] + punpcklbw m0, m0 + vpbroadcastq m2, [srcq+nsq*2] + psraw m0, 8 ; sign-extend + vpbroadcastq m4, [srcq+nsq*1] + pshufd m10, m0, q0000 + vpbroadcastq m1, [srcq+ssq*0] + pshufd m11, m0, q1111 + vpbroadcastq m3, [srcq+ssq*1] + pshufd m12, m0, q2222 + vpbroadcastq m0, [srcq+ssq*2] + vpblendd m2, m4, 0xcc ; 0 1 + vpblendd m1, m3, 0xcc ; 2 3 + pshufb m2, m6 + pshufb m1, m6 + pshufb m0, m6 + pmaddubsw m2, m7 + pmaddubsw m1, m7 + pmaddubsw m0, m7 + phaddw m2, m1 ; 0 1 2 3 + phaddw m0, m0 ; 4 + pmulhrsw m2, m8 + pmulhrsw m0, m8 + palignr m0, m2, 4 + punpcklwd m1, m2, m0 ; 01 12 + punpckhwd m2, m0 ; 23 34 +.hv_w4_loop: + pmaddwd m4, m10, m1 ; a0 b0 + lea srcq, [srcq+ssq*4] + pmaddwd m5, m2, m10 ; c0 d0 + vpbroadcastq m1, [srcq+nsq*1] + pmaddwd m2, m11 ; a1 b1 + vpbroadcastq m3, [srcq+ssq*0] + paddd m4, m2 + vpbroadcastq m2, [srcq+ssq*1] + vpblendd m1, m3, 0xcc ; 5 6 + vpbroadcastq m3, [srcq+ssq*2] + vpblendd m2, m3, 0xcc ; 7 8 + pshufb m1, m6 + pshufb m2, m6 + pmaddubsw m1, m7 + pmaddubsw m2, m7 + phaddw m1, m2 ; 5 6 7 8 + pmulhrsw m1, m8 + paddd m5, m9 + paddd m4, m9 + palignr m2, m1, m0, 12 + mova m0, m1 + punpcklwd m1, m2, m0 ; 45 56 + punpckhwd m2, m0 ; 67 78 + pmaddwd m3, m11, m1 ; c1 d1 + paddd m5, m3 + pmaddwd m3, m12, m1 ; a2 b2 + paddd m4, m3 + pmaddwd m3, m12, m2 ; c2 d2 + paddd m5, m3 + psrad m4, 6 + psrad m5, 6 + packssdw m4, m5 + vpermd m4, m13, m4 + mova [tmpq], m4 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop + RET +.hv_w8: + shr mxd, 16 + lea mxq, [r7+mxq*8+subpel_filters+1-prep_avx2] + WIN64_PUSH_XMM 16 + vpbroadcastw m10, [mxq+0] + vpbroadcastw m11, [mxq+2] + vpbroadcastw m12, [mxq+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq m0, [r7+myq*8+subpel_filters+1-prep_avx2] + lea r7, [ssq*2+2] + vbroadcasti128 m8, [z_filter_s+ 6] + punpcklbw m0, m0 + vbroadcasti128 m9, [z_filter_s+10] + psraw m0, 8 ; sign-extend + lea r6d, [wq*8-64] + pshufd m13, m0, q0000 + sub srcq, r7 + pshufd m14, m0, q1111 + lea r6d, [hq+r6*4] + pshufd m15, m0, q2222 +.hv_w8_loop0: + vbroadcasti128 m7, [z_filter_s+2] + movu xm3, [srcq+ssq*0] + lea r5, [srcq+ssq*2] + movu xm4, [srcq+ssq*1] + vbroadcasti128 m0, [r5+ssq*0] + mov r7, tmpq + vinserti128 m4, [r5+ssq*1], 1 ; 1 3 + lea r5, [r5+ssq*2] + vpblendd m3, m0, 0xf0 ; 0 2 + vinserti128 m0, [r5+ssq*0], 1 ; 2 4 + vpbroadcastd m5, [pw_8192] + HV_H_6TAP_W8 m3, m1, m2, m7, m8, m9 + HV_H_6TAP_W8 m4, m1, m2, m7, m8, m9 + HV_H_6TAP_W8 m0, m1, m2, m7, m8, m9 + vpermq m3, m3, q3120 + vpermq m4, m4, q3120 + vpermq m0, m0, q3120 + pmulhrsw m3, m5 + pmulhrsw m4, m5 + pmulhrsw m0, m5 + punpcklwd m1, m3, m4 ; 01 + punpckhwd m3, m4 ; 23 + punpcklwd m2, m4, m0 ; 12 + punpckhwd m4, m0 ; 34 +.hv_w8_loop: + movu xm7, [r5+ssq*1] + lea r5, [r5+ssq*2] + vinserti128 m7, [r5+ssq*0], 1 ; 5 6 + pmaddwd m5, m13, m1 ; a0 + mova m1, m3 + pmaddwd m6, m13, m2 ; b0 + mova m2, m4 + pmaddwd m3, m14 ; a1 + pmaddwd m4, m14 ; b1 + paddd m5, m3 + vbroadcasti128 m3, [z_filter_s+2] + paddd m6, m4 + HV_H_6TAP_W8 m7, m3, m4, m3, m8, m9 + vpbroadcastd m3, [pw_8192] + vpbroadcastd m4, [pd_32] + pmulhrsw m7, m3 + paddd m5, m4 + paddd m6, m4 + mova m4, m0 + vpermq m0, m7, q3120 + shufpd m4, m0, 0x05 + punpcklwd m3, m4, m0 ; 45 + pmaddwd m7, m15, m3 ; a2 + punpckhwd m4, m0 ; 67 + paddd m5, m7 + pmaddwd m7, m15, m4 ; b2 + paddd m6, m7 + psrad m5, 6 + psrad m6, 6 + packssdw m5, m6 + vpermq m5, m5, q3120 + mova [r7+wq*0], xm5 + vextracti128 [r7+wq*2], m5, 1 + lea r7, [r7+wq*4] + sub hd, 2 + jg .hv_w8_loop + add srcq, 8 + add tmpq, 16 + movzx hd, r6b + sub r6d, 1<<8 + jg .hv_w8_loop0 + RET + +PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_8bpc +PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_8bpc +PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_8bpc +PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_8bpc +PREP_8TAP_FN sharp, SHARP, SHARP + +cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r7, [prep%+SUFFIX] + mov wd, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jz mangle(private_prefix %+ _prep_6tap_8bpc_avx2).prep .v: - WIN64_SPILL_XMM 16 + WIN64_SPILL_XMM 12, 15 movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. shr myd, 16 ; Note that the code is 8-tap only, having cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 cmove myd, mxd ; had a negligible effect on performance. - ; TODO: Would a 6-tap code path be worth it? lea myq, [r7+myq*8+subpel_filters-prep%+SUFFIX] lea stride3q, [strideq*3] sub srcq, stride3q @@ -2359,72 +3189,154 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 jg .v_w8_loop RET .v_w16: - add wd, wd - mov r5, srcq - mov r7, tmpq - lea r6d, [hq+wq*8-256] + lea r6d, [wq*2-32] + WIN64_PUSH_XMM 15 + lea r6d, [hq+r6*8] .v_w16_loop0: vbroadcasti128 m4, [srcq+strideq*0] vbroadcasti128 m5, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vbroadcasti128 m0, [srcq+strideq*1] - vbroadcasti128 m6, [srcq+strideq*0] - lea srcq, [srcq+strideq*2] - vbroadcasti128 m1, [srcq+strideq*0] - vbroadcasti128 m2, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vbroadcasti128 m3, [srcq+strideq*0] - shufpd m4, m4, m0, 0x0c - shufpd m5, m5, m1, 0x0c + lea r5, [srcq+strideq*2] + vbroadcasti128 m0, [r5+strideq*1] + vbroadcasti128 m6, [r5+strideq*0] + lea r5, [r5+strideq*2] + vbroadcasti128 m1, [r5+strideq*0] + vbroadcasti128 m2, [r5+strideq*1] + lea r5, [r5+strideq*2] + vbroadcasti128 m3, [r5+strideq*0] + mov r7, tmpq + shufpd m4, m0, 0x0c + shufpd m5, m1, 0x0c punpcklbw m1, m4, m5 ; 01 punpckhbw m4, m5 ; 34 - shufpd m6, m6, m2, 0x0c + shufpd m6, m2, 0x0c punpcklbw m2, m5, m6 ; 12 punpckhbw m5, m6 ; 45 - shufpd m0, m0, m3, 0x0c + shufpd m0, m3, 0x0c punpcklbw m3, m6, m0 ; 23 punpckhbw m6, m0 ; 56 .v_w16_loop: - vbroadcasti128 m12, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vbroadcasti128 m13, [srcq+strideq*0] - pmaddubsw m14, m1, m8 ; a0 - pmaddubsw m15, m2, m8 ; b0 + vbroadcasti128 m12, [r5+strideq*1] + lea r5, [r5+strideq*2] + pmaddubsw m13, m1, m8 ; a0 + pmaddubsw m14, m2, m8 ; b0 mova m1, m3 mova m2, m4 pmaddubsw m3, m9 ; a1 pmaddubsw m4, m9 ; b1 - paddw m14, m3 - paddw m15, m4 + paddw m13, m3 + paddw m14, m4 mova m3, m5 mova m4, m6 pmaddubsw m5, m10 ; a2 pmaddubsw m6, m10 ; b2 - paddw m14, m5 - paddw m15, m6 + paddw m13, m5 + vbroadcasti128 m5, [r5+strideq*0] + paddw m14, m6 shufpd m6, m0, m12, 0x0d - shufpd m0, m12, m13, 0x0c + shufpd m0, m12, m5, 0x0c punpcklbw m5, m6, m0 ; 67 punpckhbw m6, m0 ; 78 pmaddubsw m12, m5, m11 ; a3 - pmaddubsw m13, m6, m11 ; b3 + paddw m13, m12 + pmaddubsw m12, m6, m11 ; b3 paddw m14, m12 - paddw m15, m13 + pmulhrsw m13, m7 pmulhrsw m14, m7 - pmulhrsw m15, m7 - mova [tmpq+wq*0], m14 - mova [tmpq+wq*1], m15 - lea tmpq, [tmpq+wq*2] + mova [r7+wq*0], m13 + mova [r7+wq*2], m14 + lea r7, [r7+wq*4] sub hd, 2 jg .v_w16_loop - add r5, 16 - add r7, 32 + add srcq, 16 + add tmpq, 32 movzx hd, r6b - mov srcq, r5 - mov tmpq, r7 sub r6d, 1<<8 jg .v_w16_loop0 RET +.h: +.h_w4: + test myd, 0xf00 + jnz .hv + vpbroadcastd m4, [pw_8192] + cmp wd, 4 + je mangle(private_prefix %+ _prep_6tap_8bpc_avx2).h_w4 + WIN64_SPILL_XMM 10 + vbroadcasti128 m5, [subpel_h_shufA] + tzcnt wd, wd + vbroadcasti128 m6, [subpel_h_shufB] + vbroadcasti128 m7, [subpel_h_shufC] + shr mxd, 16 + sub srcq, 3 + movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] + vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0] + vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4] + add wq, r7 + jmp wq +.h_w8: + movu xm0, [srcq+strideq*0] + vinserti128 m0, [srcq+strideq*1], 1 + lea srcq, [srcq+strideq*2] +%macro PREP_8TAP_H 0 + pshufb m1, m0, m5 + pshufb m2, m0, m6 + pshufb m3, m0, m7 + pmaddubsw m1, m8 + pmaddubsw m0, m2, m8 + pmaddubsw m2, m9 + pmaddubsw m3, m9 + paddw m1, m2 + paddw m0, m3 + phaddw m0, m1, m0 + pmulhrsw m0, m4 +%endmacro + PREP_8TAP_H + mova [tmpq], m0 + add tmpq, 32 + sub hd, 2 + jg .h_w8 + RET +.h_w16: + movu xm0, [srcq+strideq*0+8*0] + vinserti128 m0, [srcq+strideq*0+8*1], 1 + PREP_8TAP_H + mova [tmpq+32*0], m0 + movu xm0, [srcq+strideq*1+8*0] + vinserti128 m0, [srcq+strideq*1+8*1], 1 + lea srcq, [srcq+strideq*2] + PREP_8TAP_H + mova [tmpq+32*1], m0 + add tmpq, 32*2 + sub hd, 2 + jg .h_w16 + RET +.h_w32: + xor r6d, r6d + jmp .h_start +.h_w64: + mov r6, -32*1 + jmp .h_start +.h_w128: + mov r6, -32*3 +.h_start: + sub srcq, r6 + mov r5, r6 +.h_loop: + movu xm0, [srcq+r6+8*0] + vinserti128 m0, [srcq+r6+8*1], 1 + PREP_8TAP_H + mova [tmpq+32*0], m0 + movu xm0, [srcq+r6+8*2] + vinserti128 m0, [srcq+r6+8*3], 1 + PREP_8TAP_H + mova [tmpq+32*1], m0 + add tmpq, 32*2 + add r6, 32 + jle .h_loop + add srcq, strideq + mov r6, r5 + dec hd + jg .h_loop + RET .hv: WIN64_SPILL_XMM 16 cmp wd, 4 @@ -2542,28 +3454,27 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 RET .hv_w8: lea r6d, [wq*8-64] - mov r5, srcq - mov r7, tmpq lea r6d, [hq+r6*4] .hv_w8_loop0: vbroadcasti128 m7, [subpel_h_shufA] movu xm4, [srcq+strideq*0] + lea r5, [srcq+strideq*2] vbroadcasti128 m8, [subpel_h_shufB] movu xm5, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] + mov r7, tmpq vbroadcasti128 m9, [subpel_h_shufC] - movu xm6, [srcq+strideq*0] - vbroadcasti128 m0, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vpblendd m4, m0, 0xf0 ; 0 3 - vinserti128 m5, [srcq+strideq*0], 1 ; 1 4 - vinserti128 m6, [srcq+strideq*1], 1 ; 2 5 - lea srcq, [srcq+strideq*2] - vinserti128 m0, [srcq+strideq*0], 1 ; 3 6 - HV_H_W8 m4, m1, m2, m3, m7, m8, m9 - HV_H_W8 m5, m1, m2, m3, m7, m8, m9 - HV_H_W8 m6, m1, m2, m3, m7, m8, m9 - HV_H_W8 m0, m1, m2, m3, m7, m8, m9 + movu xm6, [r5+strideq*0] + vbroadcasti128 m0, [r5+strideq*1] + lea r5, [r5+strideq*2] + vpblendd m4, m0, 0xf0 ; 0 3 + vinserti128 m5, [r5+strideq*0], 1 ; 1 4 + vinserti128 m6, [r5+strideq*1], 1 ; 2 5 + lea r5, [r5+strideq*2] + vinserti128 m0, [r5+strideq*0], 1 ; 3 6 + HV_H_8TAP_W8 m4, m1, m2, m3, m7, m8, m9 + HV_H_8TAP_W8 m5, m1, m2, m3, m7, m8, m9 + HV_H_8TAP_W8 m6, m1, m2, m3, m7, m8, m9 + HV_H_8TAP_W8 m0, m1, m2, m3, m7, m8, m9 vpbroadcastd m7, [pw_8192] vpermq m4, m4, q3120 vpermq m5, m5, q3120 @@ -2580,10 +3491,10 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 punpcklwd m3, m6, m7 ; 23 punpckhwd m6, m7 ; 56 .hv_w8_loop: - vextracti128 [tmpq], m0, 1 ; not enough registers - movu xm0, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vinserti128 m0, [srcq+strideq*0], 1 ; 7 8 + vextracti128 [r7], m0, 1 ; not enough registers + movu xm0, [r5+strideq*1] + lea r5, [r5+strideq*2] + vinserti128 m0, [r5+strideq*0], 1 ; 7 8 pmaddwd m8, m1, m12 ; a0 pmaddwd m9, m2, m12 ; b0 mova m1, m3 @@ -2601,15 +3512,15 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 vbroadcasti128 m6, [subpel_h_shufB] vbroadcasti128 m7, [subpel_h_shufC] vbroadcasti128 m5, [subpel_h_shufA] - HV_H_W8 m0, m5, m6, m7, m5, m6, m7 + HV_H_8TAP_W8 m0, m5, m6, m7, m5, m6, m7 vpbroadcastd m5, [pw_8192] vpbroadcastd m7, [pd_32] - vbroadcasti128 m6, [tmpq] + vbroadcasti128 m6, [r7] pmulhrsw m0, m5 paddd m8, m7 paddd m9, m7 - vpermq m7, m0, q3120 ; 7 8 - shufpd m6, m6, m7, 0x04 ; 6 7 + vpermq m7, m0, q3120 ; 7 8 + shufpd m6, m7, 0x04 ; 6 7 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m5, m15 ; a3 @@ -2620,16 +3531,14 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 psrad m7, 6 packssdw m8, m7 vpermq m7, m8, q3120 - mova [tmpq+wq*0], xm7 - vextracti128 [tmpq+wq*2], m7, 1 - lea tmpq, [tmpq+wq*4] + mova [r7+wq*0], xm7 + vextracti128 [r7+wq*2], m7, 1 + lea r7, [r7+wq*4] sub hd, 2 jg .hv_w8_loop - add r5, 8 - add r7, 16 + add srcq, 8 + add tmpq, 16 movzx hd, r6b - mov srcq, r5 - mov tmpq, r7 sub r6d, 1<<8 jg .hv_w8_loop0 RET @@ -4008,14 +4917,14 @@ DECLARE_REG_TMP 6, 8 %define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, BILIN_SCALED_FN put -PUT_8TAP_SCALED_FN sharp, SHARP, SHARP -PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH -PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP -PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH -PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR -PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP -PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR -PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_SCALED_FN sharp, SHARP, SHARP, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, put_8tap_scaled_8bpc PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED put @@ -4026,14 +4935,14 @@ DECLARE_REG_TMP 6, 7 %endif BILIN_SCALED_FN prep -PREP_8TAP_SCALED_FN sharp, SHARP, SHARP -PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH -PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP -PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH -PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR -PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP -PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR -PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_SCALED_FN sharp, SHARP, SHARP, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, prep_8tap_scaled_8bpc PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED prep |