diff options
Diffstat (limited to 'third_party/dav1d/src/x86/ipred16_avx512.asm')
-rw-r--r-- | third_party/dav1d/src/x86/ipred16_avx512.asm | 610 |
1 files changed, 610 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/ipred16_avx512.asm b/third_party/dav1d/src/x86/ipred16_avx512.asm index 8124a3b145..69802614c7 100644 --- a/third_party/dav1d/src/x86/ipred16_avx512.asm +++ b/third_party/dav1d/src/x86/ipred16_avx512.asm @@ -79,14 +79,17 @@ z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 z_filter_k: dw 8, 8, 6, 6, 4, 4 dw 4, 4, 5, 5, 4, 4 dw 0, 0, 0, 0, 2, 2 +pb_90: times 4 db 90 pw_15: times 2 dw 15 pw_16: times 2 dw 16 pw_17: times 2 dw 17 pw_24: times 2 dw 24 +pw_31: times 2 dw 31 pw_32: times 2 dw 32 pw_63: times 2 dw 63 pw_64: times 2 dw 64 pw_512: times 2 dw 512 +pw_2048: times 2 dw 2048 pw_31806: times 2 dw 31806 pw_32640: times 2 dw 32640 pw_32672: times 2 dw 32672 @@ -114,6 +117,7 @@ JMP_TABLE ipred_smooth_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_z1_16bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z2_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_z3_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE pal_pred_16bpc, avx512icl, w4, w8, w16, w32, w64 @@ -1174,6 +1178,612 @@ cglobal ipred_z1_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx mov rsp, r7 RET +cglobal ipred_z2_16bpc, 3, 9, 16, dst, stride, tl, w, h, angle, dx, _, dy + tzcnt wd, wm + movifnidn angled, anglem + lea dxq, [dr_intra_derivative-90] + movzx dyd, angleb + xor angled, 0x400 + mov r7, dxq + sub dxq, dyq + movifnidn hd, hm + and dyd, ~1 + vpbroadcastw m12, [tlq] + and dxq, ~1 + movzx dyd, word [r7+dyq] ; angle - 90 + lea r7, [z_filter_t0] + movzx dxd, word [dxq+270] ; 180 - angle + mova m0, [base+pw_31to0] + movsxd wq, [base+ipred_z2_16bpc_avx512icl_table+wq*4] + movu m4, [tlq+2] + neg dyd + vpermw m7, m0, [tlq-64*1] + lea wq, [base+ipred_z2_16bpc_avx512icl_table+wq] + vpbroadcastd m14, [base+pw_31806] + vpbroadcastd m15, [base+pw_1] + jmp wq +.w4: + movq xm3, [tlq] + vpbroadcastq m8, [base+pw_1to32] + test angled, 0x400 + jnz .w4_main ; !enable_intra_edge_filter + lea r3d, [hq+2] + add angled, 1022 + shl r3d, 6 + test r3d, angled + jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) + pshuflw xm0, xm4, q3321 + sub angled, 1075 ; angle - 53 + lea r3d, [hq+3] + call .upsample_above + punpcklwd xm4, xm3, xm4 + palignr xm3, xm4, xm12, 14 + jmp .w4_main +.w4_upsample_left: + call .upsample_left + movsldup m1, [base+z_xpos_mul] + paddw m1, m1 + jmp .w4_main2 +.w4_no_upsample_above: + lea r3d, [hq+3] + vpbroadcastd ym0, [base+pw_3] + sub angled, 1112 ; angle - 90 + call .filter_above2 + lea r3d, [hq+2] + add angled, 973 ; angle + 883 + palignr xm3, xm4, xm12, 14 + shl r3d, 6 + test r3d, angled + jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) + call .filter_left16 +.w4_main: + movsldup m1, [base+z_xpos_mul] + psllw m15, 3 +.w4_main2: + vpbroadcastq m0, [base+pw_1to32] + vpbroadcastw m11, dxd + movsldup m2, [base+z_xpos_mul] + vpbroadcastw m13, dyd + vpbroadcastd m5, [tlq-2] + psllw m10, m8, 6 + valignq m5, m7, m5, 6 + pmullw m2, m11 + psubw m10, m2 ; xpos + pmullw m13, m0 ; ypos + palignr m5, m7, m5, 14 + psrlw m12, m13, 6 + psllw m13, 9 + paddw m12, m1 ; base_y + pand m13, m14 ; frac_y << 9 + psllw m11, 3 + lea r5, [strideq*3] +.w4_loop: + psrlw m1, m10, 6 ; base_x + pand m2, m14, m10 ; frac + vpermw m0, m1, m3 ; top[base_x] + vpermw m1, m1, m4 ; top[base_x+1] + vpmovw2m k1, m10 ; base_x < 0 + psllw m2, 9 + vpermw m0{k1}, m12, m5 ; left[base_y] + vpermw m1{k1}, m12, m7 ; left[base_y+1] + vmovdqu16 m2{k1}, m13 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r5 ], xm1 + sub hd, 8 + jl .w4_end + vextracti32x8 ym0, m0, 1 + psubw m10, m11 ; base_x -= dx + lea dstq, [dstq+strideq*4] + paddw m12, m15 ; base_y++ + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r5 ], xm1 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.w4_end: + RET +.upsample_above: ; w4/w8 + mova ym9, [base+pw_1to32] + palignr xm1, xm4, xm12, 12 + paddw xm3, xm4 ; b+c + xor angled, 0x7f ; 180 - angle + paddw xm0, xm1 ; a+d + vpbroadcastw xm1, r9m ; pixel_max + vpbroadcastb xm11, r3d + psubw xm0, xm3, xm0 + vpbroadcastb xm2, angled + psraw xm0, 3 + shr angled, 8 + paddw xm3, xm0 + pxor xm0, xm0 + vpcmpeqb k2, xm11, [base+z_filter_wh] + pmaxsw xm3, xm0 + add dxd, dxd + pavgw xm3, xm0 + vpcmpgtb k2{k2}, xm2, [base+z_filter_t0+angleq*8] + pminsw xm3, xm1 + paddw m8, m8 + jmp .filter_left16b +.upsample_left: ; h4/h8 + lea r3d, [hq-1] + palignr xm2, xm7, xm12, 14 + vpbroadcastw xm0, r3d + palignr xm1, xm7, xm12, 12 + pminuw xm0, xm9 + paddw xm2, xm7 ; b+c + vpermw xm0, xm0, xm7 + add dyd, dyd + paddw xm0, xm1 ; a+d + vpbroadcastw xm1, r9m ; pixel_max + psubw xm0, xm2, xm0 + psraw xm0, 3 + paddw xm2, xm0 + pxor xm0, xm0 + pmaxsw xm2, xm0 + pavgw xm2, xm0 + pminsw xm2, xm1 + punpckhwd xm0, xm2, xm7 + punpcklwd xm7, xm2, xm7 + vinserti32x4 ym7, xm0, 1 + ret +.filter_above: + sub angled, 90 +.filter_above2: + vpbroadcastb ym1, r3d + vpbroadcastb ym10, angled + mov r3d, angled + shr r3d, 8 + vpcmpeqb k2, ym1, [base+z_filter_wh] + mova xm11, [base+z_filter_t0+r3*8] + vpcmpgtb k1{k2}, ym10, ym11 + mova m9, [base+pw_1to32] + kmovd r3d, k1 + test r3d, r3d + jz .filter_end + pminuw ym0, ym9 + popcnt r3d, r3d + vpbroadcastd ym6, r7m ; max_w + kxnorw k1, k1, k1 + vpbroadcastd ym5, [base+z_filter_k+(r3-1)*4+12*0] + kaddw k1, k1, k1 ; ~1 + vpbroadcastd ym13, [base+z_filter_k+(r3-1)*4+12*1] + vpermw ym2, ym0, ym4 ; +1 + pmullw ym5, ym4 + paddw ym1, ym2, ym3 + vmovdqu16 m3{k1}, [tlq-2] ; -2 + vpermw ym2, ym0, ym2 ; +2 + vpbroadcastd ym0, [base+z_filter_k+(r3-1)*4+12*2] + pmullw ym1, ym13 + movu m13, [base+pw_0to31] + paddw ym2, ym3 + packssdw ym6, ym6 + pmullw ym2, ym0 + paddw ym1, ym5 + vpcmpgtw k1, ym6, ym13 + paddw ym1, ym2 + pxor ym2, ym2 + psrlw ym1, 3 + pavgw ym4{k1}, ym1, ym2 +.filter_end: + ret +.filter_left16: + vpbroadcastd ym1, [base+pb_90] + psubb ym1, ym10 + vpcmpgtb k2{k2}, ym1, ym11 +.filter_left16b: + kmovd r3d, k2 + test r3d, r3d + jz .filter_end + lea r5d, [hq-1] + vinserti32x4 ym0, ym12, xm7, 1 + vpbroadcastw ym1, r5d + popcnt r3d, r3d + vpbroadcastd ym6, r8m ; max_h + pminuw ym9, ym1 + vpbroadcastd ym5, [base+z_filter_k+(r3-1)*4+12*0] + vpermw ym2, ym9, ym7 ; +1 + vpbroadcastd ym10, [base+z_filter_k+(r3-1)*4+12*1] + palignr ym1, ym7, ym0, 14 ; -1 + pmullw ym5, ym7 + palignr ym0, ym7, ym0, 12 ; -2 + paddw ym1, ym2 + vpermw ym2, ym9, ym2 ; +2 + vpbroadcastd ym9, [base+z_filter_k+(r3-1)*4+12*2] + pmullw ym1, ym10 + paddw ym2, ym0 + packssdw ym6, ym6 + pmullw ym2, ym9 + paddw ym1, ym5 + vpcmpgtw k1, ym6, [base+pw_0to31] + paddw ym1, ym2 + pxor ym2, ym2 + psrlw ym1, 3 + pavgw ym7{k1}, ym1, ym2 + ret +.filter_left: + cmp hd, 32 + jl .filter_left16 + vpbroadcastd m5, [base+pw_3] + pminud m0, m9, [base+pw_31] {1to16} +.filter_left32: + vpbroadcastd m6, r8m ; max_h + valignq m2, m7, m12, 6 + packssdw m6, m6 + palignr m1, m7, m2, 14 ; -1 + paddw m1, m7 + palignr m2, m7, m2, 12 ; -2 + vpcmpgtw k1, m6, m13 + paddw m2, m5 + cmp hd, 64 + je .filter_left64 + lea r3d, [hq-1] + vpbroadcastw m10, r3d + pminuw m0, m10 + vpermw m10, m0, m7 ; +1 + paddw m1, m10 + vpermw m10, m0, m10 ; +2 + pavgw m2, m10 + paddw m1, m2 + vpsrlw m7{k1}, m1, 2 + ret +.filter_left64: + valignq m10, m8, m7, 2 + vpaddd m13, [base+pw_32] {1to16} + palignr m11, m10, m7, 2 ; +1 + paddw m1, m11 + palignr m11, m10, m7, 4 ; +2 + valignq m10, m8, m7, 6 + pavgw m11, m2 + vpermw m2, m0, m8 ; 32+1 + paddw m1, m11 + vpsrlw m7{k1}, m1, 2 + palignr m1, m8, m10, 14 ; 32-1 + paddw m1, m8 + palignr m10, m8, m10, 12 ; 32-2 + paddw m1, m2 + vpermw m2, m0, m2 ; 32+2 + paddw m10, m5 + vpcmpgtw k1, m6, m13 + pavgw m2, m10 + paddw m1, m2 + vpsrlw m8{k1}, m1, 2 + ret +.w8: + mova xm3, [tlq] + vbroadcasti32x4 m8, [base+pw_1to32] + test angled, 0x400 + jnz .w8_main + lea r3d, [angleq+126] + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm + psrldq xm0, xm4, 2 + sub angled, 53 + pshufhw xm0, xm0, q2210 + lea r3d, [hq+7] + call .upsample_above + punpcklwd xm0, xm3, xm4 + punpckhwd xm4, xm3, xm4 + vinserti32x4 ym3, ym12, xm0, 1 + vinserti32x4 ym4, ym0, xm4, 1 + palignr ym3, ym4, ym3, 14 + jmp .w8_main +.w8_upsample_left: + call .upsample_left + movshdup m1, [base+z_xpos_mul] + psllw m15, 3 + paddw m1, m1 + jmp .w8_main2 +.w8_no_upsample_above: + lea r3d, [hq+7] + vpbroadcastd ym0, [base+pw_7] + call .filter_above + lea r3d, [angleq-51] + mov r3b, hb + palignr xm3, xm4, xm12, 14 + cmp r3d, 8 + jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm + call .filter_left +.w8_main: + movshdup m1, [base+z_xpos_mul] + psllw m15, 2 +.w8_main2: + vbroadcasti32x4 m0, [base+pw_1to32] + vpbroadcastw m11, dxd + movshdup m2, [base+z_xpos_mul] + vpbroadcastw m13, dyd + psllw m10, m8, 6 + valignq m5, m7, m12, 6 + pmullw m2, m11 + psubw m10, m2 ; xpos + pmullw m13, m0 ; ypos + palignr m5, m7, m5, 14 + psrlw m12, m13, 6 + psllw m13, 9 + mov r2d, 1<<6 + paddw m12, m1 ; base_y + lea r3d, [dxq-(8<<6)] ; left-only threshold + pand m13, m14 ; frac_y << 9 + shl dxd, 2 + psllw m11, 2 + lea r5, [strideq*3] +.w8_loop: + psrlw m1, m10, 6 + pand m2, m14, m10 + vpermw m0, m1, m3 + vpermw m1, m1, m4 + psllw m2, 9 + sub r2d, dxd + jge .w8_toponly + vpmovw2m k1, m10 + vpermw m0{k1}, m12, m5 + vpermw m1{k1}, m12, m7 + vmovdqu16 m2{k1}, m13 +.w8_toponly: + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+r5 ], m0, 3 + sub hd, 4 + jz .w8_end + psubw m10, m11 ; base_x -= dx + lea dstq, [dstq+strideq*4] + paddw m12, m15 ; base_y++ + cmp r2d, r3d + jge .w8_loop +.w8_leftonly_loop: + vpermw m0, m12, m5 + vpermw m1, m12, m7 + psubw m1, m0 + pmulhrsw m1, m13 + paddw m12, m15 + paddw m0, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+r5 ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_leftonly_loop +.w8_end: + RET +.w16: + mova ym3, [tlq] + vpermw m8, m0, [tlq-64*2] + test angled, 0x400 + jnz .w16_main + lea r3d, [hq+15] + vpbroadcastd ym0, [base+pw_15] + call .filter_above + call .filter_left + vinserti32x4 ym3, ym12, xm4, 1 + palignr ym3, ym4, ym3, 14 +.w16_main: + vbroadcasti32x8 m0, [base+pw_1to32] + vpbroadcastw m11, dxd + vpbroadcastw m13, dyd + kxnorw k2, k2, k2 + psllw m10, m0, 6 + valignq m5, m7, m12, 6 + psubw m10, m11 ; xpos + valignq m6, m8, m7, 6 + pmullw m13, m0 ; ypos + knotd k1, k2 + palignr m5, m7, m5, 14 + palignr m6, m8, m6, 14 + vpsubw m10{k1}, m11 + psrlw m12, m13, 6 + psllw m13, 9 + mov r2d, 1<<6 + vpsubw m12{k2}, m15 ; base_y + pand m13, m14 ; frac_y << 9 + lea r3d, [dxq-(16<<6)] + paddw m11, m11 + add dxd, dxd + paddw m15, m15 +.w16_loop: + psrlw m1, m10, 6 + pand m2, m14, m10 + vpermw m0, m1, m3 + vpermw m1, m1, m4 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m12, m15 ; base_y++ + paddw m0, m1 + sub r2d, dxd + jge .w16_toponly + mova m1, m5 + vpermt2w m1, m12, m6 + mova m2, m7 + vpermt2w m2, m12, m8 + vpmovw2m k1, m10 + psubw m2, m1 + pmulhrsw m2, m13 + vpaddw m0{k1}, m1, m2 +.w16_toponly: + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + sub hd, 2 + jz .w16_end + psubw m10, m11 ; base_x -= dx + lea dstq, [dstq+strideq*2] + cmp r2d, r3d + jge .w16_loop + paddw m12, m15 + vpermt2w m5, m12, m6 + mova m1, m7 + vpermt2w m1, m12, m8 + jmp .w16_leftonly_loop_start +.w16_leftonly_loop: + mova m1, m7 + vpermt2w m1, m12, m8 + vshufi32x4 m5, m1, q1032 +.w16_leftonly_loop_start: + psubw m0, m1, m5 + pmulhrsw m0, m13 + paddw m12, m15 + paddw m0, m5 + mova m5, m1 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_leftonly_loop +.w16_end: + RET +.w32: + mova m3, [tlq] + vpermw m8, m0, [tlq-64*2] + mova m9, [base+pw_1to32] + test angled, 0x400 + jnz .w32_main + pminud m0, m9, [base+pw_31] {1to16} + mov r3d, ~1 + kmovd k1, r3d + vpbroadcastd m5, [base+pw_3] + vpbroadcastd m6, r6m ; max_w + vpermw m2, m0, m4 ; +1 + movu m13, [base+pw_0to31] + paddw m1, m4, m3 + vmovdqu16 m3{k1}, [tlq-2] ; -2 + packssdw m6, m6 + paddw m1, m2 + vpermw m2, m0, m2 ; +2 + paddw m3, m5 + vpcmpgtw k1, m6, m13 + pavgw m2, m3 + paddw m1, m2 + psrlw m4{k1}, m1, 2 + call .filter_left32 +.w32_main: + sub rsp, 64*2 + call .w32_main1 + add rsp, 64*2 + RET +.w32_main1: + vpbroadcastw m11, dxd + movu [rsp+64], m4 + vpbroadcastw m4, dyd + movd [rsp+60], xm12 + valignq m5, m7, m12, 6 + psllw m3, m9, 6 ; xpos + valignq m6, m8, m7, 6 + pmullw m9, m4 ; ypos + palignr m5, m7, m5, 14 + mov r2d, 33<<6 + palignr m6, m8, m6, 14 + mova m10, m3 +.w32_main2: + psllw m13, m9, 9 + sub r2d, dxd + psrlw m12, m9, 6 ; base_y + mov r8d, hd + pand m13, m14 ; frac_y << 9 +.w32_loop: + mov r3d, r2d + shr r3d, 6 + psubw m10, m11 ; base_x -= dx + movu m0, [rsp+r3*2-2] + pand m2, m10, m14 ; frac_x + movu m1, [rsp+r3*2] + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m12, m15 ; base_y++ + paddw m0, m1 + cmp r2d, 32<<6 + jge .w32_toponly + mova m1, m5 + vpermt2w m1, m12, m6 + mova m2, m7 + vpermt2w m2, m12, m8 + vpmovw2m k1, m10 + psubw m2, m1 + pmulhrsw m2, m13 + vpaddw m0{k1}, m1, m2 +.w32_toponly: + mova [dstq], m0 + dec r8d + jz .w32_end + add dstq, strideq + sub r2d, dxd + jge .w32_loop + paddw m12, m15 + mova m2, m5 + vpermt2w m2, m12, m6 +.w32_leftonly_loop: + mova m1, m7 + vpermt2w m1, m12, m8 + psubw m0, m1, m2 + pmulhrsw m0, m13 + paddw m12, m15 + paddw m0, m2 + mova m2, m1 + mova [dstq], m0 + add dstq, strideq + dec r8d + jg .w32_leftonly_loop +.w32_end: + ret +.w64: + movu m3, [tlq+66] + vpermw m8, m0, [tlq-64*2] + mova m9, [base+pw_1to32] + test angled, 0x400 + jnz .w64_main + mova m2, [tlq] ; -1 + mov r3d, ~1 + vpbroadcastd m5, [base+pw_3] + kmovd k1, r3d + movu m13, [base+pw_0to31] + vpbroadcastd m6, r6m ; max_w + pminud m0, m9, [base+pw_31] {1to16} + paddw m1, m4, m2 + vmovdqu16 m2{k1}, [tlq-2] ; -2 + packssdw m6, m6 + paddw m1, [tlq+4] ; +1 + paddw m2, m5 + vpcmpgtw k1, m6, m13 + pavgw m2, [tlq+6] ; +2 + paddw m1, m2 + vpermw m2, m0, m3 ; 32+1 + psrlw m4{k1}, m1, 2 + paddw m1, m3, [tlq+64] ; 32-1 + vpaddd m11, m13, [base+pw_32] {1to16} + paddw m1, m2 + vpermw m2, m0, m2 ; 32+2 + paddw m10, m5, [tlq+62] ; 32-2 + vpcmpgtw k1, m6, m11 + pavgw m2, m10 + paddw m1, m2 + psrlw m3{k1}, m1, 2 + call .filter_left32 +.w64_main: + sub rsp, 64*3 + movu [rsp+64*2-gprsize], m3 + mov r5, dstq + call .w32_main1 + psllw m4, 5 + mov r2d, 65<<6 + vpaddd m10, m3, [base+pw_2048] {1to16} ; xpos + lea dstq, [r5+64] + paddw m9, m4 ; ypos + call .w32_main2 + add rsp, 64*3 + RET + cglobal ipred_z3_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy lea r7, [z_filter_t0] tzcnt wd, wm |