diff options
Diffstat (limited to 'third_party/dav1d/src')
-rw-r--r-- | third_party/dav1d/src/arm/64/itx.S | 7 | ||||
-rw-r--r-- | third_party/dav1d/src/decode.c | 42 | ||||
-rw-r--r-- | third_party/dav1d/src/picture.c | 44 | ||||
-rw-r--r-- | third_party/dav1d/src/riscv/64/itx.S | 803 | ||||
-rw-r--r-- | third_party/dav1d/src/riscv/itx.h | 6 | ||||
-rw-r--r-- | third_party/dav1d/src/x86/ipred.h | 1 | ||||
-rw-r--r-- | third_party/dav1d/src/x86/ipred16_avx512.asm | 610 |
7 files changed, 1416 insertions, 97 deletions
diff --git a/third_party/dav1d/src/arm/64/itx.S b/third_party/dav1d/src/arm/64/itx.S index b1b2f8fe65..53490cd677 100644 --- a/third_party/dav1d/src/arm/64/itx.S +++ b/third_party/dav1d/src/arm/64/itx.S @@ -1426,6 +1426,7 @@ endfunc function inv_txfm_add_16x16_neon mov x15, x30 sub sp, sp, #512 + mov x8, #16*2 .irp i, 0, 8 add x6, sp, #(\i*16*2) .if \i == 8 @@ -1433,7 +1434,6 @@ function inv_txfm_add_16x16_neon b.lt 1f .endif add x7, x2, #(\i*2) - mov x8, #16*2 blr x9 .endr b 2f @@ -1449,7 +1449,6 @@ function inv_txfm_add_16x16_neon .irp i, 0, 8 add x6, x0, #(\i) add x7, sp, #(\i*2) - mov x8, #32 bl inv_txfm_add_vert_8x16_neon .endr @@ -2461,10 +2460,10 @@ function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1 b.gt 2b 3: + mov x8, #32*2 .irp i, 0, 8, 16, 24 add x6, x0, #(\i) add x7, sp, #(\i*2) - mov x8, #32*2 bl inv_txfm_add_vert_8x16_neon .endr @@ -3205,10 +3204,10 @@ function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1 3: adr x5, inv_dct_8h_x16_neon + mov x8, #64*2 .irp i, 0, 8, 16, 24, 32, 40, 48, 56 add x6, x0, #(\i) add x7, x4, #(\i*2) - mov x8, #64*2 bl inv_txfm_add_vert_8x16_neon .endr diff --git a/third_party/dav1d/src/decode.c b/third_party/dav1d/src/decode.c index 97d15ca1c6..eed9dfb756 100644 --- a/third_party/dav1d/src/decode.c +++ b/third_party/dav1d/src/decode.c @@ -2616,6 +2616,25 @@ static void read_restoration_info(Dav1dTaskContext *const t, } } +// modeled after the equivalent function in aomdec:decodeframe.c +static int check_trailing_bits_after_symbol_coder(const MsacContext *const msac) { + // check marker bit (single 1), followed by zeroes + const int n_bits = -(msac->cnt + 14); + assert(n_bits <= 0); // this assumes we errored out when cnt <= -15 in caller + const int n_bytes = (n_bits + 7) >> 3; + const uint8_t *p = &msac->buf_pos[n_bytes]; + const int pattern = 128 >> ((n_bits - 1) & 7); + if ((p[-1] & (2 * pattern - 1)) != pattern) + return 1; + + // check remainder zero bytes + for (; p < msac->buf_end; p++) + if (*p) + return 1; + + return 0; +} + int dav1d_decode_tile_sbrow(Dav1dTaskContext *const t) { const Dav1dFrameContext *const f = t->f; const enum BlockLevel root_bl = f->seq_hdr->sb128 ? BL_128X128 : BL_64X64; @@ -2659,9 +2678,6 @@ int dav1d_decode_tile_sbrow(Dav1dTaskContext *const t) { return 0; } - // error out on symbol decoder overread - if (ts->msac.cnt < -15) return 1; - if (f->c->n_tc > 1 && f->frame_hdr->use_ref_frame_mvs) { f->c->refmvs_dsp.load_tmvs(&f->rf, ts->tiling.row, ts->tiling.col_start >> 1, ts->tiling.col_end >> 1, @@ -2767,7 +2783,12 @@ int dav1d_decode_tile_sbrow(Dav1dTaskContext *const t) { memcpy(&f->lf.tx_lpf_right_edge[1][align_h * tile_col + (t->by >> ss_ver)], &t->l.tx_lpf_uv[(t->by & 16) >> ss_ver], sb_step >> ss_ver); - return 0; + // error out on symbol decoder overread + if (ts->msac.cnt <= -15) return 1; + + return c->strict_std_compliance && + (t->by >> f->sb_shift) + 1 >= f->frame_hdr->tiling.row_start_sb[tile_row + 1] && + check_trailing_bits_after_symbol_coder(&ts->msac); } int dav1d_decode_frame_init(Dav1dFrameContext *const f) { @@ -3262,7 +3283,7 @@ error: return retval; } -void dav1d_decode_frame_exit(Dav1dFrameContext *const f, const int retval) { +void dav1d_decode_frame_exit(Dav1dFrameContext *const f, int retval) { const Dav1dContext *const c = f->c; if (f->sr_cur.p.data[0]) @@ -3273,8 +3294,16 @@ void dav1d_decode_frame_exit(Dav1dFrameContext *const f, const int retval) { (size_t)f->frame_thread.cf_sz * 128 * 128 / 2); } for (int i = 0; i < 7; i++) { - if (f->refp[i].p.frame_hdr) + if (f->refp[i].p.frame_hdr) { + if (!retval && c->n_fc > 1 && c->strict_std_compliance && + atomic_load(&f->refp[i].progress[1]) == FRAME_ERROR) + { + retval = DAV1D_ERR(EINVAL); + atomic_store(&f->task_thread.error, 1); + atomic_store(&f->sr_cur.progress[1], FRAME_ERROR); + } dav1d_thread_picture_unref(&f->refp[i]); + } dav1d_ref_dec(&f->ref_mvs_ref[i]); } @@ -3328,6 +3357,7 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) { } } dav1d_decode_frame_exit(f, res); + res = f->task_thread.retval; f->n_tile_data = 0; return res; } diff --git a/third_party/dav1d/src/picture.c b/third_party/dav1d/src/picture.c index f22f05f0ca..94365bce8c 100644 --- a/third_party/dav1d/src/picture.c +++ b/third_party/dav1d/src/picture.c @@ -111,15 +111,15 @@ void dav1d_picture_free_itut_t35(const uint8_t *const data, void *const user_dat dav1d_free(itut_t35_ctx); } -static int picture_alloc_with_edges(Dav1dContext *const c, - Dav1dPicture *const p, - const int w, const int h, - Dav1dSequenceHeader *const seq_hdr, Dav1dRef *const seq_hdr_ref, - Dav1dFrameHeader *const frame_hdr, Dav1dRef *const frame_hdr_ref, - const int bpc, - const Dav1dDataProps *const props, - Dav1dPicAllocator *const p_allocator, - void **const extra_ptr) +static int picture_alloc(Dav1dContext *const c, + Dav1dPicture *const p, + const int w, const int h, + Dav1dSequenceHeader *const seq_hdr, Dav1dRef *const seq_hdr_ref, + Dav1dFrameHeader *const frame_hdr, Dav1dRef *const frame_hdr_ref, + const int bpc, + const Dav1dDataProps *const props, + Dav1dPicAllocator *const p_allocator, + void **const extra_ptr) { if (p->data[0]) { dav1d_log(c, "Picture already allocated!\n"); @@ -194,12 +194,11 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f { Dav1dThreadPicture *const p = &f->sr_cur; - const int res = - picture_alloc_with_edges(c, &p->p, f->frame_hdr->width[1], f->frame_hdr->height, - f->seq_hdr, f->seq_hdr_ref, - f->frame_hdr, f->frame_hdr_ref, - bpc, &f->tile[0].data.m, &c->allocator, - (void **) &p->progress); + const int res = picture_alloc(c, &p->p, f->frame_hdr->width[1], f->frame_hdr->height, + f->seq_hdr, f->seq_hdr_ref, + f->frame_hdr, f->frame_hdr_ref, + bpc, &f->tile[0].data.m, &c->allocator, + (void **) &p->progress); if (res) return res; dav1d_picture_copy_props(&p->p, c->content_light, c->content_light_ref, @@ -212,9 +211,10 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f c->itut_t35 = NULL; c->n_itut_t35 = 0; - // Don't clear these flags from c->frame_flags if the frame is not visible. + // Don't clear these flags from c->frame_flags if the frame is not going to be output. // This way they will be added to the next visible frame too. - const int flags_mask = (f->frame_hdr->show_frame || c->output_invisible_frames) + const int flags_mask = ((f->frame_hdr->show_frame || c->output_invisible_frames) && + c->max_spatial_id == f->frame_hdr->spatial_id) ? 0 : (PICTURE_FLAG_NEW_SEQUENCE | PICTURE_FLAG_NEW_OP_PARAMS_INFO); p->flags = c->frame_flags; c->frame_flags &= flags_mask; @@ -233,11 +233,11 @@ int dav1d_picture_alloc_copy(Dav1dContext *const c, Dav1dPicture *const dst, con { Dav1dMemPoolBuffer *const buf = (Dav1dMemPoolBuffer *)src->ref->const_data; struct pic_ctx_context *const pic_ctx = buf->data; - const int res = picture_alloc_with_edges(c, dst, w, src->p.h, - src->seq_hdr, src->seq_hdr_ref, - src->frame_hdr, src->frame_hdr_ref, - src->p.bpc, &src->m, &pic_ctx->allocator, - NULL); + const int res = picture_alloc(c, dst, w, src->p.h, + src->seq_hdr, src->seq_hdr_ref, + src->frame_hdr, src->frame_hdr_ref, + src->p.bpc, &src->m, &pic_ctx->allocator, + NULL); if (res) return res; dav1d_picture_copy_props(dst, src->content_light, src->content_light_ref, diff --git a/third_party/dav1d/src/riscv/64/itx.S b/third_party/dav1d/src/riscv/64/itx.S index f7d907eedf..60d045150d 100644 --- a/third_party/dav1d/src/riscv/64/itx.S +++ b/third_party/dav1d/src/riscv/64/itx.S @@ -117,39 +117,50 @@ function inv_identity_e16_x4_rvv, export=1, ext=v jr t0 endfunc +.macro iwht_4 + vadd.vv v0, v0, v1 + vsub.vv v5, v2, v3 + vsub.vv v4, v0, v5 + vsra.vi v4, v4, 1 + vsub.vv v2, v4, v1 + vsub.vv v1, v4, v3 + vadd.vv v3, v5, v2 + vsub.vv v0, v0, v1 +.endm + .macro idct_4 o0, o1, o2, o3 li t1, 2896 li t2, 1567 li t3, 3784 - vwmul.vx v8, \o0, t1 - vwmul.vx v10, \o0, t1 - vwmacc.vx v8, t1, \o2 + vwmul.vx v16, \o0, t1 + vwmul.vx v18, \o0, t1 + vwmacc.vx v16, t1, \o2 neg t1, t1 - vwmacc.vx v10, t1, \o2 + vwmacc.vx v18, t1, \o2 - vwmul.vx v12, \o1, t3 + vwmul.vx v20, \o1, t3 neg t3, t3 - vwmul.vx v14, \o1, t2 - vwmacc.vx v12, t2, \o3 - vwmacc.vx v14, t3, \o3 + vwmul.vx v22, \o1, t2 + vwmacc.vx v20, t2, \o3 + vwmacc.vx v22, t3, \o3 li t1, 2048 - vwadd.wx v8, v8, t1 - vwadd.wx v10, v10, t1 - vwadd.wx v12, v12, t1 - vwadd.wx v14, v14, t1 + vwadd.wx v16, v16, t1 + vwadd.wx v18, v18, t1 + vwadd.wx v20, v20, t1 + vwadd.wx v22, v22, t1 - vnsra.wi v8, v8, 12 - vnsra.wi v10, v10, 12 - vnsra.wi v12, v12, 12 - vnsra.wi v14, v14, 12 + vnsra.wi v16, v16, 12 + vnsra.wi v18, v18, 12 + vnsra.wi v20, v20, 12 + vnsra.wi v22, v22, 12 - vsadd.vv \o0, v8, v12 - vsadd.vv \o1, v10, v14 - vssub.vv \o2, v10, v14 - vssub.vv \o3, v8, v12 + vsadd.vv \o0, v16, v20 + vsadd.vv \o1, v18, v22 + vssub.vv \o2, v18, v22 + vssub.vv \o3, v16, v20 .endm .macro iadst_4 o0, o1, o2, o3 @@ -211,6 +222,45 @@ function inv_flipadst_e16_x4_rvv, export=1, ext=v jr t0 endfunc +function inv_txfm_add_wht_wht_4x4_8bpc_rvv, export=1, ext=v + csrw vxrm, zero + + vsetivli zero, 4, e16, mf2, ta, ma + vle16.v v0, (a2) + addi t0, a2, 8 + vle16.v v1, (t0) + addi t0, t0, 8 + vle16.v v2, (t0) + addi t0, t0, 8 + vle16.v v3, (t0) + + vsra.vi v0, v0, 2 + vsra.vi v1, v1, 2 + vsra.vi v2, v2, 2 + vsra.vi v3, v3, 2 + + iwht_4 + + vmv.v.x v4, zero + + vsseg4e16.v v0, (a2) + vle16.v v0, (a2) + vse16.v v4, (a2) + addi t0, a2, 8 + vle16.v v1, (t0) + vse16.v v4, (t0) + addi t0, t0, 8 + vle16.v v2, (t0) + vse16.v v4, (t0) + addi t0, t0, 8 + vle16.v v3, (t0) + vse16.v v4, (t0) + + iwht_4 + + j itx_4x4_end +endfunc + .macro def_fn_4x4 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_rvv, export=1, ext=v .ifc \txfm1\()_\txfm2, dct_dct @@ -353,7 +403,7 @@ itx_8x8_end: vwaddu.wv v6, v6, v14 vwaddu.wv v7, v7, v15 - vsetvli zero, zero, e16, m1 + vsetvli zero, zero, e16, m1, ta, ma vmax.vx v0, v0, zero vmax.vx v1, v1, zero vmax.vx v2, v2, zero @@ -410,69 +460,67 @@ function inv_identity_e16_x8_rvv, export=1, ext=v jr t0 endfunc -function inv_dct_e16_x8_rvv, export=1, ext=v - idct_4 v0, v2, v4, v6 +.macro idct_8 o0, o1, o2, o3, o4, o5, o6, o7 + idct_4 \o0, \o2, \o4, \o6 li t1, 799 li t2, 4017 li t3, 3406 li t4, 2276 - vwmul.vx v14, v1, t2 + vwmul.vx v22, \o1, t2 neg t2, t2 - vwmul.vx v8, v1, t1 - vwmacc.vx v14, t1, v7 - vwmacc.vx v8, t2, v7 + vwmul.vx v16, \o1, t1 + vwmacc.vx v22, t1, \o7 + vwmacc.vx v16, t2, \o7 - vwmul.vx v12, v5, t4 + vwmul.vx v20, \o5, t4 neg t4, t4 - vwmul.vx v10, v5, t3 - vwmacc.vx v12, t3, v3 - vwmacc.vx v10, t4, v3 + vwmul.vx v18, \o5, t3 + vwmacc.vx v20, t3, \o3 + vwmacc.vx v18, t4, \o3 li t1, 2048 - vwadd.wx v8, v8, t1 - vwadd.wx v10, v10, t1 - vwadd.wx v12, v12, t1 - vwadd.wx v14, v14, t1 + vwadd.wx v16, v16, t1 + vwadd.wx v18, v18, t1 + vwadd.wx v20, v20, t1 + vwadd.wx v22, v22, t1 - vnsra.wi v8, v8, 12 - vnsra.wi v10, v10, 12 - vnsra.wi v12, v12, 12 - vnsra.wi v14, v14, 12 + vnsra.wi v16, v16, 12 + vnsra.wi v18, v18, 12 + vnsra.wi v20, v20, 12 + vnsra.wi v22, v22, 12 - vssub.vv v7, v14, v12 - vsadd.vv v14, v14, v12 - vssub.vv v1, v8, v10 - vsadd.vv v8, v8, v10 + vssub.vv \o7, v22, v20 + vsadd.vv v22, v22, v20 + vssub.vv \o1, v16, v18 + vsadd.vv v16, v16, v18 li t2, 2896 - vwmul.vx v10, v7, t2 - vwmul.vx v12, v7, t2 - vwmacc.vx v12, t2, v1 + vwmul.vx v18, \o7, t2 + vwmul.vx v20, \o7, t2 + vwmacc.vx v20, t2, \o1 neg t2, t2 - vwmacc.vx v10, t2, v1 + vwmacc.vx v18, t2, \o1 - vwadd.wx v10, v10, t1 - vwadd.wx v12, v12, t1 - - vnsra.wi v10, v10, 12 - vnsra.wi v12, v12, 12 + vwadd.wx v18, v18, t1 + vwadd.wx v20, v20, t1 - vssub.vv v7, v0, v14 - vsadd.vv v0, v0, v14 - vssub.vv v9, v2, v12 - vsadd.vv v1, v2, v12 - vssub.vv v5, v4, v10 - vsadd.vv v2, v4, v10 - vssub.vv v4, v6, v8 - vsadd.vv v3, v6, v8 - vmv.v.v v6, v9 + vnsra.wi v18, v18, 12 + vnsra.wi v20, v20, 12 - jr t0 -endfunc + vssub.vv \o7, \o0, v22 + vsadd.vv \o0, \o0, v22 + vssub.vv v17, \o2, v20 + vsadd.vv \o1, \o2, v20 + vssub.vv \o5, \o4, v18 + vsadd.vv \o2, \o4, v18 + vssub.vv \o4, \o6, v16 + vsadd.vv \o3, \o6, v16 + vmv.v.v \o6, v17 +.endm .macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7 li t1, 4076 @@ -598,6 +646,11 @@ endfunc vssub.vv \o7, v8, \o7 .endm +function inv_dct_e16_x8_rvv, export=1, ext=v + idct_8 v0, v1, v2, v3, v4, v5, v6, v7 + jr t0 +endfunc + function inv_adst_e16_x8_rvv, export=1, ext=v iadst_8 v0, v1, v2, v3, v4, v5, v6, v7 jr t0 @@ -660,3 +713,627 @@ def_fn_8x8 adst, identity def_fn_8x8 flipadst, identity def_fn_8x8 identity, adst def_fn_8x8 identity, flipadst + +function inv_identity_e16_x16_rvv, export=1, ext=v + li t1, 2*(5793-4096)*8 +.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + vsmul.vx v16, v\i, t1 + vsadd.vv v\i, v\i, v\i + vsadd.vv v\i, v\i, v16 +.endr + jr t0 +endfunc + +function inv_dct_e16_x16_rvv, export=1, ext=v + idct_8 v0, v2, v4, v6, v8, v10, v12, v14 + + li t1, 401 + li t2, 4076 + li t3, 3166 + li t4, 2598 + + vwmul.vx v30, v1, t2 + neg t2, t2 + vwmul.vx v16, v1, t1 + vwmacc.vx v30, t1, v15 + vwmacc.vx v16, t2, v15 + + vwmul.vx v28, v9, t4 + neg t4, t4 + vwmul.vx v18, v9, t3 + vwmacc.vx v28, t3, v7 + vwmacc.vx v18, t4, v7 + + li t1, 1931 + li t2, 3612 + li t3, 3920 + li t4, 1189 + + vwmul.vx v26, v5, t2 + neg t2, t2 + vwmul.vx v20, v5, t1 + vwmacc.vx v26, t1, v11 + vwmacc.vx v20, t2, v11 + + vwmul.vx v24, v13, t4 + neg t4, t4 + vwmul.vx v22, v13, t3 + vwmacc.vx v24, t3, v3 + vwmacc.vx v22, t4, v3 + + li t1, 2048 + li t2, 2896 + li t3, 1567 + li t4, 3784 + + vwadd.wx v16, v16, t1 + vwadd.wx v18, v18, t1 + vwadd.wx v20, v20, t1 + vwadd.wx v22, v22, t1 + vwadd.wx v24, v24, t1 + vwadd.wx v26, v26, t1 + vwadd.wx v28, v28, t1 + vwadd.wx v30, v30, t1 + + vnsra.wi v16, v16, 12 + vnsra.wi v18, v18, 12 + vnsra.wi v20, v20, 12 + vnsra.wi v22, v22, 12 + vnsra.wi v24, v24, 12 + vnsra.wi v26, v26, 12 + vnsra.wi v28, v28, 12 + vnsra.wi v30, v30, 12 + + vssub.vv v3, v16, v18 + vsadd.vv v16, v16, v18 + vssub.vv v5, v22, v20 + vsadd.vv v22, v22, v20 + vssub.vv v11, v24, v26 + vsadd.vv v24, v24, v26 + vssub.vv v13, v30, v28 + vsadd.vv v30, v30, v28 + + vwmul.vx v28, v13, t4 + neg t4, t4 + vwmul.vx v18, v13, t3 + vwmul.vx v26, v11, t3 + vwmacc.vx v28, t3, v3 + neg t3, t3 + vwmul.vx v20, v11, t4 + vwmacc.vx v18, t4, v3 + vwmacc.vx v20, t3, v5 + vwmacc.vx v26, t4, v5 + + vwadd.wx v18, v18, t1 + vwadd.wx v20, v20, t1 + vwadd.wx v26, v26, t1 + vwadd.wx v28, v28, t1 + + vnsra.wi v18, v18, 12 + vnsra.wi v20, v20, 12 + vnsra.wi v26, v26, 12 + vnsra.wi v28, v28, 12 + + vssub.vv v5, v18, v20 + vsadd.vv v18, v18, v20 + vssub.vv v11, v28, v26 + vsadd.vv v28, v28, v26 + + vssub.vv v7, v16, v22 + vsadd.vv v16, v16, v22 + vssub.vv v9, v30, v24 + vsadd.vv v30, v30, v24 + + vwmul.vx v20, v11, t2 + vwmul.vx v22, v9, t2 + vwmul.vx v24, v9, t2 + vwmul.vx v26, v11, t2 + vwmacc.vx v24, t2, v7 + vwmacc.vx v26, t2, v5 + neg t2, t2 + vwmacc.vx v20, t2, v5 + vwmacc.vx v22, t2, v7 + + vwadd.wx v20, v20, t1 + vwadd.wx v22, v22, t1 + vwadd.wx v24, v24, t1 + vwadd.wx v26, v26, t1 + + vnsra.wi v20, v20, 12 + vnsra.wi v22, v22, 12 + vnsra.wi v24, v24, 12 + vnsra.wi v26, v26, 12 + + vssub.vv v15, v0, v30 + vsadd.vv v0, v0, v30 + vssub.vv v17, v2, v28 + vsadd.vv v1, v2, v28 + vssub.vv v13, v4, v26 + vsadd.vv v2, v4, v26 + vssub.vv v19, v6, v24 + vsadd.vv v3, v6, v24 + vssub.vv v11, v8, v22 + vsadd.vv v4, v8, v22 + vsadd.vv v5, v10, v20 + vssub.vv v10, v10, v20 + vssub.vv v9, v12, v18 + vsadd.vv v6, v12, v18 + vssub.vv v8, v14, v16 + vsadd.vv v7, v14, v16 + vmv.v.v v14, v17 + vmv.v.v v12, v19 + + jr t0 +endfunc + +.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 + li t1, 4091 + li t2, 201 + li t3, 3973 + li t4, 995 + + vwmul.vx v16, v15, t1 + neg t1, t1 + vwmul.vx v18, v15, t2 + vwmacc.vx v16, t2, v0 + vwmacc.vx v18, t1, v0 + + vwmul.vx v20, v13, t3 + neg t3, t3 + vwmul.vx v22, v13, t4 + vwmacc.vx v20, t4, v2 + vwmacc.vx v22, t3, v2 + + li t1, 3703 + li t2, 1751 + li t3, 3290 + li t4, 2440 + + vwmul.vx v24, v11, t1 + neg t1, t1 + vwmul.vx v26, v11, t2 + vwmacc.vx v24, t2, v4 + vwmacc.vx v26, t1, v4 + + vwmul.vx v28, v9, t3 + neg t3, t3 + vwmul.vx v30, v9, t4 + vwmacc.vx v28, t4, v6 + vwmacc.vx v30, t3, v6 + + li t1, 2048 + + vwadd.wx v16, v16, t1 + vwadd.wx v18, v18, t1 + vwadd.wx v20, v20, t1 + vwadd.wx v22, v22, t1 + vwadd.wx v24, v24, t1 + vwadd.wx v26, v26, t1 + vwadd.wx v28, v28, t1 + vwadd.wx v30, v30, t1 + + vnsra.wi v0, v16, 12 + vnsra.wi v18, v18, 12 + vnsra.wi v2, v20, 12 + vnsra.wi v22, v22, 12 + vnsra.wi v4, v24, 12 + vnsra.wi v26, v26, 12 + vnsra.wi v6, v28, 12 + vnsra.wi v30, v30, 12 + + li t1, 2751 + li t2, 3035 + li t3, 2106 + li t4, 3513 + + vwmul.vx v16, v7, t1 + neg t1, t1 + vwmul.vx v20, v7, t2 + vwmacc.vx v16, t2, v8 + vwmacc.vx v20, t1, v8 + + vwmul.vx v24, v5, t3 + neg t3, t3 + vwmul.vx v28, v5, t4 + vwmacc.vx v24, t4, v10 + vwmacc.vx v28, t3, v10 + + li t1, 2048 + + vwadd.wx v16, v16, t1 + vwadd.wx v20, v20, t1 + vwadd.wx v24, v24, t1 + vwadd.wx v28, v28, t1 + + vnsra.wi v16, v16, 12 + vnsra.wi v9, v20, 12 + vnsra.wi v24, v24, 12 + vnsra.wi v11, v28, 12 + + vssub.vv v8, v0, v16 + vsadd.vv v0, v0, v16 + vssub.vv v10, v2, v24 + vsadd.vv v2, v2, v24 + + li t1, 1380 + li t2, 3857 + li t3, 601 + li t4, 4052 + + vwmul.vx v16, v3, t1 + neg t1, t1 + vwmul.vx v20, v3, t2 + vwmacc.vx v16, t2, v12 + vwmacc.vx v20, t1, v12 + + vwmul.vx v24, v1, t3 + neg t3, t3 + vwmul.vx v28, v1, t4 + vwmacc.vx v24, t4, v14 + vwmacc.vx v28, t3, v14 + + li t1, 2048 + + vwadd.wx v16, v16, t1 + vwadd.wx v20, v20, t1 + vwadd.wx v24, v24, t1 + vwadd.wx v28, v28, t1 + + vnsra.wi v16, v16, 12 + vnsra.wi v13, v20, 12 + vnsra.wi v24, v24, 12 + vnsra.wi v15, v28, 12 + + vssub.vv v12, v4, v16 + vsadd.vv v16, v4, v16 + vssub.vv v14, v6, v24 + vsadd.vv v20, v6, v24 + + vsadd.vv v1, v18, v9 + vssub.vv v9, v18, v9 + vsadd.vv v3, v22, v11 + vssub.vv v11, v22, v11 + vsadd.vv v18, v26, v13 + vssub.vv v13, v26, v13 + vsadd.vv v22, v30, v15 + vssub.vv v15, v30, v15 + + vssub.vv v4, v0, v16 + vsadd.vv v0, v0, v16 + vssub.vv v5, v1, v18 + vsadd.vv v1, v1, v18 + vssub.vv v6, v2, v20 + vsadd.vv v2, v2, v20 + vssub.vv v7, v3, v22 + vsadd.vv v3, v3, v22 + + li t1, 799 + li t2, 4017 + li t3, 3406 + li t4, 2276 + + vwmul.vx v16, v8, t2 + vwmul.vx v18, v8, t1 + vwmul.vx v20, v10, t4 + vwmul.vx v22, v10, t3 + vwmul.vx v24, v13, t2 + vwmul.vx v26, v13, t1 + vwmul.vx v28, v15, t4 + vwmul.vx v30, v15, t3 + vwmacc.vx v16, t1, v9 + neg t1, t1 + vwmacc.vx v20, t3, v11 + neg t3, t3 + vwmacc.vx v26, t2, v12 + neg t2, t2 + vwmacc.vx v30, t4, v14 + neg t4, t4 + vwmacc.vx v18, t2, v9 + vwmacc.vx v22, t4, v11 + vwmacc.vx v24, t1, v12 + vwmacc.vx v28, t3, v14 + + li t1, 2048 + li t2, 2896 + li t3, 1567 + li t4, 3784 + + vwadd.wx v16, v16, t1 + vwadd.wx v18, v18, t1 + vwadd.wx v20, v20, t1 + vwadd.wx v22, v22, t1 + vwadd.wx v24, v24, t1 + vwadd.wx v26, v26, t1 + vwadd.wx v28, v28, t1 + vwadd.wx v30, v30, t1 + + vnsra.wi v16, v16, 12 + vnsra.wi v18, v18, 12 + vnsra.wi v20, v20, 12 + vnsra.wi v22, v22, 12 + vnsra.wi v24, v24, 12 + vnsra.wi v26, v26, 12 + vnsra.wi v28, v28, 12 + vnsra.wi v30, v30, 12 + + vsadd.vv v8, v16, v24 + vsadd.vv v9, v18, v26 + vsadd.vv v10, v20, v28 + vsadd.vv v11, v22, v30 + vssub.vv v12, v16, v24 + vssub.vv v13, v18, v26 + vssub.vv v14, v20, v28 + vssub.vv v15, v22, v30 + + vwmul.vx v16, v4, t4 + vwmul.vx v18, v4, t3 + vwmul.vx v20, v7, t4 + vwmul.vx v22, v7, t3 + vwmul.vx v24, v12, t4 + vwmul.vx v26, v12, t3 + vwmul.vx v28, v15, t4 + vwmul.vx v30, v15, t3 + vwmacc.vx v16, t3, v5 + vwmacc.vx v22, t4, v6 + vwmacc.vx v24, t3, v13 + neg t3, t3 + vwmacc.vx v30, t4, v14 + neg t4, t4 + vwmacc.vx v20, t3, v6 + vwmacc.vx v28, t3, v14 + vwmacc.vx v18, t4, v5 + vwmacc.vx v26, t4, v13 + + vwadd.wx v16, v16, t1 + vwadd.wx v18, v18, t1 + vwadd.wx v20, v20, t1 + vwadd.wx v22, v22, t1 + vwadd.wx v24, v24, t1 + vwadd.wx v26, v26, t1 + vwadd.wx v28, v28, t1 + vwadd.wx v30, v30, t1 + + vnsra.wi v16, v16, 12 + vnsra.wi v18, v18, 12 + vnsra.wi v20, v20, 12 + vnsra.wi v22, v22, 12 + vnsra.wi v24, v24, 12 + vnsra.wi v26, v26, 12 + vnsra.wi v28, v28, 12 + vnsra.wi v30, v30, 12 + +.ifc \o0, v0 + vsadd.vv \o14, v9, v11 + vssub.vv v11, v9, v11 + vssub.vv v9, v1, v3 + vsadd.vv \o15, v1, v3 + vsadd.vv \o1, v8, v10 + vssub.vv v10, v8, v10 + vssub.vv v8, v0, v2 + vsadd.vv \o0, v0, v2 +.else + vsadd.vv \o1, v8, v10 + vssub.vv v10, v8, v10 + vssub.vv v8, v0, v2 + vsadd.vv \o0, v0, v2 + vsadd.vv v2, v9, v11 + vssub.vv v11, v9, v11 + vssub.vv v9, v1, v3 + vsadd.vv \o15, v1, v3 + vmv.v.v \o14, v2 +.endif + + vsadd.vv \o3, v16, v20 + vssub.vv v6, v16, v20 + vsadd.vv \o12, v18, v22 + vssub.vv v7, v18, v22 + vsadd.vv \o2, v24, v28 + vssub.vv v24, v24, v28 + vsadd.vv \o13, v26, v30 + vssub.vv v26, v26, v30 + + neg t3, t2 + + vwmul.vx v28, v24, t2 + vwmul.vx v30, v24, t2 + vwmacc.vx v28, t2, v26 + vwmacc.vx v30, t3, v26 + + vwmul.vx v24, v10, t2 + vwmul.vx v26, v10, t2 + vwmacc.vx v24, t2, v11 + vwmacc.vx v26, t3, v11 + + vwmul.vx v20, v6, t2 + vwmul.vx v22, v6, t2 + vwmacc.vx v20, t2, v7 + vwmacc.vx v22, t3, v7 + + vwmul.vx v16, v8, t2 + vwmul.vx v18, v8, t2 + vwmacc.vx v16, t2, v9 + vwmacc.vx v18, t3, v9 + + vwadd.wx v16, v16, t1 + vwadd.wx v18, v18, t1 + vwadd.wx v20, v20, t1 + vwadd.wx v22, v22, t1 + vwadd.wx v24, v24, t1 + vwadd.wx v26, v26, t1 + vwadd.wx v28, v28, t1 + vwadd.wx v30, v30, t1 + + vnsra.wi \o7, v16, 12 + vnsra.wi \o8, v18, 12 + vnsra.wi \o4, v20, 12 + vnsra.wi \o11, v22, 12 + vnsra.wi \o6, v24, 12 + vnsra.wi \o9, v26, 12 + vnsra.wi \o5, v28, 12 + vnsra.wi \o10, v30, 12 + + vmv.v.x v16, zero + vssub.vv \o1, v16, \o1 + vssub.vv \o3, v16, \o3 + vssub.vv \o5, v16, \o5 + vssub.vv \o7, v16, \o7 + vssub.vv \o9, v16, \o9 + vssub.vv \o11, v16, \o11 + vssub.vv \o13, v16, \o13 + vssub.vv \o15, v16, \o15 +.endm + +function inv_adst_e16_x16_rvv, export=1, ext=v + iadst_16 v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15 + jr t0 +endfunc + +function inv_flipadst_e16_x16_rvv, export=1, ext=v + iadst_16 v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0 + jr t0 +endfunc + +.macro def_horz_16 variant +function inv_txfm_horz\variant\()_16x8_rvv, export=1, ext=v + vmv.v.x v16, zero +.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + vle16.v v\i, (t4) + vse16.v v16, (t4) + add t4, t4, t6 +.endr +.ifc \variant, _identity + li t1, 2*(5793-4096)*8 +.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + vsmul.vx v16, v\i, t1 + vsra.vi v16, v16, 1 + vaadd.vv v\i, v\i, v16 +.endr +.else + jalr t0, a4 +.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + vssra.vi v\i, v\i, 2 +.endr +.endif +.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + vsse16.v v\i, (t5), t6 + addi t5, t5, 2 +.endr + jr a7 +endfunc +.endm + +def_horz_16 +def_horz_16 _identity + +function inv_txfm_add_vert_8x16_rvv, export=1, ext=v + vsetivli zero, 8, e16, m1, ta, ma +.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + vle16.v v\i, (t4) + add t4, t4, t6 +.endr + jalr t0, a5 + +.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + vssra.vi v\i, v\i, 4 +.endr + + vsetivli zero, 8, e8, mf2, ta, ma + mv t0, t5 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + vle8.v v\i, (t0) + add t0, t0, a1 +.endr + + vwaddu.wv v0, v0, v16 + vwaddu.wv v1, v1, v17 + vwaddu.wv v2, v2, v18 + vwaddu.wv v3, v3, v19 + vwaddu.wv v4, v4, v20 + vwaddu.wv v5, v5, v21 + vwaddu.wv v6, v6, v22 + vwaddu.wv v7, v7, v23 + vwaddu.wv v8, v8, v24 + vwaddu.wv v9, v9, v25 + vwaddu.wv v10, v10, v26 + vwaddu.wv v11, v11, v27 + vwaddu.wv v12, v12, v28 + vwaddu.wv v13, v13, v29 + vwaddu.wv v14, v14, v30 + vwaddu.wv v15, v15, v31 + + vsetvli zero, zero, e16, m1, ta, ma +.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + vmax.vx v\i, v\i, zero +.endr + + vsetvli zero, zero, e8, mf2, ta, ma + vnclipu.wi v16, v0, 0 + vnclipu.wi v17, v1, 0 + vnclipu.wi v18, v2, 0 + vnclipu.wi v19, v3, 0 + vnclipu.wi v20, v4, 0 + vnclipu.wi v21, v5, 0 + vnclipu.wi v22, v6, 0 + vnclipu.wi v23, v7, 0 + vnclipu.wi v24, v8, 0 + vnclipu.wi v25, v9, 0 + vnclipu.wi v26, v10, 0 + vnclipu.wi v27, v11, 0 + vnclipu.wi v28, v12, 0 + vnclipu.wi v29, v13, 0 + vnclipu.wi v30, v14, 0 + vnclipu.wi v31, v15, 0 + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + vse8.v v\i, (t5) + add t5, t5, a1 +.endr + + jr a7 +endfunc + +function inv_txfm_add_16x16_rvv, export=1, ext=v + csrw vxrm, zero + vsetivli zero, 8, e16, m1, ta, ma + addi sp, sp, -16*32 +.irp i, 0, 8 + addi t4, a2, \i*2 + addi t5, sp, \i*16*2 + li t6, 16*2 + jalr a7, a6 +.endr +.irp i, 0, 8 + addi t4, sp, \i*2 + addi t5, a0, \i + li t6, 16*2 + jal a7, inv_txfm_add_vert_8x16_rvv +.endr + addi sp, sp, 16*32 + ret +endfunc + +.macro def_fn_16x16 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_rvv, export=1, ext=v +.ifc \txfm1, identity + la a6, inv_txfm_horz_identity_16x8_rvv +.else + la a6, inv_txfm_horz_16x8_rvv + la a4, inv_\txfm1\()_e16_x16_rvv +.endif + la a5, inv_\txfm2\()_e16_x16_rvv + j inv_txfm_add_16x16_rvv +endfunc +.endm + +def_fn_16x16 dct, dct +def_fn_16x16 identity, identity +def_fn_16x16 dct, adst +def_fn_16x16 dct, flipadst +def_fn_16x16 dct, identity +def_fn_16x16 adst, dct +def_fn_16x16 adst, adst +def_fn_16x16 adst, flipadst +def_fn_16x16 flipadst, dct +def_fn_16x16 flipadst, adst +def_fn_16x16 flipadst, flipadst +def_fn_16x16 identity, dct diff --git a/third_party/dav1d/src/riscv/itx.h b/third_party/dav1d/src/riscv/itx.h index bed215471b..28c5e54d42 100644 --- a/third_party/dav1d/src/riscv/itx.h +++ b/third_party/dav1d/src/riscv/itx.h @@ -58,7 +58,8 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt)) #define decl_itx_fns(ext) \ decl_itx17_fns( 4, 4, ext); \ -decl_itx16_fns( 8, 8, ext) +decl_itx16_fns( 8, 8, ext); \ +decl_itx16_fns(16, 16, ext) decl_itx_fns(rvv); @@ -103,7 +104,8 @@ static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, in if (!(flags & DAV1D_RISCV_CPU_FLAG_V)) return; #if BITDEPTH == 8 - assign_itx16_fn( , 4, 4, rvv); + assign_itx17_fn( , 4, 4, rvv); assign_itx16_fn( , 8, 8, rvv); + assign_itx12_fn( , 16, 16, rvv); #endif } diff --git a/third_party/dav1d/src/x86/ipred.h b/third_party/dav1d/src/x86/ipred.h index f5f187e53d..57aff0f38c 100644 --- a/third_party/dav1d/src/x86/ipred.h +++ b/third_party/dav1d/src/x86/ipred.h @@ -144,6 +144,7 @@ static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *cons init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx512icl); init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx512icl); init_angular_ipred_fn(Z1_PRED, ipred_z1, avx512icl); + init_angular_ipred_fn(Z2_PRED, ipred_z2, avx512icl); init_angular_ipred_fn(Z3_PRED, ipred_z3, avx512icl); init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx512icl); diff --git a/third_party/dav1d/src/x86/ipred16_avx512.asm b/third_party/dav1d/src/x86/ipred16_avx512.asm index 8124a3b145..69802614c7 100644 --- a/third_party/dav1d/src/x86/ipred16_avx512.asm +++ b/third_party/dav1d/src/x86/ipred16_avx512.asm @@ -79,14 +79,17 @@ z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 z_filter_k: dw 8, 8, 6, 6, 4, 4 dw 4, 4, 5, 5, 4, 4 dw 0, 0, 0, 0, 2, 2 +pb_90: times 4 db 90 pw_15: times 2 dw 15 pw_16: times 2 dw 16 pw_17: times 2 dw 17 pw_24: times 2 dw 24 +pw_31: times 2 dw 31 pw_32: times 2 dw 32 pw_63: times 2 dw 63 pw_64: times 2 dw 64 pw_512: times 2 dw 512 +pw_2048: times 2 dw 2048 pw_31806: times 2 dw 31806 pw_32640: times 2 dw 32640 pw_32672: times 2 dw 32672 @@ -114,6 +117,7 @@ JMP_TABLE ipred_smooth_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_z1_16bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z2_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_z3_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE pal_pred_16bpc, avx512icl, w4, w8, w16, w32, w64 @@ -1174,6 +1178,612 @@ cglobal ipred_z1_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx mov rsp, r7 RET +cglobal ipred_z2_16bpc, 3, 9, 16, dst, stride, tl, w, h, angle, dx, _, dy + tzcnt wd, wm + movifnidn angled, anglem + lea dxq, [dr_intra_derivative-90] + movzx dyd, angleb + xor angled, 0x400 + mov r7, dxq + sub dxq, dyq + movifnidn hd, hm + and dyd, ~1 + vpbroadcastw m12, [tlq] + and dxq, ~1 + movzx dyd, word [r7+dyq] ; angle - 90 + lea r7, [z_filter_t0] + movzx dxd, word [dxq+270] ; 180 - angle + mova m0, [base+pw_31to0] + movsxd wq, [base+ipred_z2_16bpc_avx512icl_table+wq*4] + movu m4, [tlq+2] + neg dyd + vpermw m7, m0, [tlq-64*1] + lea wq, [base+ipred_z2_16bpc_avx512icl_table+wq] + vpbroadcastd m14, [base+pw_31806] + vpbroadcastd m15, [base+pw_1] + jmp wq +.w4: + movq xm3, [tlq] + vpbroadcastq m8, [base+pw_1to32] + test angled, 0x400 + jnz .w4_main ; !enable_intra_edge_filter + lea r3d, [hq+2] + add angled, 1022 + shl r3d, 6 + test r3d, angled + jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) + pshuflw xm0, xm4, q3321 + sub angled, 1075 ; angle - 53 + lea r3d, [hq+3] + call .upsample_above + punpcklwd xm4, xm3, xm4 + palignr xm3, xm4, xm12, 14 + jmp .w4_main +.w4_upsample_left: + call .upsample_left + movsldup m1, [base+z_xpos_mul] + paddw m1, m1 + jmp .w4_main2 +.w4_no_upsample_above: + lea r3d, [hq+3] + vpbroadcastd ym0, [base+pw_3] + sub angled, 1112 ; angle - 90 + call .filter_above2 + lea r3d, [hq+2] + add angled, 973 ; angle + 883 + palignr xm3, xm4, xm12, 14 + shl r3d, 6 + test r3d, angled + jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) + call .filter_left16 +.w4_main: + movsldup m1, [base+z_xpos_mul] + psllw m15, 3 +.w4_main2: + vpbroadcastq m0, [base+pw_1to32] + vpbroadcastw m11, dxd + movsldup m2, [base+z_xpos_mul] + vpbroadcastw m13, dyd + vpbroadcastd m5, [tlq-2] + psllw m10, m8, 6 + valignq m5, m7, m5, 6 + pmullw m2, m11 + psubw m10, m2 ; xpos + pmullw m13, m0 ; ypos + palignr m5, m7, m5, 14 + psrlw m12, m13, 6 + psllw m13, 9 + paddw m12, m1 ; base_y + pand m13, m14 ; frac_y << 9 + psllw m11, 3 + lea r5, [strideq*3] +.w4_loop: + psrlw m1, m10, 6 ; base_x + pand m2, m14, m10 ; frac + vpermw m0, m1, m3 ; top[base_x] + vpermw m1, m1, m4 ; top[base_x+1] + vpmovw2m k1, m10 ; base_x < 0 + psllw m2, 9 + vpermw m0{k1}, m12, m5 ; left[base_y] + vpermw m1{k1}, m12, m7 ; left[base_y+1] + vmovdqu16 m2{k1}, m13 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r5 ], xm1 + sub hd, 8 + jl .w4_end + vextracti32x8 ym0, m0, 1 + psubw m10, m11 ; base_x -= dx + lea dstq, [dstq+strideq*4] + paddw m12, m15 ; base_y++ + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r5 ], xm1 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.w4_end: + RET +.upsample_above: ; w4/w8 + mova ym9, [base+pw_1to32] + palignr xm1, xm4, xm12, 12 + paddw xm3, xm4 ; b+c + xor angled, 0x7f ; 180 - angle + paddw xm0, xm1 ; a+d + vpbroadcastw xm1, r9m ; pixel_max + vpbroadcastb xm11, r3d + psubw xm0, xm3, xm0 + vpbroadcastb xm2, angled + psraw xm0, 3 + shr angled, 8 + paddw xm3, xm0 + pxor xm0, xm0 + vpcmpeqb k2, xm11, [base+z_filter_wh] + pmaxsw xm3, xm0 + add dxd, dxd + pavgw xm3, xm0 + vpcmpgtb k2{k2}, xm2, [base+z_filter_t0+angleq*8] + pminsw xm3, xm1 + paddw m8, m8 + jmp .filter_left16b +.upsample_left: ; h4/h8 + lea r3d, [hq-1] + palignr xm2, xm7, xm12, 14 + vpbroadcastw xm0, r3d + palignr xm1, xm7, xm12, 12 + pminuw xm0, xm9 + paddw xm2, xm7 ; b+c + vpermw xm0, xm0, xm7 + add dyd, dyd + paddw xm0, xm1 ; a+d + vpbroadcastw xm1, r9m ; pixel_max + psubw xm0, xm2, xm0 + psraw xm0, 3 + paddw xm2, xm0 + pxor xm0, xm0 + pmaxsw xm2, xm0 + pavgw xm2, xm0 + pminsw xm2, xm1 + punpckhwd xm0, xm2, xm7 + punpcklwd xm7, xm2, xm7 + vinserti32x4 ym7, xm0, 1 + ret +.filter_above: + sub angled, 90 +.filter_above2: + vpbroadcastb ym1, r3d + vpbroadcastb ym10, angled + mov r3d, angled + shr r3d, 8 + vpcmpeqb k2, ym1, [base+z_filter_wh] + mova xm11, [base+z_filter_t0+r3*8] + vpcmpgtb k1{k2}, ym10, ym11 + mova m9, [base+pw_1to32] + kmovd r3d, k1 + test r3d, r3d + jz .filter_end + pminuw ym0, ym9 + popcnt r3d, r3d + vpbroadcastd ym6, r7m ; max_w + kxnorw k1, k1, k1 + vpbroadcastd ym5, [base+z_filter_k+(r3-1)*4+12*0] + kaddw k1, k1, k1 ; ~1 + vpbroadcastd ym13, [base+z_filter_k+(r3-1)*4+12*1] + vpermw ym2, ym0, ym4 ; +1 + pmullw ym5, ym4 + paddw ym1, ym2, ym3 + vmovdqu16 m3{k1}, [tlq-2] ; -2 + vpermw ym2, ym0, ym2 ; +2 + vpbroadcastd ym0, [base+z_filter_k+(r3-1)*4+12*2] + pmullw ym1, ym13 + movu m13, [base+pw_0to31] + paddw ym2, ym3 + packssdw ym6, ym6 + pmullw ym2, ym0 + paddw ym1, ym5 + vpcmpgtw k1, ym6, ym13 + paddw ym1, ym2 + pxor ym2, ym2 + psrlw ym1, 3 + pavgw ym4{k1}, ym1, ym2 +.filter_end: + ret +.filter_left16: + vpbroadcastd ym1, [base+pb_90] + psubb ym1, ym10 + vpcmpgtb k2{k2}, ym1, ym11 +.filter_left16b: + kmovd r3d, k2 + test r3d, r3d + jz .filter_end + lea r5d, [hq-1] + vinserti32x4 ym0, ym12, xm7, 1 + vpbroadcastw ym1, r5d + popcnt r3d, r3d + vpbroadcastd ym6, r8m ; max_h + pminuw ym9, ym1 + vpbroadcastd ym5, [base+z_filter_k+(r3-1)*4+12*0] + vpermw ym2, ym9, ym7 ; +1 + vpbroadcastd ym10, [base+z_filter_k+(r3-1)*4+12*1] + palignr ym1, ym7, ym0, 14 ; -1 + pmullw ym5, ym7 + palignr ym0, ym7, ym0, 12 ; -2 + paddw ym1, ym2 + vpermw ym2, ym9, ym2 ; +2 + vpbroadcastd ym9, [base+z_filter_k+(r3-1)*4+12*2] + pmullw ym1, ym10 + paddw ym2, ym0 + packssdw ym6, ym6 + pmullw ym2, ym9 + paddw ym1, ym5 + vpcmpgtw k1, ym6, [base+pw_0to31] + paddw ym1, ym2 + pxor ym2, ym2 + psrlw ym1, 3 + pavgw ym7{k1}, ym1, ym2 + ret +.filter_left: + cmp hd, 32 + jl .filter_left16 + vpbroadcastd m5, [base+pw_3] + pminud m0, m9, [base+pw_31] {1to16} +.filter_left32: + vpbroadcastd m6, r8m ; max_h + valignq m2, m7, m12, 6 + packssdw m6, m6 + palignr m1, m7, m2, 14 ; -1 + paddw m1, m7 + palignr m2, m7, m2, 12 ; -2 + vpcmpgtw k1, m6, m13 + paddw m2, m5 + cmp hd, 64 + je .filter_left64 + lea r3d, [hq-1] + vpbroadcastw m10, r3d + pminuw m0, m10 + vpermw m10, m0, m7 ; +1 + paddw m1, m10 + vpermw m10, m0, m10 ; +2 + pavgw m2, m10 + paddw m1, m2 + vpsrlw m7{k1}, m1, 2 + ret +.filter_left64: + valignq m10, m8, m7, 2 + vpaddd m13, [base+pw_32] {1to16} + palignr m11, m10, m7, 2 ; +1 + paddw m1, m11 + palignr m11, m10, m7, 4 ; +2 + valignq m10, m8, m7, 6 + pavgw m11, m2 + vpermw m2, m0, m8 ; 32+1 + paddw m1, m11 + vpsrlw m7{k1}, m1, 2 + palignr m1, m8, m10, 14 ; 32-1 + paddw m1, m8 + palignr m10, m8, m10, 12 ; 32-2 + paddw m1, m2 + vpermw m2, m0, m2 ; 32+2 + paddw m10, m5 + vpcmpgtw k1, m6, m13 + pavgw m2, m10 + paddw m1, m2 + vpsrlw m8{k1}, m1, 2 + ret +.w8: + mova xm3, [tlq] + vbroadcasti32x4 m8, [base+pw_1to32] + test angled, 0x400 + jnz .w8_main + lea r3d, [angleq+126] + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm + psrldq xm0, xm4, 2 + sub angled, 53 + pshufhw xm0, xm0, q2210 + lea r3d, [hq+7] + call .upsample_above + punpcklwd xm0, xm3, xm4 + punpckhwd xm4, xm3, xm4 + vinserti32x4 ym3, ym12, xm0, 1 + vinserti32x4 ym4, ym0, xm4, 1 + palignr ym3, ym4, ym3, 14 + jmp .w8_main +.w8_upsample_left: + call .upsample_left + movshdup m1, [base+z_xpos_mul] + psllw m15, 3 + paddw m1, m1 + jmp .w8_main2 +.w8_no_upsample_above: + lea r3d, [hq+7] + vpbroadcastd ym0, [base+pw_7] + call .filter_above + lea r3d, [angleq-51] + mov r3b, hb + palignr xm3, xm4, xm12, 14 + cmp r3d, 8 + jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm + call .filter_left +.w8_main: + movshdup m1, [base+z_xpos_mul] + psllw m15, 2 +.w8_main2: + vbroadcasti32x4 m0, [base+pw_1to32] + vpbroadcastw m11, dxd + movshdup m2, [base+z_xpos_mul] + vpbroadcastw m13, dyd + psllw m10, m8, 6 + valignq m5, m7, m12, 6 + pmullw m2, m11 + psubw m10, m2 ; xpos + pmullw m13, m0 ; ypos + palignr m5, m7, m5, 14 + psrlw m12, m13, 6 + psllw m13, 9 + mov r2d, 1<<6 + paddw m12, m1 ; base_y + lea r3d, [dxq-(8<<6)] ; left-only threshold + pand m13, m14 ; frac_y << 9 + shl dxd, 2 + psllw m11, 2 + lea r5, [strideq*3] +.w8_loop: + psrlw m1, m10, 6 + pand m2, m14, m10 + vpermw m0, m1, m3 + vpermw m1, m1, m4 + psllw m2, 9 + sub r2d, dxd + jge .w8_toponly + vpmovw2m k1, m10 + vpermw m0{k1}, m12, m5 + vpermw m1{k1}, m12, m7 + vmovdqu16 m2{k1}, m13 +.w8_toponly: + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+r5 ], m0, 3 + sub hd, 4 + jz .w8_end + psubw m10, m11 ; base_x -= dx + lea dstq, [dstq+strideq*4] + paddw m12, m15 ; base_y++ + cmp r2d, r3d + jge .w8_loop +.w8_leftonly_loop: + vpermw m0, m12, m5 + vpermw m1, m12, m7 + psubw m1, m0 + pmulhrsw m1, m13 + paddw m12, m15 + paddw m0, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+r5 ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_leftonly_loop +.w8_end: + RET +.w16: + mova ym3, [tlq] + vpermw m8, m0, [tlq-64*2] + test angled, 0x400 + jnz .w16_main + lea r3d, [hq+15] + vpbroadcastd ym0, [base+pw_15] + call .filter_above + call .filter_left + vinserti32x4 ym3, ym12, xm4, 1 + palignr ym3, ym4, ym3, 14 +.w16_main: + vbroadcasti32x8 m0, [base+pw_1to32] + vpbroadcastw m11, dxd + vpbroadcastw m13, dyd + kxnorw k2, k2, k2 + psllw m10, m0, 6 + valignq m5, m7, m12, 6 + psubw m10, m11 ; xpos + valignq m6, m8, m7, 6 + pmullw m13, m0 ; ypos + knotd k1, k2 + palignr m5, m7, m5, 14 + palignr m6, m8, m6, 14 + vpsubw m10{k1}, m11 + psrlw m12, m13, 6 + psllw m13, 9 + mov r2d, 1<<6 + vpsubw m12{k2}, m15 ; base_y + pand m13, m14 ; frac_y << 9 + lea r3d, [dxq-(16<<6)] + paddw m11, m11 + add dxd, dxd + paddw m15, m15 +.w16_loop: + psrlw m1, m10, 6 + pand m2, m14, m10 + vpermw m0, m1, m3 + vpermw m1, m1, m4 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m12, m15 ; base_y++ + paddw m0, m1 + sub r2d, dxd + jge .w16_toponly + mova m1, m5 + vpermt2w m1, m12, m6 + mova m2, m7 + vpermt2w m2, m12, m8 + vpmovw2m k1, m10 + psubw m2, m1 + pmulhrsw m2, m13 + vpaddw m0{k1}, m1, m2 +.w16_toponly: + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + sub hd, 2 + jz .w16_end + psubw m10, m11 ; base_x -= dx + lea dstq, [dstq+strideq*2] + cmp r2d, r3d + jge .w16_loop + paddw m12, m15 + vpermt2w m5, m12, m6 + mova m1, m7 + vpermt2w m1, m12, m8 + jmp .w16_leftonly_loop_start +.w16_leftonly_loop: + mova m1, m7 + vpermt2w m1, m12, m8 + vshufi32x4 m5, m1, q1032 +.w16_leftonly_loop_start: + psubw m0, m1, m5 + pmulhrsw m0, m13 + paddw m12, m15 + paddw m0, m5 + mova m5, m1 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_leftonly_loop +.w16_end: + RET +.w32: + mova m3, [tlq] + vpermw m8, m0, [tlq-64*2] + mova m9, [base+pw_1to32] + test angled, 0x400 + jnz .w32_main + pminud m0, m9, [base+pw_31] {1to16} + mov r3d, ~1 + kmovd k1, r3d + vpbroadcastd m5, [base+pw_3] + vpbroadcastd m6, r6m ; max_w + vpermw m2, m0, m4 ; +1 + movu m13, [base+pw_0to31] + paddw m1, m4, m3 + vmovdqu16 m3{k1}, [tlq-2] ; -2 + packssdw m6, m6 + paddw m1, m2 + vpermw m2, m0, m2 ; +2 + paddw m3, m5 + vpcmpgtw k1, m6, m13 + pavgw m2, m3 + paddw m1, m2 + psrlw m4{k1}, m1, 2 + call .filter_left32 +.w32_main: + sub rsp, 64*2 + call .w32_main1 + add rsp, 64*2 + RET +.w32_main1: + vpbroadcastw m11, dxd + movu [rsp+64], m4 + vpbroadcastw m4, dyd + movd [rsp+60], xm12 + valignq m5, m7, m12, 6 + psllw m3, m9, 6 ; xpos + valignq m6, m8, m7, 6 + pmullw m9, m4 ; ypos + palignr m5, m7, m5, 14 + mov r2d, 33<<6 + palignr m6, m8, m6, 14 + mova m10, m3 +.w32_main2: + psllw m13, m9, 9 + sub r2d, dxd + psrlw m12, m9, 6 ; base_y + mov r8d, hd + pand m13, m14 ; frac_y << 9 +.w32_loop: + mov r3d, r2d + shr r3d, 6 + psubw m10, m11 ; base_x -= dx + movu m0, [rsp+r3*2-2] + pand m2, m10, m14 ; frac_x + movu m1, [rsp+r3*2] + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m12, m15 ; base_y++ + paddw m0, m1 + cmp r2d, 32<<6 + jge .w32_toponly + mova m1, m5 + vpermt2w m1, m12, m6 + mova m2, m7 + vpermt2w m2, m12, m8 + vpmovw2m k1, m10 + psubw m2, m1 + pmulhrsw m2, m13 + vpaddw m0{k1}, m1, m2 +.w32_toponly: + mova [dstq], m0 + dec r8d + jz .w32_end + add dstq, strideq + sub r2d, dxd + jge .w32_loop + paddw m12, m15 + mova m2, m5 + vpermt2w m2, m12, m6 +.w32_leftonly_loop: + mova m1, m7 + vpermt2w m1, m12, m8 + psubw m0, m1, m2 + pmulhrsw m0, m13 + paddw m12, m15 + paddw m0, m2 + mova m2, m1 + mova [dstq], m0 + add dstq, strideq + dec r8d + jg .w32_leftonly_loop +.w32_end: + ret +.w64: + movu m3, [tlq+66] + vpermw m8, m0, [tlq-64*2] + mova m9, [base+pw_1to32] + test angled, 0x400 + jnz .w64_main + mova m2, [tlq] ; -1 + mov r3d, ~1 + vpbroadcastd m5, [base+pw_3] + kmovd k1, r3d + movu m13, [base+pw_0to31] + vpbroadcastd m6, r6m ; max_w + pminud m0, m9, [base+pw_31] {1to16} + paddw m1, m4, m2 + vmovdqu16 m2{k1}, [tlq-2] ; -2 + packssdw m6, m6 + paddw m1, [tlq+4] ; +1 + paddw m2, m5 + vpcmpgtw k1, m6, m13 + pavgw m2, [tlq+6] ; +2 + paddw m1, m2 + vpermw m2, m0, m3 ; 32+1 + psrlw m4{k1}, m1, 2 + paddw m1, m3, [tlq+64] ; 32-1 + vpaddd m11, m13, [base+pw_32] {1to16} + paddw m1, m2 + vpermw m2, m0, m2 ; 32+2 + paddw m10, m5, [tlq+62] ; 32-2 + vpcmpgtw k1, m6, m11 + pavgw m2, m10 + paddw m1, m2 + psrlw m3{k1}, m1, 2 + call .filter_left32 +.w64_main: + sub rsp, 64*3 + movu [rsp+64*2-gprsize], m3 + mov r5, dstq + call .w32_main1 + psllw m4, 5 + mov r2d, 65<<6 + vpaddd m10, m3, [base+pw_2048] {1to16} ; xpos + lea dstq, [r5+64] + paddw m9, m4 ; ypos + call .w32_main2 + add rsp, 64*3 + RET + cglobal ipred_z3_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy lea r7, [z_filter_t0] tzcnt wd, wm |