1 files changed, 936 insertions, 0 deletions
diff --git a/third_party/dav1d/src/thread_task.c b/third_party/dav1d/src/thread_task.c
new file mode 100644
index 0000000000..bfedf6e5bb
--- /dev/null
+++ b/third_party/dav1d/src/thread_task.c
@@ -0,0 +1,936 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "common/frame.h"
+
+#include "src/thread_task.h"
+#include "src/fg_apply.h"
+
+// This function resets the cur pointer to the first frame theoretically
+// executable after a task completed (ie. each time we update some progress or
+// insert some tasks in the queue).
+// When frame_idx is set, it can be either from a completed task, or from tasks
+// inserted in the queue, in which case we have to make sure the cur pointer
+// isn't past this insert.
+// The special case where frame_idx is UINT_MAX is to handle the reset after
+// completing a task and locklessly signaling progress. In this case we don't
+// enter a critical section, which is needed for this function, so we set an
+// atomic for a delayed handling, happening here. Meaning we can call this
+// function without any actual update other than what's in the atomic, hence
+// this special case.
+static inline int reset_task_cur(const Dav1dContext *const c,
+                                 struct TaskThreadData *const ttd,
+                                 unsigned frame_idx)
+{
+    const unsigned first = atomic_load(&ttd->first);
+    unsigned reset_frame_idx = atomic_exchange(&ttd->reset_task_cur, UINT_MAX);
+    if (reset_frame_idx < first) {
+        if (frame_idx == UINT_MAX) return 0;
+        reset_frame_idx = UINT_MAX;
+    }
+    if (!ttd->cur && c->fc[first].task_thread.task_cur_prev == NULL)
+        return 0;
+    if (reset_frame_idx != UINT_MAX) {
+        if (frame_idx == UINT_MAX) {
+            if (reset_frame_idx > first + ttd->cur)
+                return 0;
+            ttd->cur = reset_frame_idx - first;
+            goto cur_found;
+        }
+    } else if (frame_idx == UINT_MAX)
+        return 0;
+    if (frame_idx < first) frame_idx += c->n_fc;
+    const unsigned min_frame_idx = umin(reset_frame_idx, frame_idx);
+    const unsigned cur_frame_idx = first + ttd->cur;
+    if (ttd->cur < c->n_fc && cur_frame_idx < min_frame_idx)
+        return 0;
+    for (ttd->cur = min_frame_idx - first; ttd->cur < c->n_fc; ttd->cur++)
+        if (c->fc[(first + ttd->cur) % c->n_fc].task_thread.task_head)
+            break;
+cur_found:
+    for (unsigned i = ttd->cur; i < c->n_fc; i++)
+        c->fc[(first + i) % c->n_fc].task_thread.task_cur_prev = NULL;
+    return 1;
+}
+
+static inline void reset_task_cur_async(struct TaskThreadData *const ttd,
+                                        unsigned frame_idx, unsigned n_frames)
+{
+    const unsigned first = atomic_load(&ttd->first);
+    if (frame_idx < first) frame_idx += n_frames;
+    unsigned last_idx = frame_idx;
+    do {
+        frame_idx = last_idx;
+        last_idx = atomic_exchange(&ttd->reset_task_cur, frame_idx);
+    } while (last_idx < frame_idx);
+    if (frame_idx == first && atomic_load(&ttd->first) != first) {
+        unsigned expected = frame_idx;
+        atomic_compare_exchange_strong(&ttd->reset_task_cur, &expected, UINT_MAX);
+    }
+}
+
+static void insert_tasks_between(Dav1dFrameContext *const f,
+                                 Dav1dTask *const first, Dav1dTask *const last,
+                                 Dav1dTask *const a, Dav1dTask *const b,
+                                 const int cond_signal)
+{
+    struct TaskThreadData *const ttd = f->task_thread.ttd;
+    if (atomic_load(f->c->flush)) return;
+    assert(!a || a->next == b);
+    if (!a) f->task_thread.task_head = first;
+    else a->next = first;
+    if (!b) f->task_thread.task_tail = last;
+    last->next = b;
+    reset_task_cur(f->c, ttd, first->frame_idx);
+    if (cond_signal && !atomic_fetch_or(&ttd->cond_signaled, 1))
+        pthread_cond_signal(&ttd->cond);
+}
+
+static void insert_tasks(Dav1dFrameContext *const f,
+                         Dav1dTask *const first, Dav1dTask *const last,
+                         const int cond_signal)
+{
+    // insert task back into task queue
+    Dav1dTask *t_ptr, *prev_t = NULL;
+    for (t_ptr = f->task_thread.task_head;
+         t_ptr; prev_t = t_ptr, t_ptr = t_ptr->next)
+    {
+        // entropy coding precedes other steps
+        if (t_ptr->type == DAV1D_TASK_TYPE_TILE_ENTROPY) {
+            if (first->type > DAV1D_TASK_TYPE_TILE_ENTROPY) continue;
+            // both are entropy
+            if (first->sby > t_ptr->sby) continue;
+            if (first->sby < t_ptr->sby) {
+                insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
+                return;
+            }
+            // same sby
+        } else {
+            if (first->type == DAV1D_TASK_TYPE_TILE_ENTROPY) {
+                insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
+                return;
+            }
+            if (first->sby > t_ptr->sby) continue;
+            if (first->sby < t_ptr->sby) {
+                insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
+                return;
+            }
+            // same sby
+            if (first->type > t_ptr->type) continue;
+            if (first->type < t_ptr->type) {
+                insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
+                return;
+            }
+            // same task type
+        }
+
+        // sort by tile-id
+        assert(first->type == DAV1D_TASK_TYPE_TILE_RECONSTRUCTION ||
+               first->type == DAV1D_TASK_TYPE_TILE_ENTROPY);
+        assert(first->type == t_ptr->type);
+        assert(t_ptr->sby == first->sby);
+        const int p = first->type == DAV1D_TASK_TYPE_TILE_ENTROPY;
+        const int t_tile_idx = (int) (first - f->task_thread.tile_tasks[p]);
+        const int p_tile_idx = (int) (t_ptr - f->task_thread.tile_tasks[p]);
+        assert(t_tile_idx != p_tile_idx);
+        if (t_tile_idx > p_tile_idx) continue;
+        insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
+        return;
+    }
+    // append at the end
+    insert_tasks_between(f, first, last, prev_t, NULL, cond_signal);
+}
+
+static inline void insert_task(Dav1dFrameContext *const f,
+                               Dav1dTask *const t, const int cond_signal)
+{
+    insert_tasks(f, t, t, cond_signal);
+}
+
+static inline void add_pending(Dav1dFrameContext *const f, Dav1dTask *const t) {
+    pthread_mutex_lock(&f->task_thread.pending_tasks.lock);
+    t->next = NULL;
+    if (!f->task_thread.pending_tasks.head)
+        f->task_thread.pending_tasks.head = t;
+    else
+        f->task_thread.pending_tasks.tail->next = t;
+    f->task_thread.pending_tasks.tail = t;
+    atomic_store(&f->task_thread.pending_tasks.merge, 1);
+    pthread_mutex_unlock(&f->task_thread.pending_tasks.lock);
+}
+
+static inline int merge_pending_frame(Dav1dFrameContext *const f) {
+    int const merge = atomic_load(&f->task_thread.pending_tasks.merge);
+    if (merge) {
+        pthread_mutex_lock(&f->task_thread.pending_tasks.lock);
+        Dav1dTask *t = f->task_thread.pending_tasks.head;
+        f->task_thread.pending_tasks.head = NULL;
+        f->task_thread.pending_tasks.tail = NULL;
+        atomic_store(&f->task_thread.pending_tasks.merge, 0);
+        pthread_mutex_unlock(&f->task_thread.pending_tasks.lock);
+        while (t) {
+            Dav1dTask *const tmp = t->next;
+            insert_task(f, t, 0);
+            t = tmp;
+        }
+    }
+    return merge;
+}
+
+static inline int merge_pending(const Dav1dContext *const c) {
+    int res = 0;
+    for (unsigned i = 0; i < c->n_fc; i++)
+        res |= merge_pending_frame(&c->fc[i]);
+    return res;
+}
+
+static int create_filter_sbrow(Dav1dFrameContext *const f,
+                               const int pass, Dav1dTask **res_t)
+{
+    const int has_deblock = f->frame_hdr->loopfilter.level_y[0] ||
+                            f->frame_hdr->loopfilter.level_y[1];
+    const int has_cdef = f->seq_hdr->cdef;
+    const int has_resize = f->frame_hdr->width[0] != f->frame_hdr->width[1];
+    const int has_lr = f->lf.restore_planes;
+
+    Dav1dTask *tasks = f->task_thread.tasks;
+    const int uses_2pass = f->c->n_fc > 1;
+    int num_tasks = f->sbh * (1 + uses_2pass);
+    if (num_tasks > f->task_thread.num_tasks) {
+        const size_t size = sizeof(Dav1dTask) * num_tasks;
+        tasks = realloc(f->task_thread.tasks, size);
+        if (!tasks) return -1;
+        memset(tasks, 0, size);
+        f->task_thread.tasks = tasks;
+        f->task_thread.num_tasks = num_tasks;
+    }
+    tasks += f->sbh * (pass & 1);
+
+    if (pass & 1) {
+        f->frame_thread.entropy_progress = 0;
+    } else {
+        const int prog_sz = ((f->sbh + 31) & ~31) >> 5;
+        if (prog_sz > f->frame_thread.prog_sz) {
+            atomic_uint *const prog = realloc(f->frame_thread.frame_progress,
+                                              2 * prog_sz * sizeof(*prog));
+            if (!prog) return -1;
+            f->frame_thread.frame_progress = prog;
+            f->frame_thread.copy_lpf_progress = prog + prog_sz;
+        }
+        f->frame_thread.prog_sz = prog_sz;
+        memset(f->frame_thread.frame_progress, 0, prog_sz * sizeof(atomic_uint));
+        memset(f->frame_thread.copy_lpf_progress, 0, prog_sz * sizeof(atomic_uint));
+        atomic_store(&f->frame_thread.deblock_progress, 0);
+    }
+    f->frame_thread.next_tile_row[pass & 1] = 0;
+
+    Dav1dTask *t = &tasks[0];
+    t->sby = 0;
+    t->recon_progress = 1;
+    t->deblock_progress = 0;
+    t->type = pass == 1 ? DAV1D_TASK_TYPE_ENTROPY_PROGRESS :
+              has_deblock ? DAV1D_TASK_TYPE_DEBLOCK_COLS :
+              has_cdef || has_lr /* i.e. LR backup */ ? DAV1D_TASK_TYPE_DEBLOCK_ROWS :
+              has_resize ? DAV1D_TASK_TYPE_SUPER_RESOLUTION :
+              DAV1D_TASK_TYPE_RECONSTRUCTION_PROGRESS;
+    t->frame_idx = (int)(f - f->c->fc);
+
+    *res_t = t;
+    return 0;
+}
+
+int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass,
+                                 const int cond_signal)
+{
+    Dav1dTask *tasks = f->task_thread.tile_tasks[0];
+    const int uses_2pass = f->c->n_fc > 1;
+    const int num_tasks = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
+    if (pass < 2) {
+        int alloc_num_tasks = num_tasks * (1 + uses_2pass);
+        if (alloc_num_tasks > f->task_thread.num_tile_tasks) {
+            const size_t size = sizeof(Dav1dTask) * alloc_num_tasks;
+            tasks = realloc(f->task_thread.tile_tasks[0], size);
+            if (!tasks) return -1;
+            memset(tasks, 0, size);
+            f->task_thread.tile_tasks[0] = tasks;
+            f->task_thread.num_tile_tasks = alloc_num_tasks;
+        }
+        f->task_thread.tile_tasks[1] = tasks + num_tasks;
+    }
+    tasks += num_tasks * (pass & 1);
+
+    Dav1dTask *pf_t;
+    if (create_filter_sbrow(f, pass, &pf_t))
+        return -1;
+
+    Dav1dTask *prev_t = NULL;
+    for (int tile_idx = 0; tile_idx < num_tasks; tile_idx++) {
+        Dav1dTileState *const ts = &f->ts[tile_idx];
+        Dav1dTask *t = &tasks[tile_idx];
+        t->sby = ts->tiling.row_start >> f->sb_shift;
+        if (pf_t && t->sby) {
+            prev_t->next = pf_t;
+            prev_t = pf_t;
+            pf_t = NULL;
+        }
+        t->recon_progress = 0;
+        t->deblock_progress = 0;
+        t->deps_skip = 0;
+        t->type = pass != 1 ? DAV1D_TASK_TYPE_TILE_RECONSTRUCTION :
+                              DAV1D_TASK_TYPE_TILE_ENTROPY;
+        t->frame_idx = (int)(f - f->c->fc);
+        if (prev_t) prev_t->next = t;
+        prev_t = t;
+    }
+    if (pf_t) {
+        prev_t->next = pf_t;
+        prev_t = pf_t;
+    }
+    prev_t->next = NULL;
+
+    atomic_store(&f->task_thread.done[pass & 1], 0);
+
+    // XXX in theory this could be done locklessly, at this point they are no
+    // tasks in the frameQ, so no other runner should be using this lock, but
+    // we must add both passes at once
+    pthread_mutex_lock(&f->task_thread.pending_tasks.lock);
+    assert(f->task_thread.pending_tasks.head == NULL || pass == 2);
+    if (!f->task_thread.pending_tasks.head)
+        f->task_thread.pending_tasks.head = &tasks[0];
+    else
+        f->task_thread.pending_tasks.tail->next = &tasks[0];
+    f->task_thread.pending_tasks.tail = prev_t;
+    atomic_store(&f->task_thread.pending_tasks.merge, 1);
+    pthread_mutex_unlock(&f->task_thread.pending_tasks.lock);
+
+    return 0;
+}
+
+void dav1d_task_frame_init(Dav1dFrameContext *const f) {
+    const Dav1dContext *const c = f->c;
+
+    atomic_store(&f->task_thread.init_done, 0);
+    // schedule init task, which will schedule the remaining tasks
+    Dav1dTask *const t = &f->task_thread.init_task;
+    t->type = DAV1D_TASK_TYPE_INIT;
+    t->frame_idx = (int)(f - c->fc);
+    t->sby = 0;
+    t->recon_progress = t->deblock_progress = 0;
+    insert_task(f, t, 1);
+}
+
+void dav1d_task_delayed_fg(Dav1dContext *const c, Dav1dPicture *const out,
+                           const Dav1dPicture *const in)
+{
+    struct TaskThreadData *const ttd = &c->task_thread;
+    ttd->delayed_fg.in = in;
+    ttd->delayed_fg.out = out;
+    ttd->delayed_fg.type = DAV1D_TASK_TYPE_FG_PREP;
+    atomic_init(&ttd->delayed_fg.progress[0], 0);
+    atomic_init(&ttd->delayed_fg.progress[1], 0);
+    pthread_mutex_lock(&ttd->lock);
+    ttd->delayed_fg.exec = 1;
+    pthread_cond_signal(&ttd->cond);
+    pthread_cond_wait(&ttd->delayed_fg.cond, &ttd->lock);
+    pthread_mutex_unlock(&ttd->lock);
+}
+
+static inline int ensure_progress(struct TaskThreadData *const ttd,
+                                  Dav1dFrameContext *const f,
+                                  Dav1dTask *const t, const enum TaskType type,
+                                  atomic_int *const state, int *const target)
+{
+    // deblock_rows (non-LR portion) depends on deblock of previous sbrow,
+    // so ensure that completed. if not, re-add to task-queue; else, fall-through
+    int p1 = atomic_load(state);
+    if (p1 < t->sby) {
+        t->type = type;
+        t->recon_progress = t->deblock_progress = 0;
+        *target = t->sby;
+        add_pending(f, t);
+        pthread_mutex_lock(&ttd->lock);
+        return 1;
+    }
+    return 0;
+}
+
+static inline int check_tile(Dav1dTask *const t, Dav1dFrameContext *const f,
+                             const int frame_mt)
+{
+    const int tp = t->type == DAV1D_TASK_TYPE_TILE_ENTROPY;
+    const int tile_idx = (int)(t - f->task_thread.tile_tasks[tp]);
+    Dav1dTileState *const ts = &f->ts[tile_idx];
+    const int p1 = atomic_load(&ts->progress[tp]);
+    if (p1 < t->sby) return 1;
+    int error = p1 == TILE_ERROR;
+    error |= atomic_fetch_or(&f->task_thread.error, error);
+    if (!error && frame_mt && !tp) {
+        const int p2 = atomic_load(&ts->progress[1]);
+        if (p2 <= t->sby) return 1;
+        error = p2 == TILE_ERROR;
+        error |= atomic_fetch_or(&f->task_thread.error, error);
+    }
+    if (!error && frame_mt && !IS_KEY_OR_INTRA(f->frame_hdr)) {
+        // check reference state
+        const Dav1dThreadPicture *p = &f->sr_cur;
+        const int ss_ver = p->p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+        const unsigned p_b = (t->sby + 1) << (f->sb_shift + 2);
+        const int tile_sby = t->sby - (ts->tiling.row_start >> f->sb_shift);
+        const int (*const lowest_px)[2] = ts->lowest_pixel[tile_sby];
+        for (int n = t->deps_skip; n < 7; n++, t->deps_skip++) {
+            unsigned lowest;
+            if (tp) {
+                // if temporal mv refs are disabled, we only need this
+                // for the primary ref; if segmentation is disabled, we
+                // don't even need that
+                lowest = p_b;
+            } else {
+                // +8 is postfilter-induced delay
+                const int y = lowest_px[n][0] == INT_MIN ? INT_MIN :
+                              lowest_px[n][0] + 8;
+                const int uv = lowest_px[n][1] == INT_MIN ? INT_MIN :
+                               lowest_px[n][1] * (1 << ss_ver) + 8;
+                const int max = imax(y, uv);
+                if (max == INT_MIN) continue;
+                lowest = iclip(max, 1, f->refp[n].p.p.h);
+            }
+            const unsigned p3 = atomic_load(&f->refp[n].progress[!tp]);
+            if (p3 < lowest) return 1;
+            atomic_fetch_or(&f->task_thread.error, p3 == FRAME_ERROR);
+        }
+    }
+    return 0;
+}
+
+static inline int get_frame_progress(const Dav1dContext *const c,
+                                     const Dav1dFrameContext *const f)
+{
+    unsigned frame_prog = c->n_fc > 1 ? atomic_load(&f->sr_cur.progress[1]) : 0;
+    if (frame_prog >= FRAME_ERROR)
+        return f->sbh - 1;
+    int idx = frame_prog >> (f->sb_shift + 7);
+    int prog;
+    do {
+        atomic_uint *state = &f->frame_thread.frame_progress[idx];
+        const unsigned val = ~atomic_load(state);
+        prog = val ? ctz(val) : 32;
+        if (prog != 32) break;
+        prog = 0;
+    } while (++idx < f->frame_thread.prog_sz);
+    return ((idx << 5) | prog) - 1;
+}
+
+static inline void abort_frame(Dav1dFrameContext *const f, const int error) {
+    atomic_store(&f->task_thread.error, error == DAV1D_ERR(EINVAL) ? 1 : -1);
+    atomic_store(&f->task_thread.task_counter, 0);
+    atomic_store(&f->task_thread.done[0], 1);
+    atomic_store(&f->task_thread.done[1], 1);
+    atomic_store(&f->sr_cur.progress[0], FRAME_ERROR);
+    atomic_store(&f->sr_cur.progress[1], FRAME_ERROR);
+    dav1d_decode_frame_exit(f, error);
+    f->n_tile_data = 0;
+    pthread_cond_signal(&f->task_thread.cond);
+}
+
+static inline void delayed_fg_task(const Dav1dContext *const c,
+                                   struct TaskThreadData *const ttd)
+{
+    const Dav1dPicture *const in = ttd->delayed_fg.in;
+    Dav1dPicture *const out = ttd->delayed_fg.out;
+#if CONFIG_16BPC
+    int off;
+    if (out->p.bpc != 8)
+        off = (out->p.bpc >> 1) - 4;
+#endif
+    switch (ttd->delayed_fg.type) {
+    case DAV1D_TASK_TYPE_FG_PREP:
+        ttd->delayed_fg.exec = 0;
+        if (atomic_load(&ttd->cond_signaled))
+            pthread_cond_signal(&ttd->cond);
+        pthread_mutex_unlock(&ttd->lock);
+        switch (out->p.bpc) {
+#if CONFIG_8BPC
+        case 8:
+            dav1d_prep_grain_8bpc(&c->dsp[0].fg, out, in,
+                                  ttd->delayed_fg.scaling_8bpc,
+                                  ttd->delayed_fg.grain_lut_8bpc);
+            break;
+#endif
+#if CONFIG_16BPC
+        case 10:
+        case 12:
+            dav1d_prep_grain_16bpc(&c->dsp[off].fg, out, in,
+                                   ttd->delayed_fg.scaling_16bpc,
+                                   ttd->delayed_fg.grain_lut_16bpc);
+            break;
+#endif
+        default: abort();
+        }
+        ttd->delayed_fg.type = DAV1D_TASK_TYPE_FG_APPLY;
+        pthread_mutex_lock(&ttd->lock);
+        ttd->delayed_fg.exec = 1;
+        // fall-through
+    case DAV1D_TASK_TYPE_FG_APPLY:;
+        int row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1);
+        pthread_mutex_unlock(&ttd->lock);
+        int progmax = (out->p.h + 31) >> 5;
+    fg_apply_loop:
+        if (row + 1 < progmax)
+            pthread_cond_signal(&ttd->cond);
+        else if (row + 1 >= progmax) {
+            pthread_mutex_lock(&ttd->lock);
+            ttd->delayed_fg.exec = 0;
+            if (row >= progmax) goto end_add;
+            pthread_mutex_unlock(&ttd->lock);
+        }
+        switch (out->p.bpc) {
+#if CONFIG_8BPC
+        case 8:
+            dav1d_apply_grain_row_8bpc(&c->dsp[0].fg, out, in,
+                                       ttd->delayed_fg.scaling_8bpc,
+                                       ttd->delayed_fg.grain_lut_8bpc, row);
+            break;
+#endif
+#if CONFIG_16BPC
+        case 10:
+        case 12:
+            dav1d_apply_grain_row_16bpc(&c->dsp[off].fg, out, in,
+                                        ttd->delayed_fg.scaling_16bpc,
+                                        ttd->delayed_fg.grain_lut_16bpc, row);
+            break;
+#endif
+        default: abort();
+        }
+        row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1);
+        int done = atomic_fetch_add(&ttd->delayed_fg.progress[1], 1) + 1;
+        if (row < progmax) goto fg_apply_loop;
+        pthread_mutex_lock(&ttd->lock);
+        ttd->delayed_fg.exec = 0;
+    end_add:
+        done = atomic_fetch_add(&ttd->delayed_fg.progress[1], 1) + 1;
+        progmax = atomic_load(&ttd->delayed_fg.progress[0]);
+        // signal for completion only once the last runner reaches this
+        if (done < progmax)
+            break;
+        pthread_cond_signal(&ttd->delayed_fg.cond);
+        break;
+    default: abort();
+    }
+}
+
+void *dav1d_worker_task(void *data) {
+    Dav1dTaskContext *const tc = data;
+    const Dav1dContext *const c = tc->c;
+    struct TaskThreadData *const ttd = tc->task_thread.ttd;
+
+    dav1d_set_thread_name("dav1d-worker");
+
+    pthread_mutex_lock(&ttd->lock);
+    for (;;) {
+        if (tc->task_thread.die) break;
+        if (atomic_load(c->flush)) goto park;
+
+        merge_pending(c);
+        if (ttd->delayed_fg.exec) { // run delayed film grain first
+            delayed_fg_task(c, ttd);
+            continue;
+        }
+        Dav1dFrameContext *f;
+        Dav1dTask *t, *prev_t = NULL;
+        if (c->n_fc > 1) { // run init tasks second
+            for (unsigned i = 0; i < c->n_fc; i++) {
+                const unsigned first = atomic_load(&ttd->first);
+                f = &c->fc[(first + i) % c->n_fc];
+                if (atomic_load(&f->task_thread.init_done)) continue;
+                t = f->task_thread.task_head;
+                if (!t) continue;
+                if (t->type == DAV1D_TASK_TYPE_INIT) goto found;
+                if (t->type == DAV1D_TASK_TYPE_INIT_CDF) {
+                    // XXX This can be a simple else, if adding tasks of both
+                    // passes at once (in dav1d_task_create_tile_sbrow).
+                    // Adding the tasks to the pending Q can result in a
+                    // thread merging them before setting init_done.
+                    // We will need to set init_done before adding to the
+                    // pending Q, so maybe return the tasks, set init_done,
+                    // and add to pending Q only then.
+                    const int p1 = f->in_cdf.progress ?
+                        atomic_load(f->in_cdf.progress) : 1;
+                    if (p1) {
+                        atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR);
+                        goto found;
+                    }
+                }
+            }
+        }
+        while (ttd->cur < c->n_fc) { // run decoding tasks last
+            const unsigned first = atomic_load(&ttd->first);
+            f = &c->fc[(first + ttd->cur) % c->n_fc];
+            merge_pending_frame(f);
+            prev_t = f->task_thread.task_cur_prev;
+            t = prev_t ? prev_t->next : f->task_thread.task_head;
+            while (t) {
+                if (t->type == DAV1D_TASK_TYPE_INIT_CDF) goto next;
+                else if (t->type == DAV1D_TASK_TYPE_TILE_ENTROPY ||
+                         t->type == DAV1D_TASK_TYPE_TILE_RECONSTRUCTION)
+                {
+                    // if not bottom sbrow of tile, this task will be re-added
+                    // after it's finished
+                    if (!check_tile(t, f, c->n_fc > 1))
+                        goto found;
+                } else if (t->recon_progress) {
+                    const int p = t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS;
+                    int error = atomic_load(&f->task_thread.error);
+                    assert(!atomic_load(&f->task_thread.done[p]) || error);
+                    const int tile_row_base = f->frame_hdr->tiling.cols *
+                                              f->frame_thread.next_tile_row[p];
+                    if (p) {
+                        atomic_int *const prog = &f->frame_thread.entropy_progress;
+                        const int p1 = atomic_load(prog);
+                        if (p1 < t->sby) goto next;
+                        atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR);
+                    }
+                    for (int tc = 0; tc < f->frame_hdr->tiling.cols; tc++) {
+                        Dav1dTileState *const ts = &f->ts[tile_row_base + tc];
+                        const int p2 = atomic_load(&ts->progress[p]);
+                        if (p2 < t->recon_progress) goto next;
+                        atomic_fetch_or(&f->task_thread.error, p2 == TILE_ERROR);
+                    }
+                    if (t->sby + 1 < f->sbh) {
+                        // add sby+1 to list to replace this one
+                        Dav1dTask *next_t = &t[1];
+                        *next_t = *t;
+                        next_t->sby++;
+                        const int ntr = f->frame_thread.next_tile_row[p] + 1;
+                        const int start = f->frame_hdr->tiling.row_start_sb[ntr];
+                        if (next_t->sby == start)
+                            f->frame_thread.next_tile_row[p] = ntr;
+                        next_t->recon_progress = next_t->sby + 1;
+                        insert_task(f, next_t, 0);
+                    }
+                    goto found;
+                } else if (t->type == DAV1D_TASK_TYPE_CDEF) {
+                    atomic_uint *prog = f->frame_thread.copy_lpf_progress;
+                    const int p1 = atomic_load(&prog[(t->sby - 1) >> 5]);
+                    if (p1 & (1U << ((t->sby - 1) & 31)))
+                        goto found;
+                } else {
+                    assert(t->deblock_progress);
+                    const int p1 = atomic_load(&f->frame_thread.deblock_progress);
+                    if (p1 >= t->deblock_progress) {
+                        atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR);
+                        goto found;
+                    }
+                }
+            next:
+                prev_t = t;
+                t = t->next;
+                f->task_thread.task_cur_prev = prev_t;
+            }
+            ttd->cur++;
+        }
+        if (reset_task_cur(c, ttd, UINT_MAX)) continue;
+        if (merge_pending(c)) continue;
+    park:
+        tc->task_thread.flushed = 1;
+        pthread_cond_signal(&tc->task_thread.td.cond);
+        // we want to be woken up next time progress is signaled
+        atomic_store(&ttd->cond_signaled, 0);
+        pthread_cond_wait(&ttd->cond, &ttd->lock);
+        tc->task_thread.flushed = 0;
+        reset_task_cur(c, ttd, UINT_MAX);
+        continue;
+
+    found:
+        // remove t from list
+        if (prev_t) prev_t->next = t->next;
+        else f->task_thread.task_head = t->next;
+        if (!t->next) f->task_thread.task_tail = prev_t;
+        if (t->type > DAV1D_TASK_TYPE_INIT_CDF && !f->task_thread.task_head)
+            ttd->cur++;
+        t->next = NULL;
+        // we don't need to check cond_signaled here, since we found a task
+        // after the last signal so we want to re-signal the next waiting thread
+        // and again won't need to signal after that
+        atomic_store(&ttd->cond_signaled, 1);
+        pthread_cond_signal(&ttd->cond);
+        pthread_mutex_unlock(&ttd->lock);
+    found_unlocked:;
+        const int flush = atomic_load(c->flush);
+        int error = atomic_fetch_or(&f->task_thread.error, flush) | flush;
+
+        // run it
+        tc->f = f;
+        int sby = t->sby;
+        switch (t->type) {
+        case DAV1D_TASK_TYPE_INIT: {
+            assert(c->n_fc > 1);
+            int res = dav1d_decode_frame_init(f);
+            int p1 = f->in_cdf.progress ? atomic_load(f->in_cdf.progress) : 1;
+            if (res || p1 == TILE_ERROR) {
+                pthread_mutex_lock(&ttd->lock);
+                abort_frame(f, res ? res : DAV1D_ERR(EINVAL));
+                reset_task_cur(c, ttd, t->frame_idx);
+            } else {
+                t->type = DAV1D_TASK_TYPE_INIT_CDF;
+                if (p1) goto found_unlocked;
+                add_pending(f, t);
+                pthread_mutex_lock(&ttd->lock);
+            }
+            continue;
+        }
+        case DAV1D_TASK_TYPE_INIT_CDF: {
+            assert(c->n_fc > 1);
+            int res = DAV1D_ERR(EINVAL);
+            if (!atomic_load(&f->task_thread.error))
+                res = dav1d_decode_frame_init_cdf(f);
+            if (f->frame_hdr->refresh_context && !f->task_thread.update_set) {
+                atomic_store(f->out_cdf.progress, res < 0 ? TILE_ERROR : 1);
+            }
+            if (!res) {
+                assert(c->n_fc > 1);
+                for (int p = 1; p <= 2; p++) {
+                    const int res = dav1d_task_create_tile_sbrow(f, p, 0);
+                    if (res) {
+                        pthread_mutex_lock(&ttd->lock);
+                        // memory allocation failed
+                        atomic_store(&f->task_thread.done[2 - p], 1);
+                        atomic_store(&f->task_thread.error, -1);
+                        atomic_fetch_sub(&f->task_thread.task_counter,
+                                         f->frame_hdr->tiling.cols *
+                                         f->frame_hdr->tiling.rows + f->sbh);
+                        atomic_store(&f->sr_cur.progress[p - 1], FRAME_ERROR);
+                        if (p == 2 && atomic_load(&f->task_thread.done[1])) {
+                            assert(!atomic_load(&f->task_thread.task_counter));
+                            dav1d_decode_frame_exit(f, DAV1D_ERR(ENOMEM));
+                            f->n_tile_data = 0;
+                            pthread_cond_signal(&f->task_thread.cond);
+                            atomic_store(&f->task_thread.init_done, 1);
+                            continue;
+                        } else {
+                            pthread_mutex_unlock(&ttd->lock);
+                        }
+                    }
+                }
+                atomic_store(&f->task_thread.init_done, 1);
+                pthread_mutex_lock(&ttd->lock);
+            } else {
+                pthread_mutex_lock(&ttd->lock);
+                abort_frame(f, res);
+                reset_task_cur(c, ttd, t->frame_idx);
+                atomic_store(&f->task_thread.init_done, 1);
+            }
+            continue;
+        }
+        case DAV1D_TASK_TYPE_TILE_ENTROPY:
+        case DAV1D_TASK_TYPE_TILE_RECONSTRUCTION: {
+            const int p = t->type == DAV1D_TASK_TYPE_TILE_ENTROPY;
+            const int tile_idx = (int)(t - f->task_thread.tile_tasks[p]);
+            Dav1dTileState *const ts = &f->ts[tile_idx];
+
+            tc->ts = ts;
+            tc->by = sby << f->sb_shift;
+            const int uses_2pass = c->n_fc > 1;
+            tc->frame_thread.pass = !uses_2pass ? 0 :
+                1 + (t->type == DAV1D_TASK_TYPE_TILE_RECONSTRUCTION);
+            if (!error) error = dav1d_decode_tile_sbrow(tc);
+            const int progress = error ? TILE_ERROR : 1 + sby;
+
+            // signal progress
+            atomic_fetch_or(&f->task_thread.error, error);
+            if (((sby + 1) << f->sb_shift) < ts->tiling.row_end) {
+                t->sby++;
+                t->deps_skip = 0;
+                if (!check_tile(t, f, uses_2pass)) {
+                    atomic_store(&ts->progress[p], progress);
+                    reset_task_cur_async(ttd, t->frame_idx, c->n_fc);
+                    if (!atomic_fetch_or(&ttd->cond_signaled, 1))
+                        pthread_cond_signal(&ttd->cond);
+                    goto found_unlocked;
+                }
+                atomic_store(&ts->progress[p], progress);
+                add_pending(f, t);
+                pthread_mutex_lock(&ttd->lock);
+            } else {
+                pthread_mutex_lock(&ttd->lock);
+                atomic_store(&ts->progress[p], progress);
+                reset_task_cur(c, ttd, t->frame_idx);
+                error = atomic_load(&f->task_thread.error);
+                if (f->frame_hdr->refresh_context &&
+                    tc->frame_thread.pass <= 1 && f->task_thread.update_set &&
+                    f->frame_hdr->tiling.update == tile_idx)
+                {
+                    if (!error)
+                        dav1d_cdf_thread_update(f->frame_hdr, f->out_cdf.data.cdf,
+                                                &f->ts[f->frame_hdr->tiling.update].cdf);
+                    if (c->n_fc > 1)
+                        atomic_store(f->out_cdf.progress, error ? TILE_ERROR : 1);
+                }
+                if (atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1 == 0 &&
+                    atomic_load(&f->task_thread.done[0]) &&
+                    (!uses_2pass || atomic_load(&f->task_thread.done[1])))
+                {
+                    error = atomic_load(&f->task_thread.error);
+                    dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
+                                            error ? DAV1D_ERR(ENOMEM) : 0);
+                    f->n_tile_data = 0;
+                    pthread_cond_signal(&f->task_thread.cond);
+                }
+                assert(atomic_load(&f->task_thread.task_counter) >= 0);
+                if (!atomic_fetch_or(&ttd->cond_signaled, 1))
+                    pthread_cond_signal(&ttd->cond);
+            }
+            continue;
+        }
+        case DAV1D_TASK_TYPE_DEBLOCK_COLS:
+            if (!atomic_load(&f->task_thread.error))
+                f->bd_fn.filter_sbrow_deblock_cols(f, sby);
+            if (ensure_progress(ttd, f, t, DAV1D_TASK_TYPE_DEBLOCK_ROWS,
+                                &f->frame_thread.deblock_progress,
+                                &t->deblock_progress)) continue;
+            // fall-through
+        case DAV1D_TASK_TYPE_DEBLOCK_ROWS:
+            if (!atomic_load(&f->task_thread.error))
+                f->bd_fn.filter_sbrow_deblock_rows(f, sby);
+            // signal deblock progress
+            if (f->frame_hdr->loopfilter.level_y[0] ||
+                f->frame_hdr->loopfilter.level_y[1])
+            {
+                error = atomic_load(&f->task_thread.error);
+                atomic_store(&f->frame_thread.deblock_progress,
+                             error ? TILE_ERROR : sby + 1);
+                reset_task_cur_async(ttd, t->frame_idx, c->n_fc);
+                if (!atomic_fetch_or(&ttd->cond_signaled, 1))
+                    pthread_cond_signal(&ttd->cond);
+            } else if (f->seq_hdr->cdef || f->lf.restore_planes) {
+                atomic_fetch_or(&f->frame_thread.copy_lpf_progress[sby >> 5],
+                                1U << (sby & 31));
+                // CDEF needs the top buffer to be saved by lr_copy_lpf of the
+                // previous sbrow
+                if (sby) {
+                    int prog = atomic_load(&f->frame_thread.copy_lpf_progress[(sby - 1) >> 5]);
+                    if (~prog & (1U << ((sby - 1) & 31))) {
+                        t->type = DAV1D_TASK_TYPE_CDEF;
+                        t->recon_progress = t->deblock_progress = 0;
+                        add_pending(f, t);
+                        pthread_mutex_lock(&ttd->lock);
+                        continue;
+                    }
+                }
+            }
+            // fall-through
+        case DAV1D_TASK_TYPE_CDEF:
+            if (f->seq_hdr->cdef) {
+                if (!atomic_load(&f->task_thread.error))
+                    f->bd_fn.filter_sbrow_cdef(tc, sby);
+                reset_task_cur_async(ttd, t->frame_idx, c->n_fc);
+                if (!atomic_fetch_or(&ttd->cond_signaled, 1))
+                    pthread_cond_signal(&ttd->cond);
+            }
+            // fall-through
+        case DAV1D_TASK_TYPE_SUPER_RESOLUTION:
+            if (f->frame_hdr->width[0] != f->frame_hdr->width[1])
+                if (!atomic_load(&f->task_thread.error))
+                    f->bd_fn.filter_sbrow_resize(f, sby);
+            // fall-through
+        case DAV1D_TASK_TYPE_LOOP_RESTORATION:
+            if (!atomic_load(&f->task_thread.error) && f->lf.restore_planes)
+                f->bd_fn.filter_sbrow_lr(f, sby);
+            // fall-through
+        case DAV1D_TASK_TYPE_RECONSTRUCTION_PROGRESS:
+            // dummy to cover for no post-filters
+        case DAV1D_TASK_TYPE_ENTROPY_PROGRESS:
+            // dummy to convert tile progress to frame
+            break;
+        default: abort();
+        }
+        // if task completed [typically LR], signal picture progress as per below
+        const int uses_2pass = c->n_fc > 1;
+        const int sbh = f->sbh;
+        const int sbsz = f->sb_step * 4;
+        if (t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS) {
+            error = atomic_load(&f->task_thread.error);
+            const unsigned y = sby + 1 == sbh ? UINT_MAX : (unsigned)(sby + 1) * sbsz;
+            assert(c->n_fc > 1);
+            if (f->sr_cur.p.data[0] /* upon flush, this can be free'ed already */)
+                atomic_store(&f->sr_cur.progress[0], error ? FRAME_ERROR : y);
+            atomic_store(&f->frame_thread.entropy_progress,
+                         error ? TILE_ERROR : sby + 1);
+            if (sby + 1 == sbh)
+                atomic_store(&f->task_thread.done[1], 1);
+            pthread_mutex_lock(&ttd->lock);
+            const int num_tasks = atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1;
+            if (sby + 1 < sbh && num_tasks) {
+                reset_task_cur(c, ttd, t->frame_idx);
+                continue;
+            }
+            if (!num_tasks && atomic_load(&f->task_thread.done[0]) &&
+                atomic_load(&f->task_thread.done[1]))
+            {
+                error = atomic_load(&f->task_thread.error);
+                dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
+                                        error ? DAV1D_ERR(ENOMEM) : 0);
+                f->n_tile_data = 0;
+                pthread_cond_signal(&f->task_thread.cond);
+            }
+            reset_task_cur(c, ttd, t->frame_idx);
+            continue;
+        }
+    // t->type != DAV1D_TASK_TYPE_ENTROPY_PROGRESS
+        atomic_fetch_or(&f->frame_thread.frame_progress[sby >> 5],
+                        1U << (sby & 31));
+        pthread_mutex_lock(&f->task_thread.lock);
+        sby = get_frame_progress(c, f);
+        error = atomic_load(&f->task_thread.error);
+        const unsigned y = sby + 1 == sbh ? UINT_MAX : (unsigned)(sby + 1) * sbsz;
+        if (c->n_fc > 1 && f->sr_cur.p.data[0] /* upon flush, this can be free'ed already */)
+            atomic_store(&f->sr_cur.progress[1], error ? FRAME_ERROR : y);
+        pthread_mutex_unlock(&f->task_thread.lock);
+        if (sby + 1 == sbh)
+            atomic_store(&f->task_thread.done[0], 1);
+        pthread_mutex_lock(&ttd->lock);
+        const int num_tasks = atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1;
+        if (sby + 1 < sbh && num_tasks) {
+            reset_task_cur(c, ttd, t->frame_idx);
+            continue;
+        }
+        if (!num_tasks && atomic_load(&f->task_thread.done[0]) &&
+            (!uses_2pass || atomic_load(&f->task_thread.done[1])))
+        {
+            error = atomic_load(&f->task_thread.error);
+            dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
+                                    error ? DAV1D_ERR(ENOMEM) : 0);
+            f->n_tile_data = 0;
+            pthread_cond_signal(&f->task_thread.cond);
+        }
+        reset_task_cur(c, ttd, t->frame_idx);
+    }
+    pthread_mutex_unlock(&ttd->lock);
+
+    return NULL;
+}