3 files changed, 1475 insertions, 0 deletions
diff --git a/src/utils/dolbyvision.c b/src/utils/dolbyvision.c
new file mode 100644
index 0000000..3798532
--- /dev/null
+++ b/src/utils/dolbyvision.c
@@ -0,0 +1,63 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+#include <libplacebo/utils/dolbyvision.h>
+
+#ifdef PL_HAVE_LIBDOVI
+#include <libplacebo/tone_mapping.h>
+#include <libdovi/rpu_parser.h>
+#endif
+
+void pl_hdr_metadata_from_dovi_rpu(struct pl_hdr_metadata *out,
+                                   const uint8_t *buf, size_t size)
+{
+#ifdef PL_HAVE_LIBDOVI
+    if (buf && size) {
+        DoviRpuOpaque *rpu =
+            dovi_parse_unspec62_nalu(buf, size);
+        const DoviRpuDataHeader *header = dovi_rpu_get_header(rpu);
+
+        if (header && header->vdr_dm_metadata_present_flag) {
+            // Profile 4 reshaping isn't done as it is a dual layer format.
+            // However there are still unknowns on its EOTF, so it cannot be enabled.
+            //
+            // For profile 7, the brightness metadata can still be used as most
+            // titles are going to have accurate metadata<->image brightness,
+            // with the exception of some titles that require the enhancement layer
+            // to be processed to restore the intended brightness, which would then
+            // match the metadata values.
+            if (header->guessed_profile == 4) {
+                goto done;
+            }
+
+            const DoviVdrDmData *vdr_dm_data = dovi_rpu_get_vdr_dm_data(rpu);
+            if (vdr_dm_data->dm_data.level1) {
+                const DoviExtMetadataBlockLevel1 *l1 = vdr_dm_data->dm_data.level1;
+                out->max_pq_y = l1->max_pq / 4095.0f;
+                out->avg_pq_y = l1->avg_pq / 4095.0f;
+            }
+
+            dovi_rpu_free_vdr_dm_data(vdr_dm_data);
+        }
+
+    done:
+        dovi_rpu_free_header(header);
+        dovi_rpu_free(rpu);
+    }
+#endif
+}
diff --git a/src/utils/frame_queue.c b/src/utils/frame_queue.c
new file mode 100644
index 0000000..0155983
--- /dev/null
+++ b/src/utils/frame_queue.c
@@ -0,0 +1,1030 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <errno.h>
+#include <math.h>
+
+#include "common.h"
+#include "log.h"
+#include "pl_thread.h"
+
+#include <libplacebo/utils/frame_queue.h>
+
+struct cache_entry {
+    pl_tex tex[4];
+};
+
+struct entry {
+    pl_rc_t rc;
+    double pts;
+    struct cache_entry cache;
+    struct pl_source_frame src;
+    struct pl_frame frame;
+    uint64_t signature;
+    bool mapped;
+    bool ok;
+
+    // for interlaced frames
+    enum pl_field field;
+    struct entry *primary;
+    struct entry *prev, *next;
+    bool dirty;
+};
+
+// Hard limits for vsync timing validity
+#define MIN_FPS 10
+#define MAX_FPS 400
+
+// Limits for FPS estimation state
+#define MAX_SAMPLES 32
+#define MIN_SAMPLES 4
+
+// Stickiness to prevent `interpolation_threshold` oscillation
+#define THRESHOLD_MAX_RATIO 0.3
+#define THRESHOLD_FRAMES 5
+
+// Maximum number of not-yet-mapped frames to allow queueing in advance
+#define PREFETCH_FRAMES 2
+
+struct pool {
+    float samples[MAX_SAMPLES];
+    float estimate;
+    float sum;
+    int idx;
+    int num;
+    int total;
+};
+
+struct pl_queue_t {
+    pl_gpu gpu;
+    pl_log log;
+
+    // For multi-threading, we use two locks. The `lock_weak` guards the queue
+    // state itself. The `lock_strong` has a bigger scope and should be held
+    // for the duration of any functions that expect the queue state to
+    // remain more or less valid (with the exception of adding new members).
+    //
+    // In particular, `pl_queue_reset` and `pl_queue_update` will take
+    // the strong lock, while `pl_queue_push_*` will only take the weak
+    // lock.
+    pl_mutex lock_strong;
+    pl_mutex lock_weak;
+    pl_cond wakeup;
+
+    // Frame queue and state
+    PL_ARRAY(struct entry *) queue;
+    uint64_t signature;
+    int threshold_frames;
+    bool want_frame;
+    bool eof;
+
+    // Average vsync/frame fps estimation state
+    struct pool vps, fps;
+    float reported_vps;
+    float reported_fps;
+    double prev_pts;
+
+    // Storage for temporary arrays
+    PL_ARRAY(uint64_t) tmp_sig;
+    PL_ARRAY(float) tmp_ts;
+    PL_ARRAY(const struct pl_frame *) tmp_frame;
+
+    // Queue of GPU objects to reuse
+    PL_ARRAY(struct cache_entry) cache;
+};
+
+pl_queue pl_queue_create(pl_gpu gpu)
+{
+    pl_queue p = pl_alloc_ptr(NULL, p);
+    *p = (struct pl_queue_t) {
+        .gpu = gpu,
+        .log = gpu->log,
+    };
+
+    pl_mutex_init(&p->lock_strong);
+    pl_mutex_init(&p->lock_weak);
+    int ret = pl_cond_init(&p->wakeup);
+    if (ret) {
+        PL_ERR(p, "Failed to init conditional variable: %d", ret);
+        return NULL;
+    }
+    return p;
+}
+
+static void recycle_cache(pl_queue p, struct cache_entry *cache, bool recycle)
+{
+    bool has_textures = false;
+    for (int i = 0; i < PL_ARRAY_SIZE(cache->tex); i++) {
+        if (!cache->tex[i])
+            continue;
+
+        has_textures = true;
+        if (recycle) {
+            pl_tex_invalidate(p->gpu, cache->tex[i]);
+        } else {
+            pl_tex_destroy(p->gpu, &cache->tex[i]);
+        }
+    }
+
+    if (recycle && has_textures)
+        PL_ARRAY_APPEND(p, p->cache, *cache);
+
+    memset(cache, 0, sizeof(*cache)); // sanity
+}
+
+static void entry_deref(pl_queue p, struct entry **pentry, bool recycle)
+{
+    struct entry *entry = *pentry;
+    *pentry = NULL;
+    if (!entry || !pl_rc_deref(&entry->rc))
+        return;
+
+    if (!entry->mapped && entry->src.discard) {
+        PL_TRACE(p, "Discarding unused frame id %"PRIu64" with PTS %f",
+                 entry->signature, entry->src.pts);
+        entry->src.discard(&entry->src);
+    }
+
+    if (entry->mapped && entry->ok && entry->src.unmap) {
+        PL_TRACE(p, "Unmapping frame id %"PRIu64" with PTS %f",
+                 entry->signature, entry->src.pts);
+        entry->src.unmap(p->gpu, &entry->frame, &entry->src);
+    }
+
+    recycle_cache(p, &entry->cache, recycle);
+    pl_free(entry);
+}
+
+static struct entry *entry_ref(struct entry *entry)
+{
+    pl_rc_ref(&entry->rc);
+    return entry;
+}
+
+static void entry_cull(pl_queue p, struct entry *entry, bool recycle)
+{
+    // Forcibly clean up references to prev/next frames, even if `entry` has
+    // remaining refs pointing at it. This is to prevent cyclic references.
+    entry_deref(p, &entry->primary, recycle);
+    entry_deref(p, &entry->prev, recycle);
+    entry_deref(p, &entry->next, recycle);
+    entry_deref(p, &entry, recycle);
+}
+
+void pl_queue_destroy(pl_queue *queue)
+{
+    pl_queue p = *queue;
+    if (!p)
+        return;
+
+    for (int n = 0; n < p->queue.num; n++)
+        entry_cull(p, p->queue.elem[n], false);
+    for (int n = 0; n < p->cache.num; n++) {
+        for (int i = 0; i < PL_ARRAY_SIZE(p->cache.elem[n].tex); i++)
+            pl_tex_destroy(p->gpu, &p->cache.elem[n].tex[i]);
+    }
+
+    pl_cond_destroy(&p->wakeup);
+    pl_mutex_destroy(&p->lock_weak);
+    pl_mutex_destroy(&p->lock_strong);
+    pl_free(p);
+    *queue = NULL;
+}
+
+void pl_queue_reset(pl_queue p)
+{
+    pl_mutex_lock(&p->lock_strong);
+    pl_mutex_lock(&p->lock_weak);
+
+    for (int i = 0; i < p->queue.num; i++)
+        entry_cull(p, p->queue.elem[i], false);
+
+    *p = (struct pl_queue_t) {
+        .gpu = p->gpu,
+        .log = p->log,
+
+        // Reuse lock objects
+        .lock_strong = p->lock_strong,
+        .lock_weak = p->lock_weak,
+        .wakeup = p->wakeup,
+
+        // Explicitly preserve allocations
+        .queue.elem = p->queue.elem,
+        .tmp_sig.elem = p->tmp_sig.elem,
+        .tmp_ts.elem = p->tmp_ts.elem,
+        .tmp_frame.elem = p->tmp_frame.elem,
+
+        // Reuse GPU object cache entirely
+        .cache = p->cache,
+    };
+
+    pl_cond_signal(&p->wakeup);
+    pl_mutex_unlock(&p->lock_weak);
+    pl_mutex_unlock(&p->lock_strong);
+}
+
+static inline float delta(float old, float new)
+{
+    return fabsf((new - old) / PL_MIN(new, old));
+}
+
+static inline void default_estimate(struct pool *pool, float val)
+{
+    if (!pool->estimate && isnormal(val) && val > 0.0)
+        pool->estimate = val;
+}
+
+static inline void update_estimate(struct pool *pool, float cur)
+{
+    if (pool->num) {
+        static const float max_delta = 0.3;
+        if (delta(pool->sum / pool->num, cur) > max_delta) {
+            pool->sum = 0.0;
+            pool->num = pool->idx = 0;
+        }
+    }
+
+    if (pool->num++ == MAX_SAMPLES) {
+        pool->sum -= pool->samples[pool->idx];
+        pool->num--;
+    }
+
+    pool->sum += pool->samples[pool->idx] = cur;
+    pool->idx = (pool->idx + 1) % MAX_SAMPLES;
+    pool->total++;
+
+    if (pool->total < MIN_SAMPLES || pool->num >= MIN_SAMPLES)
+        pool->estimate = pool->sum / pool->num;
+}
+
+static void queue_push(pl_queue p, const struct pl_source_frame *src)
+{
+    if (p->eof && !src)
+        return; // ignore duplicate EOF
+
+    if (p->eof && src) {
+        PL_INFO(p, "Received frame after EOF signaled... discarding frame!");
+        if (src->discard)
+            src->discard(src);
+        return;
+    }
+
+    pl_cond_signal(&p->wakeup);
+
+    if (!src) {
+        PL_TRACE(p, "Received EOF, draining frame queue...");
+        p->eof = true;
+        p->want_frame = false;
+        return;
+    }
+
+    // Update FPS estimates if possible/reasonable
+    default_estimate(&p->fps, src->first_field ? src->duration / 2 : src->duration);
+    if (p->queue.num) {
+        double last_pts = p->queue.elem[p->queue.num - 1]->pts;
+        float delta = src->pts - last_pts;
+        if (delta <= 0.0f) {
+            PL_DEBUG(p, "Non monotonically increasing PTS %f -> %f", last_pts, src->pts);
+        } else if (p->fps.estimate && delta > 10.0 * p->fps.estimate) {
+            PL_DEBUG(p, "Discontinuous source PTS jump %f -> %f", last_pts, src->pts);
+        } else {
+            update_estimate(&p->fps, delta);
+        }
+    } else if (src->pts != 0) {
+        PL_DEBUG(p, "First frame received with non-zero PTS %f", src->pts);
+    }
+
+    struct entry *entry = pl_alloc_ptr(NULL, entry);
+    *entry = (struct entry) {
+        .signature = p->signature++,
+        .pts = src->pts,
+        .src = *src,
+    };
+    pl_rc_init(&entry->rc);
+    PL_ARRAY_POP(p->cache, &entry->cache);
+    PL_TRACE(p, "Added new frame id %"PRIu64" with PTS %f",
+             entry->signature, entry->pts);
+
+    // Insert new entry into the correct spot in the queue, sorted by PTS
+    for (int i = p->queue.num;; i--) {
+        if (i == 0 || p->queue.elem[i - 1]->pts <= entry->pts) {
+            if (src->first_field == PL_FIELD_NONE) {
+                // Progressive
+                PL_ARRAY_INSERT_AT(p, p->queue, i, entry);
+                break;
+            } else {
+                // Interlaced
+                struct entry *prev = i > 0 ? p->queue.elem[i - 1] : NULL;
+                struct entry *next = i < p->queue.num ? p->queue.elem[i] : NULL;
+                struct entry *entry2 = pl_zalloc_ptr(NULL, entry2);
+                pl_rc_init(&entry2->rc);
+                if (next) {
+                    entry2->pts = (entry->pts + next->pts) / 2;
+                } else if (src->duration) {
+                    entry2->pts = entry->pts + src->duration / 2;
+                } else if (p->fps.estimate) {
+                    entry2->pts = entry->pts + p->fps.estimate;
+                } else {
+                    PL_ERR(p, "Frame with PTS %f specified as interlaced, but "
+                           "no FPS information known yet! Please specify a "
+                           "valid `pl_source_frame.duration`. Treating as "
+                           "progressive...", src->pts);
+                    PL_ARRAY_INSERT_AT(p, p->queue, i, entry);
+                    pl_free(entry2);
+                    break;
+                }
+
+                entry->field = src->first_field;
+                entry2->primary = entry_ref(entry);
+                entry2->field = pl_field_other(entry->field);
+                entry2->signature = p->signature++;
+
+                PL_TRACE(p, "Added second field id %"PRIu64" with PTS %f",
+                         entry2->signature, entry2->pts);
+
+                // Link previous/next frames
+                if (prev) {
+                    entry->prev = entry_ref(PL_DEF(prev->primary, prev));
+                    entry2->prev = entry_ref(PL_DEF(prev->primary, prev));
+                    // Retroactively re-link the previous frames that should
+                    // be referencing this frame
+                    for (int j = i - 1; j >= 0; --j) {
+                        struct entry *e = p->queue.elem[j];
+                        if (e != prev && e != prev->primary)
+                            break;
+                        entry_deref(p, &e->next, true);
+                        e->next = entry_ref(entry);
+                        if (e->dirty) { // reset signature to signal change
+                            e->signature = p->signature++;
+                            e->dirty = false;
+                        }
+                    }
+                }
+
+                if (next) {
+                    entry->next = entry_ref(PL_DEF(next->primary, next));
+                    entry2->next = entry_ref(PL_DEF(next->primary, next));
+                    for (int j = i; j < p->queue.num; j++) {
+                        struct entry *e = p->queue.elem[j];
+                        if (e != next && e != next->primary)
+                            break;
+                        entry_deref(p, &e->prev, true);
+                        e->prev = entry_ref(entry);
+                        if (e->dirty) {
+                            e->signature = p->signature++;
+                            e->dirty = false;
+                        }
+                    }
+                }
+
+                PL_ARRAY_INSERT_AT(p, p->queue, i, entry);
+                PL_ARRAY_INSERT_AT(p, p->queue, i+1, entry2);
+                break;
+            }
+        }
+    }
+
+    p->want_frame = false;
+}
+
+void pl_queue_push(pl_queue p, const struct pl_source_frame *frame)
+{
+    pl_mutex_lock(&p->lock_weak);
+    queue_push(p, frame);
+    pl_mutex_unlock(&p->lock_weak);
+}
+
+static inline bool entry_mapped(struct entry *entry)
+{
+    return entry->mapped || (entry->primary && entry->primary->mapped);
+}
+
+static bool queue_has_room(pl_queue p)
+{
+    if (p->want_frame)
+        return true;
+
+    int wanted_frames = PREFETCH_FRAMES;
+    if (p->fps.estimate && p->vps.estimate && p->vps.estimate <= 1.0f / MIN_FPS)
+        wanted_frames += ceilf(p->vps.estimate / p->fps.estimate) - 1;
+
+    // Examine the queue tail
+    for (int i = p->queue.num - 1; i >= 0; i--) {
+        if (entry_mapped(p->queue.elem[i]))
+            return true;
+        if (p->queue.num - i >= wanted_frames)
+            return false;
+    }
+
+    return true;
+}
+
+bool pl_queue_push_block(pl_queue p, uint64_t timeout,
+                         const struct pl_source_frame *frame)
+{
+    pl_mutex_lock(&p->lock_weak);
+    if (!timeout || !frame || p->eof)
+        goto skip_blocking;
+
+    while (!queue_has_room(p) && !p->eof) {
+        if (pl_cond_timedwait(&p->wakeup, &p->lock_weak, timeout) == ETIMEDOUT) {
+            pl_mutex_unlock(&p->lock_weak);
+            return false;
+        }
+    }
+
+skip_blocking:
+
+    queue_push(p, frame);
+    pl_mutex_unlock(&p->lock_weak);
+    return true;
+}
+
+static void report_estimates(pl_queue p)
+{
+    if (p->fps.total >= MIN_SAMPLES && p->vps.total >= MIN_SAMPLES) {
+        if (p->reported_fps && p->reported_vps) {
+            // Only re-report the estimates if they've changed considerably
+            // from the previously reported values
+            static const float report_delta = 0.3f;
+            float delta_fps = delta(p->reported_fps, p->fps.estimate);
+            float delta_vps = delta(p->reported_vps, p->vps.estimate);
+            if (delta_fps < report_delta && delta_vps < report_delta)
+                return;
+        }
+
+        PL_INFO(p, "Estimated source FPS: %.3f, display FPS: %.3f",
+                1.0 / p->fps.estimate, 1.0 / p->vps.estimate);
+
+        p->reported_fps = p->fps.estimate;
+        p->reported_vps = p->vps.estimate;
+    }
+}
+
+// note: may add more than one frame, since it releases the lock
+static enum pl_queue_status get_frame(pl_queue p, const struct pl_queue_params *params)
+{
+    if (p->eof)
+        return PL_QUEUE_EOF;
+
+    if (!params->get_frame) {
+        if (!params->timeout)
+            return PL_QUEUE_MORE;
+
+        p->want_frame = true;
+        pl_cond_signal(&p->wakeup);
+
+        while (p->want_frame) {
+            if (pl_cond_timedwait(&p->wakeup, &p->lock_weak, params->timeout) == ETIMEDOUT)
+                return PL_QUEUE_MORE;
+        }
+
+        return p->eof ? PL_QUEUE_EOF : PL_QUEUE_OK;
+    }
+
+    // Don't hold the weak mutex while calling into `get_frame`, to allow
+    // `pl_queue_push` to run concurrently while we're waiting for frames
+    pl_mutex_unlock(&p->lock_weak);
+
+    struct pl_source_frame src;
+    enum pl_queue_status ret;
+    switch ((ret = params->get_frame(&src, params))) {
+    case PL_QUEUE_OK:
+        pl_queue_push(p, &src);
+        break;
+    case PL_QUEUE_EOF:
+        pl_queue_push(p, NULL);
+        break;
+    case PL_QUEUE_MORE:
+    case PL_QUEUE_ERR:
+        break;
+    }
+
+    pl_mutex_lock(&p->lock_weak);
+    return ret;
+}
+
+static inline bool map_frame(pl_queue p, struct entry *entry)
+{
+    if (!entry->mapped) {
+        PL_TRACE(p, "Mapping frame id %"PRIu64" with PTS %f",
+                 entry->signature, entry->pts);
+        entry->mapped = true;
+        entry->ok = entry->src.map(p->gpu, entry->cache.tex,
+                                   &entry->src, &entry->frame);
+        if (!entry->ok)
+            PL_ERR(p, "Failed mapping frame id %"PRIu64" with PTS %f",
+                   entry->signature, entry->pts);
+    }
+
+    return entry->ok;
+}
+
+static bool map_entry(pl_queue p, struct entry *entry)
+{
+    bool ok = map_frame(p, entry->primary ? entry->primary : entry);
+    if (entry->prev)
+        ok &= map_frame(p, entry->prev);
+    if (entry->next)
+        ok &= map_frame(p, entry->next);
+    if (!ok)
+        return false;
+
+    if (entry->primary)
+        entry->frame = entry->primary->frame;
+
+    if (entry->field) {
+        entry->frame.field = entry->field;
+        entry->frame.first_field = PL_DEF(entry->primary, entry)->src.first_field;
+        entry->frame.prev = entry->prev ? &entry->prev->frame : NULL;
+        entry->frame.next = entry->next ? &entry->next->frame : NULL;
+        entry->dirty = true;
+    }
+
+    return true;
+}
+
+static bool entry_complete(struct entry *entry)
+{
+    return entry->field ? !!entry->next : true;
+}
+
+// Advance the queue as needed to make sure idx 0 is the last frame before
+// `pts`, and idx 1 is the first frame after `pts` (unless this is the last).
+//
+// Returns PL_QUEUE_OK only if idx 0 is still legal under ZOH semantics.
+static enum pl_queue_status advance(pl_queue p, double pts,
+                                    const struct pl_queue_params *params)
+{
+    // Cull all frames except the last frame before `pts`
+    int culled = 0;
+    for (int i = 1; i < p->queue.num; i++) {
+        if (p->queue.elem[i]->pts <= pts) {
+            entry_cull(p, p->queue.elem[i - 1], true);
+            culled++;
+        }
+    }
+    PL_ARRAY_REMOVE_RANGE(p->queue, 0, culled);
+
+    // Keep adding new frames until we find one in the future, or EOF
+    enum pl_queue_status ret = PL_QUEUE_OK;
+    while (p->queue.num < 2) {
+        switch ((ret = get_frame(p, params))) {
+        case PL_QUEUE_ERR:
+            return ret;
+        case PL_QUEUE_EOF:
+            if (!p->queue.num)
+                return ret;
+            goto done;
+        case PL_QUEUE_MORE:
+        case PL_QUEUE_OK:
+            while (p->queue.num > 1 && p->queue.elem[1]->pts <= pts) {
+                entry_cull(p, p->queue.elem[0], true);
+                PL_ARRAY_REMOVE_AT(p->queue, 0);
+            }
+            if (ret == PL_QUEUE_MORE)
+                return ret;
+            continue;
+        }
+    }
+
+    if (!entry_complete(p->queue.elem[1])) {
+        switch (get_frame(p, params)) {
+        case PL_QUEUE_ERR:
+            return PL_QUEUE_ERR;
+        case PL_QUEUE_MORE:
+            ret = PL_QUEUE_MORE;
+            // fall through
+        case PL_QUEUE_EOF:
+        case PL_QUEUE_OK:
+            goto done;
+        }
+    }
+
+done:
+    if (p->eof && p->queue.num == 1) {
+        if (p->queue.elem[0]->pts == 0.0 || !p->fps.estimate) {
+            // If the last frame has PTS 0.0, or we have no FPS estimate, then
+            // this is probably a single-frame file, in which case we want to
+            // extend the ZOH to infinity, rather than returning. Not a perfect
+            // heuristic, but w/e
+            return PL_QUEUE_OK;
+        }
+
+        // Last frame is held for an extra `p->fps.estimate` duration,
+        // afterwards this function just returns EOF.
+        if (pts < p->queue.elem[0]->pts + p->fps.estimate) {
+            ret = PL_QUEUE_OK;
+        } else {
+            entry_cull(p, p->queue.elem[0], true);
+            p->queue.num = 0;
+            return PL_QUEUE_EOF;
+        }
+    }
+
+    pl_assert(p->queue.num);
+    return ret;
+}
+
+static inline enum pl_queue_status point(pl_queue p, struct pl_frame_mix *mix,
+                                         const struct pl_queue_params *params)
+{
+    if (!p->queue.num) {
+        *mix = (struct pl_frame_mix) {0};
+        return PL_QUEUE_MORE;
+    }
+
+    // Find closest frame (nearest neighbour semantics)
+    struct entry *entry = p->queue.elem[0];
+    if (entry->pts > params->pts) { // first frame not visible yet
+        *mix = (struct pl_frame_mix) {0};
+        return PL_QUEUE_OK;
+    }
+
+    double best = fabs(entry->pts - params->pts);
+    for (int i = 1; i < p->queue.num; i++) {
+        double dist = fabs(p->queue.elem[i]->pts - params->pts);
+        if (dist < best) {
+            entry = p->queue.elem[i];
+            best = dist;
+            continue;
+        } else {
+            break;
+        }
+    }
+
+    if (!map_entry(p, entry))
+        return PL_QUEUE_ERR;
+
+    // Return a mix containing only this single frame
+    p->tmp_sig.num = p->tmp_ts.num = p->tmp_frame.num = 0;
+    PL_ARRAY_APPEND(p, p->tmp_sig, entry->signature);
+    PL_ARRAY_APPEND(p, p->tmp_frame, &entry->frame);
+    PL_ARRAY_APPEND(p, p->tmp_ts, 0.0);
+    *mix = (struct pl_frame_mix) {
+        .num_frames = 1,
+        .frames = p->tmp_frame.elem,
+        .signatures = p->tmp_sig.elem,
+        .timestamps = p->tmp_ts.elem,
+        .vsync_duration = 1.0,
+    };
+
+    PL_TRACE(p, "Showing single frame id %"PRIu64" with PTS %f for target PTS %f",
+             entry->signature, entry->pts, params->pts);
+
+    report_estimates(p);
+    return PL_QUEUE_OK;
+}
+
+// Present a single frame as appropriate for `pts`
+static enum pl_queue_status nearest(pl_queue p, struct pl_frame_mix *mix,
+                                    const struct pl_queue_params *params)
+{
+    enum pl_queue_status ret;
+    switch ((ret = advance(p, params->pts, params))) {
+    case PL_QUEUE_ERR:
+    case PL_QUEUE_EOF:
+        return ret;
+    case PL_QUEUE_OK:
+    case PL_QUEUE_MORE:
+        if (mix && point(p, mix, params) == PL_QUEUE_ERR)
+            return PL_QUEUE_ERR;
+        return ret;
+    }
+
+    pl_unreachable();
+}
+
+// Special case of `interpolate` for radius = 0, in which case we need exactly
+// the previous frame and the following frame
+static enum pl_queue_status oversample(pl_queue p, struct pl_frame_mix *mix,
+                                       const struct pl_queue_params *params)
+{
+    enum pl_queue_status ret;
+    switch ((ret = advance(p, params->pts, params))) {
+    case PL_QUEUE_ERR:
+    case PL_QUEUE_EOF:
+        return ret;
+    case PL_QUEUE_OK:
+        break;
+    case PL_QUEUE_MORE:
+        if (!p->queue.num) {
+            if (mix)
+                *mix = (struct pl_frame_mix) {0};
+            return ret;
+        }
+        break;
+    }
+
+    if (!mix)
+        return PL_QUEUE_OK;
+
+    // Can't oversample with only a single frame, fall back to point sampling
+    if (p->queue.num < 2 || p->queue.elem[0]->pts > params->pts) {
+        if (point(p, mix, params) != PL_QUEUE_OK)
+            return PL_QUEUE_ERR;
+        return ret;
+    }
+
+    struct entry *entries[2] = { p->queue.elem[0], p->queue.elem[1] };
+    pl_assert(entries[0]->pts <= params->pts);
+    pl_assert(entries[1]->pts >= params->pts);
+
+    // Returning a mix containing both of these two frames
+    p->tmp_sig.num = p->tmp_ts.num = p->tmp_frame.num = 0;
+    for (int i = 0; i < 2; i++) {
+        if (!map_entry(p, entries[i]))
+            return PL_QUEUE_ERR;
+        float ts = (entries[i]->pts - params->pts) / p->fps.estimate;
+        PL_ARRAY_APPEND(p, p->tmp_sig, entries[i]->signature);
+        PL_ARRAY_APPEND(p, p->tmp_frame, &entries[i]->frame);
+        PL_ARRAY_APPEND(p, p->tmp_ts, ts);
+    }
+
+    *mix = (struct pl_frame_mix) {
+        .num_frames = 2,
+        .frames = p->tmp_frame.elem,
+        .signatures = p->tmp_sig.elem,
+        .timestamps = p->tmp_ts.elem,
+        .vsync_duration = p->vps.estimate / p->fps.estimate,
+    };
+
+    PL_TRACE(p, "Oversampling 2 frames for target PTS %f:", params->pts);
+    for (int i = 0; i < mix->num_frames; i++)
+        PL_TRACE(p, "    id %"PRIu64" ts %f", mix->signatures[i], mix->timestamps[i]);
+
+    report_estimates(p);
+    return ret;
+}
+
+// Present a mixture of frames, relative to the vsync ratio
+static enum pl_queue_status interpolate(pl_queue p, struct pl_frame_mix *mix,
+                                        const struct pl_queue_params *params)
+{
+    // No FPS estimate available, possibly source contains only a single frame,
+    // or this is the first frame to be rendered. Fall back to point sampling.
+    if (!p->fps.estimate)
+        return nearest(p, mix, params);
+
+    // Silently disable interpolation if the ratio dips lower than the
+    // configured threshold
+    float ratio = fabs(p->fps.estimate / p->vps.estimate - 1.0);
+    if (ratio < params->interpolation_threshold) {
+        if (!p->threshold_frames) {
+            PL_INFO(p, "Detected fps ratio %.4f below threshold %.4f, "
+                    "disabling interpolation",
+                    ratio, params->interpolation_threshold);
+        }
+
+        p->threshold_frames = THRESHOLD_FRAMES + 1;
+        return nearest(p, mix, params);
+    } else if (ratio < THRESHOLD_MAX_RATIO && p->threshold_frames > 1) {
+        p->threshold_frames--;
+        return nearest(p, mix, params);
+    } else {
+        if (p->threshold_frames) {
+            PL_INFO(p, "Detected fps ratio %.4f exceeds threshold %.4f, "
+                    "re-enabling interpolation",
+                    ratio, params->interpolation_threshold);
+        }
+        p->threshold_frames = 0;
+    }
+
+    // No radius information, special case in which we only need the previous
+    // and next frames.
+    if (!params->radius)
+        return oversample(p, mix, params);
+
+    pl_assert(p->fps.estimate && p->vps.estimate);
+    float radius = params->radius * fmaxf(1.0f, p->vps.estimate / p->fps.estimate);
+    double min_pts = params->pts - radius * p->fps.estimate,
+           max_pts = params->pts + radius * p->fps.estimate;
+
+    enum pl_queue_status ret;
+    switch ((ret = advance(p, min_pts, params))) {
+    case PL_QUEUE_ERR:
+    case PL_QUEUE_EOF:
+        return ret;
+    case PL_QUEUE_MORE:
+        goto done;
+    case PL_QUEUE_OK:
+        break;
+    }
+
+    // Keep adding new frames until we've covered the range we care about
+    pl_assert(p->queue.num);
+    while (p->queue.elem[p->queue.num - 1]->pts < max_pts) {
+        switch ((ret = get_frame(p, params))) {
+        case PL_QUEUE_ERR:
+            return ret;
+        case PL_QUEUE_MORE:
+            goto done;
+        case PL_QUEUE_EOF:;
+            // Don't forward EOF until we've held the last frame for the
+            // desired ZOH hold duration
+            double last_pts = p->queue.elem[p->queue.num - 1]->pts;
+            if (last_pts && params->pts >= last_pts + p->fps.estimate)
+                return ret;
+            ret = PL_QUEUE_OK;
+            goto done;
+        case PL_QUEUE_OK:
+            continue;
+        }
+    }
+
+    if (!entry_complete(p->queue.elem[p->queue.num - 1])) {
+        switch ((ret = get_frame(p, params))) {
+        case PL_QUEUE_MORE:
+        case PL_QUEUE_OK:
+            break;
+        case PL_QUEUE_ERR:
+        case PL_QUEUE_EOF:
+            return ret;
+        }
+    }
+
+done: ;
+
+    if (!mix)
+        return PL_QUEUE_OK;
+
+    // Construct a mix object representing the current queue state, starting at
+    // the last frame before `min_pts` to make sure there's a fallback frame
+    // available for ZOH semantics.
+    p->tmp_sig.num = p->tmp_ts.num = p->tmp_frame.num = 0;
+    for (int i = 0; i < p->queue.num; i++) {
+        struct entry *entry = p->queue.elem[i];
+        if (entry->pts > max_pts)
+            break;
+        if (!map_entry(p, entry))
+            return PL_QUEUE_ERR;
+        float ts = (entry->pts - params->pts) / p->fps.estimate;
+        PL_ARRAY_APPEND(p, p->tmp_sig, entry->signature);
+        PL_ARRAY_APPEND(p, p->tmp_frame, &entry->frame);
+        PL_ARRAY_APPEND(p, p->tmp_ts, ts);
+    }
+
+    *mix = (struct pl_frame_mix) {
+        .num_frames = p->tmp_frame.num,
+        .frames = p->tmp_frame.elem,
+        .signatures = p->tmp_sig.elem,
+        .timestamps = p->tmp_ts.elem,
+        .vsync_duration = p->vps.estimate / p->fps.estimate,
+    };
+
+    PL_TRACE(p, "Showing mix of %d frames for target PTS %f:",
+             mix->num_frames, params->pts);
+    for (int i = 0; i < mix->num_frames; i++)
+        PL_TRACE(p, "    id %"PRIu64" ts %f", mix->signatures[i], mix->timestamps[i]);
+
+    report_estimates(p);
+    return ret;
+}
+
+static bool prefill(pl_queue p, const struct pl_queue_params *params)
+{
+    int min_frames = 2 * ceilf(params->radius);
+    if (p->fps.estimate && p->vps.estimate && p->vps.estimate <= 1.0f / MIN_FPS)
+        min_frames *= ceilf(p->vps.estimate / p->fps.estimate);
+    min_frames = PL_MAX(min_frames, PREFETCH_FRAMES);
+
+    while (p->queue.num < min_frames) {
+        switch (get_frame(p, params)) {
+        case PL_QUEUE_ERR:
+            return false;
+        case PL_QUEUE_EOF:
+        case PL_QUEUE_MORE:
+            return true;
+        case PL_QUEUE_OK:
+            continue;
+        }
+    }
+
+    // In the most likely case, the first few frames will all be required. So
+    // force-map them all to initialize GPU state on initial rendering. This is
+    // better than the alternative of missing the cache later, when timing is
+    // more relevant.
+    for (int i = 0; i < min_frames; i++) {
+        if (!map_entry(p, p->queue.elem[i]))
+            return false;
+    }
+
+    return true;
+}
+
+enum pl_queue_status pl_queue_update(pl_queue p, struct pl_frame_mix *out_mix,
+                                     const struct pl_queue_params *params)
+{
+    pl_mutex_lock(&p->lock_strong);
+    pl_mutex_lock(&p->lock_weak);
+    default_estimate(&p->vps, params->vsync_duration);
+
+    float delta = params->pts - p->prev_pts;
+    if (delta < 0.0f) {
+
+        // This is a backwards PTS jump. This is something we can handle
+        // semi-gracefully, but only if we haven't culled past the current
+        // frame yet.
+        if (p->queue.num && p->queue.elem[0]->pts > params->pts) {
+            PL_ERR(p, "Requested PTS %f is lower than the oldest frame "
+                   "PTS %f. This is not supported, PTS must be monotonically "
+                   "increasing! Please use `pl_queue_reset` to reset the frame "
+                   "queue on discontinuous PTS jumps.",
+                   params->pts, p->queue.elem[0]->pts);
+            pl_mutex_unlock(&p->lock_weak);
+            pl_mutex_unlock(&p->lock_strong);
+            return PL_QUEUE_ERR;
+        }
+
+    } else if (delta > 1.0f) {
+
+        // A jump of more than a second is probably the result of a
+        // discontinuous jump after a suspend. To prevent this from exploding
+        // the FPS estimate, treat this as a new frame.
+        PL_TRACE(p, "Discontinuous target PTS jump %f -> %f, ignoring...",
+                 p->prev_pts, params->pts);
+
+    } else if (delta > 0) {
+
+        update_estimate(&p->vps, params->pts - p->prev_pts);
+
+    }
+
+    p->prev_pts = params->pts;
+
+    // As a special case, prefill the queue if this is the first frame
+    if (!params->pts && !p->queue.num) {
+        if (!prefill(p, params)) {
+            pl_mutex_unlock(&p->lock_weak);
+            pl_mutex_unlock(&p->lock_strong);
+            return PL_QUEUE_ERR;
+        }
+    }
+
+    // Ignore unrealistically high or low FPS, common near start of playback
+    static const float max_vsync = 1.0 / MIN_FPS;
+    static const float min_vsync = 1.0 / MAX_FPS;
+    bool estimation_ok = p->vps.estimate > min_vsync && p->vps.estimate < max_vsync;
+    enum pl_queue_status ret;
+
+    if (estimation_ok || params->vsync_duration > 0) {
+        // We know the vsync duration, so construct an interpolation mix
+        ret = interpolate(p, out_mix, params);
+    } else {
+        // We don't know the vsync duration (yet), so just point-sample
+        ret = nearest(p, out_mix, params);
+    }
+
+    pl_cond_signal(&p->wakeup);
+    pl_mutex_unlock(&p->lock_weak);
+    pl_mutex_unlock(&p->lock_strong);
+    return ret;
+}
+
+float pl_queue_estimate_fps(pl_queue p)
+{
+    pl_mutex_lock(&p->lock_weak);
+    float estimate = p->fps.estimate;
+    pl_mutex_unlock(&p->lock_weak);
+    return estimate ? 1.0f / estimate : 0.0f;
+}
+
+float pl_queue_estimate_vps(pl_queue p)
+{
+    pl_mutex_lock(&p->lock_weak);
+    float estimate = p->vps.estimate;
+    pl_mutex_unlock(&p->lock_weak);
+    return estimate ? 1.0f / estimate : 0.0f;
+}
+
+int pl_queue_num_frames(pl_queue p)
+{
+    pl_mutex_lock(&p->lock_weak);
+    int count = p->queue.num;
+    pl_mutex_unlock(&p->lock_weak);
+    return count;
+}
+
+bool pl_queue_peek(pl_queue p, int idx, struct pl_source_frame *out)
+{
+    pl_mutex_lock(&p->lock_weak);
+    bool ok = idx >= 0 && idx < p->queue.num;
+    if (ok)
+        *out = p->queue.elem[idx]->src;
+    pl_mutex_unlock(&p->lock_weak);
+    return ok;
+}
diff --git a/src/utils/upload.c b/src/utils/upload.c
new file mode 100644
index 0000000..75bd4bb
--- /dev/null
+++ b/src/utils/upload.c
@@ -0,0 +1,382 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "log.h"
+#include "common.h"
+#include "gpu.h"
+
+#include <libplacebo/utils/upload.h>
+
+#define MAX_COMPS 4
+
+struct comp {
+    int order; // e.g. 0, 1, 2, 3 for RGBA
+    int size;  // size in bits
+    int shift; // bit-shift / offset in bits
+};
+
+static int compare_comp(const void *pa, const void *pb)
+{
+    const struct comp *a = pa, *b = pb;
+
+    // Move all of the components with a size of 0 to the end, so they can
+    // be ignored outright
+    if (a->size && !b->size)
+        return -1;
+    if (b->size && !a->size)
+        return 1;
+
+    // Otherwise, just compare based on the shift
+    return PL_CMP(a->shift, b->shift);
+}
+
+void pl_plane_data_from_comps(struct pl_plane_data *data, int size[4],
+                              int shift[4])
+{
+    struct comp comps[MAX_COMPS];
+    for (int i = 0; i < PL_ARRAY_SIZE(comps); i++) {
+        comps[i].order = i;
+        comps[i].size = size[i];
+        comps[i].shift = shift[i];
+    }
+
+    // Sort the components by shift
+    qsort(comps, MAX_COMPS, sizeof(struct comp), compare_comp);
+
+    // Generate the resulting component size/pad/map
+    int offset = 0;
+    for (int i = 0; i < MAX_COMPS; i++)  {
+        if (comps[i].size) {
+            assert(comps[i].shift >= offset);
+            data->component_size[i] = comps[i].size;
+            data->component_pad[i] = comps[i].shift - offset;
+            data->component_map[i] = comps[i].order;
+            offset += data->component_size[i] + data->component_pad[i];
+        } else {
+            // Clear the superfluous entries for sanity
+            data->component_size[i] = 0;
+            data->component_pad[i] = 0;
+            data->component_map[i] = 0;
+        }
+    }
+}
+
+void pl_plane_data_from_mask(struct pl_plane_data *data, uint64_t mask[4])
+{
+    int size[4];
+    int shift[4];
+
+    for (int i = 0; i < PL_ARRAY_SIZE(size); i++) {
+        size[i] = __builtin_popcountll(mask[i]);
+        shift[i] = PL_MAX(0, __builtin_ffsll(mask[i]) - 1);
+
+        // Sanity checking
+        uint64_t mask_reconstructed = (1LLU << size[i]) - 1;
+        mask_reconstructed <<= shift[i];
+        pl_assert(mask_reconstructed == mask[i]);
+    }
+
+    pl_plane_data_from_comps(data, size, shift);
+}
+
+bool pl_plane_data_align(struct pl_plane_data *data, struct pl_bit_encoding *out_bits)
+{
+    struct pl_plane_data aligned = *data;
+    struct pl_bit_encoding bits = {0};
+
+    int offset = 0;
+
+#define SET_TEST(var, value)                \
+    do {                                    \
+        if (offset == 0) {                  \
+            (var) = (value);                \
+        } else if ((var) != (value)) {      \
+            goto misaligned;                \
+        }                                   \
+    } while (0)
+
+    for (int i = 0; i < MAX_COMPS; i++) {
+        if (!aligned.component_size[i])
+            break;
+
+        // Can't meaningfully align alpha channel, so just skip it. This is a
+        // limitation of the fact that `pl_bit_encoding` only applies to the
+        // main color channels, and changing this would be very nontrivial.
+        if (aligned.component_map[i] == PL_CHANNEL_A)
+            continue;
+
+        // Color depth is the original component size, before alignment
+        SET_TEST(bits.color_depth, aligned.component_size[i]);
+
+        // Try consuming padding of the current component to align down. This
+        // corresponds to an extra bit shift to the left.
+        int comp_start = offset + aligned.component_pad[i];
+        int left_delta = comp_start - PL_ALIGN2(comp_start - 7, 8);
+        left_delta = PL_MIN(left_delta, aligned.component_pad[i]);
+        aligned.component_pad[i] -= left_delta;
+        aligned.component_size[i] += left_delta;
+        SET_TEST(bits.bit_shift, left_delta);
+
+        // Try consuming padding of the next component to align up. This
+        // corresponds to simply ignoring some extra 0s on the end.
+        int comp_end = comp_start + aligned.component_size[i] - left_delta;
+        int right_delta = PL_ALIGN2(comp_end, 8) - comp_end;
+        if (i+1 == MAX_COMPS || !aligned.component_size[i+1]) {
+            // This is the last component, so we can be greedy
+            aligned.component_size[i] += right_delta;
+        } else {
+            right_delta = PL_MIN(right_delta, aligned.component_pad[i+1]);
+            aligned.component_pad[i+1] -= right_delta;
+            aligned.component_size[i] += right_delta;
+        }
+
+        // Sample depth is the new total component size, including padding
+        SET_TEST(bits.sample_depth, aligned.component_size[i]);
+
+        offset += aligned.component_pad[i] + aligned.component_size[i];
+    }
+
+    // Easy sanity check, to make sure that we don't exceed the known stride
+    if (aligned.pixel_stride && offset > aligned.pixel_stride * 8)
+        goto misaligned;
+
+    *data = aligned;
+    if (out_bits)
+        *out_bits = bits;
+    return true;
+
+misaligned:
+    // Can't properly align anything, so just do a no-op
+    if (out_bits)
+        *out_bits = (struct pl_bit_encoding) {0};
+    return false;
+}
+
+pl_fmt pl_plane_find_fmt(pl_gpu gpu, int out_map[4], const struct pl_plane_data *data)
+{
+    int dummy[4] = {0};
+    out_map = PL_DEF(out_map, dummy);
+
+    // Endian swapping requires compute shaders (currently)
+    if (data->swapped && !gpu->limits.max_ssbo_size)
+        return NULL;
+
+    // Count the number of components and initialize out_map
+    int num = 0;
+    for (int i = 0; i < PL_ARRAY_SIZE(data->component_size); i++) {
+        out_map[i] = -1;
+        if (data->component_size[i])
+            num = i+1;
+    }
+
+    for (int n = 0; n < gpu->num_formats; n++) {
+        pl_fmt fmt = gpu->formats[n];
+        if (fmt->opaque || fmt->num_components < num)
+            continue;
+        if (fmt->type != data->type || fmt->texel_size != data->pixel_stride)
+            continue;
+        if (!(fmt->caps & PL_FMT_CAP_SAMPLEABLE))
+            continue;
+
+        int idx = 0;
+
+        // Try mapping all pl_plane_data components to texture components
+        for (int i = 0; i < num; i++) {
+            // If there's padding we have to map it to an unused physical
+            // component first
+            int pad = data->component_pad[i];
+            if (pad && (idx >= 4 || fmt->host_bits[idx++] != pad))
+                goto next_fmt;
+
+            // Otherwise, try and match this component
+            int size = data->component_size[i];
+            if (size && (idx >= 4 || fmt->host_bits[idx] != size))
+                goto next_fmt;
+            out_map[idx++] = data->component_map[i];
+        }
+
+        // Reject misaligned formats, check this last to only log such errors
+        // if this is the only thing preventing a format from being used, as
+        // this is likely an issue in the API usage.
+        if (data->row_stride % fmt->texel_align) {
+            PL_WARN(gpu, "Rejecting texture format '%s' due to misalignment: "
+                    "Row stride %zu is not a clean multiple of texel size %zu! "
+                    "This is likely an API usage bug.",
+                    fmt->name, data->row_stride, fmt->texel_align);
+            continue;
+        }
+
+        return fmt;
+
+next_fmt: ; // acts as `continue`
+    }
+
+    return NULL;
+}
+
+bool pl_upload_plane(pl_gpu gpu, struct pl_plane *out_plane,
+                     pl_tex *tex, const struct pl_plane_data *data)
+{
+    pl_assert(!data->buf ^ !data->pixels); // exactly one
+
+    int out_map[4];
+    pl_fmt fmt = pl_plane_find_fmt(gpu, out_map, data);
+    if (!fmt) {
+        PL_ERR(gpu, "Failed picking any compatible texture format for a plane!");
+        return false;
+
+        // TODO: try soft-converting to a supported format using e.g zimg?
+    }
+
+    bool ok = pl_tex_recreate(gpu, tex, pl_tex_params(
+        .w = data->width,
+        .h = data->height,
+        .format = fmt,
+        .sampleable = true,
+        .host_writable = true,
+        .blit_src = fmt->caps & PL_FMT_CAP_BLITTABLE,
+    ));
+
+    if (!ok) {
+        PL_ERR(gpu, "Failed initializing plane texture!");
+        return false;
+    }
+
+    if (out_plane) {
+        out_plane->texture = *tex;
+        out_plane->components = 0;
+        for (int i = 0; i < PL_ARRAY_SIZE(out_map); i++) {
+            out_plane->component_mapping[i] = out_map[i];
+            if (out_map[i] >= 0)
+                out_plane->components = i+1;
+        }
+    }
+
+    struct pl_tex_transfer_params params = {
+        .tex        = *tex,
+        .rc.x1      = data->width, // set these for `pl_tex_transfer_size`
+        .rc.y1      = data->height,
+        .rc.z1      = 1,
+        .row_pitch  = PL_DEF(data->row_stride, data->width * fmt->texel_size),
+        .ptr        = (void *) data->pixels,
+        .buf        = data->buf,
+        .buf_offset = data->buf_offset,
+        .callback   = data->callback,
+        .priv       = data->priv,
+    };
+
+    pl_buf swapbuf = NULL;
+    if (data->swapped) {
+        const size_t aligned = PL_ALIGN2(pl_tex_transfer_size(&params), 4);
+        swapbuf = pl_buf_create(gpu, pl_buf_params(
+            .size           = aligned,
+            .storable       = true,
+            .initial_data   = params.ptr,
+
+            // Note: This may over-read from `ptr` if `ptr` is not aligned to a
+            // word boundary, but the extra texels will be ignored by
+            // `pl_tex_upload` so this UB should be a non-issue in practice.
+        ));
+        if (!swapbuf) {
+            PL_ERR(gpu, "Failed creating endian swapping buffer!");
+            return false;
+        }
+
+        struct pl_buf_copy_swap_params swap_params = {
+            .src        = swapbuf,
+            .dst        = swapbuf,
+            .size       = aligned,
+            .wordsize   = fmt->texel_size / fmt->num_components,
+        };
+
+        bool can_reuse = params.buf && params.buf->params.storable &&
+                         params.buf_offset % 4 == 0 &&
+                         params.buf_offset + aligned <= params.buf->params.size;
+
+        if (params.ptr) {
+            // Data is already uploaded (no-op), can swap in-place
+        } else if (can_reuse) {
+            // We can sample directly from the source buffer
+            swap_params.src = params.buf;
+            swap_params.src_offset = params.buf_offset;
+        } else {
+            // We sadly need to do a second memcpy
+            assert(params.buf);
+            PL_TRACE(gpu, "Double-slow path! pl_buf_copy -> pl_buf_copy_swap...");
+            pl_buf_copy(gpu, swapbuf, 0, params.buf, params.buf_offset,
+                        PL_MIN(aligned, params.buf->params.size - params.buf_offset));
+        }
+
+        if (!pl_buf_copy_swap(gpu, &swap_params)) {
+            PL_ERR(gpu, "Failed swapping endianness!");
+            pl_buf_destroy(gpu, &swapbuf);
+            return false;
+        }
+
+        params.ptr = NULL;
+        params.buf = swapbuf;
+        params.buf_offset = 0;
+    }
+
+    ok = pl_tex_upload(gpu, &params);
+    pl_buf_destroy(gpu, &swapbuf);
+    return ok;
+}
+
+bool pl_recreate_plane(pl_gpu gpu, struct pl_plane *out_plane,
+                       pl_tex *tex, const struct pl_plane_data *data)
+{
+    if (data->swapped) {
+        PL_ERR(gpu, "Cannot call pl_recreate_plane on non-native endian plane "
+               "data, this is only supported for `pl_upload_plane`!");
+        return false;
+    }
+
+    int out_map[4];
+    pl_fmt fmt = pl_plane_find_fmt(gpu, out_map, data);
+    if (!fmt) {
+        PL_ERR(gpu, "Failed picking any compatible texture format for a plane!");
+        return false;
+    }
+
+    bool ok = pl_tex_recreate(gpu, tex, pl_tex_params(
+        .w = data->width,
+        .h = data->height,
+        .format = fmt,
+        .renderable = true,
+        .host_readable = fmt->caps & PL_FMT_CAP_HOST_READABLE,
+        .blit_dst = fmt->caps & PL_FMT_CAP_BLITTABLE,
+        .storable = fmt->caps & PL_FMT_CAP_STORABLE,
+    ));
+
+    if (!ok) {
+        PL_ERR(gpu, "Failed initializing plane texture!");
+        return false;
+    }
+
+    if (out_plane) {
+        out_plane->texture = *tex;
+        out_plane->components = 0;
+        for (int i = 0; i < PL_ARRAY_SIZE(out_map); i++) {
+            out_plane->component_mapping[i] = out_map[i];
+            if (out_map[i] >= 0)
+                out_plane->components = i+1;
+        }
+    }
+
+    return true;
+}