1 files changed, 871 insertions, 0 deletions
diff --git a/demos/video-filtering.c b/demos/video-filtering.c
new file mode 100644
index 0000000..5881c28
--- /dev/null
+++ b/demos/video-filtering.c
@@ -0,0 +1,871 @@
+/* Presented are two hypothetical scenarios of how one might use libplacebo
+ * as something like an FFmpeg or mpv video filter. We examine two example
+ * APIs (loosely modeled after real video filtering APIs) and how each style
+ * would like to use libplacebo.
+ *
+ * For sake of a simple example, let's assume this is a debanding filter.
+ * For those of you too lazy to compile/run this file but still want to see
+ * results, these are from my machine (RX 5700 XT + 1950X, as of 2020-05-25):
+ *
+ * RADV+ACO:
+ *   api1: 10000 frames in 16.328440 s => 1.632844 ms/frame (612.43 fps)
+ *         render: 0.113524 ms, upload: 0.127551 ms, download: 0.146097 ms
+ *   api2: 10000 frames in 5.335634 s => 0.533563 ms/frame (1874.19 fps)
+ *         render: 0.064378 ms, upload: 0.000000 ms, download: 0.189719 ms
+ *
+ * AMDVLK:
+ *   api1: 10000 frames in 14.921859 s => 1.492186 ms/frame (670.16 fps)
+ *         render: 0.110603 ms, upload: 0.114412 ms, download: 0.115375 ms
+ *   api2: 10000 frames in 4.667386 s => 0.466739 ms/frame (2142.53 fps)
+ *         render: 0.030781 ms, upload: 0.000000 ms, download: 0.075237 ms
+ *
+ * You can see that AMDVLK is still better at doing texture streaming than
+ * RADV - this is because as of writing RADV still does not support
+ * asynchronous texture queues / DMA engine transfers. If we disable the
+ * `async_transfer` option with AMDVLK we get this:
+ *
+ *   api1: 10000 frames in 16.087723 s => 1.608772 ms/frame (621.59 fps)
+ *         render: 0.111154 ms, upload: 0.122476 ms, download: 0.133162 ms
+ *   api2: 10000 frames in 6.344959 s => 0.634496 ms/frame (1576.05 fps)
+ *         render: 0.031307 ms, upload: 0.000000 ms, download: 0.083520 ms
+ *
+ * License: CC0 / Public Domain
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "common.h"
+#include "pl_clock.h"
+#include "pl_thread.h"
+
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+#include <libplacebo/dispatch.h>
+#include <libplacebo/shaders/sampling.h>
+#include <libplacebo/utils/upload.h>
+#include <libplacebo/vulkan.h>
+
+///////////////////////
+/// API definitions ///
+///////////////////////
+
+// Stuff that would be common to each API
+
+void *init(void);
+void uninit(void *priv);
+
+struct format {
+    // For simplicity let's make a few assumptions here, since configuring the
+    // texture format is not the point of this example. (In practice you can
+    // go nuts with the `utils/upload.h` helpers)
+    //
+    // - All formats contain unsigned integers only
+    // - All components have the same size in bits
+    // - All components are in the "canonical" order
+    // - All formats have power of two sizes only (2 or 4 components, not 3)
+    // - All plane strides are a multiple of the pixel size
+    int num_comps;
+    int bitdepth;
+};
+
+struct plane {
+    int subx, suby; // subsampling shift
+    struct format fmt;
+    size_t stride;
+    void *data;
+};
+
+#define MAX_PLANES 4
+
+struct image {
+    int width, height;
+    int num_planes;
+    struct plane planes[MAX_PLANES];
+
+    // For API #2, the associated mapped buffer (if any)
+    struct api2_buf *associated_buf;
+};
+
+
+// Example API design #1: synchronous, blocking, double-copy (bad!)
+//
+// In this API, `api1_filter` must immediately return with the new data.
+// This prevents parallelism on the GPU and should be avoided if possible,
+// but sometimes that's what you have to work with. So this is what it
+// would look like.
+//
+// Also, let's assume this API design reconfigures the filter chain (using
+// a blank `proxy` image every time the image format or dimensions change,
+// and doesn't expect us to fail due to format mismatches or resource
+// exhaustion afterwards.
+
+bool api1_reconfig(void *priv, const struct image *proxy);
+bool api1_filter(void *priv, struct image *dst, struct image *src);
+
+
+// Example API design #2: asynchronous, streaming, queued, zero-copy (good!)
+//
+// In this API, `api2_process` will run by the calling code every so often
+// (e.g. when new data is available or expected). This function has access
+// to non-blocking functions `get_image` and `put_image` that interface
+// with the video filtering engine's internal queueing system.
+//
+// This API is also designed to feed multiple frames ahead of time, i.e.
+// it will feed us as many frames as it can while we're still returning
+// `API2_WANT_MORE`. To drain the filter chain, it would continue running
+// the process function until `API2_HAVE_MORE` is no longer present
+// in the output.
+//
+// This API is also designed to do zero-copy where possible. When it wants
+// to create a data buffer of a given size, it will call our function
+// `api2_alloc` which will return a buffer that we can process directly.
+// We can use this to do zero-copy uploading to the GPU, by creating
+// host-visible persistently mapped buffers. In order to prevent the video
+// filtering system from re-using our buffers while copies are happening, we
+// use special functions `image_lock` and `image_unlock` to increase a
+// refcount on the image's backing storage. (As is typical of such APIs)
+//
+// Finally, this API is designed to be fully dynamic: The image parameters
+// could change at any time, and we must be equipped to handle that.
+
+enum api2_status {
+    // Negative values are used to signal error conditions
+    API2_ERR_FMT = -2,          // incompatible / unsupported format
+    API2_ERR_UNKNOWN = -1,      // some other error happened
+    API2_OK = 0,                // no error, no status - everything's good
+
+    // Positive values represent a mask of status conditions
+    API2_WANT_MORE = (1 << 0),  // we want more frames, please feed some more!
+    API2_HAVE_MORE = (1 << 1),  // we have more frames but they're not ready
+};
+
+enum api2_status api2_process(void *priv);
+
+// Functions for creating persistently mapped buffers
+struct api2_buf {
+    void *data;
+    size_t size;
+    void *priv;
+};
+
+bool api2_alloc(void *priv, size_t size, struct api2_buf *out);
+void api2_free(void *priv, const struct api2_buf *buf);
+
+// These functions are provided by the API. The exact details of how images
+// are enqueued, dequeued and locked are not really important here, so just
+// do something unrealistic but simple to demonstrate with.
+struct image *get_image(void);
+void put_image(struct image *img);
+void image_lock(struct image *img);
+void image_unlock(struct image *img);
+
+
+/////////////////////////////////
+/// libplacebo implementation ///
+/////////////////////////////////
+
+
+// For API #2:
+#define PARALLELISM 8
+
+struct entry {
+    pl_buf buf; // to stream the download
+    pl_tex tex_in[MAX_PLANES];
+    pl_tex tex_out[MAX_PLANES];
+    struct image image;
+
+    // For entries that are associated with a held image, so we can unlock them
+    // as soon as possible
+    struct image *held_image;
+    pl_buf held_buf;
+};
+
+// For both APIs:
+struct priv {
+    pl_log log;
+    pl_vulkan vk;
+    pl_gpu gpu;
+    pl_dispatch dp;
+    pl_shader_obj dither_state;
+
+    // Timer objects
+    pl_timer render_timer;
+    pl_timer upload_timer;
+    pl_timer download_timer;
+    uint64_t render_sum;
+    uint64_t upload_sum;
+    uint64_t download_sum;
+    int render_count;
+    int upload_count;
+    int download_count;
+
+    // API #1: A simple pair of input and output textures
+    pl_tex tex_in[MAX_PLANES];
+    pl_tex tex_out[MAX_PLANES];
+
+    // API #2: A ring buffer of textures/buffers for streaming
+    int idx_in;  // points the next free entry
+    int idx_out; // points to the first entry still in progress
+    struct entry entries[PARALLELISM];
+};
+
+void *init(void) {
+    struct priv *p = calloc(1, sizeof(struct priv));
+    if (!p)
+        return NULL;
+
+    p->log = pl_log_create(PL_API_VER, pl_log_params(
+        .log_cb = pl_log_simple,
+        .log_level = PL_LOG_WARN,
+    ));
+
+    p->vk = pl_vulkan_create(p->log, pl_vulkan_params(
+        // Note: This is for API #2. In API #1 you could just pass params=NULL
+        // and it wouldn't really matter much.
+        .async_transfer = true,
+        .async_compute = true,
+        .queue_count = PARALLELISM,
+    ));
+
+    if (!p->vk) {
+        fprintf(stderr, "Failed creating vulkan context\n");
+        goto error;
+    }
+
+    // Give this a shorter name for convenience
+    p->gpu = p->vk->gpu;
+
+    p->dp = pl_dispatch_create(p->log, p->gpu);
+    if (!p->dp) {
+        fprintf(stderr, "Failed creating shader dispatch object\n");
+        goto error;
+    }
+
+    p->render_timer = pl_timer_create(p->gpu);
+    p->upload_timer = pl_timer_create(p->gpu);
+    p->download_timer = pl_timer_create(p->gpu);
+
+    return p;
+
+error:
+    uninit(p);
+    return NULL;
+}
+
+void uninit(void *priv)
+{
+    struct priv *p = priv;
+
+    // API #1
+    for (int i = 0; i < MAX_PLANES; i++) {
+        pl_tex_destroy(p->gpu, &p->tex_in[i]);
+        pl_tex_destroy(p->gpu, &p->tex_out[i]);
+    }
+
+    // API #2
+    for (int i = 0; i < PARALLELISM; i++) {
+        pl_buf_destroy(p->gpu, &p->entries[i].buf);
+        for (int j = 0; j < MAX_PLANES; j++) {
+            pl_tex_destroy(p->gpu, &p->entries[i].tex_in[j]);
+            pl_tex_destroy(p->gpu, &p->entries[i].tex_out[j]);
+        }
+        if (p->entries[i].held_image)
+            image_unlock(p->entries[i].held_image);
+    }
+
+    pl_timer_destroy(p->gpu, &p->render_timer);
+    pl_timer_destroy(p->gpu, &p->upload_timer);
+    pl_timer_destroy(p->gpu, &p->download_timer);
+
+    pl_shader_obj_destroy(&p->dither_state);
+    pl_dispatch_destroy(&p->dp);
+    pl_vulkan_destroy(&p->vk);
+    pl_log_destroy(&p->log);
+
+    free(p);
+}
+
+// Helper function to set up the `pl_plane_data` struct from the image params
+static void setup_plane_data(const struct image *img,
+                             struct pl_plane_data out[MAX_PLANES])
+{
+    for (int i = 0; i < img->num_planes; i++) {
+        const struct plane *plane = &img->planes[i];
+
+        out[i] = (struct pl_plane_data) {
+            .type = PL_FMT_UNORM,
+            .width = img->width >> plane->subx,
+            .height = img->height >> plane->suby,
+            .pixel_stride = plane->fmt.num_comps * plane->fmt.bitdepth / 8,
+            .row_stride = plane->stride,
+            .pixels = plane->data,
+        };
+
+        // For API 2 (direct rendering)
+        if (img->associated_buf) {
+            pl_buf buf = img->associated_buf->priv;
+            out[i].pixels = NULL;
+            out[i].buf = buf;
+            out[i].buf_offset = (uintptr_t) plane->data - (uintptr_t) buf->data;
+        }
+
+        for (int c = 0; c < plane->fmt.num_comps; c++) {
+            out[i].component_size[c] = plane->fmt.bitdepth;
+            out[i].component_pad[c] = 0;
+            out[i].component_map[c] = c;
+        }
+    }
+}
+
+static bool do_plane(struct priv *p, pl_tex dst, pl_tex src)
+{
+    int new_depth = dst->params.format->component_depth[0];
+
+    // Do some debanding, and then also make sure to dither to the new depth
+    // so that our debanded gradients are actually preserved well
+    pl_shader sh = pl_dispatch_begin(p->dp);
+    pl_shader_deband(sh, pl_sample_src( .tex = src ), NULL);
+    pl_shader_dither(sh, new_depth, &p->dither_state, NULL);
+    return pl_dispatch_finish(p->dp, pl_dispatch_params(
+        .shader = &sh,
+        .target = dst,
+        .timer  = p->render_timer,
+    ));
+}
+
+static void check_timers(struct priv *p)
+{
+    uint64_t ret;
+
+    while ((ret = pl_timer_query(p->gpu, p->render_timer))) {
+        p->render_sum += ret;
+        p->render_count++;
+    }
+
+    while ((ret = pl_timer_query(p->gpu, p->upload_timer))) {
+        p->upload_sum += ret;
+        p->upload_count++;
+    }
+
+    while ((ret = pl_timer_query(p->gpu, p->download_timer))) {
+        p->download_sum += ret;
+        p->download_count++;
+    }
+}
+
+// API #1 implementation:
+//
+// In this design, we will create all GPU resources inside `reconfig`, based on
+// the texture format configured from the proxy image. This will avoid failing
+// later on due to e.g. resource exhaustion or texture format mismatch, and
+// thereby falls within the intended semantics of this style of API.
+
+bool api1_reconfig(void *priv, const struct image *proxy)
+{
+    struct priv *p = priv;
+    struct pl_plane_data data[MAX_PLANES];
+    setup_plane_data(proxy, data);
+
+    for (int i = 0; i < proxy->num_planes; i++) {
+        pl_fmt fmt = pl_plane_find_fmt(p->gpu, NULL, &data[i]);
+        if (!fmt) {
+            fprintf(stderr, "Failed configuring filter: no good texture format!\n");
+            return false;
+        }
+
+        bool ok = true;
+        ok &= pl_tex_recreate(p->gpu, &p->tex_in[i], pl_tex_params(
+            .w = data[i].width,
+            .h = data[i].height,
+            .format = fmt,
+            .sampleable = true,
+            .host_writable = true,
+        ));
+
+        ok &= pl_tex_recreate(p->gpu, &p->tex_out[i], pl_tex_params(
+            .w = data[i].width,
+            .h = data[i].height,
+            .format = fmt,
+            .renderable = true,
+            .host_readable = true,
+        ));
+
+        if (!ok) {
+            fprintf(stderr, "Failed creating GPU textures!\n");
+            return false;
+        }
+    }
+
+    return true;
+}
+
+bool api1_filter(void *priv, struct image *dst, struct image *src)
+{
+    struct priv *p = priv;
+    struct pl_plane_data data[MAX_PLANES];
+    setup_plane_data(src, data);
+
+    // Upload planes
+    for (int i = 0; i < src->num_planes; i++) {
+        bool ok = pl_tex_upload(p->gpu, pl_tex_transfer_params(
+            .tex = p->tex_in[i],
+            .row_pitch = data[i].row_stride,
+            .ptr = src->planes[i].data,
+            .timer = p->upload_timer,
+        ));
+
+        if (!ok) {
+            fprintf(stderr, "Failed uploading data to the GPU!\n");
+            return false;
+        }
+    }
+
+    // Process planes
+    for (int i = 0; i < src->num_planes; i++) {
+        if (!do_plane(p, p->tex_out[i], p->tex_in[i])) {
+            fprintf(stderr, "Failed processing planes!\n");
+            return false;
+        }
+    }
+
+    // Download planes
+    for (int i = 0; i < src->num_planes; i++) {
+        bool ok = pl_tex_download(p->gpu, pl_tex_transfer_params(
+            .tex = p->tex_out[i],
+            .row_pitch = dst->planes[i].stride,
+            .ptr = dst->planes[i].data,
+            .timer = p->download_timer,
+        ));
+
+        if (!ok) {
+            fprintf(stderr, "Failed downloading data from the GPU!\n");
+            return false;
+        }
+    }
+
+    check_timers(p);
+    return true;
+}
+
+
+// API #2 implementation:
+//
+// In this implementation we maintain a queue (implemented as ring buffer)
+// of "work entries", which are isolated structs that hold independent GPU
+// resources - so that the GPU has no cross-entry dependencies on any of the
+// textures or other resources. (Side note: It still has a dependency on the
+// dither state, but this is just a shared LUT anyway)
+
+// Align up to the nearest multiple of a power of two
+#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1))
+
+static enum api2_status submit_work(struct priv *p, struct entry *e,
+                                    struct image *img)
+{
+    // If the image comes from a mapped buffer, we have to take a lock
+    // while our upload is in progress
+    if (img->associated_buf) {
+        assert(!e->held_image);
+        image_lock(img);
+        e->held_image = img;
+        e->held_buf = img->associated_buf->priv;
+    }
+
+    // Upload this image's data
+    struct pl_plane_data data[MAX_PLANES];
+    setup_plane_data(img, data);
+
+    for (int i = 0; i < img->num_planes; i++) {
+        pl_fmt fmt = pl_plane_find_fmt(p->gpu, NULL, &data[i]);
+        if (!fmt)
+            return API2_ERR_FMT;
+
+        // FIXME: can we plumb a `pl_timer` in here somehow?
+        if (!pl_upload_plane(p->gpu, NULL, &e->tex_in[i], &data[i]))
+            return API2_ERR_UNKNOWN;
+
+        // Re-create the target FBO as well with this format if necessary
+        bool ok = pl_tex_recreate(p->gpu, &e->tex_out[i], pl_tex_params(
+            .w = data[i].width,
+            .h = data[i].height,
+            .format = fmt,
+            .renderable = true,
+            .host_readable = true,
+        ));
+        if (!ok)
+            return API2_ERR_UNKNOWN;
+    }
+
+    // Dispatch the work for this image
+    for (int i = 0; i < img->num_planes; i++) {
+        if (!do_plane(p, e->tex_out[i], e->tex_in[i]))
+            return API2_ERR_UNKNOWN;
+    }
+
+    // Set up the resulting `struct image` that will hold our target
+    // data. We just copy the format etc. from the source image
+    memcpy(&e->image, img, sizeof(struct image));
+
+    size_t offset[MAX_PLANES], stride[MAX_PLANES], total_size = 0;
+    for (int i = 0; i < img->num_planes; i++) {
+        // For performance, we want to make sure we align the stride
+        // to a multiple of the GPU's preferred texture transfer stride
+        // (This is entirely optional)
+        stride[i] = ALIGN2(img->planes[i].stride,
+                           p->gpu->limits.align_tex_xfer_pitch);
+        int height = img->height >> img->planes[i].suby;
+
+        // Round up the offset to the nearest multiple of the optimal
+        // transfer alignment. (This is also entirely optional)
+        offset[i] = ALIGN2(total_size, p->gpu->limits.align_tex_xfer_offset);
+        total_size = offset[i] + stride[i] * height;
+    }
+
+    // Dispatch the asynchronous download into a mapped buffer
+    bool ok = pl_buf_recreate(p->gpu, &e->buf, pl_buf_params(
+        .size = total_size,
+        .host_mapped = true,
+    ));
+    if (!ok)
+        return API2_ERR_UNKNOWN;
+
+    for (int i = 0; i < img->num_planes; i++) {
+        ok = pl_tex_download(p->gpu, pl_tex_transfer_params(
+            .tex = e->tex_out[i],
+            .row_pitch = stride[i],
+            .buf = e->buf,
+            .buf_offset = offset[i],
+            .timer = p->download_timer,
+        ));
+        if (!ok)
+            return API2_ERR_UNKNOWN;
+
+        // Update the output fields
+        e->image.planes[i].data = e->buf->data + offset[i];
+        e->image.planes[i].stride = stride[i];
+    }
+
+    // Make sure this work starts processing in the background, and especially
+    // so we can move on to the next queue on the gPU
+    pl_gpu_flush(p->gpu);
+    return API2_OK;
+}
+
+enum api2_status api2_process(void *priv)
+{
+    struct priv *p = priv;
+    enum api2_status ret = 0;
+
+    // Opportunistically release any held images. We do this across the ring
+    // buffer, rather than doing this as part of the following loop, because
+    // we want to release images ahead-of-time (no FIFO constraints)
+    for (int i = 0; i < PARALLELISM; i++) {
+        struct entry *e = &p->entries[i];
+        if (e->held_image && !pl_buf_poll(p->gpu, e->held_buf, 0)) {
+            // upload buffer is no longer in use, release it
+            image_unlock(e->held_image);
+            e->held_image = NULL;
+            e->held_buf = NULL;
+        }
+    }
+
+    // Poll the status of existing entries and dequeue the ones that are done
+    while (p->idx_out != p->idx_in) {
+        struct entry *e = &p->entries[p->idx_out];
+        if (pl_buf_poll(p->gpu, e->buf, 0))
+            break;
+
+        if (e->held_image) {
+            image_unlock(e->held_image);
+            e->held_image = NULL;
+            e->held_buf = NULL;
+        }
+
+        // download buffer is no longer busy, dequeue the frame
+        put_image(&e->image);
+        p->idx_out = (p->idx_out + 1) % PARALLELISM;
+    }
+
+    // Fill up the queue with more work
+    int last_free_idx = (p->idx_out ? p->idx_out : PARALLELISM) - 1;
+    while (p->idx_in != last_free_idx) {
+        struct image *img = get_image();
+        if (!img) {
+            ret |= API2_WANT_MORE;
+            break;
+        }
+
+        enum api2_status err = submit_work(p, &p->entries[p->idx_in], img);
+        if (err < 0)
+            return err;
+
+        p->idx_in = (p->idx_in + 1) % PARALLELISM;
+    }
+
+    if (p->idx_out != p->idx_in)
+        ret |= API2_HAVE_MORE;
+
+    return ret;
+}
+
+bool api2_alloc(void *priv, size_t size, struct api2_buf *out)
+{
+    struct priv *p = priv;
+    if (!p->gpu->limits.buf_transfer || size > p->gpu->limits.max_mapped_size)
+        return false;
+
+    pl_buf buf = pl_buf_create(p->gpu, pl_buf_params(
+        .size = size,
+        .host_mapped = true,
+    ));
+
+    if (!buf)
+        return false;
+
+    *out = (struct api2_buf) {
+        .data = buf->data,
+        .size = size,
+        .priv = (void *) buf,
+    };
+    return true;
+}
+
+void api2_free(void *priv, const struct api2_buf *buf)
+{
+    struct priv *p = priv;
+    pl_buf plbuf = buf->priv;
+    pl_buf_destroy(p->gpu, &plbuf);
+}
+
+
+////////////////////////////////////
+/// Proof of Concept / Benchmark ///
+////////////////////////////////////
+
+#define FRAMES 10000
+
+// Let's say we're processing a 1920x1080 4:2:0 8-bit NV12 video, arbitrarily
+// with a stride aligned to 256 bytes. (For no particular reason)
+#define TEXELSZ sizeof(uint8_t)
+#define WIDTH   1920
+#define HEIGHT  1080
+#define STRIDE  (ALIGN2(WIDTH, 256) * TEXELSZ)
+// Subsampled planes
+#define SWIDTH  (WIDTH >> 1)
+#define SHEIGHT (HEIGHT >> 1)
+#define SSTRIDE (ALIGN2(SWIDTH, 256) * TEXELSZ)
+// Plane offsets / sizes
+#define SIZE0   (HEIGHT * STRIDE)
+#define SIZE1   (2 * SHEIGHT * SSTRIDE)
+#define OFFSET0 0
+#define OFFSET1 SIZE0
+#define BUFSIZE (OFFSET1 + SIZE1)
+
+// Skeleton of an example image
+static const struct image example_image = {
+    .width = WIDTH,
+    .height = HEIGHT,
+    .num_planes = 2,
+    .planes = {
+        {
+            .subx = 0,
+            .suby = 0,
+            .stride = STRIDE,
+            .fmt = {
+                .num_comps = 1,
+                .bitdepth = 8 * TEXELSZ,
+            },
+        }, {
+            .subx = 1,
+            .suby = 1,
+            .stride = SSTRIDE * 2,
+            .fmt = {
+                .num_comps = 2,
+                .bitdepth = 8 * TEXELSZ,
+            },
+        },
+    },
+};
+
+// API #1: Nice and simple (but slow)
+static void api1_example(void)
+{
+    struct priv *vf = init();
+    if (!vf)
+        return;
+
+    if (!api1_reconfig(vf, &example_image)) {
+        fprintf(stderr, "api1: Failed configuring video filter!\n");
+        return;
+    }
+
+    // Allocate two buffers to hold the example data, and fill the source
+    // buffer arbitrarily with a "simple" pattern. (Decoding the data into
+    // the buffer is not meant to be part of this benchmark)
+    uint8_t *srcbuf = malloc(BUFSIZE),
+            *dstbuf = malloc(BUFSIZE);
+    if (!srcbuf || !dstbuf)
+        goto done;
+
+    for (size_t i = 0; i < BUFSIZE; i++)
+        srcbuf[i] = i;
+
+    struct image src = example_image, dst = example_image;
+    src.planes[0].data = srcbuf + OFFSET0;
+    src.planes[1].data = srcbuf + OFFSET1;
+    dst.planes[0].data = dstbuf + OFFSET0;
+    dst.planes[1].data = dstbuf + OFFSET1;
+
+    const pl_clock_t start = pl_clock_now();
+
+    // Process this dummy frame a bunch of times
+    unsigned frames = 0;
+    for (frames = 0; frames < FRAMES; frames++) {
+        if (!api1_filter(vf, &dst, &src)) {
+            fprintf(stderr, "api1: Failed filtering frame... aborting\n");
+            break;
+        }
+    }
+
+    const pl_clock_t stop = pl_clock_now();
+    const float secs = pl_clock_diff(stop, start);
+
+    printf("api1: %4u frames in %1.6f s => %2.6f ms/frame (%5.2f fps)\n",
+           frames, secs, 1000 * secs / frames, frames / secs);
+
+    if (vf->render_count) {
+        printf("      render: %f ms, upload: %f ms, download: %f ms\n",
+               1e-6 * vf->render_sum / vf->render_count,
+               vf->upload_count ? (1e-6 * vf->upload_sum / vf->upload_count) : 0.0,
+               vf->download_count ? (1e-6 * vf->download_sum / vf->download_count) : 0.0);
+    }
+
+done:
+    free(srcbuf);
+    free(dstbuf);
+    uninit(vf);
+}
+
+
+// API #2: Pretend we have some fancy pool of images.
+#define POOLSIZE (PARALLELISM + 1)
+
+static struct api2_buf buffers[POOLSIZE] = {0};
+static struct image images[POOLSIZE] = {0};
+static int refcount[POOLSIZE] = {0};
+static unsigned api2_frames_in = 0;
+static unsigned api2_frames_out = 0;
+
+static void api2_example(void)
+{
+    struct priv *vf = init();
+    if (!vf)
+        return;
+
+    // Set up a bunch of dummy images
+    for (int i = 0; i < POOLSIZE; i++) {
+        uint8_t *data;
+        images[i] = example_image;
+        if (api2_alloc(vf, BUFSIZE, &buffers[i])) {
+            data = buffers[i].data;
+            images[i].associated_buf = &buffers[i];
+        } else {
+            // Fall back in case mapped buffers are unsupported
+            fprintf(stderr, "warning: falling back to malloc, may be slow\n");
+            data = malloc(BUFSIZE);
+        }
+        // Fill with some "data" (like in API #1)
+        for (size_t n = 0; n < BUFSIZE; n++)
+            data[i] = n;
+        images[i].planes[0].data = data + OFFSET0;
+        images[i].planes[1].data = data + OFFSET1;
+    }
+
+    const pl_clock_t start = pl_clock_now();
+
+    // Just keep driving the event loop regardless of the return status
+    // until we reach the critical number of frames. (Good enough for this PoC)
+    while (api2_frames_out < FRAMES) {
+        enum api2_status ret = api2_process(vf);
+        if (ret < 0) {
+            fprintf(stderr, "api2: Failed processing... aborting\n");
+            break;
+        }
+
+        // Sleep a short time (100us) to prevent busy waiting the CPU
+        pl_thread_sleep(1e-4);
+        check_timers(vf);
+    }
+
+    const pl_clock_t stop = pl_clock_now();
+    const float secs = pl_clock_diff(stop, start);
+    printf("api2: %4u frames in %1.6f s => %2.6f ms/frame (%5.2f fps)\n",
+           api2_frames_out, secs, 1000 * secs / api2_frames_out,
+           api2_frames_out / secs);
+
+    if (vf->render_count) {
+        printf("      render: %f ms, upload: %f ms, download: %f ms\n",
+               1e-6 * vf->render_sum / vf->render_count,
+               vf->upload_count ? (1e-6 * vf->upload_sum / vf->upload_count) : 0.0,
+               vf->download_count ? (1e-6 * vf->download_sum / vf->download_count) : 0.0);
+    }
+
+    for (int i = 0; i < POOLSIZE; i++) {
+        if (images[i].associated_buf) {
+            api2_free(vf, images[i].associated_buf);
+        } else {
+            // This is what we originally malloc'd
+            free(images[i].planes[0].data);
+        }
+    }
+
+    uninit(vf);
+}
+
+struct image *get_image(void)
+{
+    if (api2_frames_in == FRAMES)
+        return NULL; // simulate EOF, to avoid queueing up "extra" work
+
+    // if we can find a free (unlocked) image, give it that
+    for (int i = 0; i < POOLSIZE; i++) {
+        if (refcount[i] == 0) {
+            api2_frames_in++;
+            return &images[i];
+        }
+    }
+
+    return NULL; // no free image available
+}
+
+void put_image(struct image *img)
+{
+    (void)img;
+    api2_frames_out++;
+}
+
+void image_lock(struct image *img)
+{
+    int index = img - images; // cheat, for lack of having actual image management
+    refcount[index]++;
+}
+
+void image_unlock(struct image *img)
+{
+    int index = img - images;
+    refcount[index]--;
+}
+
+int main(void)
+{
+    printf("Running benchmarks...\n");
+    api1_example();
+    api2_example();
+    return 0;
+}