1 files changed, 1741 insertions, 0 deletions
diff --git a/src/tests/gpu_tests.h b/src/tests/gpu_tests.h
new file mode 100644
index 0000000..f14f260
--- /dev/null
+++ b/src/tests/gpu_tests.h
@@ -0,0 +1,1741 @@
+#include "tests.h"
+#include "shaders.h"
+
+#include <libplacebo/renderer.h>
+#include <libplacebo/utils/frame_queue.h>
+#include <libplacebo/utils/upload.h>
+
+//#define PRINT_OUTPUT
+
+static void pl_buffer_tests(pl_gpu gpu)
+{
+    const size_t buf_size = 1024;
+    if (buf_size > gpu->limits.max_buf_size)
+        return;
+
+    uint8_t *test_src = malloc(buf_size * 2);
+    uint8_t *test_dst = test_src + buf_size;
+    assert(test_src && test_dst);
+    memset(test_dst, 0, buf_size);
+    for (int i = 0; i < buf_size; i++)
+        test_src[i] = RANDOM_U8;
+
+    pl_buf buf = NULL, tbuf = NULL;
+
+    printf("test buffer static creation and readback\n");
+    buf = pl_buf_create(gpu, pl_buf_params(
+        .size = buf_size,
+        .host_readable = true,
+        .initial_data = test_src,
+    ));
+
+    REQUIRE(buf);
+    REQUIRE(pl_buf_read(gpu, buf, 0, test_dst, buf_size));
+    REQUIRE_MEMEQ(test_src, test_dst, buf_size);
+    pl_buf_destroy(gpu, &buf);
+
+    printf("test buffer empty creation, update and readback\n");
+    memset(test_dst, 0, buf_size);
+    buf = pl_buf_create(gpu, pl_buf_params(
+        .size = buf_size,
+        .host_writable = true,
+        .host_readable = true,
+    ));
+
+    REQUIRE(buf);
+    pl_buf_write(gpu, buf, 0, test_src, buf_size);
+    REQUIRE(pl_buf_read(gpu, buf, 0, test_dst, buf_size));
+    REQUIRE_MEMEQ(test_src, test_dst, buf_size);
+    pl_buf_destroy(gpu, &buf);
+
+    printf("test buffer-buffer copy and readback\n");
+    memset(test_dst, 0, buf_size);
+    buf = pl_buf_create(gpu, pl_buf_params(
+        .size = buf_size,
+        .initial_data = test_src,
+    ));
+
+    tbuf = pl_buf_create(gpu, pl_buf_params(
+        .size = buf_size,
+        .host_readable = true,
+    ));
+
+    REQUIRE(buf && tbuf);
+    pl_buf_copy(gpu, tbuf, 0, buf, 0, buf_size);
+    REQUIRE(pl_buf_read(gpu, tbuf, 0, test_dst, buf_size));
+    REQUIRE_MEMEQ(test_src, test_dst, buf_size);
+    pl_buf_destroy(gpu, &buf);
+    pl_buf_destroy(gpu, &tbuf);
+
+    if (buf_size <= gpu->limits.max_mapped_size) {
+        printf("test host mapped buffer readback\n");
+        buf = pl_buf_create(gpu, pl_buf_params(
+            .size = buf_size,
+            .host_mapped = true,
+            .initial_data = test_src,
+        ));
+
+        REQUIRE(buf);
+        REQUIRE(!pl_buf_poll(gpu, buf, 0));
+        REQUIRE_MEMEQ(test_src, buf->data, buf_size);
+        pl_buf_destroy(gpu, &buf);
+    }
+
+    // `compute_queues` check is to exclude dummy GPUs here
+    if (buf_size <= gpu->limits.max_ssbo_size && gpu->limits.compute_queues)
+    {
+        printf("test endian swapping\n");
+        buf = pl_buf_create(gpu, pl_buf_params(
+            .size = buf_size,
+            .storable = true,
+            .initial_data = test_src,
+        ));
+
+        tbuf = pl_buf_create(gpu, pl_buf_params(
+            .size = buf_size,
+            .storable = true,
+            .host_readable = true,
+        ));
+
+        REQUIRE(buf && tbuf);
+        REQUIRE(pl_buf_copy_swap(gpu, &(struct pl_buf_copy_swap_params) {
+            .src = buf,
+            .dst = tbuf,
+            .size = buf_size,
+            .wordsize = 2,
+        }));
+        REQUIRE(pl_buf_read(gpu, tbuf, 0, test_dst, buf_size));
+        for (int i = 0; i < buf_size / 2; i++) {
+            REQUIRE_CMP(test_src[2 * i + 0], ==, test_dst[2 * i + 1], PRIu8);
+            REQUIRE_CMP(test_src[2 * i + 1], ==, test_dst[2 * i + 0], PRIu8);
+        }
+        // test endian swap in-place
+        REQUIRE(pl_buf_copy_swap(gpu, &(struct pl_buf_copy_swap_params) {
+            .src = tbuf,
+            .dst = tbuf,
+            .size = buf_size,
+            .wordsize = 4,
+        }));
+        REQUIRE(pl_buf_read(gpu, tbuf, 0, test_dst, buf_size));
+        for (int i = 0; i < buf_size / 4; i++) {
+            REQUIRE_CMP(test_src[4 * i + 0], ==, test_dst[4 * i + 2], PRIu8);
+            REQUIRE_CMP(test_src[4 * i + 1], ==, test_dst[4 * i + 3], PRIu8);
+            REQUIRE_CMP(test_src[4 * i + 2], ==, test_dst[4 * i + 0], PRIu8);
+            REQUIRE_CMP(test_src[4 * i + 3], ==, test_dst[4 * i + 1], PRIu8);
+        }
+        pl_buf_destroy(gpu, &buf);
+        pl_buf_destroy(gpu, &tbuf);
+    }
+
+    free(test_src);
+}
+
+static void test_cb(void *priv)
+{
+    bool *flag = priv;
+    *flag = true;
+}
+
+static void pl_test_roundtrip(pl_gpu gpu, pl_tex tex[2],
+                              uint8_t *src, uint8_t *dst)
+{
+    if (!tex[0] || !tex[1]) {
+        printf("failed creating test textures... skipping this test\n");
+        return;
+    }
+
+    int texels = tex[0]->params.w;
+    texels *= tex[0]->params.h ? tex[0]->params.h : 1;
+    texels *= tex[0]->params.d ? tex[0]->params.d : 1;
+
+    pl_fmt fmt = tex[0]->params.format;
+    size_t bytes = texels * fmt->texel_size;
+    memset(src, 0, bytes);
+    memset(dst, 0, bytes);
+
+    for (size_t i = 0; i < bytes; i++)
+        src[i] = RANDOM_U8;
+
+    pl_timer ul, dl;
+    ul = pl_timer_create(gpu);
+    dl = pl_timer_create(gpu);
+
+    bool ran_ul = false, ran_dl = false;
+
+    REQUIRE(pl_tex_upload(gpu, &(struct pl_tex_transfer_params){
+        .tex = tex[0],
+        .ptr = src,
+        .timer = ul,
+        .callback = gpu->limits.callbacks ? test_cb : NULL,
+        .priv = &ran_ul,
+    }));
+
+    // Test blitting, if possible for this format
+    pl_tex dst_tex = tex[0];
+    if (tex[0]->params.blit_src && tex[1]->params.blit_dst) {
+        pl_tex_clear_ex(gpu, tex[1], (union pl_clear_color){0}); // for testing
+        pl_tex_blit(gpu, &(struct pl_tex_blit_params) {
+            .src = tex[0],
+            .dst = tex[1],
+        });
+        dst_tex = tex[1];
+    }
+
+    REQUIRE(pl_tex_download(gpu, &(struct pl_tex_transfer_params){
+        .tex = dst_tex,
+        .ptr = dst,
+        .timer = dl,
+        .callback = gpu->limits.callbacks ? test_cb : NULL,
+        .priv = &ran_dl,
+    }));
+
+    pl_gpu_finish(gpu);
+    if (gpu->limits.callbacks)
+        REQUIRE(ran_ul && ran_dl);
+
+    if (fmt->emulated && fmt->type == PL_FMT_FLOAT) {
+        // TODO: can't memcmp here because bits might be lost due to the
+        // emulated 16/32 bit upload paths, figure out a better way to
+        // generate data and verify the roundtrip!
+    } else {
+        REQUIRE_MEMEQ(src, dst, bytes);
+    }
+
+    // Report timer results
+    printf("upload time: %"PRIu64", download time: %"PRIu64"\n",
+           pl_timer_query(gpu, ul), pl_timer_query(gpu, dl));
+
+    pl_timer_destroy(gpu, &ul);
+    pl_timer_destroy(gpu, &dl);
+}
+
+static void pl_texture_tests(pl_gpu gpu)
+{
+    const size_t max_size = 16*16*16 * 4 *sizeof(double);
+    uint8_t *test_src = malloc(max_size * 2);
+    uint8_t *test_dst = test_src + max_size;
+
+    for (int f = 0; f < gpu->num_formats; f++) {
+        pl_fmt fmt = gpu->formats[f];
+        if (fmt->opaque || !(fmt->caps & PL_FMT_CAP_HOST_READABLE))
+            continue;
+
+        printf("testing texture roundtrip for format %s\n", fmt->name);
+        assert(fmt->texel_size <= 4 * sizeof(double));
+
+        struct pl_tex_params ref_params = {
+            .format        = fmt,
+            .blit_src      = (fmt->caps & PL_FMT_CAP_BLITTABLE),
+            .blit_dst      = (fmt->caps & PL_FMT_CAP_BLITTABLE),
+            .host_writable = true,
+            .host_readable = true,
+            .debug_tag     = PL_DEBUG_TAG,
+        };
+
+        pl_tex tex[2];
+
+        if (gpu->limits.max_tex_1d_dim >= 16) {
+            printf("... 1D\n");
+            struct pl_tex_params params = ref_params;
+            params.w = 16;
+            if (!gpu->limits.blittable_1d_3d)
+                params.blit_src = params.blit_dst = false;
+            for (int i = 0; i < PL_ARRAY_SIZE(tex); i++)
+                tex[i] = pl_tex_create(gpu, &params);
+            pl_test_roundtrip(gpu, tex, test_src, test_dst);
+            for (int i = 0; i < PL_ARRAY_SIZE(tex); i++)
+                pl_tex_destroy(gpu, &tex[i]);
+        }
+
+        if (gpu->limits.max_tex_2d_dim >= 16) {
+            printf("... 2D\n");
+            struct pl_tex_params params = ref_params;
+            params.w = params.h = 16;
+            for (int i = 0; i < PL_ARRAY_SIZE(tex); i++)
+                tex[i] = pl_tex_create(gpu, &params);
+            pl_test_roundtrip(gpu, tex, test_src, test_dst);
+            for (int i = 0; i < PL_ARRAY_SIZE(tex); i++)
+                pl_tex_destroy(gpu, &tex[i]);
+        }
+
+        if (gpu->limits.max_tex_3d_dim >= 16) {
+            printf("... 3D\n");
+            struct pl_tex_params params = ref_params;
+            params.w = params.h = params.d = 16;
+            if (!gpu->limits.blittable_1d_3d)
+                params.blit_src = params.blit_dst = false;
+            for (int i = 0; i < PL_ARRAY_SIZE(tex); i++)
+                tex[i] = pl_tex_create(gpu, &params);
+            pl_test_roundtrip(gpu, tex, test_src, test_dst);
+            for (int i = 0; i < PL_ARRAY_SIZE(tex); i++)
+                pl_tex_destroy(gpu, &tex[i]);
+        }
+    }
+
+    free(test_src);
+}
+
+static void pl_planar_tests(pl_gpu gpu)
+{
+    pl_fmt fmt = pl_find_named_fmt(gpu, "g8_b8_r8_420");
+    if (!fmt)
+        return;
+    REQUIRE_CMP(fmt->num_planes, ==, 3, "d");
+
+    const int width = 64, height = 32;
+    pl_tex tex = pl_tex_create(gpu, pl_tex_params(
+        .w              = width,
+        .h              = height,
+        .format         = fmt,
+        .blit_dst       = true,
+        .host_readable  = true,
+    ));
+    if (!tex)
+        return;
+    for (int i = 0; i < fmt->num_planes; i++)
+        REQUIRE(tex->planes[i]);
+
+    pl_tex plane = tex->planes[1];
+    uint8_t data[(width * height) >> 2];
+    REQUIRE_CMP(plane->params.w * plane->params.h, ==, PL_ARRAY_SIZE(data), "d");
+
+    pl_tex_clear(gpu, plane, (float[]){ (float) 0x80 / 0xFF, 0.0, 0.0, 1.0 });
+    REQUIRE(pl_tex_download(gpu, pl_tex_transfer_params(
+        .tex = plane,
+        .ptr = data,
+    )));
+
+    uint8_t ref[PL_ARRAY_SIZE(data)];
+    memset(ref, 0x80, sizeof(ref));
+    REQUIRE_MEMEQ(data, ref, PL_ARRAY_SIZE(data));
+
+    pl_tex_destroy(gpu, &tex);
+}
+
+static void pl_shader_tests(pl_gpu gpu)
+{
+    if (gpu->glsl.version < 410)
+        return;
+
+    const char *vert_shader =
+        "#version 410                               \n"
+        "layout(location=0) in vec2 vertex_pos;     \n"
+        "layout(location=1) in vec3 vertex_color;   \n"
+        "layout(location=0) out vec3 frag_color;    \n"
+        "void main() {                              \n"
+        "    gl_Position = vec4(vertex_pos, 0, 1);  \n"
+        "    frag_color = vertex_color;             \n"
+        "}";
+
+    const char *frag_shader =
+        "#version 410                               \n"
+        "layout(location=0) in vec3 frag_color;     \n"
+        "layout(location=0) out vec4 out_color;     \n"
+        "void main() {                              \n"
+        "    out_color = vec4(frag_color, 1.0);     \n"
+        "}";
+
+    pl_fmt fbo_fmt;
+    enum pl_fmt_caps caps = PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_BLITTABLE |
+                            PL_FMT_CAP_LINEAR;
+
+    fbo_fmt = pl_find_fmt(gpu, PL_FMT_FLOAT, 4, 16, 32, caps);
+    if (!fbo_fmt)
+        return;
+
+#define FBO_W 16
+#define FBO_H 16
+
+    pl_tex fbo;
+    fbo = pl_tex_create(gpu, &(struct pl_tex_params) {
+        .format         = fbo_fmt,
+        .w              = FBO_W,
+        .h              = FBO_H,
+        .renderable     = true,
+        .storable       = !!(fbo_fmt->caps & PL_FMT_CAP_STORABLE),
+        .host_readable  = true,
+        .blit_dst       = true,
+    });
+    REQUIRE(fbo);
+
+    pl_tex_clear_ex(gpu, fbo, (union pl_clear_color){0});
+
+    pl_fmt vert_fmt;
+    vert_fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 3);
+    REQUIRE(vert_fmt);
+
+    static const struct vertex { float pos[2]; float color[3]; } vertices[] = {
+        {{-1.0, -1.0}, {0, 0, 0}},
+        {{ 1.0, -1.0}, {1, 0, 0}},
+        {{-1.0,  1.0}, {0, 1, 0}},
+        {{ 1.0,  1.0}, {1, 1, 0}},
+    };
+
+    pl_pass pass;
+    pass = pl_pass_create(gpu, &(struct pl_pass_params) {
+        .type           = PL_PASS_RASTER,
+        .target_format  = fbo_fmt,
+        .vertex_shader  = vert_shader,
+        .glsl_shader    = frag_shader,
+
+        .vertex_type    = PL_PRIM_TRIANGLE_STRIP,
+        .vertex_stride  = sizeof(struct vertex),
+        .num_vertex_attribs = 2,
+        .vertex_attribs = (struct pl_vertex_attrib[]) {{
+            .name     = "vertex_pos",
+            .fmt      = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2),
+            .location = 0,
+            .offset   = offsetof(struct vertex, pos),
+        }, {
+            .name     = "vertex_color",
+            .fmt      = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 3),
+            .location = 1,
+            .offset   = offsetof(struct vertex, color),
+        }},
+    });
+    REQUIRE(pass);
+    if (pass->params.cached_program || pass->params.cached_program_len) {
+        // Ensure both are set if either one is set
+        REQUIRE(pass->params.cached_program);
+        REQUIRE(pass->params.cached_program_len);
+    }
+
+    pl_timer timer = pl_timer_create(gpu);
+    pl_pass_run(gpu, &(struct pl_pass_run_params) {
+        .pass           = pass,
+        .target         = fbo,
+        .vertex_count   = PL_ARRAY_SIZE(vertices),
+        .vertex_data    = vertices,
+        .timer          = timer,
+    });
+
+    // Wait until this pass is complete and report the timer result
+    pl_gpu_finish(gpu);
+    printf("timer query result: %"PRIu64"\n", pl_timer_query(gpu, timer));
+    pl_timer_destroy(gpu, &timer);
+
+    static float test_data[FBO_H * FBO_W * 4] = {0};
+
+    // Test against the known pattern of `src`, only useful for roundtrip tests
+#define TEST_FBO_PATTERN(eps, fmt, ...)                                     \
+    do {                                                                    \
+        printf("testing pattern of " fmt "\n", __VA_ARGS__);                \
+        REQUIRE(pl_tex_download(gpu, &(struct pl_tex_transfer_params) {     \
+            .tex = fbo,                                                     \
+            .ptr = test_data,                                               \
+        }));                                                                \
+                                                                            \
+        for (int y = 0; y < FBO_H; y++) {                                   \
+            for (int x = 0; x < FBO_W; x++) {                               \
+                float *color = &test_data[(y * FBO_W + x) * 4];             \
+                REQUIRE_FEQ(color[0], (x + 0.5) / FBO_W, eps);              \
+                REQUIRE_FEQ(color[1], (y + 0.5) / FBO_H, eps);              \
+                REQUIRE_FEQ(color[2], 0.0, eps);                            \
+                REQUIRE_FEQ(color[3], 1.0, eps);                            \
+            }                                                               \
+        }                                                                   \
+    } while (0)
+
+    TEST_FBO_PATTERN(1e-6, "%s", "initial rendering");
+
+    if (sizeof(vertices) <= gpu->limits.max_vbo_size) {
+        // Test the use of an explicit vertex buffer
+        pl_buf vert = pl_buf_create(gpu, &(struct pl_buf_params) {
+            .size = sizeof(vertices),
+            .initial_data = vertices,
+            .drawable = true,
+        });
+
+        REQUIRE(vert);
+        pl_pass_run(gpu, &(struct pl_pass_run_params) {
+            .pass           = pass,
+            .target         = fbo,
+            .vertex_count   = sizeof(vertices) / sizeof(struct vertex),
+            .vertex_buf     = vert,
+            .buf_offset     = 0,
+        });
+
+        pl_buf_destroy(gpu, &vert);
+        TEST_FBO_PATTERN(1e-6, "%s", "using vertex buffer");
+    }
+
+    // Test the use of index buffers
+    static const uint16_t indices[] = { 3, 2, 1, 0 };
+    pl_pass_run(gpu, &(struct pl_pass_run_params) {
+        .pass           = pass,
+        .target         = fbo,
+        .vertex_count   = PL_ARRAY_SIZE(indices),
+        .vertex_data    = vertices,
+        .index_data     = indices,
+    });
+
+    pl_pass_destroy(gpu, &pass);
+    TEST_FBO_PATTERN(1e-6, "%s", "using indexed rendering");
+
+    // Test the use of pl_dispatch
+    pl_dispatch dp = pl_dispatch_create(gpu->log, gpu);
+    pl_shader sh = pl_dispatch_begin(dp);
+    REQUIRE(pl_shader_custom(sh, &(struct pl_custom_shader) {
+        .body       = "color = vec4(col, 1.0);",
+        .input      = PL_SHADER_SIG_NONE,
+        .output     = PL_SHADER_SIG_COLOR,
+    }));
+
+    REQUIRE(pl_dispatch_vertex(dp, &(struct pl_dispatch_vertex_params) {
+        .shader         = &sh,
+        .target         = fbo,
+        .vertex_stride  = sizeof(struct vertex),
+        .vertex_position_idx = 0,
+        .num_vertex_attribs = 2,
+        .vertex_attribs = (struct pl_vertex_attrib[]) {{
+            .name   = "pos",
+            .fmt    = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2),
+            .offset = offsetof(struct vertex, pos),
+        }, {
+            .name   = "col",
+            .fmt    = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 3),
+            .offset = offsetof(struct vertex, color),
+        }},
+
+        .vertex_type    = PL_PRIM_TRIANGLE_STRIP,
+        .vertex_coords  = PL_COORDS_NORMALIZED,
+        .vertex_count   = PL_ARRAY_SIZE(vertices),
+        .vertex_data    = vertices,
+    }));
+
+    TEST_FBO_PATTERN(1e-6, "%s", "using custom vertices");
+
+    static float src_data[FBO_H * FBO_W * 4] = {0};
+    memcpy(src_data, test_data, sizeof(src_data));
+
+    pl_tex src;
+    src = pl_tex_create(gpu, &(struct pl_tex_params) {
+        .format         = fbo_fmt,
+        .w              = FBO_W,
+        .h              = FBO_H,
+        .storable       = fbo->params.storable,
+        .sampleable     = true,
+        .initial_data   = src_data,
+    });
+
+    if (fbo->params.storable) {
+        // Test 1x1 blit, to make sure the scaling code runs
+        REQUIRE(pl_tex_blit_compute(gpu, &(struct pl_tex_blit_params) {
+            .src = src,
+            .dst = fbo,
+            .src_rc = {0, 0, 0, 1, 1, 1},
+            .dst_rc = {0, 0, 0, FBO_W, FBO_H, 1},
+            .sample_mode = PL_TEX_SAMPLE_NEAREST,
+        }));
+
+        // Test non-resizing blit, which uses the efficient imageLoad path
+        REQUIRE(pl_tex_blit_compute(gpu, &(struct pl_tex_blit_params) {
+            .src = src,
+            .dst = fbo,
+            .src_rc = {0, 0, 0, FBO_W, FBO_H, 1},
+            .dst_rc = {0, 0, 0, FBO_W, FBO_H, 1},
+            .sample_mode = PL_TEX_SAMPLE_NEAREST,
+        }));
+
+        TEST_FBO_PATTERN(1e-6, "%s", "pl_tex_blit_compute");
+    }
+
+    // Test encoding/decoding of all gamma functions, color spaces, etc.
+    for (enum pl_color_transfer trc = 0; trc < PL_COLOR_TRC_COUNT; trc++) {
+        struct pl_color_space test_csp = {
+            .transfer = trc,
+            .hdr.min_luma = PL_COLOR_HDR_BLACK,
+        };
+        sh = pl_dispatch_begin(dp);
+        pl_shader_sample_nearest(sh, pl_sample_src( .tex = src ));
+        pl_shader_delinearize(sh, &test_csp);
+        pl_shader_linearize(sh, &test_csp);
+        REQUIRE(pl_dispatch_finish(dp, pl_dispatch_params(
+            .shader = &sh,
+            .target = fbo,
+        )));
+
+        float epsilon = pl_color_transfer_is_hdr(trc) ? 1e-4 : 1e-6;
+        TEST_FBO_PATTERN(epsilon, "transfer function %d", (int) trc);
+    }
+
+    for (enum pl_color_system sys = 0; sys < PL_COLOR_SYSTEM_COUNT; sys++) {
+        if (sys == PL_COLOR_SYSTEM_DOLBYVISION)
+            continue; // requires metadata
+        sh = pl_dispatch_begin(dp);
+        pl_shader_sample_nearest(sh, pl_sample_src( .tex = src ));
+        pl_shader_encode_color(sh, &(struct pl_color_repr) { .sys = sys });
+        pl_shader_decode_color(sh, &(struct pl_color_repr) { .sys = sys }, NULL);
+        REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
+            .shader = &sh,
+            .target = fbo,
+        }));
+
+        float epsilon;
+        switch (sys) {
+        case PL_COLOR_SYSTEM_BT_2020_C:
+        case PL_COLOR_SYSTEM_XYZ:
+            epsilon = 1e-5;
+            break;
+
+        case PL_COLOR_SYSTEM_BT_2100_PQ:
+        case PL_COLOR_SYSTEM_BT_2100_HLG:
+            // These seem to be horrifically noisy and prone to breaking on
+            // edge cases for some reason
+            // TODO: figure out why!
+            continue;
+
+        default: epsilon = 1e-6; break;
+        }
+
+        TEST_FBO_PATTERN(epsilon, "color system %d", (int) sys);
+    }
+
+    // Repeat this a few times to test the caching
+    pl_cache cache = pl_cache_create(pl_cache_params( .log = gpu->log ));
+    pl_gpu_set_cache(gpu, cache);
+    for (int i = 0; i < 10; i++) {
+        if (i == 5) {
+            printf("Recreating pl_dispatch to test the caching\n");
+            size_t size = pl_dispatch_save(dp, NULL);
+            REQUIRE(size);
+            uint8_t *cache_data = malloc(size);
+            REQUIRE(cache_data);
+            REQUIRE_CMP(pl_dispatch_save(dp, cache_data), ==, size, "zu");
+
+            pl_dispatch_destroy(&dp);
+            dp = pl_dispatch_create(gpu->log, gpu);
+            pl_dispatch_load(dp, cache_data);
+
+            // Test to make sure the pass regenerates the same cache
+            uint64_t hash = pl_str_hash((pl_str) { cache_data, size });
+            REQUIRE_CMP(pl_dispatch_save(dp, NULL), ==, size, "zu");
+            REQUIRE_CMP(pl_dispatch_save(dp, cache_data), ==, size, "zu");
+            REQUIRE_CMP(pl_str_hash((pl_str) { cache_data, size }), ==, hash, PRIu64);
+            free(cache_data);
+        }
+
+        sh = pl_dispatch_begin(dp);
+
+        // For testing, force the use of CS if possible
+        if (gpu->glsl.compute) {
+            sh->type = SH_COMPUTE;
+            sh->group_size[0] = 8;
+            sh->group_size[1] = 8;
+        }
+
+        pl_shader_deband(sh, pl_sample_src( .tex = src ), pl_deband_params(
+            .iterations     = 0,
+            .grain          = 0.0,
+        ));
+
+        REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
+            .shader = &sh,
+            .target = fbo,
+        }));
+        TEST_FBO_PATTERN(1e-6, "deband iter %d", i);
+    }
+
+    pl_gpu_set_cache(gpu, NULL);
+    pl_cache_destroy(&cache);
+
+    // Test peak detection and readback if possible
+    sh = pl_dispatch_begin(dp);
+    pl_shader_sample_nearest(sh, pl_sample_src( .tex = src ));
+
+    pl_shader_obj peak_state = NULL;
+    struct pl_color_space csp_gamma22 = { .transfer = PL_COLOR_TRC_GAMMA22 };
+    struct pl_peak_detect_params peak_params = { .minimum_peak = 0.01 };
+    if (pl_shader_detect_peak(sh, csp_gamma22, &peak_state, &peak_params)) {
+        REQUIRE(pl_dispatch_compute(dp, &(struct pl_dispatch_compute_params) {
+            .shader = &sh,
+            .width = fbo->params.w,
+            .height = fbo->params.h,
+        }));
+
+        float peak, avg;
+        REQUIRE(pl_get_detected_peak(peak_state, &peak, &avg));
+
+        float real_peak = 0, real_avg = 0;
+        for (int y = 0; y < FBO_H; y++) {
+            for (int x = 0; x < FBO_W; x++) {
+                float *color = &src_data[(y * FBO_W + x) * 4];
+                float luma = 0.212639f * powf(color[0], 2.2f) +
+                             0.715169f * powf(color[1], 2.2f) +
+                             0.072192f * powf(color[2], 2.2f);
+                luma = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, luma);
+                real_peak = PL_MAX(real_peak, luma);
+                real_avg += luma;
+            }
+        }
+        real_avg = real_avg / (FBO_W * FBO_H);
+
+        real_avg  = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, real_avg);
+        real_peak = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, real_peak);
+        REQUIRE_FEQ(peak, real_peak, 1e-3);
+        REQUIRE_FEQ(avg, real_avg, 1e-2);
+    }
+
+    pl_dispatch_abort(dp, &sh);
+    pl_shader_obj_destroy(&peak_state);
+
+    // Test film grain synthesis
+    pl_shader_obj grain = NULL;
+    struct pl_film_grain_params grain_params = {
+        .tex = src,
+        .components = 3,
+        .component_mapping = { 0, 1, 2},
+        .repr = &(struct pl_color_repr) {
+            .sys = PL_COLOR_SYSTEM_BT_709,
+            .levels = PL_COLOR_LEVELS_LIMITED,
+            .bits = { .color_depth = 10, .sample_depth = 10 },
+        },
+    };
+
+    for (int i = 0; i < 2; i++) {
+        grain_params.data.type = PL_FILM_GRAIN_AV1;
+        grain_params.data.params.av1 = av1_grain_data;
+        grain_params.data.params.av1.overlap = !!i;
+        grain_params.data.seed = rand();
+
+        sh = pl_dispatch_begin(dp);
+        pl_shader_film_grain(sh, &grain, &grain_params);
+        REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
+            .shader = &sh,
+            .target = fbo,
+        }));
+    }
+
+    if (gpu->glsl.compute) {
+        grain_params.data.type = PL_FILM_GRAIN_H274;
+        grain_params.data.params.h274 = h274_grain_data;
+        grain_params.data.seed = rand();
+
+        sh = pl_dispatch_begin(dp);
+        pl_shader_film_grain(sh, &grain, &grain_params);
+        REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
+            .shader = &sh,
+            .target = fbo,
+        }));
+    }
+    pl_shader_obj_destroy(&grain);
+
+    // Test custom shaders
+    struct pl_custom_shader custom = {
+        .header =
+            "vec3 invert(vec3 color)            \n"
+            "{                                  \n"
+            "    return vec3(1.0) - color;      \n"
+            "}                                  \n",
+
+        .body =
+            "color = vec4(gl_FragCoord.xy, 0.0, 1.0);   \n"
+            "color.rgb = invert(color.rgb) + offset;    \n",
+
+        .input = PL_SHADER_SIG_NONE,
+        .output = PL_SHADER_SIG_COLOR,
+
+        .num_variables = 1,
+        .variables = &(struct pl_shader_var) {
+            .var = pl_var_float("offset"),
+            .data = &(float) { 0.1 },
+        },
+    };
+
+    sh = pl_dispatch_begin(dp);
+    REQUIRE(pl_shader_custom(sh, &custom));
+    REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
+        .shader = &sh,
+        .target = fbo,
+    }));
+
+    // Test dolbyvision
+    struct pl_color_repr repr = {
+        .sys = PL_COLOR_SYSTEM_DOLBYVISION,
+        .dovi = &dovi_meta,
+    };
+
+    sh = pl_dispatch_begin(dp);
+    pl_shader_sample_direct(sh, pl_sample_src( .tex = src ));
+    pl_shader_decode_color(sh, &repr, NULL);
+    REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
+        .shader = &sh,
+        .target = fbo,
+    }));
+
+    // Test deinterlacing
+    sh = pl_dispatch_begin(dp);
+    pl_shader_deinterlace(sh, pl_deinterlace_source( .cur = pl_field_pair(src) ), NULL);
+    REQUIRE(pl_dispatch_finish(dp, pl_dispatch_params(
+        .shader = &sh,
+        .target = fbo,
+    )));
+
+    // Test error diffusion
+    if (fbo->params.storable) {
+        for (int i = 0; i < pl_num_error_diffusion_kernels; i++) {
+            const struct pl_error_diffusion_kernel *k = pl_error_diffusion_kernels[i];
+            printf("testing error diffusion kernel '%s'\n", k->name);
+            sh = pl_dispatch_begin(dp);
+            bool ok = pl_shader_error_diffusion(sh, pl_error_diffusion_params(
+                .input_tex  = src,
+                .output_tex = fbo,
+                .new_depth  = 8,
+                .kernel     = k,
+            ));
+
+            if (!ok) {
+                fprintf(stderr, "kernel '%s' exceeds GPU limits, skipping...\n", k->name);
+                continue;
+            }
+
+            REQUIRE(pl_dispatch_compute(dp, pl_dispatch_compute_params(
+                .shader = &sh,
+                .dispatch_size = {1, 1, 1},
+            )));
+        }
+    }
+
+    pl_dispatch_destroy(&dp);
+    pl_tex_destroy(gpu, &src);
+    pl_tex_destroy(gpu, &fbo);
+}
+
+static void pl_scaler_tests(pl_gpu gpu)
+{
+    pl_fmt src_fmt = pl_find_fmt(gpu, PL_FMT_FLOAT, 1, 16, 32, PL_FMT_CAP_LINEAR);
+    pl_fmt fbo_fmt = pl_find_fmt(gpu, PL_FMT_FLOAT, 1, 16, 32, PL_FMT_CAP_RENDERABLE);
+    if (!src_fmt || !fbo_fmt)
+        return;
+
+    float *fbo_data = NULL;
+    pl_shader_obj lut = NULL;
+
+    static float data_5x5[5][5] = {
+        { 0, 0, 0, 0, 0 },
+        { 0, 0, 0, 0, 0 },
+        { 0, 0, 1, 0, 0 },
+        { 0, 0, 0, 0, 0 },
+        { 0, 0, 0, 0, 0 },
+    };
+
+    pl_tex dot5x5 = pl_tex_create(gpu, &(struct pl_tex_params) {
+        .w              = 5,
+        .h              = 5,
+        .format         = src_fmt,
+        .sampleable     = true,
+        .initial_data   = &data_5x5[0][0],
+    });
+
+    struct pl_tex_params fbo_params = {
+        .w              = 100,
+        .h              = 100,
+        .format         = fbo_fmt,
+        .renderable     = true,
+        .storable       = fbo_fmt->caps & PL_FMT_CAP_STORABLE,
+        .host_readable  = fbo_fmt->caps & PL_FMT_CAP_HOST_READABLE,
+    };
+
+    pl_tex fbo = pl_tex_create(gpu, &fbo_params);
+    pl_dispatch dp = pl_dispatch_create(gpu->log, gpu);
+    if (!dot5x5 || !fbo || !dp)
+        goto error;
+
+    pl_shader sh = pl_dispatch_begin(dp);
+    REQUIRE(pl_shader_sample_polar(sh,
+        pl_sample_src(
+            .tex        = dot5x5,
+            .new_w      = fbo->params.w,
+            .new_h      = fbo->params.h,
+        ),
+        pl_sample_filter_params(
+            .filter     = pl_filter_ewa_lanczos,
+            .lut        = &lut,
+            .no_compute = !fbo->params.storable,
+        )
+    ));
+    REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
+        .shader = &sh,
+        .target = fbo,
+    }));
+
+    if (fbo->params.host_readable) {
+        fbo_data = malloc(fbo->params.w * fbo->params.h * sizeof(float));
+        REQUIRE(pl_tex_download(gpu, &(struct pl_tex_transfer_params) {
+            .tex            = fbo,
+            .ptr            = fbo_data,
+        }));
+
+#ifdef PRINT_OUTPUT
+        int max = 255;
+        printf("P2\n%d %d\n%d\n", fbo->params.w, fbo->params.h, max);
+        for (int y = 0; y < fbo->params.h; y++) {
+            for (int x = 0; x < fbo->params.w; x++) {
+                float v = fbo_data[y * fbo->params.h + x];
+                printf("%d ", (int) round(fmin(fmax(v, 0.0), 1.0) * max));
+            }
+            printf("\n");
+        }
+#endif
+    }
+
+error:
+    free(fbo_data);
+    pl_shader_obj_destroy(&lut);
+    pl_dispatch_destroy(&dp);
+    pl_tex_destroy(gpu, &dot5x5);
+    pl_tex_destroy(gpu, &fbo);
+}
+
+static const char *user_shader_tests[] = {
+    // Test hooking, saving and loading
+    "// Example of a comment at the beginning                               \n"
+    "                                                                       \n"
+    "//!HOOK NATIVE                                                         \n"
+    "//!DESC upscale image                                                  \n"
+    "//!BIND HOOKED                                                         \n"
+    "//!WIDTH HOOKED.w 10 *                                                 \n"
+    "//!HEIGHT HOOKED.h 10 *                                                \n"
+    "//!SAVE NATIVEBIG                                                      \n"
+    "//!WHEN NATIVE.w 500 <                                                 \n"
+    "                                                                       \n"
+    "vec4 hook()                                                            \n"
+    "{                                                                      \n"
+    "    return HOOKED_texOff(0);                                           \n"
+    "}                                                                      \n"
+    "                                                                       \n"
+    "//!HOOK MAIN                                                           \n"
+    "//!DESC downscale bigger image                                         \n"
+    "//!WHEN NATIVE.w 500 <                                                 \n"
+    "//!BIND NATIVEBIG                                                      \n"
+    "                                                                       \n"
+    "vec4 hook()                                                            \n"
+    "{                                                                      \n"
+    "    return NATIVEBIG_texOff(0);                                        \n"
+    "}                                                                      \n",
+
+    // Test use of textures
+    "//!HOOK MAIN                                                           \n"
+    "//!DESC turn everything into colorful pixels                           \n"
+    "//!BIND HOOKED                                                         \n"
+    "//!BIND DISCO                                                          \n"
+    "//!COMPONENTS 3                                                        \n"
+    "                                                                       \n"
+    "vec4 hook()                                                            \n"
+    "{                                                                      \n"
+    "    return vec4(DISCO_tex(HOOKED_pos * 10.0).rgb, 1);                  \n"
+    "}                                                                      \n"
+    "                                                                       \n"
+    "//!TEXTURE DISCO                                                       \n"
+    "//!SIZE 3 3                                                            \n"
+    "//!FORMAT rgba8                                                        \n"
+    "//!FILTER NEAREST                                                      \n"
+    "//!BORDER REPEAT                                                       \n"
+    "ff0000ff00ff00ff0000ffff00ffffffff00ffffffff00ff4c4c4cff999999ffffffffff\n"
+
+    // Test custom parameters
+    "//!PARAM test                                                          \n"
+    "//!DESC test parameter                                                 \n"
+    "//!TYPE DYNAMIC float                                                  \n"
+    "//!MINIMUM 0.0                                                         \n"
+    "//!MAXIMUM 100.0                                                       \n"
+    "1.0                                                                    \n"
+    "                                                                       \n"
+    "//!PARAM testconst                                                     \n"
+    "//!TYPE CONSTANT uint                                                  \n"
+    "//!MAXIMUM 16                                                          \n"
+    "3                                                                      \n"
+    "                                                                       \n"
+    "//!PARAM testdefine                                                    \n"
+    "//!TYPE DEFINE                                                         \n"
+    "100                                                                    \n"
+    "                                                                       \n"
+    "//!PARAM testenum                                                      \n"
+    "//!TYPE ENUM DEFINE                                                    \n"
+    "FOO                                                                    \n"
+    "BAR                                                                    \n"
+    "                                                                       \n"
+    "//!HOOK MAIN                                                           \n"
+    "//!WHEN testconst 30 >                                                 \n"
+    "#error should not be run                                               \n"
+    "                                                                       \n"
+    "//!HOOK MAIN                                                           \n"
+    "//!WHEN testenum FOO =                                                 \n"
+    "#if testenum == BAR                                                    \n"
+    " #error bad                                                            \n"
+    "#endif                                                                 \n"
+    "vec4 hook() { return vec4(0.0); }                                      \n"
+};
+
+static const char *compute_shader_tests[] = {
+    // Test use of storage/buffer resources
+    "//!HOOK MAIN                                                           \n"
+    "//!DESC attach some storage objects                                    \n"
+    "//!BIND tex_storage                                                    \n"
+    "//!BIND buf_uniform                                                    \n"
+    "//!BIND buf_storage                                                    \n"
+    "//!COMPONENTS 4                                                        \n"
+    "                                                                       \n"
+    "vec4 hook()                                                            \n"
+    "{                                                                      \n"
+    "    return vec4(foo, bar, bat);                                        \n"
+    "}                                                                      \n"
+    "                                                                       \n"
+    "//!TEXTURE tex_storage                                                 \n"
+    "//!SIZE 100 100                                                        \n"
+    "//!FORMAT r32f                                                         \n"
+    "//!STORAGE                                                             \n"
+    "                                                                       \n"
+    "//!BUFFER buf_uniform                                                  \n"
+    "//!VAR float foo                                                       \n"
+    "//!VAR float bar                                                       \n"
+    "0000000000000000                                                       \n"
+    "                                                                       \n"
+    "//!BUFFER buf_storage                                                  \n"
+    "//!VAR vec2 bat                                                        \n"
+    "//!VAR int big[32];                                                    \n"
+    "//!STORAGE                                                             \n",
+
+};
+
+static const char *test_luts[] = {
+
+    "TITLE \"1D identity\"  \n"
+    "LUT_1D_SIZE 2          \n"
+    "0.0 0.0 0.0            \n"
+    "1.0 1.0 1.0            \n",
+
+    "TITLE \"3D identity\"  \n"
+    "LUT_3D_SIZE 2          \n"
+    "0.0 0.0 0.0            \n"
+    "1.0 0.0 0.0            \n"
+    "0.0 1.0 0.0            \n"
+    "1.0 1.0 0.0            \n"
+    "0.0 0.0 1.0            \n"
+    "1.0 0.0 1.0            \n"
+    "0.0 1.0 1.0            \n"
+    "1.0 1.0 1.0            \n"
+
+};
+
+static bool frame_passthrough(pl_gpu gpu, pl_tex *tex,
+                              const struct pl_source_frame *src, struct pl_frame *out_frame)
+{
+    const struct pl_frame *frame = src->frame_data;
+    *out_frame = *frame;
+    return true;
+}
+
+static enum pl_queue_status get_frame_ptr(struct pl_source_frame *out_frame,
+                                          const struct pl_queue_params *qparams)
+{
+    const struct pl_source_frame **pframe = qparams->priv;
+    if (!(*pframe)->frame_data)
+        return PL_QUEUE_EOF;
+
+    *out_frame = *(*pframe)++;
+    return PL_QUEUE_OK;
+}
+
+static void render_info_cb(void *priv, const struct pl_render_info *info)
+{
+    printf("{%d} Executed shader: %s\n", info->index,
+           info->pass->shader->description);
+}
+
+static void pl_render_tests(pl_gpu gpu)
+{
+    pl_tex img_tex = NULL, fbo = NULL;
+    pl_renderer rr = NULL;
+
+    enum { width = 50, height = 50 };
+    static float data[width][height];
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x++)
+            data[y][x] = RANDOM;
+    }
+
+    struct pl_plane img_plane = {0};
+    struct pl_plane_data plane_data = {
+        .type = PL_FMT_FLOAT,
+        .width = width,
+        .height = height,
+        .component_size = { 8 * sizeof(float) },
+        .component_map  = { 0 },
+        .pixel_stride = sizeof(float),
+        .pixels = data,
+    };
+
+    if (!pl_recreate_plane(gpu, NULL, &fbo, &plane_data))
+        return;
+
+    if (!pl_upload_plane(gpu, &img_plane, &img_tex, &plane_data))
+        goto error;
+
+    rr = pl_renderer_create(gpu->log, gpu);
+    pl_tex_clear_ex(gpu, fbo, (union pl_clear_color){0});
+
+    struct pl_frame image = {
+        .num_planes     = 1,
+        .planes         = { img_plane },
+        .repr = {
+            .sys        = PL_COLOR_SYSTEM_BT_709,
+            .levels     = PL_COLOR_LEVELS_FULL,
+        },
+        .color          = pl_color_space_srgb,
+    };
+
+    struct pl_frame target = {
+        .num_planes     = 1,
+        .planes         = {{
+            .texture            = fbo,
+            .components         = 3,
+            .component_mapping  = {0, 1, 2},
+        }},
+        .repr = {
+            .sys        = PL_COLOR_SYSTEM_RGB,
+            .levels     = PL_COLOR_LEVELS_FULL,
+            .bits.color_depth = 32,
+        },
+        .color          = pl_color_space_srgb,
+    };
+
+    REQUIRE(pl_render_image(rr, &image, &target, NULL));
+    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+
+    // TODO: embed a reference texture and ensure it matches
+
+    // Test a bunch of different params
+#define TEST(SNAME, STYPE, DEFAULT, FIELD, LIMIT)                       \
+    do {                                                                \
+        for (int i = 0; i <= LIMIT; i++) {                              \
+            printf("testing `" #STYPE "." #FIELD " = %d`\n", i);        \
+            struct pl_render_params params = pl_render_default_params;  \
+            params.force_dither = true;                                 \
+            struct STYPE tmp = DEFAULT;                                 \
+            tmp.FIELD = i;                                              \
+            params.SNAME = &tmp;                                        \
+            REQUIRE(pl_render_image(rr, &image, &target, &params));     \
+            pl_gpu_flush(gpu);                                          \
+            REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE); \
+        }                                                               \
+    } while (0)
+
+#define TEST_PARAMS(NAME, FIELD, LIMIT) \
+    TEST(NAME##_params, pl_##NAME##_params, pl_##NAME##_default_params, FIELD, LIMIT)
+
+    image.crop.x1 = width / 2.0;
+    image.crop.y1 = height / 2.0;
+    for (int i = 0; i < pl_num_scale_filters; i++) {
+        struct pl_render_params params = pl_render_default_params;
+        params.upscaler = pl_scale_filters[i].filter;
+        printf("testing `params.upscaler = /* %s */`\n", pl_scale_filters[i].name);
+        REQUIRE(pl_render_image(rr, &image, &target, &params));
+        pl_gpu_flush(gpu);
+        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+    }
+    image.crop.x1 = image.crop.y1 = 0;
+
+    target.crop.x1 = width / 2.0;
+    target.crop.y1 = height / 2.0;
+    for (int i = 0; i < pl_num_scale_filters; i++) {
+        struct pl_render_params params = pl_render_default_params;
+        params.downscaler = pl_scale_filters[i].filter;
+        printf("testing `params.downscaler = /* %s */`\n", pl_scale_filters[i].name);
+        REQUIRE(pl_render_image(rr, &image, &target, &params));
+        pl_gpu_flush(gpu);
+        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+    }
+    target.crop.x1 = target.crop.y1 = 0;
+
+    TEST_PARAMS(deband, iterations, 3);
+    TEST_PARAMS(sigmoid, center, 1);
+    TEST_PARAMS(color_map, intent, PL_INTENT_ABSOLUTE_COLORIMETRIC);
+    TEST_PARAMS(dither, method, PL_DITHER_WHITE_NOISE);
+    TEST_PARAMS(dither, temporal, true);
+    TEST_PARAMS(distort, alpha_mode, PL_ALPHA_INDEPENDENT);
+    TEST_PARAMS(distort, constrain, true);
+    TEST_PARAMS(distort, bicubic, true);
+    TEST(cone_params, pl_cone_params, pl_vision_deuteranomaly, strength, 0);
+
+    // Test gamma-correct dithering
+    target.repr.bits.color_depth = 2;
+    TEST_PARAMS(dither, transfer, PL_COLOR_TRC_GAMMA22);
+    target.repr.bits.color_depth = 32;
+
+    // Test HDR tone mapping
+    image.color = pl_color_space_hdr10;
+    TEST_PARAMS(color_map, visualize_lut, true);
+    if (gpu->limits.max_ssbo_size)
+        TEST_PARAMS(peak_detect, allow_delayed, true);
+
+    // Test inverse tone-mapping and pure BPC
+    image.color.hdr.max_luma = 1000;
+    target.color.hdr.max_luma = 4000;
+    target.color.hdr.min_luma = 0.02;
+    TEST_PARAMS(color_map, inverse_tone_mapping, true);
+
+    image.color = pl_color_space_srgb;
+    target.color = pl_color_space_srgb;
+
+    // Test some misc stuff
+    struct pl_render_params params = pl_render_default_params;
+    params.color_adjustment = &(struct pl_color_adjustment) {
+        .brightness = 0.1,
+        .contrast = 0.9,
+        .saturation = 1.5,
+        .gamma = 0.8,
+        .temperature = 0.3,
+    };
+    REQUIRE(pl_render_image(rr, &image, &target, &params));
+    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+    params = pl_render_default_params;
+
+    struct pl_frame inferred_image = image, inferred_target = target;
+    pl_frames_infer(rr, &inferred_image, &inferred_target);
+    REQUIRE(pl_render_image(rr, &inferred_image, &inferred_target, &params));
+    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+
+    // Test background blending and alpha transparency
+    params.blend_against_tiles = true;
+    params.corner_rounding = 0.25f;
+    REQUIRE(pl_render_image(rr, &image, &target, &params));
+    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+    params = pl_render_default_params;
+
+    // Test film grain synthesis
+    image.film_grain.type = PL_FILM_GRAIN_AV1;
+    image.film_grain.params.av1 = av1_grain_data;
+    REQUIRE(pl_render_image(rr, &image, &target, &params));
+    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+
+    image.film_grain.type = PL_FILM_GRAIN_H274;
+    image.film_grain.params.h274 = h274_grain_data;
+    REQUIRE(pl_render_image(rr, &image, &target, &params));
+    // H.274 film grain synthesis requires compute shaders
+    if (gpu->glsl.compute) {
+        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+    } else {
+        const struct pl_render_errors rr_err = pl_renderer_get_errors(rr);
+        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_FILM_GRAIN);
+        pl_renderer_reset_errors(rr, &rr_err);
+    }
+    image.film_grain = (struct pl_film_grain_data) {0};
+
+    // Test mpv-style custom shaders
+    for (int i = 0; i < PL_ARRAY_SIZE(user_shader_tests); i++) {
+        printf("testing user shader:\n\n%s\n", user_shader_tests[i]);
+        const struct pl_hook *hook;
+        hook = pl_mpv_user_shader_parse(gpu, user_shader_tests[i],
+                                        strlen(user_shader_tests[i]));
+        REQUIRE(hook);
+
+        params.hooks = &hook;
+        params.num_hooks = 1;
+        REQUIRE(pl_render_image(rr, &image, &target, &params));
+        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+
+        pl_mpv_user_shader_destroy(&hook);
+    }
+
+    if (gpu->glsl.compute && gpu->limits.max_ssbo_size) {
+        for (int i = 0; i < PL_ARRAY_SIZE(compute_shader_tests); i++) {
+            printf("testing user shader:\n\n%s\n", compute_shader_tests[i]);
+            const struct pl_hook *hook;
+            hook = pl_mpv_user_shader_parse(gpu, compute_shader_tests[i],
+                                            strlen(compute_shader_tests[i]));
+            REQUIRE(hook);
+
+            params.hooks = &hook;
+            params.num_hooks = 1;
+            REQUIRE(pl_render_image(rr, &image, &target, &params));
+            REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+
+            pl_mpv_user_shader_destroy(&hook);
+        }
+    }
+    params = pl_render_default_params;
+
+    // Test custom LUTs
+    for (int i = 0; i < PL_ARRAY_SIZE(test_luts); i++) {
+        printf("testing custom lut %d\n", i);
+        struct pl_custom_lut *lut;
+        lut = pl_lut_parse_cube(gpu->log, test_luts[i], strlen(test_luts[i]));
+        REQUIRE(lut);
+
+        bool has_3dlut = gpu->limits.max_tex_3d_dim && gpu->glsl.version > 100;
+        if (lut->size[2] && !has_3dlut) {
+            pl_lut_free(&lut);
+            continue;
+        }
+
+        // Test all three at the same time to reduce the number of tests
+        image.lut = target.lut = params.lut = lut;
+
+        for (enum pl_lut_type t = PL_LUT_UNKNOWN; t <= PL_LUT_CONVERSION; t++) {
+            printf("testing LUT method %d\n", t);
+            image.lut_type = target.lut_type = params.lut_type = t;
+            REQUIRE(pl_render_image(rr, &image, &target, &params));
+            REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+        }
+
+        image.lut = target.lut = params.lut = NULL;
+        pl_lut_free(&lut);
+    }
+
+#ifdef PL_HAVE_LCMS
+
+    // It doesn't fit without use of 3D textures on GLES2
+    if (gpu->glsl.version > 100) {
+        // Test ICC profiles
+        image.profile = TEST_PROFILE(sRGB_v2_nano_icc);
+        REQUIRE(pl_render_image(rr, &image, &target, &params));
+        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+        image.profile = (struct pl_icc_profile) {0};
+
+        target.profile = TEST_PROFILE(sRGB_v2_nano_icc);
+        REQUIRE(pl_render_image(rr, &image, &target, &params));
+        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+        target.profile = (struct pl_icc_profile) {0};
+
+        image.profile = TEST_PROFILE(sRGB_v2_nano_icc);
+        target.profile = image.profile;
+        REQUIRE(pl_render_image(rr, &image, &target, &params));
+        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+        image.profile = (struct pl_icc_profile) {0};
+        target.profile = (struct pl_icc_profile) {0};
+    }
+
+#endif
+
+    // Test overlays
+    image.num_overlays = 1;
+    image.overlays = &(struct pl_overlay) {
+        .tex = img_plane.texture,
+        .mode = PL_OVERLAY_NORMAL,
+        .num_parts = 2,
+        .parts = (struct pl_overlay_part[]) {{
+            .src = {0, 0, 2, 2},
+            .dst = {30, 100, 40, 200},
+        }, {
+            .src = {2, 2, 5, 5},
+            .dst = {1000, -1, 3, 5},
+        }},
+    };
+    REQUIRE(pl_render_image(rr, &image, &target, &params));
+    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+    params.disable_fbos = true;
+    REQUIRE(pl_render_image(rr, &image, &target, &params));
+    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+    image.num_overlays = 0;
+    params = pl_render_default_params;
+
+    target.num_overlays = 1;
+    target.overlays = &(struct pl_overlay) {
+        .tex = img_plane.texture,
+        .mode = PL_OVERLAY_MONOCHROME,
+        .num_parts = 1,
+        .parts = &(struct pl_overlay_part) {
+            .src = {5, 5, 15, 15},
+            .dst = {5, 5, 15, 15},
+            .color = {1.0, 0.5, 0.0},
+        },
+    };
+    REQUIRE(pl_render_image(rr, &image, &target, &params));
+    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+    REQUIRE(pl_render_image(rr, NULL, &target, &params));
+    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+    target.num_overlays = 0;
+
+    // Test rotation
+    for (pl_rotation rot = 0; rot < PL_ROTATION_360; rot += PL_ROTATION_90) {
+        image.rotation = rot;
+        REQUIRE(pl_render_image(rr, &image, &target, &params));
+        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+    }
+
+    // Attempt frame mixing, using the mixer queue helper
+    printf("testing frame mixing \n");
+    struct pl_render_params mix_params = {
+        .frame_mixer = &pl_filter_mitchell_clamp,
+        .info_callback = render_info_cb,
+    };
+
+    struct pl_queue_params qparams = {
+        .radius = pl_frame_mix_radius(&mix_params),
+        .vsync_duration = 1.0 / 60.0,
+    };
+
+    // Test large PTS jumps in frame mix
+    struct pl_frame_mix mix = (struct pl_frame_mix) {
+        .num_frames = 2,
+        .frames = (const struct pl_frame *[]) { &image, &image },
+        .signatures = (uint64_t[]) { 0xFFF1, 0xFFF2 },
+        .timestamps = (float[]) { -100, 100 },
+        .vsync_duration = 1.6,
+    };
+    REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));
+
+    // Test inferring frame mix
+    inferred_target = target;
+    pl_frames_infer_mix(rr, &mix, &inferred_target, &inferred_image);
+    REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));
+
+    // Test empty frame mix
+    mix = (struct pl_frame_mix) {0};
+    REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));
+
+    // Test inferring empty frame mix
+    inferred_target = target;
+    pl_frames_infer_mix(rr, &mix, &inferred_target, &inferred_image);
+    REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));
+
+    // Test mixer queue
+#define NUM_MIX_FRAMES 20
+    const float frame_duration = 1.0 / 24.0;
+    struct pl_source_frame srcframes[NUM_MIX_FRAMES+1];
+    srcframes[NUM_MIX_FRAMES] = (struct pl_source_frame) {0};
+    for (int i = 0; i < NUM_MIX_FRAMES; i++) {
+        srcframes[i] = (struct pl_source_frame) {
+            .pts = i * frame_duration,
+            .duration = frame_duration,
+            .map = frame_passthrough,
+            .frame_data = &image,
+        };
+    }
+
+    pl_queue queue = pl_queue_create(gpu);
+    enum pl_queue_status ret;
+
+    // Test pre-pushing all frames, with delayed EOF.
+    for (int i = 0; i < NUM_MIX_FRAMES; i++) {
+        const struct pl_source_frame *src = &srcframes[i];
+        if (i > 10) // test pushing in reverse order
+            src = &srcframes[NUM_MIX_FRAMES + 10 - i];
+        if (!pl_queue_push_block(queue, 1, src)) // mini-sleep
+            pl_queue_push(queue, src); // push it anyway, for testing
+    }
+
+    while ((ret = pl_queue_update(queue, &mix, &qparams)) != PL_QUEUE_EOF) {
+        if (ret == PL_QUEUE_MORE) {
+            REQUIRE_CMP(qparams.pts, >, 0.0f, "f");
+            pl_queue_push(queue, NULL); // push delayed EOF
+            continue;
+        }
+
+        REQUIRE_CMP(ret, ==, PL_QUEUE_OK, "u");
+        REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));
+
+        // Simulate advancing vsync
+        qparams.pts += qparams.vsync_duration;
+    }
+
+    // Test dynamically pulling all frames, with oversample mixer
+    const struct pl_source_frame *frame_ptr = &srcframes[0];
+    mix_params.frame_mixer = &pl_oversample_frame_mixer;
+
+    qparams = (struct pl_queue_params) {
+        .radius = pl_frame_mix_radius(&mix_params),
+        .vsync_duration = qparams.vsync_duration,
+        .get_frame = get_frame_ptr,
+        .priv = &frame_ptr,
+    };
+
+    pl_queue_reset(queue);
+    while ((ret = pl_queue_update(queue, &mix, &qparams)) != PL_QUEUE_EOF) {
+        REQUIRE_CMP(ret, ==, PL_QUEUE_OK, "u");
+        REQUIRE_CMP(mix.num_frames, <=, 2, "d");
+        REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));
+        qparams.pts += qparams.vsync_duration;
+    }
+
+    // Test large PTS jump
+    pl_queue_reset(queue);
+    REQUIRE(pl_queue_update(queue, &mix, &qparams) == PL_QUEUE_EOF);
+
+    // Test deinterlacing
+    pl_queue_reset(queue);
+    printf("testing deinterlacing \n");
+    for (int i = 0; i < NUM_MIX_FRAMES; i++) {
+        struct pl_source_frame *src = &srcframes[i];
+        if (i > 10)
+            src = &srcframes[NUM_MIX_FRAMES + 10 - i];
+        src->first_field = PL_FIELD_EVEN;
+        pl_queue_push(queue, src);
+    }
+    pl_queue_push(queue, NULL);
+
+    qparams.pts = 0;
+    qparams.get_frame = NULL;
+    while ((ret = pl_queue_update(queue, &mix, &qparams)) != PL_QUEUE_EOF) {
+        REQUIRE_CMP(ret, ==, PL_QUEUE_OK, "u");
+        REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));
+        qparams.pts += qparams.vsync_duration;
+    }
+
+    pl_queue_destroy(&queue);
+
+error:
+    pl_renderer_destroy(&rr);
+    pl_tex_destroy(gpu, &img_tex);
+    pl_tex_destroy(gpu, &fbo);
+}
+
+static struct pl_hook_res noop_hook(void *priv, const struct pl_hook_params *params)
+{
+    return (struct pl_hook_res) {0};
+}
+
+static void pl_ycbcr_tests(pl_gpu gpu)
+{
+    struct pl_plane_data data[3];
+    for (int i = 0; i < 3; i++) {
+        const int sub = i > 0 ? 1 : 0;
+        const int width = (323 + sub) >> sub;
+        const int height = (255 + sub) >> sub;
+
+        data[i] = (struct pl_plane_data) {
+            .type = PL_FMT_UNORM,
+            .width = width,
+            .height = height,
+            .component_size = {16},
+            .component_map = {i},
+            .pixel_stride = sizeof(uint16_t),
+            .row_stride = PL_ALIGN2(width * sizeof(uint16_t),
+                                    gpu->limits.align_tex_xfer_pitch),
+        };
+    }
+
+    pl_fmt fmt = pl_plane_find_fmt(gpu, NULL, &data[0]);
+    enum pl_fmt_caps caps = PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_HOST_READABLE;
+    if (!fmt || (fmt->caps & caps) != caps)
+        return;
+
+    pl_renderer rr = pl_renderer_create(gpu->log, gpu);
+    if (!rr)
+        return;
+
+    pl_tex src_tex[3] = {0};
+    pl_tex dst_tex[3] = {0};
+    struct pl_frame img = {
+        .num_planes = 3,
+        .repr = pl_color_repr_hdtv,
+        .color = pl_color_space_bt709,
+    };
+
+    struct pl_frame target = {
+        .num_planes = 3,
+        .repr = pl_color_repr_hdtv,
+        .color = pl_color_space_bt709,
+    };
+
+    uint8_t *src_buffer[3] = {0};
+    uint8_t *dst_buffer = NULL;
+    for (int i = 0; i < 3; i++) {
+        // Generate some arbitrary data for the buffer
+        src_buffer[i] = malloc(data[i].height * data[i].row_stride);
+        if (!src_buffer[i])
+            goto error;
+
+        data[i].pixels = src_buffer[i];
+        for (int y = 0; y < data[i].height; y++) {
+            for (int x = 0; x < data[i].width; x++) {
+                size_t off = y * data[i].row_stride + x * data[i].pixel_stride;
+                uint16_t *pixel = (uint16_t *) &src_buffer[i][off];
+                int gx = 200 + 100 * i, gy = 300 + 150 * i;
+                *pixel = (gx * x) ^ (gy * y); // whatever
+            }
+        }
+
+        REQUIRE(pl_upload_plane(gpu, &img.planes[i], &src_tex[i], &data[i]));
+    }
+
+    // This co-sites chroma pixels with pixels in the RGB image, meaning we
+    // get an exact round-trip when sampling both ways. This makes it useful
+    // as a test case, even though it's not common in the real world.
+    pl_frame_set_chroma_location(&img, PL_CHROMA_TOP_LEFT);
+
+    for (int i = 0; i < 3; i++) {
+        dst_tex[i] = pl_tex_create(gpu, &(struct pl_tex_params) {
+            .format = fmt,
+            .w = data[i].width,
+            .h = data[i].height,
+            .renderable = true,
+            .host_readable = true,
+            .storable = fmt->caps & PL_FMT_CAP_STORABLE,
+            .blit_dst = fmt->caps & PL_FMT_CAP_BLITTABLE,
+        });
+
+        if (!dst_tex[i])
+            goto error;
+
+        target.planes[i] = img.planes[i];
+        target.planes[i].texture = dst_tex[i];
+    }
+
+    REQUIRE(pl_render_image(rr, &img, &target, &(struct pl_render_params) {
+        .num_hooks = 1,
+        .hooks = &(const struct pl_hook *){&(struct pl_hook) {
+            // Forces chroma merging, to test the chroma merging code
+            .stages = PL_HOOK_CHROMA_INPUT,
+            .hook = noop_hook,
+        }},
+    }));
+    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+
+    size_t buf_size = data[0].height * data[0].row_stride;
+    dst_buffer = malloc(buf_size);
+    if (!dst_buffer)
+        goto error;
+
+    for (int i = 0; i < 3; i++) {
+        memset(dst_buffer, 0xAA, buf_size);
+        REQUIRE(pl_tex_download(gpu, &(struct pl_tex_transfer_params) {
+            .tex = dst_tex[i],
+            .ptr = dst_buffer,
+            .row_pitch = data[i].row_stride,
+        }));
+
+        for (int y = 0; y < data[i].height; y++) {
+            for (int x = 0; x < data[i].width; x++) {
+                size_t off = y * data[i].row_stride + x * data[i].pixel_stride;
+                uint16_t *src_pixel = (uint16_t *) &src_buffer[i][off];
+                uint16_t *dst_pixel = (uint16_t *) &dst_buffer[off];
+                int diff = abs((int) *src_pixel - (int) *dst_pixel);
+                REQUIRE_CMP(diff, <=, 50, "d"); // a little under 0.1%
+            }
+        }
+    }
+
+error:
+    pl_renderer_destroy(&rr);
+    free(dst_buffer);
+    for (int i = 0; i < 3; i++) {
+        free(src_buffer[i]);
+        pl_tex_destroy(gpu, &src_tex[i]);
+        pl_tex_destroy(gpu, &dst_tex[i]);
+    }
+}
+
+static void pl_test_export_import(pl_gpu gpu,
+                                  enum pl_handle_type handle_type)
+{
+    // Test texture roundtrip
+
+    if (!(gpu->export_caps.tex & handle_type) ||
+        !(gpu->import_caps.tex & handle_type))
+        goto skip_tex;
+
+    pl_fmt fmt = pl_find_fmt(gpu, PL_FMT_UNORM, 4, 0, 0, PL_FMT_CAP_BLITTABLE);
+    if (!fmt)
+        goto skip_tex;
+
+    printf("testing texture import/export with fmt %s\n", fmt->name);
+
+    pl_tex export = pl_tex_create(gpu, &(struct pl_tex_params) {
+        .w = 32,
+        .h = 32,
+        .format = fmt,
+        .export_handle = handle_type,
+    });
+    REQUIRE(export);
+    REQUIRE_HANDLE(export->shared_mem, handle_type);
+
+    pl_tex import = pl_tex_create(gpu, &(struct pl_tex_params) {
+        .w = export->params.w,
+        .h = export->params.h,
+        .format = fmt,
+        .import_handle = handle_type,
+        .shared_mem = export->shared_mem,
+    });
+    REQUIRE(import);
+
+    pl_tex_destroy(gpu, &import);
+    pl_tex_destroy(gpu, &export);
+
+skip_tex: ;
+
+    // Test buffer roundtrip
+
+    if (!(gpu->export_caps.buf & handle_type) ||
+        !(gpu->import_caps.buf & handle_type))
+        return;
+
+    printf("testing buffer import/export\n");
+
+    pl_buf exp_buf = pl_buf_create(gpu, &(struct pl_buf_params) {
+        .size = 32,
+        .export_handle = handle_type,
+    });
+    REQUIRE(exp_buf);
+    REQUIRE_HANDLE(exp_buf->shared_mem, handle_type);
+
+    pl_buf imp_buf = pl_buf_create(gpu, &(struct pl_buf_params) {
+        .size = 32,
+        .import_handle = handle_type,
+        .shared_mem = exp_buf->shared_mem,
+    });
+    REQUIRE(imp_buf);
+
+    pl_buf_destroy(gpu, &imp_buf);
+    pl_buf_destroy(gpu, &exp_buf);
+}
+
+static void pl_test_host_ptr(pl_gpu gpu)
+{
+    if (!(gpu->import_caps.buf & PL_HANDLE_HOST_PTR))
+        return;
+
+#ifdef __unix__
+
+    printf("testing host ptr\n");
+    REQUIRE(gpu->limits.max_mapped_size);
+
+    const size_t size = 2 << 20;
+    const size_t offset = 2 << 10;
+    const size_t slice = 2 << 16;
+
+    uint8_t *data = aligned_alloc(0x1000, size);
+    for (int i = 0; i < size; i++)
+        data[i] = (uint8_t) i;
+
+    pl_buf buf = pl_buf_create(gpu, &(struct pl_buf_params) {
+        .size = slice,
+        .import_handle = PL_HANDLE_HOST_PTR,
+        .shared_mem = {
+            .handle.ptr = data,
+            .size = size,
+            .offset = offset,
+        },
+        .host_mapped = true,
+    });
+
+    REQUIRE(buf);
+    REQUIRE_MEMEQ(data + offset, buf->data, slice);
+
+    pl_buf_destroy(gpu, &buf);
+    free(data);
+
+#endif // unix
+}
+
+static void gpu_shader_tests(pl_gpu gpu)
+{
+    pl_buffer_tests(gpu);
+    pl_texture_tests(gpu);
+    pl_planar_tests(gpu);
+    pl_shader_tests(gpu);
+    pl_scaler_tests(gpu);
+    pl_render_tests(gpu);
+    pl_ycbcr_tests(gpu);
+
+    REQUIRE(!pl_gpu_is_failed(gpu));
+}
+
+static void gpu_interop_tests(pl_gpu gpu)
+{
+    pl_test_export_import(gpu, PL_HANDLE_DMA_BUF);
+    pl_test_host_ptr(gpu);
+
+    REQUIRE(!pl_gpu_is_failed(gpu));
+}