#include "tests.h"
#include "shaders.h"

#include <libplacebo/renderer.h>
#include <libplacebo/utils/frame_queue.h>
#include <libplacebo/utils/upload.h>

//#define PRINT_OUTPUT

static void pl_buffer_tests(pl_gpu gpu)
{
    const size_t buf_size = 1024;
    if (buf_size > gpu->limits.max_buf_size)
        return;

    uint8_t *test_src = malloc(buf_size * 2);
    uint8_t *test_dst = test_src + buf_size;
    assert(test_src && test_dst);
    memset(test_dst, 0, buf_size);
    for (int i = 0; i < buf_size; i++)
        test_src[i] = RANDOM_U8;

    pl_buf buf = NULL, tbuf = NULL;

    printf("test buffer static creation and readback\n");
    buf = pl_buf_create(gpu, pl_buf_params(
        .size = buf_size,
        .host_readable = true,
        .initial_data = test_src,
    ));

    REQUIRE(buf);
    REQUIRE(pl_buf_read(gpu, buf, 0, test_dst, buf_size));
    REQUIRE_MEMEQ(test_src, test_dst, buf_size);
    pl_buf_destroy(gpu, &buf);

    printf("test buffer empty creation, update and readback\n");
    memset(test_dst, 0, buf_size);
    buf = pl_buf_create(gpu, pl_buf_params(
        .size = buf_size,
        .host_writable = true,
        .host_readable = true,
    ));

    REQUIRE(buf);
    pl_buf_write(gpu, buf, 0, test_src, buf_size);
    REQUIRE(pl_buf_read(gpu, buf, 0, test_dst, buf_size));
    REQUIRE_MEMEQ(test_src, test_dst, buf_size);
    pl_buf_destroy(gpu, &buf);

    printf("test buffer-buffer copy and readback\n");
    memset(test_dst, 0, buf_size);
    buf = pl_buf_create(gpu, pl_buf_params(
        .size = buf_size,
        .initial_data = test_src,
    ));

    tbuf = pl_buf_create(gpu, pl_buf_params(
        .size = buf_size,
        .host_readable = true,
    ));

    REQUIRE(buf && tbuf);
    pl_buf_copy(gpu, tbuf, 0, buf, 0, buf_size);
    REQUIRE(pl_buf_read(gpu, tbuf, 0, test_dst, buf_size));
    REQUIRE_MEMEQ(test_src, test_dst, buf_size);
    pl_buf_destroy(gpu, &buf);
    pl_buf_destroy(gpu, &tbuf);

    if (buf_size <= gpu->limits.max_mapped_size) {
        printf("test host mapped buffer readback\n");
        buf = pl_buf_create(gpu, pl_buf_params(
            .size = buf_size,
            .host_mapped = true,
            .initial_data = test_src,
        ));

        REQUIRE(buf);
        REQUIRE(!pl_buf_poll(gpu, buf, 0));
        REQUIRE_MEMEQ(test_src, buf->data, buf_size);
        pl_buf_destroy(gpu, &buf);
    }

    // `compute_queues` check is to exclude dummy GPUs here
    if (buf_size <= gpu->limits.max_ssbo_size && gpu->limits.compute_queues)
    {
        printf("test endian swapping\n");
        buf = pl_buf_create(gpu, pl_buf_params(
            .size = buf_size,
            .storable = true,
            .initial_data = test_src,
        ));

        tbuf = pl_buf_create(gpu, pl_buf_params(
            .size = buf_size,
            .storable = true,
            .host_readable = true,
        ));

        REQUIRE(buf && tbuf);
        REQUIRE(pl_buf_copy_swap(gpu, &(struct pl_buf_copy_swap_params) {
            .src = buf,
            .dst = tbuf,
            .size = buf_size,
            .wordsize = 2,
        }));
        REQUIRE(pl_buf_read(gpu, tbuf, 0, test_dst, buf_size));
        for (int i = 0; i < buf_size / 2; i++) {
            REQUIRE_CMP(test_src[2 * i + 0], ==, test_dst[2 * i + 1], PRIu8);
            REQUIRE_CMP(test_src[2 * i + 1], ==, test_dst[2 * i + 0], PRIu8);
        }
        // test endian swap in-place
        REQUIRE(pl_buf_copy_swap(gpu, &(struct pl_buf_copy_swap_params) {
            .src = tbuf,
            .dst = tbuf,
            .size = buf_size,
            .wordsize = 4,
        }));
        REQUIRE(pl_buf_read(gpu, tbuf, 0, test_dst, buf_size));
        for (int i = 0; i < buf_size / 4; i++) {
            REQUIRE_CMP(test_src[4 * i + 0], ==, test_dst[4 * i + 2], PRIu8);
            REQUIRE_CMP(test_src[4 * i + 1], ==, test_dst[4 * i + 3], PRIu8);
            REQUIRE_CMP(test_src[4 * i + 2], ==, test_dst[4 * i + 0], PRIu8);
            REQUIRE_CMP(test_src[4 * i + 3], ==, test_dst[4 * i + 1], PRIu8);
        }
        pl_buf_destroy(gpu, &buf);
        pl_buf_destroy(gpu, &tbuf);
    }

    free(test_src);
}

static void test_cb(void *priv)
{
    bool *flag = priv;
    *flag = true;
}

static void pl_test_roundtrip(pl_gpu gpu, pl_tex tex[2],
                              uint8_t *src, uint8_t *dst)
{
    if (!tex[0] || !tex[1]) {
        printf("failed creating test textures... skipping this test\n");
        return;
    }

    int texels = tex[0]->params.w;
    texels *= tex[0]->params.h ? tex[0]->params.h : 1;
    texels *= tex[0]->params.d ? tex[0]->params.d : 1;

    pl_fmt fmt = tex[0]->params.format;
    size_t bytes = texels * fmt->texel_size;
    memset(src, 0, bytes);
    memset(dst, 0, bytes);

    for (size_t i = 0; i < bytes; i++)
        src[i] = RANDOM_U8;

    pl_timer ul, dl;
    ul = pl_timer_create(gpu);
    dl = pl_timer_create(gpu);

    bool ran_ul = false, ran_dl = false;

    REQUIRE(pl_tex_upload(gpu, &(struct pl_tex_transfer_params){
        .tex = tex[0],
        .ptr = src,
        .timer = ul,
        .callback = gpu->limits.callbacks ? test_cb : NULL,
        .priv = &ran_ul,
    }));

    // Test blitting, if possible for this format
    pl_tex dst_tex = tex[0];
    if (tex[0]->params.blit_src && tex[1]->params.blit_dst) {
        pl_tex_clear_ex(gpu, tex[1], (union pl_clear_color){0}); // for testing
        pl_tex_blit(gpu, &(struct pl_tex_blit_params) {
            .src = tex[0],
            .dst = tex[1],
        });
        dst_tex = tex[1];
    }

    REQUIRE(pl_tex_download(gpu, &(struct pl_tex_transfer_params){
        .tex = dst_tex,
        .ptr = dst,
        .timer = dl,
        .callback = gpu->limits.callbacks ? test_cb : NULL,
        .priv = &ran_dl,
    }));

    pl_gpu_finish(gpu);
    if (gpu->limits.callbacks)
        REQUIRE(ran_ul && ran_dl);

    if (fmt->emulated && fmt->type == PL_FMT_FLOAT) {
        // TODO: can't memcmp here because bits might be lost due to the
        // emulated 16/32 bit upload paths, figure out a better way to
        // generate data and verify the roundtrip!
    } else {
        REQUIRE_MEMEQ(src, dst, bytes);
    }

    // Report timer results
    printf("upload time: %"PRIu64", download time: %"PRIu64"\n",
           pl_timer_query(gpu, ul), pl_timer_query(gpu, dl));

    pl_timer_destroy(gpu, &ul);
    pl_timer_destroy(gpu, &dl);
}

static void pl_texture_tests(pl_gpu gpu)
{
    const size_t max_size = 16*16*16 * 4 *sizeof(double);
    uint8_t *test_src = malloc(max_size * 2);
    uint8_t *test_dst = test_src + max_size;

    for (int f = 0; f < gpu->num_formats; f++) {
        pl_fmt fmt = gpu->formats[f];
        if (fmt->opaque || !(fmt->caps & PL_FMT_CAP_HOST_READABLE))
            continue;

        printf("testing texture roundtrip for format %s\n", fmt->name);
        assert(fmt->texel_size <= 4 * sizeof(double));

        struct pl_tex_params ref_params = {
            .format        = fmt,
            .blit_src      = (fmt->caps & PL_FMT_CAP_BLITTABLE),
            .blit_dst      = (fmt->caps & PL_FMT_CAP_BLITTABLE),
            .host_writable = true,
            .host_readable = true,
            .debug_tag     = PL_DEBUG_TAG,
        };

        pl_tex tex[2];

        if (gpu->limits.max_tex_1d_dim >= 16) {
            printf("... 1D\n");
            struct pl_tex_params params = ref_params;
            params.w = 16;
            if (!gpu->limits.blittable_1d_3d)
                params.blit_src = params.blit_dst = false;
            for (int i = 0; i < PL_ARRAY_SIZE(tex); i++)
                tex[i] = pl_tex_create(gpu, &params);
            pl_test_roundtrip(gpu, tex, test_src, test_dst);
            for (int i = 0; i < PL_ARRAY_SIZE(tex); i++)
                pl_tex_destroy(gpu, &tex[i]);
        }

        if (gpu->limits.max_tex_2d_dim >= 16) {
            printf("... 2D\n");
            struct pl_tex_params params = ref_params;
            params.w = params.h = 16;
            for (int i = 0; i < PL_ARRAY_SIZE(tex); i++)
                tex[i] = pl_tex_create(gpu, &params);
            pl_test_roundtrip(gpu, tex, test_src, test_dst);
            for (int i = 0; i < PL_ARRAY_SIZE(tex); i++)
                pl_tex_destroy(gpu, &tex[i]);
        }

        if (gpu->limits.max_tex_3d_dim >= 16) {
            printf("... 3D\n");
            struct pl_tex_params params = ref_params;
            params.w = params.h = params.d = 16;
            if (!gpu->limits.blittable_1d_3d)
                params.blit_src = params.blit_dst = false;
            for (int i = 0; i < PL_ARRAY_SIZE(tex); i++)
                tex[i] = pl_tex_create(gpu, &params);
            pl_test_roundtrip(gpu, tex, test_src, test_dst);
            for (int i = 0; i < PL_ARRAY_SIZE(tex); i++)
                pl_tex_destroy(gpu, &tex[i]);
        }
    }

    free(test_src);
}

static void pl_planar_tests(pl_gpu gpu)
{
    pl_fmt fmt = pl_find_named_fmt(gpu, "g8_b8_r8_420");
    if (!fmt)
        return;
    REQUIRE_CMP(fmt->num_planes, ==, 3, "d");

    const int width = 64, height = 32;
    pl_tex tex = pl_tex_create(gpu, pl_tex_params(
        .w              = width,
        .h              = height,
        .format         = fmt,
        .blit_dst       = true,
        .host_readable  = true,
    ));
    if (!tex)
        return;
    for (int i = 0; i < fmt->num_planes; i++)
        REQUIRE(tex->planes[i]);

    pl_tex plane = tex->planes[1];
    uint8_t data[(width * height) >> 2];
    REQUIRE_CMP(plane->params.w * plane->params.h, ==, PL_ARRAY_SIZE(data), "d");

    pl_tex_clear(gpu, plane, (float[]){ (float) 0x80 / 0xFF, 0.0, 0.0, 1.0 });
    REQUIRE(pl_tex_download(gpu, pl_tex_transfer_params(
        .tex = plane,
        .ptr = data,
    )));

    uint8_t ref[PL_ARRAY_SIZE(data)];
    memset(ref, 0x80, sizeof(ref));
    REQUIRE_MEMEQ(data, ref, PL_ARRAY_SIZE(data));

    pl_tex_destroy(gpu, &tex);
}

static void pl_shader_tests(pl_gpu gpu)
{
    if (gpu->glsl.version < 410)
        return;

    const char *vert_shader =
        "#version 410                               \n"
        "layout(location=0) in vec2 vertex_pos;     \n"
        "layout(location=1) in vec3 vertex_color;   \n"
        "layout(location=0) out vec3 frag_color;    \n"
        "void main() {                              \n"
        "    gl_Position = vec4(vertex_pos, 0, 1);  \n"
        "    frag_color = vertex_color;             \n"
        "}";

    const char *frag_shader =
        "#version 410                               \n"
        "layout(location=0) in vec3 frag_color;     \n"
        "layout(location=0) out vec4 out_color;     \n"
        "void main() {                              \n"
        "    out_color = vec4(frag_color, 1.0);     \n"
        "}";

    pl_fmt fbo_fmt;
    enum pl_fmt_caps caps = PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_BLITTABLE |
                            PL_FMT_CAP_LINEAR;

    fbo_fmt = pl_find_fmt(gpu, PL_FMT_FLOAT, 4, 16, 32, caps);
    if (!fbo_fmt)
        return;

#define FBO_W 16
#define FBO_H 16

    pl_tex fbo;
    fbo = pl_tex_create(gpu, &(struct pl_tex_params) {
        .format         = fbo_fmt,
        .w              = FBO_W,
        .h              = FBO_H,
        .renderable     = true,
        .storable       = !!(fbo_fmt->caps & PL_FMT_CAP_STORABLE),
        .host_readable  = true,
        .blit_dst       = true,
    });
    REQUIRE(fbo);

    pl_tex_clear_ex(gpu, fbo, (union pl_clear_color){0});

    pl_fmt vert_fmt;
    vert_fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 3);
    REQUIRE(vert_fmt);

    static const struct vertex { float pos[2]; float color[3]; } vertices[] = {
        {{-1.0, -1.0}, {0, 0, 0}},
        {{ 1.0, -1.0}, {1, 0, 0}},
        {{-1.0,  1.0}, {0, 1, 0}},
        {{ 1.0,  1.0}, {1, 1, 0}},
    };

    pl_pass pass;
    pass = pl_pass_create(gpu, &(struct pl_pass_params) {
        .type           = PL_PASS_RASTER,
        .target_format  = fbo_fmt,
        .vertex_shader  = vert_shader,
        .glsl_shader    = frag_shader,

        .vertex_type    = PL_PRIM_TRIANGLE_STRIP,
        .vertex_stride  = sizeof(struct vertex),
        .num_vertex_attribs = 2,
        .vertex_attribs = (struct pl_vertex_attrib[]) {{
            .name     = "vertex_pos",
            .fmt      = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2),
            .location = 0,
            .offset   = offsetof(struct vertex, pos),
        }, {
            .name     = "vertex_color",
            .fmt      = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 3),
            .location = 1,
            .offset   = offsetof(struct vertex, color),
        }},
    });
    REQUIRE(pass);
    if (pass->params.cached_program || pass->params.cached_program_len) {
        // Ensure both are set if either one is set
        REQUIRE(pass->params.cached_program);
        REQUIRE(pass->params.cached_program_len);
    }

    pl_timer timer = pl_timer_create(gpu);
    pl_pass_run(gpu, &(struct pl_pass_run_params) {
        .pass           = pass,
        .target         = fbo,
        .vertex_count   = PL_ARRAY_SIZE(vertices),
        .vertex_data    = vertices,
        .timer          = timer,
    });

    // Wait until this pass is complete and report the timer result
    pl_gpu_finish(gpu);
    printf("timer query result: %"PRIu64"\n", pl_timer_query(gpu, timer));
    pl_timer_destroy(gpu, &timer);

    static float test_data[FBO_H * FBO_W * 4] = {0};

    // Test against the known pattern of `src`, only useful for roundtrip tests
#define TEST_FBO_PATTERN(eps, fmt, ...)                                     \
    do {                                                                    \
        printf("testing pattern of " fmt "\n", __VA_ARGS__);                \
        REQUIRE(pl_tex_download(gpu, &(struct pl_tex_transfer_params) {     \
            .tex = fbo,                                                     \
            .ptr = test_data,                                               \
        }));                                                                \
                                                                            \
        for (int y = 0; y < FBO_H; y++) {                                   \
            for (int x = 0; x < FBO_W; x++) {                               \
                float *color = &test_data[(y * FBO_W + x) * 4];             \
                REQUIRE_FEQ(color[0], (x + 0.5) / FBO_W, eps);              \
                REQUIRE_FEQ(color[1], (y + 0.5) / FBO_H, eps);              \
                REQUIRE_FEQ(color[2], 0.0, eps);                            \
                REQUIRE_FEQ(color[3], 1.0, eps);                            \
            }                                                               \
        }                                                                   \
    } while (0)

    TEST_FBO_PATTERN(1e-6, "%s", "initial rendering");

    if (sizeof(vertices) <= gpu->limits.max_vbo_size) {
        // Test the use of an explicit vertex buffer
        pl_buf vert = pl_buf_create(gpu, &(struct pl_buf_params) {
            .size = sizeof(vertices),
            .initial_data = vertices,
            .drawable = true,
        });

        REQUIRE(vert);
        pl_pass_run(gpu, &(struct pl_pass_run_params) {
            .pass           = pass,
            .target         = fbo,
            .vertex_count   = sizeof(vertices) / sizeof(struct vertex),
            .vertex_buf     = vert,
            .buf_offset     = 0,
        });

        pl_buf_destroy(gpu, &vert);
        TEST_FBO_PATTERN(1e-6, "%s", "using vertex buffer");
    }

    // Test the use of index buffers
    static const uint16_t indices[] = { 3, 2, 1, 0 };
    pl_pass_run(gpu, &(struct pl_pass_run_params) {
        .pass           = pass,
        .target         = fbo,
        .vertex_count   = PL_ARRAY_SIZE(indices),
        .vertex_data    = vertices,
        .index_data     = indices,
    });

    pl_pass_destroy(gpu, &pass);
    TEST_FBO_PATTERN(1e-6, "%s", "using indexed rendering");

    // Test the use of pl_dispatch
    pl_dispatch dp = pl_dispatch_create(gpu->log, gpu);
    pl_shader sh = pl_dispatch_begin(dp);
    REQUIRE(pl_shader_custom(sh, &(struct pl_custom_shader) {
        .body       = "color = vec4(col, 1.0);",
        .input      = PL_SHADER_SIG_NONE,
        .output     = PL_SHADER_SIG_COLOR,
    }));

    REQUIRE(pl_dispatch_vertex(dp, &(struct pl_dispatch_vertex_params) {
        .shader         = &sh,
        .target         = fbo,
        .vertex_stride  = sizeof(struct vertex),
        .vertex_position_idx = 0,
        .num_vertex_attribs = 2,
        .vertex_attribs = (struct pl_vertex_attrib[]) {{
            .name   = "pos",
            .fmt    = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2),
            .offset = offsetof(struct vertex, pos),
        }, {
            .name   = "col",
            .fmt    = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 3),
            .offset = offsetof(struct vertex, color),
        }},

        .vertex_type    = PL_PRIM_TRIANGLE_STRIP,
        .vertex_coords  = PL_COORDS_NORMALIZED,
        .vertex_count   = PL_ARRAY_SIZE(vertices),
        .vertex_data    = vertices,
    }));

    TEST_FBO_PATTERN(1e-6, "%s", "using custom vertices");

    static float src_data[FBO_H * FBO_W * 4] = {0};
    memcpy(src_data, test_data, sizeof(src_data));

    pl_tex src;
    src = pl_tex_create(gpu, &(struct pl_tex_params) {
        .format         = fbo_fmt,
        .w              = FBO_W,
        .h              = FBO_H,
        .storable       = fbo->params.storable,
        .sampleable     = true,
        .initial_data   = src_data,
    });

    if (fbo->params.storable) {
        // Test 1x1 blit, to make sure the scaling code runs
        REQUIRE(pl_tex_blit_compute(gpu, &(struct pl_tex_blit_params) {
            .src = src,
            .dst = fbo,
            .src_rc = {0, 0, 0, 1, 1, 1},
            .dst_rc = {0, 0, 0, FBO_W, FBO_H, 1},
            .sample_mode = PL_TEX_SAMPLE_NEAREST,
        }));

        // Test non-resizing blit, which uses the efficient imageLoad path
        REQUIRE(pl_tex_blit_compute(gpu, &(struct pl_tex_blit_params) {
            .src = src,
            .dst = fbo,
            .src_rc = {0, 0, 0, FBO_W, FBO_H, 1},
            .dst_rc = {0, 0, 0, FBO_W, FBO_H, 1},
            .sample_mode = PL_TEX_SAMPLE_NEAREST,
        }));

        TEST_FBO_PATTERN(1e-6, "%s", "pl_tex_blit_compute");
    }

    // Test encoding/decoding of all gamma functions, color spaces, etc.
    for (enum pl_color_transfer trc = 0; trc < PL_COLOR_TRC_COUNT; trc++) {
        struct pl_color_space test_csp = {
            .transfer = trc,
            .hdr.min_luma = PL_COLOR_HDR_BLACK,
        };
        sh = pl_dispatch_begin(dp);
        pl_shader_sample_nearest(sh, pl_sample_src( .tex = src ));
        pl_shader_delinearize(sh, &test_csp);
        pl_shader_linearize(sh, &test_csp);
        REQUIRE(pl_dispatch_finish(dp, pl_dispatch_params(
            .shader = &sh,
            .target = fbo,
        )));

        float epsilon = pl_color_transfer_is_hdr(trc) ? 1e-4 : 1e-6;
        TEST_FBO_PATTERN(epsilon, "transfer function %d", (int) trc);
    }

    for (enum pl_color_system sys = 0; sys < PL_COLOR_SYSTEM_COUNT; sys++) {
        if (sys == PL_COLOR_SYSTEM_DOLBYVISION)
            continue; // requires metadata
        sh = pl_dispatch_begin(dp);
        pl_shader_sample_nearest(sh, pl_sample_src( .tex = src ));
        pl_shader_encode_color(sh, &(struct pl_color_repr) { .sys = sys });
        pl_shader_decode_color(sh, &(struct pl_color_repr) { .sys = sys }, NULL);
        REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
            .shader = &sh,
            .target = fbo,
        }));

        float epsilon;
        switch (sys) {
        case PL_COLOR_SYSTEM_BT_2020_C:
        case PL_COLOR_SYSTEM_XYZ:
            epsilon = 1e-5;
            break;

        case PL_COLOR_SYSTEM_BT_2100_PQ:
        case PL_COLOR_SYSTEM_BT_2100_HLG:
            // These seem to be horrifically noisy and prone to breaking on
            // edge cases for some reason
            // TODO: figure out why!
            continue;

        default: epsilon = 1e-6; break;
        }

        TEST_FBO_PATTERN(epsilon, "color system %d", (int) sys);
    }

    // Repeat this a few times to test the caching
    pl_cache cache = pl_cache_create(pl_cache_params( .log = gpu->log ));
    pl_gpu_set_cache(gpu, cache);
    for (int i = 0; i < 10; i++) {
        if (i == 5) {
            printf("Recreating pl_dispatch to test the caching\n");
            size_t size = pl_dispatch_save(dp, NULL);
            REQUIRE(size);
            uint8_t *cache_data = malloc(size);
            REQUIRE(cache_data);
            REQUIRE_CMP(pl_dispatch_save(dp, cache_data), ==, size, "zu");

            pl_dispatch_destroy(&dp);
            dp = pl_dispatch_create(gpu->log, gpu);
            pl_dispatch_load(dp, cache_data);

            // Test to make sure the pass regenerates the same cache
            uint64_t hash = pl_str_hash((pl_str) { cache_data, size });
            REQUIRE_CMP(pl_dispatch_save(dp, NULL), ==, size, "zu");
            REQUIRE_CMP(pl_dispatch_save(dp, cache_data), ==, size, "zu");
            REQUIRE_CMP(pl_str_hash((pl_str) { cache_data, size }), ==, hash, PRIu64);
            free(cache_data);
        }

        sh = pl_dispatch_begin(dp);

        // For testing, force the use of CS if possible
        if (gpu->glsl.compute) {
            sh->type = SH_COMPUTE;
            sh->group_size[0] = 8;
            sh->group_size[1] = 8;
        }

        pl_shader_deband(sh, pl_sample_src( .tex = src ), pl_deband_params(
            .iterations     = 0,
            .grain          = 0.0,
        ));

        REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
            .shader = &sh,
            .target = fbo,
        }));
        TEST_FBO_PATTERN(1e-6, "deband iter %d", i);
    }

    pl_gpu_set_cache(gpu, NULL);
    pl_cache_destroy(&cache);

    // Test peak detection and readback if possible
    sh = pl_dispatch_begin(dp);
    pl_shader_sample_nearest(sh, pl_sample_src( .tex = src ));

    pl_shader_obj peak_state = NULL;
    struct pl_color_space csp_gamma22 = { .transfer = PL_COLOR_TRC_GAMMA22 };
    struct pl_peak_detect_params peak_params = { .minimum_peak = 0.01 };
    if (pl_shader_detect_peak(sh, csp_gamma22, &peak_state, &peak_params)) {
        REQUIRE(pl_dispatch_compute(dp, &(struct pl_dispatch_compute_params) {
            .shader = &sh,
            .width = fbo->params.w,
            .height = fbo->params.h,
        }));

        float peak, avg;
        REQUIRE(pl_get_detected_peak(peak_state, &peak, &avg));

        float real_peak = 0, real_avg = 0;
        for (int y = 0; y < FBO_H; y++) {
            for (int x = 0; x < FBO_W; x++) {
                float *color = &src_data[(y * FBO_W + x) * 4];
                float luma = 0.212639f * powf(color[0], 2.2f) +
                             0.715169f * powf(color[1], 2.2f) +
                             0.072192f * powf(color[2], 2.2f);
                luma = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, luma);
                real_peak = PL_MAX(real_peak, luma);
                real_avg += luma;
            }
        }
        real_avg = real_avg / (FBO_W * FBO_H);

        real_avg  = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, real_avg);
        real_peak = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, real_peak);
        REQUIRE_FEQ(peak, real_peak, 1e-3);
        REQUIRE_FEQ(avg, real_avg, 1e-2);
    }

    pl_dispatch_abort(dp, &sh);
    pl_shader_obj_destroy(&peak_state);

    // Test film grain synthesis
    pl_shader_obj grain = NULL;
    struct pl_film_grain_params grain_params = {
        .tex = src,
        .components = 3,
        .component_mapping = { 0, 1, 2},
        .repr = &(struct pl_color_repr) {
            .sys = PL_COLOR_SYSTEM_BT_709,
            .levels = PL_COLOR_LEVELS_LIMITED,
            .bits = { .color_depth = 10, .sample_depth = 10 },
        },
    };

    for (int i = 0; i < 2; i++) {
        grain_params.data.type = PL_FILM_GRAIN_AV1;
        grain_params.data.params.av1 = av1_grain_data;
        grain_params.data.params.av1.overlap = !!i;
        grain_params.data.seed = rand();

        sh = pl_dispatch_begin(dp);
        pl_shader_film_grain(sh, &grain, &grain_params);
        REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
            .shader = &sh,
            .target = fbo,
        }));
    }

    if (gpu->glsl.compute) {
        grain_params.data.type = PL_FILM_GRAIN_H274;
        grain_params.data.params.h274 = h274_grain_data;
        grain_params.data.seed = rand();

        sh = pl_dispatch_begin(dp);
        pl_shader_film_grain(sh, &grain, &grain_params);
        REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
            .shader = &sh,
            .target = fbo,
        }));
    }
    pl_shader_obj_destroy(&grain);

    // Test custom shaders
    struct pl_custom_shader custom = {
        .header =
            "vec3 invert(vec3 color)            \n"
            "{                                  \n"
            "    return vec3(1.0) - color;      \n"
            "}                                  \n",

        .body =
            "color = vec4(gl_FragCoord.xy, 0.0, 1.0);   \n"
            "color.rgb = invert(color.rgb) + offset;    \n",

        .input = PL_SHADER_SIG_NONE,
        .output = PL_SHADER_SIG_COLOR,

        .num_variables = 1,
        .variables = &(struct pl_shader_var) {
            .var = pl_var_float("offset"),
            .data = &(float) { 0.1 },
        },
    };

    sh = pl_dispatch_begin(dp);
    REQUIRE(pl_shader_custom(sh, &custom));
    REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
        .shader = &sh,
        .target = fbo,
    }));

    // Test dolbyvision
    struct pl_color_repr repr = {
        .sys = PL_COLOR_SYSTEM_DOLBYVISION,
        .dovi = &dovi_meta,
    };

    sh = pl_dispatch_begin(dp);
    pl_shader_sample_direct(sh, pl_sample_src( .tex = src ));
    pl_shader_decode_color(sh, &repr, NULL);
    REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
        .shader = &sh,
        .target = fbo,
    }));

    // Test deinterlacing
    sh = pl_dispatch_begin(dp);
    pl_shader_deinterlace(sh, pl_deinterlace_source( .cur = pl_field_pair(src) ), NULL);
    REQUIRE(pl_dispatch_finish(dp, pl_dispatch_params(
        .shader = &sh,
        .target = fbo,
    )));

    // Test error diffusion
    if (fbo->params.storable) {
        for (int i = 0; i < pl_num_error_diffusion_kernels; i++) {
            const struct pl_error_diffusion_kernel *k = pl_error_diffusion_kernels[i];
            printf("testing error diffusion kernel '%s'\n", k->name);
            sh = pl_dispatch_begin(dp);
            bool ok = pl_shader_error_diffusion(sh, pl_error_diffusion_params(
                .input_tex  = src,
                .output_tex = fbo,
                .new_depth  = 8,
                .kernel     = k,
            ));

            if (!ok) {
                fprintf(stderr, "kernel '%s' exceeds GPU limits, skipping...\n", k->name);
                continue;
            }

            REQUIRE(pl_dispatch_compute(dp, pl_dispatch_compute_params(
                .shader = &sh,
                .dispatch_size = {1, 1, 1},
            )));
        }
    }

    pl_dispatch_destroy(&dp);
    pl_tex_destroy(gpu, &src);
    pl_tex_destroy(gpu, &fbo);
}

static void pl_scaler_tests(pl_gpu gpu)
{
    pl_fmt src_fmt = pl_find_fmt(gpu, PL_FMT_FLOAT, 1, 16, 32, PL_FMT_CAP_LINEAR);
    pl_fmt fbo_fmt = pl_find_fmt(gpu, PL_FMT_FLOAT, 1, 16, 32, PL_FMT_CAP_RENDERABLE);
    if (!src_fmt || !fbo_fmt)
        return;

    float *fbo_data = NULL;
    pl_shader_obj lut = NULL;

    static float data_5x5[5][5] = {
        { 0, 0, 0, 0, 0 },
        { 0, 0, 0, 0, 0 },
        { 0, 0, 1, 0, 0 },
        { 0, 0, 0, 0, 0 },
        { 0, 0, 0, 0, 0 },
    };

    pl_tex dot5x5 = pl_tex_create(gpu, &(struct pl_tex_params) {
        .w              = 5,
        .h              = 5,
        .format         = src_fmt,
        .sampleable     = true,
        .initial_data   = &data_5x5[0][0],
    });

    struct pl_tex_params fbo_params = {
        .w              = 100,
        .h              = 100,
        .format         = fbo_fmt,
        .renderable     = true,
        .storable       = fbo_fmt->caps & PL_FMT_CAP_STORABLE,
        .host_readable  = fbo_fmt->caps & PL_FMT_CAP_HOST_READABLE,
    };

    pl_tex fbo = pl_tex_create(gpu, &fbo_params);
    pl_dispatch dp = pl_dispatch_create(gpu->log, gpu);
    if (!dot5x5 || !fbo || !dp)
        goto error;

    pl_shader sh = pl_dispatch_begin(dp);
    REQUIRE(pl_shader_sample_polar(sh,
        pl_sample_src(
            .tex        = dot5x5,
            .new_w      = fbo->params.w,
            .new_h      = fbo->params.h,
        ),
        pl_sample_filter_params(
            .filter     = pl_filter_ewa_lanczos,
            .lut        = &lut,
            .no_compute = !fbo->params.storable,
        )
    ));
    REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
        .shader = &sh,
        .target = fbo,
    }));

    if (fbo->params.host_readable) {
        fbo_data = malloc(fbo->params.w * fbo->params.h * sizeof(float));
        REQUIRE(pl_tex_download(gpu, &(struct pl_tex_transfer_params) {
            .tex            = fbo,
            .ptr            = fbo_data,
        }));

#ifdef PRINT_OUTPUT
        int max = 255;
        printf("P2\n%d %d\n%d\n", fbo->params.w, fbo->params.h, max);
        for (int y = 0; y < fbo->params.h; y++) {
            for (int x = 0; x < fbo->params.w; x++) {
                float v = fbo_data[y * fbo->params.h + x];
                printf("%d ", (int) round(fmin(fmax(v, 0.0), 1.0) * max));
            }
            printf("\n");
        }
#endif
    }

error:
    free(fbo_data);
    pl_shader_obj_destroy(&lut);
    pl_dispatch_destroy(&dp);
    pl_tex_destroy(gpu, &dot5x5);
    pl_tex_destroy(gpu, &fbo);
}

static const char *user_shader_tests[] = {
    // Test hooking, saving and loading
    "// Example of a comment at the beginning                               \n"
    "                                                                       \n"
    "//!HOOK NATIVE                                                         \n"
    "//!DESC upscale image                                                  \n"
    "//!BIND HOOKED                                                         \n"
    "//!WIDTH HOOKED.w 10 *                                                 \n"
    "//!HEIGHT HOOKED.h 10 *                                                \n"
    "//!SAVE NATIVEBIG                                                      \n"
    "//!WHEN NATIVE.w 500 <                                                 \n"
    "                                                                       \n"
    "vec4 hook()                                                            \n"
    "{                                                                      \n"
    "    return HOOKED_texOff(0);                                           \n"
    "}                                                                      \n"
    "                                                                       \n"
    "//!HOOK MAIN                                                           \n"
    "//!DESC downscale bigger image                                         \n"
    "//!WHEN NATIVE.w 500 <                                                 \n"
    "//!BIND NATIVEBIG                                                      \n"
    "                                                                       \n"
    "vec4 hook()                                                            \n"
    "{                                                                      \n"
    "    return NATIVEBIG_texOff(0);                                        \n"
    "}                                                                      \n",

    // Test use of textures
    "//!HOOK MAIN                                                           \n"
    "//!DESC turn everything into colorful pixels                           \n"
    "//!BIND HOOKED                                                         \n"
    "//!BIND DISCO                                                          \n"
    "//!COMPONENTS 3                                                        \n"
    "                                                                       \n"
    "vec4 hook()                                                            \n"
    "{                                                                      \n"
    "    return vec4(DISCO_tex(HOOKED_pos * 10.0).rgb, 1);                  \n"
    "}                                                                      \n"
    "                                                                       \n"
    "//!TEXTURE DISCO                                                       \n"
    "//!SIZE 3 3                                                            \n"
    "//!FORMAT rgba8                                                        \n"
    "//!FILTER NEAREST                                                      \n"
    "//!BORDER REPEAT                                                       \n"
    "ff0000ff00ff00ff0000ffff00ffffffff00ffffffff00ff4c4c4cff999999ffffffffff\n"

    // Test custom parameters
    "//!PARAM test                                                          \n"
    "//!DESC test parameter                                                 \n"
    "//!TYPE DYNAMIC float                                                  \n"
    "//!MINIMUM 0.0                                                         \n"
    "//!MAXIMUM 100.0                                                       \n"
    "1.0                                                                    \n"
    "                                                                       \n"
    "//!PARAM testconst                                                     \n"
    "//!TYPE CONSTANT uint                                                  \n"
    "//!MAXIMUM 16                                                          \n"
    "3                                                                      \n"
    "                                                                       \n"
    "//!PARAM testdefine                                                    \n"
    "//!TYPE DEFINE                                                         \n"
    "100                                                                    \n"
    "                                                                       \n"
    "//!PARAM testenum                                                      \n"
    "//!TYPE ENUM DEFINE                                                    \n"
    "FOO                                                                    \n"
    "BAR                                                                    \n"
    "                                                                       \n"
    "//!HOOK MAIN                                                           \n"
    "//!WHEN testconst 30 >                                                 \n"
    "#error should not be run                                               \n"
    "                                                                       \n"
    "//!HOOK MAIN                                                           \n"
    "//!WHEN testenum FOO =                                                 \n"
    "#if testenum == BAR                                                    \n"
    " #error bad                                                            \n"
    "#endif                                                                 \n"
    "vec4 hook() { return vec4(0.0); }                                      \n"
};

static const char *compute_shader_tests[] = {
    // Test use of storage/buffer resources
    "//!HOOK MAIN                                                           \n"
    "//!DESC attach some storage objects                                    \n"
    "//!BIND tex_storage                                                    \n"
    "//!BIND buf_uniform                                                    \n"
    "//!BIND buf_storage                                                    \n"
    "//!COMPONENTS 4                                                        \n"
    "                                                                       \n"
    "vec4 hook()                                                            \n"
    "{                                                                      \n"
    "    return vec4(foo, bar, bat);                                        \n"
    "}                                                                      \n"
    "                                                                       \n"
    "//!TEXTURE tex_storage                                                 \n"
    "//!SIZE 100 100                                                        \n"
    "//!FORMAT r32f                                                         \n"
    "//!STORAGE                                                             \n"
    "                                                                       \n"
    "//!BUFFER buf_uniform                                                  \n"
    "//!VAR float foo                                                       \n"
    "//!VAR float bar                                                       \n"
    "0000000000000000                                                       \n"
    "                                                                       \n"
    "//!BUFFER buf_storage                                                  \n"
    "//!VAR vec2 bat                                                        \n"
    "//!VAR int big[32];                                                    \n"
    "//!STORAGE                                                             \n",

};

static const char *test_luts[] = {

    "TITLE \"1D identity\"  \n"
    "LUT_1D_SIZE 2          \n"
    "0.0 0.0 0.0            \n"
    "1.0 1.0 1.0            \n",

    "TITLE \"3D identity\"  \n"
    "LUT_3D_SIZE 2          \n"
    "0.0 0.0 0.0            \n"
    "1.0 0.0 0.0            \n"
    "0.0 1.0 0.0            \n"
    "1.0 1.0 0.0            \n"
    "0.0 0.0 1.0            \n"
    "1.0 0.0 1.0            \n"
    "0.0 1.0 1.0            \n"
    "1.0 1.0 1.0            \n"

};

static bool frame_passthrough(pl_gpu gpu, pl_tex *tex,
                              const struct pl_source_frame *src, struct pl_frame *out_frame)
{
    const struct pl_frame *frame = src->frame_data;
    *out_frame = *frame;
    return true;
}

static enum pl_queue_status get_frame_ptr(struct pl_source_frame *out_frame,
                                          const struct pl_queue_params *qparams)
{
    const struct pl_source_frame **pframe = qparams->priv;
    if (!(*pframe)->frame_data)
        return PL_QUEUE_EOF;

    *out_frame = *(*pframe)++;
    return PL_QUEUE_OK;
}

static void render_info_cb(void *priv, const struct pl_render_info *info)
{
    printf("{%d} Executed shader: %s\n", info->index,
           info->pass->shader->description);
}

static void pl_render_tests(pl_gpu gpu)
{
    pl_tex img_tex = NULL, fbo = NULL;
    pl_renderer rr = NULL;

    enum { width = 50, height = 50 };
    static float data[width][height];
    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x++)
            data[y][x] = RANDOM;
    }

    struct pl_plane img_plane = {0};
    struct pl_plane_data plane_data = {
        .type = PL_FMT_FLOAT,
        .width = width,
        .height = height,
        .component_size = { 8 * sizeof(float) },
        .component_map  = { 0 },
        .pixel_stride = sizeof(float),
        .pixels = data,
    };

    if (!pl_recreate_plane(gpu, NULL, &fbo, &plane_data))
        return;

    if (!pl_upload_plane(gpu, &img_plane, &img_tex, &plane_data))
        goto error;

    rr = pl_renderer_create(gpu->log, gpu);
    pl_tex_clear_ex(gpu, fbo, (union pl_clear_color){0});

    struct pl_frame image = {
        .num_planes     = 1,
        .planes         = { img_plane },
        .repr = {
            .sys        = PL_COLOR_SYSTEM_BT_709,
            .levels     = PL_COLOR_LEVELS_FULL,
        },
        .color          = pl_color_space_srgb,
    };

    struct pl_frame target = {
        .num_planes     = 1,
        .planes         = {{
            .texture            = fbo,
            .components         = 3,
            .component_mapping  = {0, 1, 2},
        }},
        .repr = {
            .sys        = PL_COLOR_SYSTEM_RGB,
            .levels     = PL_COLOR_LEVELS_FULL,
            .bits.color_depth = 32,
        },
        .color          = pl_color_space_srgb,
    };

    REQUIRE(pl_render_image(rr, &image, &target, NULL));
    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);

    // TODO: embed a reference texture and ensure it matches

    // Test a bunch of different params
#define TEST(SNAME, STYPE, DEFAULT, FIELD, LIMIT)                       \
    do {                                                                \
        for (int i = 0; i <= LIMIT; i++) {                              \
            printf("testing `" #STYPE "." #FIELD " = %d`\n", i);        \
            struct pl_render_params params = pl_render_default_params;  \
            params.force_dither = true;                                 \
            struct STYPE tmp = DEFAULT;                                 \
            tmp.FIELD = i;                                              \
            params.SNAME = &tmp;                                        \
            REQUIRE(pl_render_image(rr, &image, &target, &params));     \
            pl_gpu_flush(gpu);                                          \
            REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE); \
        }                                                               \
    } while (0)

#define TEST_PARAMS(NAME, FIELD, LIMIT) \
    TEST(NAME##_params, pl_##NAME##_params, pl_##NAME##_default_params, FIELD, LIMIT)

    image.crop.x1 = width / 2.0;
    image.crop.y1 = height / 2.0;
    for (int i = 0; i < pl_num_scale_filters; i++) {
        struct pl_render_params params = pl_render_default_params;
        params.upscaler = pl_scale_filters[i].filter;
        printf("testing `params.upscaler = /* %s */`\n", pl_scale_filters[i].name);
        REQUIRE(pl_render_image(rr, &image, &target, &params));
        pl_gpu_flush(gpu);
        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
    }
    image.crop.x1 = image.crop.y1 = 0;

    target.crop.x1 = width / 2.0;
    target.crop.y1 = height / 2.0;
    for (int i = 0; i < pl_num_scale_filters; i++) {
        struct pl_render_params params = pl_render_default_params;
        params.downscaler = pl_scale_filters[i].filter;
        printf("testing `params.downscaler = /* %s */`\n", pl_scale_filters[i].name);
        REQUIRE(pl_render_image(rr, &image, &target, &params));
        pl_gpu_flush(gpu);
        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
    }
    target.crop.x1 = target.crop.y1 = 0;

    TEST_PARAMS(deband, iterations, 3);
    TEST_PARAMS(sigmoid, center, 1);
    TEST_PARAMS(color_map, intent, PL_INTENT_ABSOLUTE_COLORIMETRIC);
    TEST_PARAMS(dither, method, PL_DITHER_WHITE_NOISE);
    TEST_PARAMS(dither, temporal, true);
    TEST_PARAMS(distort, alpha_mode, PL_ALPHA_INDEPENDENT);
    TEST_PARAMS(distort, constrain, true);
    TEST_PARAMS(distort, bicubic, true);
    TEST(cone_params, pl_cone_params, pl_vision_deuteranomaly, strength, 0);

    // Test gamma-correct dithering
    target.repr.bits.color_depth = 2;
    TEST_PARAMS(dither, transfer, PL_COLOR_TRC_GAMMA22);
    target.repr.bits.color_depth = 32;

    // Test HDR tone mapping
    image.color = pl_color_space_hdr10;
    TEST_PARAMS(color_map, visualize_lut, true);
    if (gpu->limits.max_ssbo_size)
        TEST_PARAMS(peak_detect, allow_delayed, true);

    // Test inverse tone-mapping and pure BPC
    image.color.hdr.max_luma = 1000;
    target.color.hdr.max_luma = 4000;
    target.color.hdr.min_luma = 0.02;
    TEST_PARAMS(color_map, inverse_tone_mapping, true);

    image.color = pl_color_space_srgb;
    target.color = pl_color_space_srgb;

    // Test some misc stuff
    struct pl_render_params params = pl_render_default_params;
    params.color_adjustment = &(struct pl_color_adjustment) {
        .brightness = 0.1,
        .contrast = 0.9,
        .saturation = 1.5,
        .gamma = 0.8,
        .temperature = 0.3,
    };
    REQUIRE(pl_render_image(rr, &image, &target, &params));
    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
    params = pl_render_default_params;

    struct pl_frame inferred_image = image, inferred_target = target;
    pl_frames_infer(rr, &inferred_image, &inferred_target);
    REQUIRE(pl_render_image(rr, &inferred_image, &inferred_target, &params));
    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);

    // Test background blending and alpha transparency
    params.blend_against_tiles = true;
    params.corner_rounding = 0.25f;
    REQUIRE(pl_render_image(rr, &image, &target, &params));
    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
    params = pl_render_default_params;

    // Test film grain synthesis
    image.film_grain.type = PL_FILM_GRAIN_AV1;
    image.film_grain.params.av1 = av1_grain_data;
    REQUIRE(pl_render_image(rr, &image, &target, &params));
    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);

    image.film_grain.type = PL_FILM_GRAIN_H274;
    image.film_grain.params.h274 = h274_grain_data;
    REQUIRE(pl_render_image(rr, &image, &target, &params));
    // H.274 film grain synthesis requires compute shaders
    if (gpu->glsl.compute) {
        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
    } else {
        const struct pl_render_errors rr_err = pl_renderer_get_errors(rr);
        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_FILM_GRAIN);
        pl_renderer_reset_errors(rr, &rr_err);
    }
    image.film_grain = (struct pl_film_grain_data) {0};

    // Test mpv-style custom shaders
    for (int i = 0; i < PL_ARRAY_SIZE(user_shader_tests); i++) {
        printf("testing user shader:\n\n%s\n", user_shader_tests[i]);
        const struct pl_hook *hook;
        hook = pl_mpv_user_shader_parse(gpu, user_shader_tests[i],
                                        strlen(user_shader_tests[i]));
        REQUIRE(hook);

        params.hooks = &hook;
        params.num_hooks = 1;
        REQUIRE(pl_render_image(rr, &image, &target, &params));
        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);

        pl_mpv_user_shader_destroy(&hook);
    }

    if (gpu->glsl.compute && gpu->limits.max_ssbo_size) {
        for (int i = 0; i < PL_ARRAY_SIZE(compute_shader_tests); i++) {
            printf("testing user shader:\n\n%s\n", compute_shader_tests[i]);
            const struct pl_hook *hook;
            hook = pl_mpv_user_shader_parse(gpu, compute_shader_tests[i],
                                            strlen(compute_shader_tests[i]));
            REQUIRE(hook);

            params.hooks = &hook;
            params.num_hooks = 1;
            REQUIRE(pl_render_image(rr, &image, &target, &params));
            REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);

            pl_mpv_user_shader_destroy(&hook);
        }
    }
    params = pl_render_default_params;

    // Test custom LUTs
    for (int i = 0; i < PL_ARRAY_SIZE(test_luts); i++) {
        printf("testing custom lut %d\n", i);
        struct pl_custom_lut *lut;
        lut = pl_lut_parse_cube(gpu->log, test_luts[i], strlen(test_luts[i]));
        REQUIRE(lut);

        bool has_3dlut = gpu->limits.max_tex_3d_dim && gpu->glsl.version > 100;
        if (lut->size[2] && !has_3dlut) {
            pl_lut_free(&lut);
            continue;
        }

        // Test all three at the same time to reduce the number of tests
        image.lut = target.lut = params.lut = lut;

        for (enum pl_lut_type t = PL_LUT_UNKNOWN; t <= PL_LUT_CONVERSION; t++) {
            printf("testing LUT method %d\n", t);
            image.lut_type = target.lut_type = params.lut_type = t;
            REQUIRE(pl_render_image(rr, &image, &target, &params));
            REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
        }

        image.lut = target.lut = params.lut = NULL;
        pl_lut_free(&lut);
    }

#ifdef PL_HAVE_LCMS

    // It doesn't fit without use of 3D textures on GLES2
    if (gpu->glsl.version > 100) {
        // Test ICC profiles
        image.profile = TEST_PROFILE(sRGB_v2_nano_icc);
        REQUIRE(pl_render_image(rr, &image, &target, &params));
        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
        image.profile = (struct pl_icc_profile) {0};

        target.profile = TEST_PROFILE(sRGB_v2_nano_icc);
        REQUIRE(pl_render_image(rr, &image, &target, &params));
        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
        target.profile = (struct pl_icc_profile) {0};

        image.profile = TEST_PROFILE(sRGB_v2_nano_icc);
        target.profile = image.profile;
        REQUIRE(pl_render_image(rr, &image, &target, &params));
        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
        image.profile = (struct pl_icc_profile) {0};
        target.profile = (struct pl_icc_profile) {0};
    }

#endif

    // Test overlays
    image.num_overlays = 1;
    image.overlays = &(struct pl_overlay) {
        .tex = img_plane.texture,
        .mode = PL_OVERLAY_NORMAL,
        .num_parts = 2,
        .parts = (struct pl_overlay_part[]) {{
            .src = {0, 0, 2, 2},
            .dst = {30, 100, 40, 200},
        }, {
            .src = {2, 2, 5, 5},
            .dst = {1000, -1, 3, 5},
        }},
    };
    REQUIRE(pl_render_image(rr, &image, &target, &params));
    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
    params.disable_fbos = true;
    REQUIRE(pl_render_image(rr, &image, &target, &params));
    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
    image.num_overlays = 0;
    params = pl_render_default_params;

    target.num_overlays = 1;
    target.overlays = &(struct pl_overlay) {
        .tex = img_plane.texture,
        .mode = PL_OVERLAY_MONOCHROME,
        .num_parts = 1,
        .parts = &(struct pl_overlay_part) {
            .src = {5, 5, 15, 15},
            .dst = {5, 5, 15, 15},
            .color = {1.0, 0.5, 0.0},
        },
    };
    REQUIRE(pl_render_image(rr, &image, &target, &params));
    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
    REQUIRE(pl_render_image(rr, NULL, &target, &params));
    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
    target.num_overlays = 0;

    // Test rotation
    for (pl_rotation rot = 0; rot < PL_ROTATION_360; rot += PL_ROTATION_90) {
        image.rotation = rot;
        REQUIRE(pl_render_image(rr, &image, &target, &params));
        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
    }

    // Attempt frame mixing, using the mixer queue helper
    printf("testing frame mixing \n");
    struct pl_render_params mix_params = {
        .frame_mixer = &pl_filter_mitchell_clamp,
        .info_callback = render_info_cb,
    };

    struct pl_queue_params qparams = {
        .radius = pl_frame_mix_radius(&mix_params),
        .vsync_duration = 1.0 / 60.0,
    };

    // Test large PTS jumps in frame mix
    struct pl_frame_mix mix = (struct pl_frame_mix) {
        .num_frames = 2,
        .frames = (const struct pl_frame *[]) { &image, &image },
        .signatures = (uint64_t[]) { 0xFFF1, 0xFFF2 },
        .timestamps = (float[]) { -100, 100 },
        .vsync_duration = 1.6,
    };
    REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));

    // Test inferring frame mix
    inferred_target = target;
    pl_frames_infer_mix(rr, &mix, &inferred_target, &inferred_image);
    REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));

    // Test empty frame mix
    mix = (struct pl_frame_mix) {0};
    REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));

    // Test inferring empty frame mix
    inferred_target = target;
    pl_frames_infer_mix(rr, &mix, &inferred_target, &inferred_image);
    REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));

    // Test mixer queue
#define NUM_MIX_FRAMES 20
    const float frame_duration = 1.0 / 24.0;
    struct pl_source_frame srcframes[NUM_MIX_FRAMES+1];
    srcframes[NUM_MIX_FRAMES] = (struct pl_source_frame) {0};
    for (int i = 0; i < NUM_MIX_FRAMES; i++) {
        srcframes[i] = (struct pl_source_frame) {
            .pts = i * frame_duration,
            .duration = frame_duration,
            .map = frame_passthrough,
            .frame_data = &image,
        };
    }

    pl_queue queue = pl_queue_create(gpu);
    enum pl_queue_status ret;

    // Test pre-pushing all frames, with delayed EOF.
    for (int i = 0; i < NUM_MIX_FRAMES; i++) {
        const struct pl_source_frame *src = &srcframes[i];
        if (i > 10) // test pushing in reverse order
            src = &srcframes[NUM_MIX_FRAMES + 10 - i];
        if (!pl_queue_push_block(queue, 1, src)) // mini-sleep
            pl_queue_push(queue, src); // push it anyway, for testing
    }

    while ((ret = pl_queue_update(queue, &mix, &qparams)) != PL_QUEUE_EOF) {
        if (ret == PL_QUEUE_MORE) {
            REQUIRE_CMP(qparams.pts, >, 0.0f, "f");
            pl_queue_push(queue, NULL); // push delayed EOF
            continue;
        }

        REQUIRE_CMP(ret, ==, PL_QUEUE_OK, "u");
        REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));

        // Simulate advancing vsync
        qparams.pts += qparams.vsync_duration;
    }

    // Test dynamically pulling all frames, with oversample mixer
    const struct pl_source_frame *frame_ptr = &srcframes[0];
    mix_params.frame_mixer = &pl_oversample_frame_mixer;

    qparams = (struct pl_queue_params) {
        .radius = pl_frame_mix_radius(&mix_params),
        .vsync_duration = qparams.vsync_duration,
        .get_frame = get_frame_ptr,
        .priv = &frame_ptr,
    };

    pl_queue_reset(queue);
    while ((ret = pl_queue_update(queue, &mix, &qparams)) != PL_QUEUE_EOF) {
        REQUIRE_CMP(ret, ==, PL_QUEUE_OK, "u");
        REQUIRE_CMP(mix.num_frames, <=, 2, "d");
        REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));
        qparams.pts += qparams.vsync_duration;
    }

    // Test large PTS jump
    pl_queue_reset(queue);
    REQUIRE(pl_queue_update(queue, &mix, &qparams) == PL_QUEUE_EOF);

    // Test deinterlacing
    pl_queue_reset(queue);
    printf("testing deinterlacing \n");
    for (int i = 0; i < NUM_MIX_FRAMES; i++) {
        struct pl_source_frame *src = &srcframes[i];
        if (i > 10)
            src = &srcframes[NUM_MIX_FRAMES + 10 - i];
        src->first_field = PL_FIELD_EVEN;
        pl_queue_push(queue, src);
    }
    pl_queue_push(queue, NULL);

    qparams.pts = 0;
    qparams.get_frame = NULL;
    while ((ret = pl_queue_update(queue, &mix, &qparams)) != PL_QUEUE_EOF) {
        REQUIRE_CMP(ret, ==, PL_QUEUE_OK, "u");
        REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));
        qparams.pts += qparams.vsync_duration;
    }

    pl_queue_destroy(&queue);

error:
    pl_renderer_destroy(&rr);
    pl_tex_destroy(gpu, &img_tex);
    pl_tex_destroy(gpu, &fbo);
}

static struct pl_hook_res noop_hook(void *priv, const struct pl_hook_params *params)
{
    return (struct pl_hook_res) {0};
}

static void pl_ycbcr_tests(pl_gpu gpu)
{
    struct pl_plane_data data[3];
    for (int i = 0; i < 3; i++) {
        const int sub = i > 0 ? 1 : 0;
        const int width = (323 + sub) >> sub;
        const int height = (255 + sub) >> sub;

        data[i] = (struct pl_plane_data) {
            .type = PL_FMT_UNORM,
            .width = width,
            .height = height,
            .component_size = {16},
            .component_map = {i},
            .pixel_stride = sizeof(uint16_t),
            .row_stride = PL_ALIGN2(width * sizeof(uint16_t),
                                    gpu->limits.align_tex_xfer_pitch),
        };
    }

    pl_fmt fmt = pl_plane_find_fmt(gpu, NULL, &data[0]);
    enum pl_fmt_caps caps = PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_HOST_READABLE;
    if (!fmt || (fmt->caps & caps) != caps)
        return;

    pl_renderer rr = pl_renderer_create(gpu->log, gpu);
    if (!rr)
        return;

    pl_tex src_tex[3] = {0};
    pl_tex dst_tex[3] = {0};
    struct pl_frame img = {
        .num_planes = 3,
        .repr = pl_color_repr_hdtv,
        .color = pl_color_space_bt709,
    };

    struct pl_frame target = {
        .num_planes = 3,
        .repr = pl_color_repr_hdtv,
        .color = pl_color_space_bt709,
    };

    uint8_t *src_buffer[3] = {0};
    uint8_t *dst_buffer = NULL;
    for (int i = 0; i < 3; i++) {
        // Generate some arbitrary data for the buffer
        src_buffer[i] = malloc(data[i].height * data[i].row_stride);
        if (!src_buffer[i])
            goto error;

        data[i].pixels = src_buffer[i];
        for (int y = 0; y < data[i].height; y++) {
            for (int x = 0; x < data[i].width; x++) {
                size_t off = y * data[i].row_stride + x * data[i].pixel_stride;
                uint16_t *pixel = (uint16_t *) &src_buffer[i][off];
                int gx = 200 + 100 * i, gy = 300 + 150 * i;
                *pixel = (gx * x) ^ (gy * y); // whatever
            }
        }

        REQUIRE(pl_upload_plane(gpu, &img.planes[i], &src_tex[i], &data[i]));
    }

    // This co-sites chroma pixels with pixels in the RGB image, meaning we
    // get an exact round-trip when sampling both ways. This makes it useful
    // as a test case, even though it's not common in the real world.
    pl_frame_set_chroma_location(&img, PL_CHROMA_TOP_LEFT);

    for (int i = 0; i < 3; i++) {
        dst_tex[i] = pl_tex_create(gpu, &(struct pl_tex_params) {
            .format = fmt,
            .w = data[i].width,
            .h = data[i].height,
            .renderable = true,
            .host_readable = true,
            .storable = fmt->caps & PL_FMT_CAP_STORABLE,
            .blit_dst = fmt->caps & PL_FMT_CAP_BLITTABLE,
        });

        if (!dst_tex[i])
            goto error;

        target.planes[i] = img.planes[i];
        target.planes[i].texture = dst_tex[i];
    }

    REQUIRE(pl_render_image(rr, &img, &target, &(struct pl_render_params) {
        .num_hooks = 1,
        .hooks = &(const struct pl_hook *){&(struct pl_hook) {
            // Forces chroma merging, to test the chroma merging code
            .stages = PL_HOOK_CHROMA_INPUT,
            .hook = noop_hook,
        }},
    }));
    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);

    size_t buf_size = data[0].height * data[0].row_stride;
    dst_buffer = malloc(buf_size);
    if (!dst_buffer)
        goto error;

    for (int i = 0; i < 3; i++) {
        memset(dst_buffer, 0xAA, buf_size);
        REQUIRE(pl_tex_download(gpu, &(struct pl_tex_transfer_params) {
            .tex = dst_tex[i],
            .ptr = dst_buffer,
            .row_pitch = data[i].row_stride,
        }));

        for (int y = 0; y < data[i].height; y++) {
            for (int x = 0; x < data[i].width; x++) {
                size_t off = y * data[i].row_stride + x * data[i].pixel_stride;
                uint16_t *src_pixel = (uint16_t *) &src_buffer[i][off];
                uint16_t *dst_pixel = (uint16_t *) &dst_buffer[off];
                int diff = abs((int) *src_pixel - (int) *dst_pixel);
                REQUIRE_CMP(diff, <=, 50, "d"); // a little under 0.1%
            }
        }
    }

error:
    pl_renderer_destroy(&rr);
    free(dst_buffer);
    for (int i = 0; i < 3; i++) {
        free(src_buffer[i]);
        pl_tex_destroy(gpu, &src_tex[i]);
        pl_tex_destroy(gpu, &dst_tex[i]);
    }
}

static void pl_test_export_import(pl_gpu gpu,
                                  enum pl_handle_type handle_type)
{
    // Test texture roundtrip

    if (!(gpu->export_caps.tex & handle_type) ||
        !(gpu->import_caps.tex & handle_type))
        goto skip_tex;

    pl_fmt fmt = pl_find_fmt(gpu, PL_FMT_UNORM, 4, 0, 0, PL_FMT_CAP_BLITTABLE);
    if (!fmt)
        goto skip_tex;

    printf("testing texture import/export with fmt %s\n", fmt->name);

    pl_tex export = pl_tex_create(gpu, &(struct pl_tex_params) {
        .w = 32,
        .h = 32,
        .format = fmt,
        .export_handle = handle_type,
    });
    REQUIRE(export);
    REQUIRE_HANDLE(export->shared_mem, handle_type);

    pl_tex import = pl_tex_create(gpu, &(struct pl_tex_params) {
        .w = export->params.w,
        .h = export->params.h,
        .format = fmt,
        .import_handle = handle_type,
        .shared_mem = export->shared_mem,
    });
    REQUIRE(import);

    pl_tex_destroy(gpu, &import);
    pl_tex_destroy(gpu, &export);

skip_tex: ;

    // Test buffer roundtrip

    if (!(gpu->export_caps.buf & handle_type) ||
        !(gpu->import_caps.buf & handle_type))
        return;

    printf("testing buffer import/export\n");

    pl_buf exp_buf = pl_buf_create(gpu, &(struct pl_buf_params) {
        .size = 32,
        .export_handle = handle_type,
    });
    REQUIRE(exp_buf);
    REQUIRE_HANDLE(exp_buf->shared_mem, handle_type);

    pl_buf imp_buf = pl_buf_create(gpu, &(struct pl_buf_params) {
        .size = 32,
        .import_handle = handle_type,
        .shared_mem = exp_buf->shared_mem,
    });
    REQUIRE(imp_buf);

    pl_buf_destroy(gpu, &imp_buf);
    pl_buf_destroy(gpu, &exp_buf);
}

static void pl_test_host_ptr(pl_gpu gpu)
{
    if (!(gpu->import_caps.buf & PL_HANDLE_HOST_PTR))
        return;

#ifdef __unix__

    printf("testing host ptr\n");
    REQUIRE(gpu->limits.max_mapped_size);

    const size_t size = 2 << 20;
    const size_t offset = 2 << 10;
    const size_t slice = 2 << 16;

    uint8_t *data = aligned_alloc(0x1000, size);
    for (int i = 0; i < size; i++)
        data[i] = (uint8_t) i;

    pl_buf buf = pl_buf_create(gpu, &(struct pl_buf_params) {
        .size = slice,
        .import_handle = PL_HANDLE_HOST_PTR,
        .shared_mem = {
            .handle.ptr = data,
            .size = size,
            .offset = offset,
        },
        .host_mapped = true,
    });

    REQUIRE(buf);
    REQUIRE_MEMEQ(data + offset, buf->data, slice);

    pl_buf_destroy(gpu, &buf);
    free(data);

#endif // unix
}

static void gpu_shader_tests(pl_gpu gpu)
{
    pl_buffer_tests(gpu);
    pl_texture_tests(gpu);
    pl_planar_tests(gpu);
    pl_shader_tests(gpu);
    pl_scaler_tests(gpu);
    pl_render_tests(gpu);
    pl_ycbcr_tests(gpu);

    REQUIRE(!pl_gpu_is_failed(gpu));
}

static void gpu_interop_tests(pl_gpu gpu)
{
    pl_test_export_import(gpu, PL_HANDLE_DMA_BUF);
    pl_test_host_ptr(gpu);

    REQUIRE(!pl_gpu_is_failed(gpu));
}