#include "tests.h"

#include <libplacebo/dispatch.h>
#include <libplacebo/vulkan.h>
#include <libplacebo/shaders/colorspace.h>
#include <libplacebo/shaders/deinterlacing.h>
#include <libplacebo/shaders/sampling.h>

enum {
    // Image configuration
    NUM_TEX     = 16,
    WIDTH       = 2048,
    HEIGHT      = 2048,
    DEPTH       = 16,
    COMPS       = 4,

    // Queue configuration
    NUM_QUEUES  = NUM_TEX,
    ASYNC_TX    = 1,
    ASYNC_COMP  = 1,

    // Test configuration
    TEST_MS     = 1000,
    WARMUP_MS   = 500,
};

static pl_tex create_test_img(pl_gpu gpu)
{
    pl_fmt fmt = pl_find_fmt(gpu, PL_FMT_FLOAT, COMPS, DEPTH, 32, PL_FMT_CAP_LINEAR);
    REQUIRE(fmt);

    const float xc = (WIDTH  - 1) / 2.0f;
    const float yc = (HEIGHT - 1) / 2.0f;
    const float kf = 0.5f / sqrtf(xc * xc + yc * yc);
    const float invphi = 0.61803398874989;
    const float freqR = kf * M_PI * 0.2f;
    const float freqG = freqR * invphi;
    const float freqB = freqG * invphi;
    float *data = malloc(WIDTH * HEIGHT * COMPS * sizeof(float));
    for (int y = 0; y < HEIGHT; y++) {
        for (int x = 0; x < WIDTH; x++) {
            float *color = &data[(y * WIDTH + x) * COMPS];
            float xx = x - xc, yy = y - yc;
            float r2 = xx * xx + yy * yy;
            switch (COMPS) {
            case 4: color[3] = 1.0;
            case 3: color[2] = 0.5f * sinf(freqB * r2) + 0.5f;;
            case 2: color[1] = 0.5f * sinf(freqG * r2) + 0.5f;;
            case 1: color[0] = 0.5f * sinf(freqR * r2) + 0.5f;;
            }
        }
    }

    pl_tex tex = pl_tex_create(gpu, pl_tex_params(
        .format         = fmt,
        .w              = WIDTH,
        .h              = HEIGHT,
        .sampleable     = true,
        .initial_data   = data,
    ));

    free(data);
    REQUIRE(tex);
    return tex;
}

struct bench {
    void (*run_sh)(pl_shader sh, pl_shader_obj *state,
                   pl_tex src);

    void (*run_tex)(pl_gpu gpu, pl_tex tex);
};

static void run_bench(pl_gpu gpu, pl_dispatch dp,
                      pl_shader_obj *state, pl_tex src,
                      pl_tex fbo, pl_timer timer,
                      const struct bench *bench)
{
    REQUIRE(bench);
    REQUIRE(bench->run_sh || bench->run_tex);
    if (bench->run_sh) {
        pl_shader sh = pl_dispatch_begin(dp);
        bench->run_sh(sh, state, src);

        pl_dispatch_finish(dp, pl_dispatch_params(
            .shader = &sh,
            .target = fbo,
            .timer = timer,
        ));
    } else {
        bench->run_tex(gpu, fbo);
    }
}

static void benchmark(pl_gpu gpu, const char *name,
                      const struct bench *bench)
{
    pl_dispatch dp = pl_dispatch_create(gpu->log, gpu);
    REQUIRE(dp);
    pl_shader_obj state = NULL;
    pl_tex src = create_test_img(gpu);

    // Create the FBOs
    pl_fmt fmt = pl_find_fmt(gpu, PL_FMT_FLOAT, COMPS, DEPTH, 32,
                             PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_BLITTABLE);
    REQUIRE(fmt);

    pl_tex fbos[NUM_TEX] = {0};
    for (int i = 0; i < NUM_TEX; i++) {
        fbos[i] = pl_tex_create(gpu, pl_tex_params(
            .format         = fmt,
            .w              = WIDTH,
            .h              = HEIGHT,
            .renderable     = true,
            .blit_dst       = true,
            .host_writable  = true,
            .host_readable  = true,
            .storable       = !!(fmt->caps & PL_FMT_CAP_STORABLE),
        ));
        REQUIRE(fbos[i]);

        pl_tex_clear(gpu, fbos[i], (float[4]){ 0.0 });
    }

    // Run the benchmark and flush+block once to force shader compilation etc.
    run_bench(gpu, dp, &state, src, fbos[0], NULL, bench);
    pl_gpu_finish(gpu);

    // Perform the actual benchmark
    pl_clock_t start_warmup = 0, start_test = 0;
    unsigned long frames = 0, frames_warmup = 0;

    pl_timer timer = pl_timer_create(gpu);
    uint64_t gputime_total = 0;
    unsigned long gputime_count = 0;
    uint64_t gputime;

    start_warmup = pl_clock_now();
    do {
        const int idx = frames % NUM_TEX;
        while (pl_tex_poll(gpu, fbos[idx], UINT64_MAX))
            ; // do nothing
        run_bench(gpu, dp, &state, src, fbos[idx], start_test ? timer : NULL, bench);
        pl_gpu_flush(gpu);
        frames++;

        if (start_test) {
            while ((gputime = pl_timer_query(gpu, timer))) {
                gputime_total += gputime;
                gputime_count++;
            }
        }

        pl_clock_t now = pl_clock_now();
        if (start_test) {
            if (pl_clock_diff(now, start_test) > TEST_MS * 1e-3)
                break;
        } else if (pl_clock_diff(now, start_warmup) > WARMUP_MS * 1e-3) {
            start_test = now;
            frames_warmup = frames;
        }
    } while (true);

    // Force the GPU to finish execution and re-measure the final stop time
    pl_gpu_finish(gpu);

    pl_clock_t stop = pl_clock_now();
    while ((gputime = pl_timer_query(gpu, timer))) {
        gputime_total += gputime;
        gputime_count++;
    }

    frames -= frames_warmup;
    double secs = pl_clock_diff(stop, start_test);
    printf("'%s':\t%4lu frames in %1.6f seconds => %2.6f ms/frame (%5.2f FPS)",
          name, frames, secs, 1000 * secs / frames, frames / secs);
    if (gputime_count)
        printf(", gpu time: %2.6f ms", 1e-6 * gputime_total / gputime_count);
    printf("\n");

    pl_timer_destroy(gpu, &timer);
    pl_shader_obj_destroy(&state);
    pl_dispatch_destroy(&dp);
    pl_tex_destroy(gpu, &src);
    for (int i = 0; i < NUM_TEX; i++)
        pl_tex_destroy(gpu, &fbos[i]);
}

// List of benchmarks
static void bench_deband(pl_shader sh, pl_shader_obj *state, pl_tex src)
{
    pl_shader_deband(sh, pl_sample_src( .tex = src ), NULL);
}

static void bench_deband_heavy(pl_shader sh, pl_shader_obj *state, pl_tex src)
{
    pl_shader_deband(sh, pl_sample_src( .tex = src ), pl_deband_params(
        .iterations = 4,
        .threshold  = 4.0,
        .radius     = 4.0,
        .grain      = 16.0,
    ));
}

static void bench_bilinear(pl_shader sh, pl_shader_obj *state, pl_tex src)
{
    REQUIRE(pl_shader_sample_bilinear(sh, pl_sample_src( .tex = src )));
}

static void bench_bicubic(pl_shader sh, pl_shader_obj *state, pl_tex src)
{
    REQUIRE(pl_shader_sample_bicubic(sh, pl_sample_src( .tex = src )));
}

static void bench_hermite(pl_shader sh, pl_shader_obj *state, pl_tex src)
{
    REQUIRE(pl_shader_sample_hermite(sh, pl_sample_src( .tex = src )));
}

static void bench_gaussian(pl_shader sh, pl_shader_obj *state, pl_tex src)
{
    REQUIRE(pl_shader_sample_gaussian(sh, pl_sample_src( .tex = src )));
}

static void bench_dither_blue(pl_shader sh, pl_shader_obj *state, pl_tex src)
{
    REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
    pl_shader_dither(sh, 8, state, pl_dither_params(
        .method = PL_DITHER_BLUE_NOISE,
    ));
}

static void bench_dither_white(pl_shader sh, pl_shader_obj *state, pl_tex src)
{
    REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
    pl_shader_dither(sh, 8, state, pl_dither_params(
        .method = PL_DITHER_WHITE_NOISE,
    ));
}

static void bench_dither_ordered_fix(pl_shader sh, pl_shader_obj *state, pl_tex src)
{
    REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
    pl_shader_dither(sh, 8, state, pl_dither_params(
        .method = PL_DITHER_ORDERED_FIXED,
    ));
}

static void bench_polar(pl_shader sh, pl_shader_obj *state, pl_tex src)
{
    struct pl_sample_filter_params params = {
        .filter = pl_filter_ewa_lanczos,
        .lut = state,
    };

    REQUIRE(pl_shader_sample_polar(sh, pl_sample_src( .tex = src ), &params));
}

static void bench_polar_nocompute(pl_shader sh, pl_shader_obj *state, pl_tex src)
{
    struct pl_sample_filter_params params = {
        .filter = pl_filter_ewa_lanczos,
        .no_compute = true,
        .lut = state,
    };

    REQUIRE(pl_shader_sample_polar(sh, pl_sample_src( .tex = src ), &params));
}

static void bench_hdr_peak(pl_shader sh, pl_shader_obj *state, pl_tex src)
{
    REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
    REQUIRE(pl_shader_detect_peak(sh, pl_color_space_hdr10, state, &pl_peak_detect_default_params));
}

static void bench_hdr_peak_hq(pl_shader sh, pl_shader_obj *state, pl_tex src)
{
    REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
    REQUIRE(pl_shader_detect_peak(sh, pl_color_space_hdr10, state, &pl_peak_detect_high_quality_params));
}

static void bench_hdr_lut(pl_shader sh, pl_shader_obj *state, pl_tex src)
{
    struct pl_color_map_params params = {
        PL_COLOR_MAP_DEFAULTS
        .tone_mapping_function  = &pl_tone_map_bt2390,
        .tone_mapping_mode      = PL_TONE_MAP_RGB,
    };

    REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
    pl_shader_color_map_ex(sh, &params, pl_color_map_args(
        .src = pl_color_space_hdr10,
        .dst = pl_color_space_monitor,
        .state = state,
    ));
}

static void bench_hdr_clip(pl_shader sh, pl_shader_obj *state, pl_tex src)
{
    struct pl_color_map_params params = {
        PL_COLOR_MAP_DEFAULTS
        .tone_mapping_function  = &pl_tone_map_clip,
        .tone_mapping_mode      = PL_TONE_MAP_RGB,
    };

    REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
    pl_shader_color_map_ex(sh, &params, pl_color_map_args(
        .src = pl_color_space_hdr10,
        .dst = pl_color_space_monitor,
        .state = state,
    ));
}

static void bench_weave(pl_shader sh, pl_shader_obj *state, pl_tex src)
{
    struct pl_deinterlace_source dsrc = {
        .cur = pl_field_pair(src),
        .field = PL_FIELD_TOP,
    };

    pl_shader_deinterlace(sh, &dsrc, pl_deinterlace_params(
        .algo = PL_DEINTERLACE_WEAVE,
    ));
}

static void bench_bob(pl_shader sh, pl_shader_obj *state, pl_tex src)
{
    struct pl_deinterlace_source dsrc = {
        .cur = pl_field_pair(src),
        .field = PL_FIELD_TOP,
    };

    pl_shader_deinterlace(sh, &dsrc, pl_deinterlace_params(
        .algo = PL_DEINTERLACE_BOB,
    ));
}

static void bench_yadif(pl_shader sh, pl_shader_obj *state, pl_tex src)
{
    struct pl_deinterlace_source dsrc = {
        .prev = pl_field_pair(src),
        .cur = pl_field_pair(src),
        .next = pl_field_pair(src),
        .field = PL_FIELD_TOP,
    };

    pl_shader_deinterlace(sh, &dsrc, pl_deinterlace_params(
        .algo = PL_DEINTERLACE_YADIF,
    ));
}

static void bench_av1_grain(pl_shader sh, pl_shader_obj *state, pl_tex src)
{
    struct pl_film_grain_params params = {
        .data = {
            .type = PL_FILM_GRAIN_AV1,
            .params.av1 = av1_grain_data,
            .seed = rand(),
        },
        .tex = src,
        .components = 3,
        .component_mapping = {0, 1, 2},
        .repr = &(struct pl_color_repr) {0},
    };

    REQUIRE(pl_shader_film_grain(sh, state, &params));
}

static void bench_av1_grain_lap(pl_shader sh, pl_shader_obj *state, pl_tex src)
{
    struct pl_film_grain_params params = {
        .data = {
            .type = PL_FILM_GRAIN_AV1,
            .params.av1 = av1_grain_data,
            .seed = rand(),
        },
        .tex = src,
        .components = 3,
        .component_mapping = {0, 1, 2},
        .repr = &(struct pl_color_repr) {0},
    };

    params.data.params.av1.overlap = true;
    REQUIRE(pl_shader_film_grain(sh, state, &params));
}

static void bench_h274_grain(pl_shader sh, pl_shader_obj *state, pl_tex src)
{
    struct pl_film_grain_params params = {
        .data = {
            .type = PL_FILM_GRAIN_H274,
            .params.h274 = h274_grain_data,
            .seed = rand(),
        },
        .tex = src,
        .components = 3,
        .component_mapping = {0, 1, 2},
        .repr = &(struct pl_color_repr) {0},
    };

    REQUIRE(pl_shader_film_grain(sh, state, &params));
}

static void bench_reshape_poly(pl_shader sh, pl_shader_obj *state, pl_tex src)
{
    REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
    pl_shader_dovi_reshape(sh, &(struct pl_dovi_metadata) { .comp = {
        {
            .num_pivots = 8,
            .pivots = {0.0, 0.00488758553, 0.0420332365, 0.177908108,
                       0.428152502, 0.678396881, 0.92864126, 1.0},
            .method = {0, 0, 0, 0, 0, 0, 0},
            .poly_coeffs = {
                {0.00290930271, 2.30019712, 50.1446037},
                {0.00725257397, 1.88119054, -4.49443769},
                {0.0150123835, 1.61106598, -1.64833081},
                {0.0498571396, 1.2059114, -0.430627108},
                {0.0878019333, 1.01845241, -0.19669354},
                {0.120447636, 0.920134187, -0.122338772},
                {2.12430835, -3.30913281, 2.10893941},
            },
        }, {
            .num_pivots = 2,
            .pivots = {0.0, 1.0},
            .method = {0},
            .poly_coeffs = {{-0.397901177, 1.85908031, 0}},
        }, {
            .num_pivots = 2,
            .pivots = {0.0, 1.0},
            .method = {0},
            .poly_coeffs = {{-0.399355531, 1.85591626, 0}},
        },
    }});
}

static void bench_reshape_mmr(pl_shader sh, pl_shader_obj *state, pl_tex src)
{
    REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
    pl_shader_dovi_reshape(sh, &dovi_meta); // this includes MMR
}

static float data[WIDTH * HEIGHT * COMPS + 8192];

static void bench_download(pl_gpu gpu, pl_tex tex)
{
    REQUIRE(pl_tex_download(gpu, pl_tex_transfer_params(
        .tex = tex,
        .ptr = (uint8_t *) PL_ALIGN((uintptr_t) data, 4096),
    )));
}

static void bench_upload(pl_gpu gpu, pl_tex tex)
{
    REQUIRE(pl_tex_upload(gpu, pl_tex_transfer_params(
        .tex = tex,
        .ptr = (uint8_t *) PL_ALIGN((uintptr_t) data, 4096),
    )));
}

static void dummy_cb(void *arg) {}

static void bench_download_async(pl_gpu gpu, pl_tex tex)
{
    REQUIRE(pl_tex_download(gpu, pl_tex_transfer_params(
        .tex = tex,
        .ptr = (uint8_t *) PL_ALIGN((uintptr_t) data, 4096),
        .callback = dummy_cb,
    )));
}

static void bench_upload_async(pl_gpu gpu, pl_tex tex)
{
    REQUIRE(pl_tex_upload(gpu, pl_tex_transfer_params(
        .tex = tex,
        .ptr = (uint8_t *) PL_ALIGN((uintptr_t) data, 4096),
        .callback = dummy_cb,
    )));
}

int main()
{
    setbuf(stdout, NULL);
    setbuf(stderr, NULL);

    pl_log log = pl_log_create(PL_API_VER, pl_log_params(
        .log_cb     = isatty(fileno(stdout)) ? pl_log_color : pl_log_simple,
        .log_level  = PL_LOG_WARN,
    ));

    pl_vulkan vk = pl_vulkan_create(log, pl_vulkan_params(
        .allow_software = true,
        .async_transfer = ASYNC_TX,
        .async_compute  = ASYNC_COMP,
        .queue_count    = NUM_QUEUES,
    ));

    if (!vk)
        return SKIP;

#define BENCH_SH(fn)  &(struct bench) { .run_sh = fn }
#define BENCH_TEX(fn) &(struct bench) { .run_tex = fn }

    printf("= Running benchmarks =\n");
    benchmark(vk->gpu, "tex_download ptr", BENCH_TEX(bench_download));
    benchmark(vk->gpu, "tex_download ptr async", BENCH_TEX(bench_download_async));
    benchmark(vk->gpu, "tex_upload ptr", BENCH_TEX(bench_upload));
    benchmark(vk->gpu, "tex_upload ptr async", BENCH_TEX(bench_upload_async));
    benchmark(vk->gpu, "bilinear", BENCH_SH(bench_bilinear));
    benchmark(vk->gpu, "bicubic", BENCH_SH(bench_bicubic));
    benchmark(vk->gpu, "hermite", BENCH_SH(bench_hermite));
    benchmark(vk->gpu, "gaussian", BENCH_SH(bench_gaussian));
    benchmark(vk->gpu, "deband", BENCH_SH(bench_deband));
    benchmark(vk->gpu, "deband_heavy", BENCH_SH(bench_deband_heavy));

    // Deinterlacing
    benchmark(vk->gpu, "weave", BENCH_SH(bench_weave));
    benchmark(vk->gpu, "bob", BENCH_SH(bench_bob));
    benchmark(vk->gpu, "yadif", BENCH_SH(bench_yadif));

    // Polar sampling
    benchmark(vk->gpu, "polar", BENCH_SH(bench_polar));
    if (vk->gpu->glsl.compute)
        benchmark(vk->gpu, "polar_nocompute", BENCH_SH(bench_polar_nocompute));

    // Dithering algorithms
    benchmark(vk->gpu, "dither_blue", BENCH_SH(bench_dither_blue));
    benchmark(vk->gpu, "dither_white", BENCH_SH(bench_dither_white));
    benchmark(vk->gpu, "dither_ordered_fixed", BENCH_SH(bench_dither_ordered_fix));

    // HDR peak detection
    if (vk->gpu->glsl.compute) {
        benchmark(vk->gpu, "hdr_peakdetect",    BENCH_SH(bench_hdr_peak));
        benchmark(vk->gpu, "hdr_peakdetect_hq", BENCH_SH(bench_hdr_peak_hq));
    }

    // Tone mapping
    benchmark(vk->gpu, "hdr_lut", BENCH_SH(bench_hdr_lut));
    benchmark(vk->gpu, "hdr_clip", BENCH_SH(bench_hdr_clip));

    // Misc stuff
    benchmark(vk->gpu, "av1_grain", BENCH_SH(bench_av1_grain));
    benchmark(vk->gpu, "av1_grain_lap", BENCH_SH(bench_av1_grain_lap));
    benchmark(vk->gpu, "h274_grain", BENCH_SH(bench_h274_grain));
    benchmark(vk->gpu, "reshape_poly", BENCH_SH(bench_reshape_poly));
    benchmark(vk->gpu, "reshape_mmr", BENCH_SH(bench_reshape_mmr));

    pl_vulkan_destroy(&vk);
    pl_log_destroy(&log);
    return 0;
}