/* GPU->GPU transfer benchmarks. Requires some manual setup.
 *
 * License: CC0 / Public Domain
 */

#include <assert.h>
#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#include <libplacebo/gpu.h>
#include <libplacebo/vulkan.h>

#include "pl_clock.h"

#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1))

enum {
    // Image configuration
    NUM_TEX = 16,
    WIDTH   = 1920,
    HEIGHT  = 1080,
    DEPTH   = 16,
    COMPS   = 1,

    // Queue configuration
    NUM_QUEUES = NUM_TEX,
    ASYNC_TX   = 1,
    ASYNC_COMP = 1,

    // Buffer configuration
    PTR_ALIGN    = 4096,
    PIXEL_PITCH  = DEPTH / 8,
    ROW_PITCH    = ALIGN2(WIDTH * PIXEL_PITCH, 256),
    IMAGE_SIZE   = ROW_PITCH * HEIGHT,
    BUFFER_SIZE  = IMAGE_SIZE + PTR_ALIGN - 1,

    // Test configuration
    TEST_MS    = 1500,
    WARMUP_MS  = 500,
    POLL_FREQ  = 10,
};

static uint8_t* page_align(uint8_t *data)
{
    return (uint8_t *) ALIGN2((uintptr_t) data, PTR_ALIGN);
}

enum mem_owner {
    CPU,
    SRC,
    DST,
    NUM_MEM_OWNERS,
};

enum mem_type {
    RAM,
    GPU,
    NUM_MEM_TYPES,
};

// This is attached to every `pl_tex.params.user_data`
struct buffers {
    pl_gpu gpu;
    pl_buf buf[NUM_MEM_TYPES];
    pl_buf exported[NUM_MEM_TYPES];
    pl_buf imported[NUM_MEM_TYPES];
    struct pl_tex_transfer_params async;
};

static struct buffers *alloc_buffers(pl_gpu gpu)
{
    struct buffers *buffers = malloc(sizeof(*buffers));
    *buffers = (struct buffers) { .gpu = gpu };

    for (enum mem_type type = 0; type < NUM_MEM_TYPES; type++) {
        buffers->buf[type] = pl_buf_create(gpu, pl_buf_params(
            .size          = BUFFER_SIZE,
            .memory_type   = type == RAM ? PL_BUF_MEM_HOST : PL_BUF_MEM_DEVICE,
            .host_mapped   = true,
        ));
        if (!buffers->buf[type])
            exit(2);

        if (gpu->export_caps.buf & PL_HANDLE_DMA_BUF) {
            buffers->exported[type] = pl_buf_create(gpu, pl_buf_params(
                .size          = BUFFER_SIZE,
                .memory_type   = type == RAM ? PL_BUF_MEM_HOST : PL_BUF_MEM_DEVICE,
                .export_handle = PL_HANDLE_DMA_BUF,
            ));
        }
    }

    return buffers;
}

static void free_buffers(struct buffers *buffers)
{
    for (enum mem_type type = 0; type < NUM_MEM_TYPES; type++) {
        pl_buf_destroy(buffers->gpu, &buffers->buf[type]);
        pl_buf_destroy(buffers->gpu, &buffers->exported[type]);
        pl_buf_destroy(buffers->gpu, &buffers->imported[type]);
    }
    free(buffers);
}

static void link_buffers(pl_gpu gpu, struct buffers *buffers,
                         const struct buffers *import)
{
    if (!(gpu->import_caps.buf & PL_HANDLE_DMA_BUF))
        return;

    for (enum mem_type type = 0; type < NUM_MEM_TYPES; type++) {
        if (!import->exported[type])
            continue;
        buffers->imported[type] = pl_buf_create(gpu, pl_buf_params(
            .size          = BUFFER_SIZE,
            .memory_type   = type == RAM ? PL_BUF_MEM_HOST : PL_BUF_MEM_DEVICE,
            .import_handle = PL_HANDLE_DMA_BUF,
            .shared_mem    = import->exported[type]->shared_mem,
        ));
    }
}

struct ctx {
    pl_gpu srcgpu, dstgpu;
    pl_tex src, dst;

    // for copy-based methods
    enum mem_owner  owner;
    enum mem_type   type;
    bool noimport;
    bool async;
};

static void await_buf(pl_gpu gpu, pl_buf buf)
{
    while (pl_buf_poll(gpu, buf, UINT64_MAX))
        ; // do nothing
}

static void async_upload(void *priv)
{
    struct buffers *buffers = priv;
    pl_tex_upload(buffers->gpu, &buffers->async);
}

static inline void copy_ptr(struct ctx ctx)
{
    const pl_gpu srcgpu = ctx.srcgpu, dstgpu = ctx.dstgpu;
    const pl_tex src = ctx.src, dst = ctx.dst;
    struct buffers *srcbuffers = src->params.user_data;
    struct buffers *dstbuffers = dst->params.user_data;
    pl_buf buf = NULL;
    uint8_t *data = NULL;

    if (ctx.owner == CPU) {
        static uint8_t static_buffer[BUFFER_SIZE];
        data = page_align(static_buffer);
    } else {
        struct buffers *b = ctx.owner == SRC ? srcbuffers : dstbuffers;
        buf = b->buf[ctx.type];
        data = page_align(buf->data);
        await_buf(b->gpu, buf);
    }

    struct pl_tex_transfer_params src_params = {
        .tex       = src,
        .row_pitch = ROW_PITCH,
        .no_import = ctx.noimport,
    };

    if (ctx.owner == SRC) {
        src_params.buf = buf;
        src_params.buf_offset = data - buf->data;
    } else {
        src_params.ptr = data;
    }

    struct pl_tex_transfer_params dst_params = {
        .tex       = dst,
        .row_pitch = ROW_PITCH,
        .no_import = ctx.noimport,
    };

    if (ctx.owner == DST) {
        dst_params.buf = buf;
        dst_params.buf_offset = data - buf->data;
    } else {
        dst_params.ptr = data;
    }

    if (ctx.async) {
        src_params.callback = async_upload;
        src_params.priv = dstbuffers;
        dstbuffers->async = dst_params;
        pl_tex_download(srcgpu, &src_params);
    } else {
        pl_tex_download(srcgpu, &src_params);
        pl_tex_upload(dstgpu, &dst_params);
    }
}

static inline void copy_interop(struct ctx ctx)
{
    const pl_gpu srcgpu = ctx.srcgpu, dstgpu = ctx.dstgpu;
    const pl_tex src = ctx.src, dst = ctx.dst;
    struct buffers *srcbuffers = src->params.user_data;
    struct buffers *dstbuffers = dst->params.user_data;

    struct pl_tex_transfer_params src_params = {
        .tex       = src,
        .row_pitch = ROW_PITCH,
    };

    struct pl_tex_transfer_params dst_params = {
        .tex       = dst,
        .row_pitch = ROW_PITCH,
    };

    if (ctx.owner == SRC) {
        src_params.buf = srcbuffers->exported[ctx.type];
        dst_params.buf = dstbuffers->imported[ctx.type];
    } else {
        src_params.buf = srcbuffers->imported[ctx.type];
        dst_params.buf = dstbuffers->exported[ctx.type];
    }

    await_buf(srcgpu, src_params.buf);
    if (ctx.async) {
        src_params.callback = async_upload;
        src_params.priv = dstbuffers;
        dstbuffers->async = dst_params;
        pl_tex_download(srcgpu, &src_params);
    } else {
        pl_tex_download(srcgpu, &src_params);
        await_buf(srcgpu, src_params.buf); // manual cross-GPU synchronization
        pl_tex_upload(dstgpu, &dst_params);
    }
}

typedef void method(struct ctx ctx);

static double bench(struct ctx ctx, pl_tex srcs[], pl_tex dsts[], method fun)
{
    const pl_gpu srcgpu = ctx.srcgpu, dstgpu = ctx.dstgpu;
    pl_clock_t start_warmup = 0, start_test = 0;
    uint64_t frames = 0, frames_warmup = 0;

    start_warmup = pl_clock_now();
    do {
        const int idx = frames % NUM_TEX;
        ctx.src = srcs[idx];
        ctx.dst = dsts[idx];

        // Generate some quasi-unique data in the source
        float x = M_E * (frames / 100.0);
        pl_tex_clear(srcgpu, ctx.src, (float[4]) {
            sinf(x + 0.0) / 2.0 + 0.5,
            sinf(x + 2.0) / 2.0 + 0.5,
            sinf(x + 4.0) / 2.0 + 0.5,
            1.0,
        });

        if (fun)
            fun(ctx);

        pl_gpu_flush(srcgpu); // to rotate queues
        pl_gpu_flush(dstgpu);
        frames++;

        if (frames % POLL_FREQ == 0) {
            pl_clock_t now = pl_clock_now();
            if (start_test) {
                if (pl_clock_diff(now, start_test) > TEST_MS * 1e-3)
                    break;
            } else if (pl_clock_diff(now, start_warmup) > WARMUP_MS * 1e-3) {
                start_test = now;
                frames_warmup = frames;
            }
        }
    } while (true);

    pl_gpu_finish(srcgpu);
    pl_gpu_finish(dstgpu);

    return pl_clock_diff(pl_clock_now(), start_test) / (frames - frames_warmup);
}

static void run_tests(pl_gpu srcgpu, pl_gpu dstgpu)
{
    const enum pl_fmt_caps caps = PL_FMT_CAP_HOST_READABLE;
    pl_fmt srcfmt = pl_find_fmt(srcgpu, PL_FMT_UNORM, COMPS, DEPTH, DEPTH, caps);
    pl_fmt dstfmt = pl_find_fmt(dstgpu, PL_FMT_UNORM, COMPS, DEPTH, DEPTH, caps);
    if (!srcfmt || !dstfmt)
        exit(2);

    pl_tex src[NUM_TEX], dst[NUM_TEX];
    for (int i = 0; i < NUM_TEX; i++) {
        struct buffers *srcbuffers = alloc_buffers(srcgpu);
        struct buffers *dstbuffers = alloc_buffers(dstgpu);
        if (!memcmp(srcgpu->uuid, dstgpu->uuid, sizeof(srcgpu->uuid))) {
            link_buffers(srcgpu, srcbuffers, dstbuffers);
            link_buffers(dstgpu, dstbuffers, srcbuffers);
        }

        src[i] = pl_tex_create(srcgpu, pl_tex_params(
            .w             = WIDTH,
            .h             = HEIGHT,
            .format        = srcfmt,
            .host_readable = true,
            .blit_dst      = true,
            .user_data     = srcbuffers,
        ));

        dst[i] = pl_tex_create(dstgpu, pl_tex_params(
            .w             = WIDTH,
            .h             = HEIGHT,
            .format        = dstfmt,
            .host_writable = true,
            .blit_dst      = true,
            .user_data     = dstbuffers,
        ));

        if (!src[i] || !dst[i])
            exit(2);
    }

    struct ctx ctx = {
        .srcgpu = srcgpu,
        .dstgpu = dstgpu,
    };

    static const char *owners[] = {
        [CPU] = "cpu",
        [SRC] = "src",
        [DST] = "dst",
    };

    static const char *types[] = {
        [RAM] = "ram",
        [GPU] = "gpu",
    };

    double baseline = bench(ctx, src, dst, NULL);

    // Test all possible generic copy methods
    for (enum mem_owner owner = 0; owner < NUM_MEM_OWNERS; owner++) {
        for (enum mem_type type = 0; type < NUM_MEM_TYPES; type++) {
            for (int async = 0; async <= 1; async++) {
                for (int noimport = 0; noimport <= 1; noimport++) {
                    // Blacklist undesirable configurations:
                    if (owner == CPU && type != RAM)
                        continue; // impossible
                    if (owner == CPU && async)
                        continue; // no synchronization on static buffer
                    if (owner == SRC && type == GPU)
                        continue; // GPU readback is orders of magnitude too slow
                    if (owner == DST && !noimport)
                        continue; // exhausts source address space

                    struct ctx cfg = ctx;
                    cfg.noimport = noimport;
                    cfg.owner    = owner;
                    cfg.type     = type;
                    cfg.async    = async;

                    printf("  %s %s %s %s : ",
                           owners[owner], types[type],
                           noimport ? "memcpy" : "      ",
                           async    ? "async" : "     ");

                    double dur = bench(cfg, src, dst, copy_ptr) - baseline;
                    printf("avg %.0f μs\t%.3f fps\n",
                           1e6 * dur, 1.0 / dur);
                }
            }
        }
    }

    // Test DMABUF interop when supported
    for (enum mem_owner owner = 0; owner < NUM_MEM_OWNERS; owner++) {
        for (enum mem_type type = 0; type < NUM_MEM_TYPES; type++) {
            for (int async = 0; async <= 1; async++) {
                struct buffers *buffers;
                switch (owner) {
                case SRC:
                    buffers = dst[0]->params.user_data;
                    if (!buffers->imported[type])
                        continue;
                    break;
                case DST:
                    buffers = src[0]->params.user_data;
                    if (!buffers->imported[type])
                        continue;
                    break;
                default: continue;
                }

                struct ctx cfg = ctx;
                cfg.owner = owner;
                cfg.type = type;

                printf("  %s %s %s %s : ",
                       owners[owner], types[type], "dmabuf",
                       async ? "async" : "     ");

                double dur = bench(cfg, src, dst, copy_interop) - baseline;
                        printf("avg %.0f μs\t%.3f fps\n",
                               1e6 * dur, 1.0 / dur);
            }
        }
    }

    for (int i = 0; i < NUM_TEX; i++) {
        free_buffers(src[i]->params.user_data);
        free_buffers(dst[i]->params.user_data);
        pl_tex_destroy(srcgpu, &src[i]);
        pl_tex_destroy(dstgpu, &dst[i]);
    }
}

int main(int argc, const char *argv[])
{
    if (argc < 3) {
        fprintf(stderr, "Usage: %s 'Device 1' 'Device 2'\n\n", argv[0]);
        fprintf(stderr, "(Use `vulkaninfo` for a list of devices)\n");
        exit(1);
    }

    pl_log log = pl_log_create(PL_API_VER, pl_log_params(
        .log_cb    = pl_log_color,
        .log_level = PL_LOG_WARN,
    ));

    pl_vk_inst inst = pl_vk_inst_create(log, pl_vk_inst_params(
        .debug = false,
    ));

    pl_vulkan dev1 = pl_vulkan_create(log, pl_vulkan_params(
        .device_name    = argv[1],
        .queue_count    = NUM_QUEUES,
        .async_transfer = ASYNC_TX,
        .async_compute  = ASYNC_COMP,
    ));

    pl_vulkan dev2 = pl_vulkan_create(log, pl_vulkan_params(
        .device_name    = argv[2],
        .queue_count    = NUM_QUEUES,
        .async_transfer = ASYNC_TX,
        .async_compute  = ASYNC_COMP,
    ));

    if (!dev1 || !dev2) {
        fprintf(stderr, "Failed creating Vulkan device!\n");
        exit(1);
    }

    if (ROW_PITCH % dev1->gpu->limits.align_tex_xfer_pitch) {
        fprintf(stderr, "Warning: Row pitch %d is not a multiple of optimal "
                "transfer pitch (%zu) for GPU '%s'\n", ROW_PITCH,
                dev1->gpu->limits.align_tex_xfer_pitch, argv[1]);
    }

    if (ROW_PITCH % dev2->gpu->limits.align_tex_xfer_pitch) {
        fprintf(stderr, "Warning: Row pitch %d is not a multiple of optimal "
                "transfer pitch (%zu) for GPU '%s'\n", ROW_PITCH,
                dev2->gpu->limits.align_tex_xfer_pitch, argv[2]);
    }

    printf("%s -> %s:\n", argv[1], argv[2]);
    run_tests(dev1->gpu, dev2->gpu);
    if (strcmp(argv[1], argv[2])) {
        printf("%s -> %s:\n", argv[2], argv[1]);
        run_tests(dev2->gpu, dev1->gpu);
    }

    pl_vulkan_destroy(&dev1);
    pl_vulkan_destroy(&dev2);
    pl_vk_inst_destroy(&inst);
    pl_log_destroy(&log);
}