From ff6e3c025658a5fa1affd094f220b623e7e1b24b Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Mon, 15 Apr 2024 22:38:23 +0200 Subject: Adding upstream version 6.338.2. Signed-off-by: Daniel Baumann --- demos/multigpu-bench.c | 484 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 484 insertions(+) create mode 100644 demos/multigpu-bench.c (limited to 'demos/multigpu-bench.c') diff --git a/demos/multigpu-bench.c b/demos/multigpu-bench.c new file mode 100644 index 0000000..75f1135 --- /dev/null +++ b/demos/multigpu-bench.c @@ -0,0 +1,484 @@ +/* GPU->GPU transfer benchmarks. Requires some manual setup. + * + * License: CC0 / Public Domain + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "pl_clock.h" + +#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1)) + +enum { + // Image configuration + NUM_TEX = 16, + WIDTH = 1920, + HEIGHT = 1080, + DEPTH = 16, + COMPS = 1, + + // Queue configuration + NUM_QUEUES = NUM_TEX, + ASYNC_TX = 1, + ASYNC_COMP = 1, + + // Buffer configuration + PTR_ALIGN = 4096, + PIXEL_PITCH = DEPTH / 8, + ROW_PITCH = ALIGN2(WIDTH * PIXEL_PITCH, 256), + IMAGE_SIZE = ROW_PITCH * HEIGHT, + BUFFER_SIZE = IMAGE_SIZE + PTR_ALIGN - 1, + + // Test configuration + TEST_MS = 1500, + WARMUP_MS = 500, + POLL_FREQ = 10, +}; + +static uint8_t* page_align(uint8_t *data) +{ + return (uint8_t *) ALIGN2((uintptr_t) data, PTR_ALIGN); +} + +enum mem_owner { + CPU, + SRC, + DST, + NUM_MEM_OWNERS, +}; + +enum mem_type { + RAM, + GPU, + NUM_MEM_TYPES, +}; + +// This is attached to every `pl_tex.params.user_data` +struct buffers { + pl_gpu gpu; + pl_buf buf[NUM_MEM_TYPES]; + pl_buf exported[NUM_MEM_TYPES]; + pl_buf imported[NUM_MEM_TYPES]; + struct pl_tex_transfer_params async; +}; + +static struct buffers *alloc_buffers(pl_gpu gpu) +{ + struct buffers *buffers = malloc(sizeof(*buffers)); + *buffers = (struct buffers) { .gpu = gpu }; + + for (enum mem_type type = 0; type < NUM_MEM_TYPES; type++) { + buffers->buf[type] = pl_buf_create(gpu, pl_buf_params( + .size = BUFFER_SIZE, + .memory_type = type == RAM ? PL_BUF_MEM_HOST : PL_BUF_MEM_DEVICE, + .host_mapped = true, + )); + if (!buffers->buf[type]) + exit(2); + + if (gpu->export_caps.buf & PL_HANDLE_DMA_BUF) { + buffers->exported[type] = pl_buf_create(gpu, pl_buf_params( + .size = BUFFER_SIZE, + .memory_type = type == RAM ? PL_BUF_MEM_HOST : PL_BUF_MEM_DEVICE, + .export_handle = PL_HANDLE_DMA_BUF, + )); + } + } + + return buffers; +} + +static void free_buffers(struct buffers *buffers) +{ + for (enum mem_type type = 0; type < NUM_MEM_TYPES; type++) { + pl_buf_destroy(buffers->gpu, &buffers->buf[type]); + pl_buf_destroy(buffers->gpu, &buffers->exported[type]); + pl_buf_destroy(buffers->gpu, &buffers->imported[type]); + } + free(buffers); +} + +static void link_buffers(pl_gpu gpu, struct buffers *buffers, + const struct buffers *import) +{ + if (!(gpu->import_caps.buf & PL_HANDLE_DMA_BUF)) + return; + + for (enum mem_type type = 0; type < NUM_MEM_TYPES; type++) { + if (!import->exported[type]) + continue; + buffers->imported[type] = pl_buf_create(gpu, pl_buf_params( + .size = BUFFER_SIZE, + .memory_type = type == RAM ? PL_BUF_MEM_HOST : PL_BUF_MEM_DEVICE, + .import_handle = PL_HANDLE_DMA_BUF, + .shared_mem = import->exported[type]->shared_mem, + )); + } +} + +struct ctx { + pl_gpu srcgpu, dstgpu; + pl_tex src, dst; + + // for copy-based methods + enum mem_owner owner; + enum mem_type type; + bool noimport; + bool async; +}; + +static void await_buf(pl_gpu gpu, pl_buf buf) +{ + while (pl_buf_poll(gpu, buf, UINT64_MAX)) + ; // do nothing +} + +static void async_upload(void *priv) +{ + struct buffers *buffers = priv; + pl_tex_upload(buffers->gpu, &buffers->async); +} + +static inline void copy_ptr(struct ctx ctx) +{ + const pl_gpu srcgpu = ctx.srcgpu, dstgpu = ctx.dstgpu; + const pl_tex src = ctx.src, dst = ctx.dst; + struct buffers *srcbuffers = src->params.user_data; + struct buffers *dstbuffers = dst->params.user_data; + pl_buf buf = NULL; + uint8_t *data = NULL; + + if (ctx.owner == CPU) { + static uint8_t static_buffer[BUFFER_SIZE]; + data = page_align(static_buffer); + } else { + struct buffers *b = ctx.owner == SRC ? srcbuffers : dstbuffers; + buf = b->buf[ctx.type]; + data = page_align(buf->data); + await_buf(b->gpu, buf); + } + + struct pl_tex_transfer_params src_params = { + .tex = src, + .row_pitch = ROW_PITCH, + .no_import = ctx.noimport, + }; + + if (ctx.owner == SRC) { + src_params.buf = buf; + src_params.buf_offset = data - buf->data; + } else { + src_params.ptr = data; + } + + struct pl_tex_transfer_params dst_params = { + .tex = dst, + .row_pitch = ROW_PITCH, + .no_import = ctx.noimport, + }; + + if (ctx.owner == DST) { + dst_params.buf = buf; + dst_params.buf_offset = data - buf->data; + } else { + dst_params.ptr = data; + } + + if (ctx.async) { + src_params.callback = async_upload; + src_params.priv = dstbuffers; + dstbuffers->async = dst_params; + pl_tex_download(srcgpu, &src_params); + } else { + pl_tex_download(srcgpu, &src_params); + pl_tex_upload(dstgpu, &dst_params); + } +} + +static inline void copy_interop(struct ctx ctx) +{ + const pl_gpu srcgpu = ctx.srcgpu, dstgpu = ctx.dstgpu; + const pl_tex src = ctx.src, dst = ctx.dst; + struct buffers *srcbuffers = src->params.user_data; + struct buffers *dstbuffers = dst->params.user_data; + + struct pl_tex_transfer_params src_params = { + .tex = src, + .row_pitch = ROW_PITCH, + }; + + struct pl_tex_transfer_params dst_params = { + .tex = dst, + .row_pitch = ROW_PITCH, + }; + + if (ctx.owner == SRC) { + src_params.buf = srcbuffers->exported[ctx.type]; + dst_params.buf = dstbuffers->imported[ctx.type]; + } else { + src_params.buf = srcbuffers->imported[ctx.type]; + dst_params.buf = dstbuffers->exported[ctx.type]; + } + + await_buf(srcgpu, src_params.buf); + if (ctx.async) { + src_params.callback = async_upload; + src_params.priv = dstbuffers; + dstbuffers->async = dst_params; + pl_tex_download(srcgpu, &src_params); + } else { + pl_tex_download(srcgpu, &src_params); + await_buf(srcgpu, src_params.buf); // manual cross-GPU synchronization + pl_tex_upload(dstgpu, &dst_params); + } +} + +typedef void method(struct ctx ctx); + +static double bench(struct ctx ctx, pl_tex srcs[], pl_tex dsts[], method fun) +{ + const pl_gpu srcgpu = ctx.srcgpu, dstgpu = ctx.dstgpu; + pl_clock_t start_warmup = 0, start_test = 0; + uint64_t frames = 0, frames_warmup = 0; + + start_warmup = pl_clock_now(); + do { + const int idx = frames % NUM_TEX; + ctx.src = srcs[idx]; + ctx.dst = dsts[idx]; + + // Generate some quasi-unique data in the source + float x = M_E * (frames / 100.0); + pl_tex_clear(srcgpu, ctx.src, (float[4]) { + sinf(x + 0.0) / 2.0 + 0.5, + sinf(x + 2.0) / 2.0 + 0.5, + sinf(x + 4.0) / 2.0 + 0.5, + 1.0, + }); + + if (fun) + fun(ctx); + + pl_gpu_flush(srcgpu); // to rotate queues + pl_gpu_flush(dstgpu); + frames++; + + if (frames % POLL_FREQ == 0) { + pl_clock_t now = pl_clock_now(); + if (start_test) { + if (pl_clock_diff(now, start_test) > TEST_MS * 1e-3) + break; + } else if (pl_clock_diff(now, start_warmup) > WARMUP_MS * 1e-3) { + start_test = now; + frames_warmup = frames; + } + } + } while (true); + + pl_gpu_finish(srcgpu); + pl_gpu_finish(dstgpu); + + return pl_clock_diff(pl_clock_now(), start_test) / (frames - frames_warmup); +} + +static void run_tests(pl_gpu srcgpu, pl_gpu dstgpu) +{ + const enum pl_fmt_caps caps = PL_FMT_CAP_HOST_READABLE; + pl_fmt srcfmt = pl_find_fmt(srcgpu, PL_FMT_UNORM, COMPS, DEPTH, DEPTH, caps); + pl_fmt dstfmt = pl_find_fmt(dstgpu, PL_FMT_UNORM, COMPS, DEPTH, DEPTH, caps); + if (!srcfmt || !dstfmt) + exit(2); + + pl_tex src[NUM_TEX], dst[NUM_TEX]; + for (int i = 0; i < NUM_TEX; i++) { + struct buffers *srcbuffers = alloc_buffers(srcgpu); + struct buffers *dstbuffers = alloc_buffers(dstgpu); + if (!memcmp(srcgpu->uuid, dstgpu->uuid, sizeof(srcgpu->uuid))) { + link_buffers(srcgpu, srcbuffers, dstbuffers); + link_buffers(dstgpu, dstbuffers, srcbuffers); + } + + src[i] = pl_tex_create(srcgpu, pl_tex_params( + .w = WIDTH, + .h = HEIGHT, + .format = srcfmt, + .host_readable = true, + .blit_dst = true, + .user_data = srcbuffers, + )); + + dst[i] = pl_tex_create(dstgpu, pl_tex_params( + .w = WIDTH, + .h = HEIGHT, + .format = dstfmt, + .host_writable = true, + .blit_dst = true, + .user_data = dstbuffers, + )); + + if (!src[i] || !dst[i]) + exit(2); + } + + struct ctx ctx = { + .srcgpu = srcgpu, + .dstgpu = dstgpu, + }; + + static const char *owners[] = { + [CPU] = "cpu", + [SRC] = "src", + [DST] = "dst", + }; + + static const char *types[] = { + [RAM] = "ram", + [GPU] = "gpu", + }; + + double baseline = bench(ctx, src, dst, NULL); + + // Test all possible generic copy methods + for (enum mem_owner owner = 0; owner < NUM_MEM_OWNERS; owner++) { + for (enum mem_type type = 0; type < NUM_MEM_TYPES; type++) { + for (int async = 0; async <= 1; async++) { + for (int noimport = 0; noimport <= 1; noimport++) { + // Blacklist undesirable configurations: + if (owner == CPU && type != RAM) + continue; // impossible + if (owner == CPU && async) + continue; // no synchronization on static buffer + if (owner == SRC && type == GPU) + continue; // GPU readback is orders of magnitude too slow + if (owner == DST && !noimport) + continue; // exhausts source address space + + struct ctx cfg = ctx; + cfg.noimport = noimport; + cfg.owner = owner; + cfg.type = type; + cfg.async = async; + + printf(" %s %s %s %s : ", + owners[owner], types[type], + noimport ? "memcpy" : " ", + async ? "async" : " "); + + double dur = bench(cfg, src, dst, copy_ptr) - baseline; + printf("avg %.0f μs\t%.3f fps\n", + 1e6 * dur, 1.0 / dur); + } + } + } + } + + // Test DMABUF interop when supported + for (enum mem_owner owner = 0; owner < NUM_MEM_OWNERS; owner++) { + for (enum mem_type type = 0; type < NUM_MEM_TYPES; type++) { + for (int async = 0; async <= 1; async++) { + struct buffers *buffers; + switch (owner) { + case SRC: + buffers = dst[0]->params.user_data; + if (!buffers->imported[type]) + continue; + break; + case DST: + buffers = src[0]->params.user_data; + if (!buffers->imported[type]) + continue; + break; + default: continue; + } + + struct ctx cfg = ctx; + cfg.owner = owner; + cfg.type = type; + + printf(" %s %s %s %s : ", + owners[owner], types[type], "dmabuf", + async ? "async" : " "); + + double dur = bench(cfg, src, dst, copy_interop) - baseline; + printf("avg %.0f μs\t%.3f fps\n", + 1e6 * dur, 1.0 / dur); + } + } + } + + for (int i = 0; i < NUM_TEX; i++) { + free_buffers(src[i]->params.user_data); + free_buffers(dst[i]->params.user_data); + pl_tex_destroy(srcgpu, &src[i]); + pl_tex_destroy(dstgpu, &dst[i]); + } +} + +int main(int argc, const char *argv[]) +{ + if (argc < 3) { + fprintf(stderr, "Usage: %s 'Device 1' 'Device 2'\n\n", argv[0]); + fprintf(stderr, "(Use `vulkaninfo` for a list of devices)\n"); + exit(1); + } + + pl_log log = pl_log_create(PL_API_VER, pl_log_params( + .log_cb = pl_log_color, + .log_level = PL_LOG_WARN, + )); + + pl_vk_inst inst = pl_vk_inst_create(log, pl_vk_inst_params( + .debug = false, + )); + + pl_vulkan dev1 = pl_vulkan_create(log, pl_vulkan_params( + .device_name = argv[1], + .queue_count = NUM_QUEUES, + .async_transfer = ASYNC_TX, + .async_compute = ASYNC_COMP, + )); + + pl_vulkan dev2 = pl_vulkan_create(log, pl_vulkan_params( + .device_name = argv[2], + .queue_count = NUM_QUEUES, + .async_transfer = ASYNC_TX, + .async_compute = ASYNC_COMP, + )); + + if (!dev1 || !dev2) { + fprintf(stderr, "Failed creating Vulkan device!\n"); + exit(1); + } + + if (ROW_PITCH % dev1->gpu->limits.align_tex_xfer_pitch) { + fprintf(stderr, "Warning: Row pitch %d is not a multiple of optimal " + "transfer pitch (%zu) for GPU '%s'\n", ROW_PITCH, + dev1->gpu->limits.align_tex_xfer_pitch, argv[1]); + } + + if (ROW_PITCH % dev2->gpu->limits.align_tex_xfer_pitch) { + fprintf(stderr, "Warning: Row pitch %d is not a multiple of optimal " + "transfer pitch (%zu) for GPU '%s'\n", ROW_PITCH, + dev2->gpu->limits.align_tex_xfer_pitch, argv[2]); + } + + printf("%s -> %s:\n", argv[1], argv[2]); + run_tests(dev1->gpu, dev2->gpu); + if (strcmp(argv[1], argv[2])) { + printf("%s -> %s:\n", argv[2], argv[1]); + run_tests(dev2->gpu, dev1->gpu); + } + + pl_vulkan_destroy(&dev1); + pl_vulkan_destroy(&dev2); + pl_vk_inst_destroy(&inst); + pl_log_destroy(&log); +} -- cgit v1.2.3