diff options
Diffstat (limited to 'demos/video-filtering.c')
-rw-r--r-- | demos/video-filtering.c | 871 |
1 files changed, 871 insertions, 0 deletions
diff --git a/demos/video-filtering.c b/demos/video-filtering.c new file mode 100644 index 0000000..5881c28 --- /dev/null +++ b/demos/video-filtering.c @@ -0,0 +1,871 @@ +/* Presented are two hypothetical scenarios of how one might use libplacebo + * as something like an FFmpeg or mpv video filter. We examine two example + * APIs (loosely modeled after real video filtering APIs) and how each style + * would like to use libplacebo. + * + * For sake of a simple example, let's assume this is a debanding filter. + * For those of you too lazy to compile/run this file but still want to see + * results, these are from my machine (RX 5700 XT + 1950X, as of 2020-05-25): + * + * RADV+ACO: + * api1: 10000 frames in 16.328440 s => 1.632844 ms/frame (612.43 fps) + * render: 0.113524 ms, upload: 0.127551 ms, download: 0.146097 ms + * api2: 10000 frames in 5.335634 s => 0.533563 ms/frame (1874.19 fps) + * render: 0.064378 ms, upload: 0.000000 ms, download: 0.189719 ms + * + * AMDVLK: + * api1: 10000 frames in 14.921859 s => 1.492186 ms/frame (670.16 fps) + * render: 0.110603 ms, upload: 0.114412 ms, download: 0.115375 ms + * api2: 10000 frames in 4.667386 s => 0.466739 ms/frame (2142.53 fps) + * render: 0.030781 ms, upload: 0.000000 ms, download: 0.075237 ms + * + * You can see that AMDVLK is still better at doing texture streaming than + * RADV - this is because as of writing RADV still does not support + * asynchronous texture queues / DMA engine transfers. If we disable the + * `async_transfer` option with AMDVLK we get this: + * + * api1: 10000 frames in 16.087723 s => 1.608772 ms/frame (621.59 fps) + * render: 0.111154 ms, upload: 0.122476 ms, download: 0.133162 ms + * api2: 10000 frames in 6.344959 s => 0.634496 ms/frame (1576.05 fps) + * render: 0.031307 ms, upload: 0.000000 ms, download: 0.083520 ms + * + * License: CC0 / Public Domain + */ + +#include <assert.h> +#include <stdlib.h> +#include <stdbool.h> +#include <stdio.h> +#include <string.h> + +#include "common.h" +#include "pl_clock.h" +#include "pl_thread.h" + +#ifdef _WIN32 +#include <windows.h> +#endif + +#include <libplacebo/dispatch.h> +#include <libplacebo/shaders/sampling.h> +#include <libplacebo/utils/upload.h> +#include <libplacebo/vulkan.h> + +/////////////////////// +/// API definitions /// +/////////////////////// + +// Stuff that would be common to each API + +void *init(void); +void uninit(void *priv); + +struct format { + // For simplicity let's make a few assumptions here, since configuring the + // texture format is not the point of this example. (In practice you can + // go nuts with the `utils/upload.h` helpers) + // + // - All formats contain unsigned integers only + // - All components have the same size in bits + // - All components are in the "canonical" order + // - All formats have power of two sizes only (2 or 4 components, not 3) + // - All plane strides are a multiple of the pixel size + int num_comps; + int bitdepth; +}; + +struct plane { + int subx, suby; // subsampling shift + struct format fmt; + size_t stride; + void *data; +}; + +#define MAX_PLANES 4 + +struct image { + int width, height; + int num_planes; + struct plane planes[MAX_PLANES]; + + // For API #2, the associated mapped buffer (if any) + struct api2_buf *associated_buf; +}; + + +// Example API design #1: synchronous, blocking, double-copy (bad!) +// +// In this API, `api1_filter` must immediately return with the new data. +// This prevents parallelism on the GPU and should be avoided if possible, +// but sometimes that's what you have to work with. So this is what it +// would look like. +// +// Also, let's assume this API design reconfigures the filter chain (using +// a blank `proxy` image every time the image format or dimensions change, +// and doesn't expect us to fail due to format mismatches or resource +// exhaustion afterwards. + +bool api1_reconfig(void *priv, const struct image *proxy); +bool api1_filter(void *priv, struct image *dst, struct image *src); + + +// Example API design #2: asynchronous, streaming, queued, zero-copy (good!) +// +// In this API, `api2_process` will run by the calling code every so often +// (e.g. when new data is available or expected). This function has access +// to non-blocking functions `get_image` and `put_image` that interface +// with the video filtering engine's internal queueing system. +// +// This API is also designed to feed multiple frames ahead of time, i.e. +// it will feed us as many frames as it can while we're still returning +// `API2_WANT_MORE`. To drain the filter chain, it would continue running +// the process function until `API2_HAVE_MORE` is no longer present +// in the output. +// +// This API is also designed to do zero-copy where possible. When it wants +// to create a data buffer of a given size, it will call our function +// `api2_alloc` which will return a buffer that we can process directly. +// We can use this to do zero-copy uploading to the GPU, by creating +// host-visible persistently mapped buffers. In order to prevent the video +// filtering system from re-using our buffers while copies are happening, we +// use special functions `image_lock` and `image_unlock` to increase a +// refcount on the image's backing storage. (As is typical of such APIs) +// +// Finally, this API is designed to be fully dynamic: The image parameters +// could change at any time, and we must be equipped to handle that. + +enum api2_status { + // Negative values are used to signal error conditions + API2_ERR_FMT = -2, // incompatible / unsupported format + API2_ERR_UNKNOWN = -1, // some other error happened + API2_OK = 0, // no error, no status - everything's good + + // Positive values represent a mask of status conditions + API2_WANT_MORE = (1 << 0), // we want more frames, please feed some more! + API2_HAVE_MORE = (1 << 1), // we have more frames but they're not ready +}; + +enum api2_status api2_process(void *priv); + +// Functions for creating persistently mapped buffers +struct api2_buf { + void *data; + size_t size; + void *priv; +}; + +bool api2_alloc(void *priv, size_t size, struct api2_buf *out); +void api2_free(void *priv, const struct api2_buf *buf); + +// These functions are provided by the API. The exact details of how images +// are enqueued, dequeued and locked are not really important here, so just +// do something unrealistic but simple to demonstrate with. +struct image *get_image(void); +void put_image(struct image *img); +void image_lock(struct image *img); +void image_unlock(struct image *img); + + +///////////////////////////////// +/// libplacebo implementation /// +///////////////////////////////// + + +// For API #2: +#define PARALLELISM 8 + +struct entry { + pl_buf buf; // to stream the download + pl_tex tex_in[MAX_PLANES]; + pl_tex tex_out[MAX_PLANES]; + struct image image; + + // For entries that are associated with a held image, so we can unlock them + // as soon as possible + struct image *held_image; + pl_buf held_buf; +}; + +// For both APIs: +struct priv { + pl_log log; + pl_vulkan vk; + pl_gpu gpu; + pl_dispatch dp; + pl_shader_obj dither_state; + + // Timer objects + pl_timer render_timer; + pl_timer upload_timer; + pl_timer download_timer; + uint64_t render_sum; + uint64_t upload_sum; + uint64_t download_sum; + int render_count; + int upload_count; + int download_count; + + // API #1: A simple pair of input and output textures + pl_tex tex_in[MAX_PLANES]; + pl_tex tex_out[MAX_PLANES]; + + // API #2: A ring buffer of textures/buffers for streaming + int idx_in; // points the next free entry + int idx_out; // points to the first entry still in progress + struct entry entries[PARALLELISM]; +}; + +void *init(void) { + struct priv *p = calloc(1, sizeof(struct priv)); + if (!p) + return NULL; + + p->log = pl_log_create(PL_API_VER, pl_log_params( + .log_cb = pl_log_simple, + .log_level = PL_LOG_WARN, + )); + + p->vk = pl_vulkan_create(p->log, pl_vulkan_params( + // Note: This is for API #2. In API #1 you could just pass params=NULL + // and it wouldn't really matter much. + .async_transfer = true, + .async_compute = true, + .queue_count = PARALLELISM, + )); + + if (!p->vk) { + fprintf(stderr, "Failed creating vulkan context\n"); + goto error; + } + + // Give this a shorter name for convenience + p->gpu = p->vk->gpu; + + p->dp = pl_dispatch_create(p->log, p->gpu); + if (!p->dp) { + fprintf(stderr, "Failed creating shader dispatch object\n"); + goto error; + } + + p->render_timer = pl_timer_create(p->gpu); + p->upload_timer = pl_timer_create(p->gpu); + p->download_timer = pl_timer_create(p->gpu); + + return p; + +error: + uninit(p); + return NULL; +} + +void uninit(void *priv) +{ + struct priv *p = priv; + + // API #1 + for (int i = 0; i < MAX_PLANES; i++) { + pl_tex_destroy(p->gpu, &p->tex_in[i]); + pl_tex_destroy(p->gpu, &p->tex_out[i]); + } + + // API #2 + for (int i = 0; i < PARALLELISM; i++) { + pl_buf_destroy(p->gpu, &p->entries[i].buf); + for (int j = 0; j < MAX_PLANES; j++) { + pl_tex_destroy(p->gpu, &p->entries[i].tex_in[j]); + pl_tex_destroy(p->gpu, &p->entries[i].tex_out[j]); + } + if (p->entries[i].held_image) + image_unlock(p->entries[i].held_image); + } + + pl_timer_destroy(p->gpu, &p->render_timer); + pl_timer_destroy(p->gpu, &p->upload_timer); + pl_timer_destroy(p->gpu, &p->download_timer); + + pl_shader_obj_destroy(&p->dither_state); + pl_dispatch_destroy(&p->dp); + pl_vulkan_destroy(&p->vk); + pl_log_destroy(&p->log); + + free(p); +} + +// Helper function to set up the `pl_plane_data` struct from the image params +static void setup_plane_data(const struct image *img, + struct pl_plane_data out[MAX_PLANES]) +{ + for (int i = 0; i < img->num_planes; i++) { + const struct plane *plane = &img->planes[i]; + + out[i] = (struct pl_plane_data) { + .type = PL_FMT_UNORM, + .width = img->width >> plane->subx, + .height = img->height >> plane->suby, + .pixel_stride = plane->fmt.num_comps * plane->fmt.bitdepth / 8, + .row_stride = plane->stride, + .pixels = plane->data, + }; + + // For API 2 (direct rendering) + if (img->associated_buf) { + pl_buf buf = img->associated_buf->priv; + out[i].pixels = NULL; + out[i].buf = buf; + out[i].buf_offset = (uintptr_t) plane->data - (uintptr_t) buf->data; + } + + for (int c = 0; c < plane->fmt.num_comps; c++) { + out[i].component_size[c] = plane->fmt.bitdepth; + out[i].component_pad[c] = 0; + out[i].component_map[c] = c; + } + } +} + +static bool do_plane(struct priv *p, pl_tex dst, pl_tex src) +{ + int new_depth = dst->params.format->component_depth[0]; + + // Do some debanding, and then also make sure to dither to the new depth + // so that our debanded gradients are actually preserved well + pl_shader sh = pl_dispatch_begin(p->dp); + pl_shader_deband(sh, pl_sample_src( .tex = src ), NULL); + pl_shader_dither(sh, new_depth, &p->dither_state, NULL); + return pl_dispatch_finish(p->dp, pl_dispatch_params( + .shader = &sh, + .target = dst, + .timer = p->render_timer, + )); +} + +static void check_timers(struct priv *p) +{ + uint64_t ret; + + while ((ret = pl_timer_query(p->gpu, p->render_timer))) { + p->render_sum += ret; + p->render_count++; + } + + while ((ret = pl_timer_query(p->gpu, p->upload_timer))) { + p->upload_sum += ret; + p->upload_count++; + } + + while ((ret = pl_timer_query(p->gpu, p->download_timer))) { + p->download_sum += ret; + p->download_count++; + } +} + +// API #1 implementation: +// +// In this design, we will create all GPU resources inside `reconfig`, based on +// the texture format configured from the proxy image. This will avoid failing +// later on due to e.g. resource exhaustion or texture format mismatch, and +// thereby falls within the intended semantics of this style of API. + +bool api1_reconfig(void *priv, const struct image *proxy) +{ + struct priv *p = priv; + struct pl_plane_data data[MAX_PLANES]; + setup_plane_data(proxy, data); + + for (int i = 0; i < proxy->num_planes; i++) { + pl_fmt fmt = pl_plane_find_fmt(p->gpu, NULL, &data[i]); + if (!fmt) { + fprintf(stderr, "Failed configuring filter: no good texture format!\n"); + return false; + } + + bool ok = true; + ok &= pl_tex_recreate(p->gpu, &p->tex_in[i], pl_tex_params( + .w = data[i].width, + .h = data[i].height, + .format = fmt, + .sampleable = true, + .host_writable = true, + )); + + ok &= pl_tex_recreate(p->gpu, &p->tex_out[i], pl_tex_params( + .w = data[i].width, + .h = data[i].height, + .format = fmt, + .renderable = true, + .host_readable = true, + )); + + if (!ok) { + fprintf(stderr, "Failed creating GPU textures!\n"); + return false; + } + } + + return true; +} + +bool api1_filter(void *priv, struct image *dst, struct image *src) +{ + struct priv *p = priv; + struct pl_plane_data data[MAX_PLANES]; + setup_plane_data(src, data); + + // Upload planes + for (int i = 0; i < src->num_planes; i++) { + bool ok = pl_tex_upload(p->gpu, pl_tex_transfer_params( + .tex = p->tex_in[i], + .row_pitch = data[i].row_stride, + .ptr = src->planes[i].data, + .timer = p->upload_timer, + )); + + if (!ok) { + fprintf(stderr, "Failed uploading data to the GPU!\n"); + return false; + } + } + + // Process planes + for (int i = 0; i < src->num_planes; i++) { + if (!do_plane(p, p->tex_out[i], p->tex_in[i])) { + fprintf(stderr, "Failed processing planes!\n"); + return false; + } + } + + // Download planes + for (int i = 0; i < src->num_planes; i++) { + bool ok = pl_tex_download(p->gpu, pl_tex_transfer_params( + .tex = p->tex_out[i], + .row_pitch = dst->planes[i].stride, + .ptr = dst->planes[i].data, + .timer = p->download_timer, + )); + + if (!ok) { + fprintf(stderr, "Failed downloading data from the GPU!\n"); + return false; + } + } + + check_timers(p); + return true; +} + + +// API #2 implementation: +// +// In this implementation we maintain a queue (implemented as ring buffer) +// of "work entries", which are isolated structs that hold independent GPU +// resources - so that the GPU has no cross-entry dependencies on any of the +// textures or other resources. (Side note: It still has a dependency on the +// dither state, but this is just a shared LUT anyway) + +// Align up to the nearest multiple of a power of two +#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1)) + +static enum api2_status submit_work(struct priv *p, struct entry *e, + struct image *img) +{ + // If the image comes from a mapped buffer, we have to take a lock + // while our upload is in progress + if (img->associated_buf) { + assert(!e->held_image); + image_lock(img); + e->held_image = img; + e->held_buf = img->associated_buf->priv; + } + + // Upload this image's data + struct pl_plane_data data[MAX_PLANES]; + setup_plane_data(img, data); + + for (int i = 0; i < img->num_planes; i++) { + pl_fmt fmt = pl_plane_find_fmt(p->gpu, NULL, &data[i]); + if (!fmt) + return API2_ERR_FMT; + + // FIXME: can we plumb a `pl_timer` in here somehow? + if (!pl_upload_plane(p->gpu, NULL, &e->tex_in[i], &data[i])) + return API2_ERR_UNKNOWN; + + // Re-create the target FBO as well with this format if necessary + bool ok = pl_tex_recreate(p->gpu, &e->tex_out[i], pl_tex_params( + .w = data[i].width, + .h = data[i].height, + .format = fmt, + .renderable = true, + .host_readable = true, + )); + if (!ok) + return API2_ERR_UNKNOWN; + } + + // Dispatch the work for this image + for (int i = 0; i < img->num_planes; i++) { + if (!do_plane(p, e->tex_out[i], e->tex_in[i])) + return API2_ERR_UNKNOWN; + } + + // Set up the resulting `struct image` that will hold our target + // data. We just copy the format etc. from the source image + memcpy(&e->image, img, sizeof(struct image)); + + size_t offset[MAX_PLANES], stride[MAX_PLANES], total_size = 0; + for (int i = 0; i < img->num_planes; i++) { + // For performance, we want to make sure we align the stride + // to a multiple of the GPU's preferred texture transfer stride + // (This is entirely optional) + stride[i] = ALIGN2(img->planes[i].stride, + p->gpu->limits.align_tex_xfer_pitch); + int height = img->height >> img->planes[i].suby; + + // Round up the offset to the nearest multiple of the optimal + // transfer alignment. (This is also entirely optional) + offset[i] = ALIGN2(total_size, p->gpu->limits.align_tex_xfer_offset); + total_size = offset[i] + stride[i] * height; + } + + // Dispatch the asynchronous download into a mapped buffer + bool ok = pl_buf_recreate(p->gpu, &e->buf, pl_buf_params( + .size = total_size, + .host_mapped = true, + )); + if (!ok) + return API2_ERR_UNKNOWN; + + for (int i = 0; i < img->num_planes; i++) { + ok = pl_tex_download(p->gpu, pl_tex_transfer_params( + .tex = e->tex_out[i], + .row_pitch = stride[i], + .buf = e->buf, + .buf_offset = offset[i], + .timer = p->download_timer, + )); + if (!ok) + return API2_ERR_UNKNOWN; + + // Update the output fields + e->image.planes[i].data = e->buf->data + offset[i]; + e->image.planes[i].stride = stride[i]; + } + + // Make sure this work starts processing in the background, and especially + // so we can move on to the next queue on the gPU + pl_gpu_flush(p->gpu); + return API2_OK; +} + +enum api2_status api2_process(void *priv) +{ + struct priv *p = priv; + enum api2_status ret = 0; + + // Opportunistically release any held images. We do this across the ring + // buffer, rather than doing this as part of the following loop, because + // we want to release images ahead-of-time (no FIFO constraints) + for (int i = 0; i < PARALLELISM; i++) { + struct entry *e = &p->entries[i]; + if (e->held_image && !pl_buf_poll(p->gpu, e->held_buf, 0)) { + // upload buffer is no longer in use, release it + image_unlock(e->held_image); + e->held_image = NULL; + e->held_buf = NULL; + } + } + + // Poll the status of existing entries and dequeue the ones that are done + while (p->idx_out != p->idx_in) { + struct entry *e = &p->entries[p->idx_out]; + if (pl_buf_poll(p->gpu, e->buf, 0)) + break; + + if (e->held_image) { + image_unlock(e->held_image); + e->held_image = NULL; + e->held_buf = NULL; + } + + // download buffer is no longer busy, dequeue the frame + put_image(&e->image); + p->idx_out = (p->idx_out + 1) % PARALLELISM; + } + + // Fill up the queue with more work + int last_free_idx = (p->idx_out ? p->idx_out : PARALLELISM) - 1; + while (p->idx_in != last_free_idx) { + struct image *img = get_image(); + if (!img) { + ret |= API2_WANT_MORE; + break; + } + + enum api2_status err = submit_work(p, &p->entries[p->idx_in], img); + if (err < 0) + return err; + + p->idx_in = (p->idx_in + 1) % PARALLELISM; + } + + if (p->idx_out != p->idx_in) + ret |= API2_HAVE_MORE; + + return ret; +} + +bool api2_alloc(void *priv, size_t size, struct api2_buf *out) +{ + struct priv *p = priv; + if (!p->gpu->limits.buf_transfer || size > p->gpu->limits.max_mapped_size) + return false; + + pl_buf buf = pl_buf_create(p->gpu, pl_buf_params( + .size = size, + .host_mapped = true, + )); + + if (!buf) + return false; + + *out = (struct api2_buf) { + .data = buf->data, + .size = size, + .priv = (void *) buf, + }; + return true; +} + +void api2_free(void *priv, const struct api2_buf *buf) +{ + struct priv *p = priv; + pl_buf plbuf = buf->priv; + pl_buf_destroy(p->gpu, &plbuf); +} + + +//////////////////////////////////// +/// Proof of Concept / Benchmark /// +//////////////////////////////////// + +#define FRAMES 10000 + +// Let's say we're processing a 1920x1080 4:2:0 8-bit NV12 video, arbitrarily +// with a stride aligned to 256 bytes. (For no particular reason) +#define TEXELSZ sizeof(uint8_t) +#define WIDTH 1920 +#define HEIGHT 1080 +#define STRIDE (ALIGN2(WIDTH, 256) * TEXELSZ) +// Subsampled planes +#define SWIDTH (WIDTH >> 1) +#define SHEIGHT (HEIGHT >> 1) +#define SSTRIDE (ALIGN2(SWIDTH, 256) * TEXELSZ) +// Plane offsets / sizes +#define SIZE0 (HEIGHT * STRIDE) +#define SIZE1 (2 * SHEIGHT * SSTRIDE) +#define OFFSET0 0 +#define OFFSET1 SIZE0 +#define BUFSIZE (OFFSET1 + SIZE1) + +// Skeleton of an example image +static const struct image example_image = { + .width = WIDTH, + .height = HEIGHT, + .num_planes = 2, + .planes = { + { + .subx = 0, + .suby = 0, + .stride = STRIDE, + .fmt = { + .num_comps = 1, + .bitdepth = 8 * TEXELSZ, + }, + }, { + .subx = 1, + .suby = 1, + .stride = SSTRIDE * 2, + .fmt = { + .num_comps = 2, + .bitdepth = 8 * TEXELSZ, + }, + }, + }, +}; + +// API #1: Nice and simple (but slow) +static void api1_example(void) +{ + struct priv *vf = init(); + if (!vf) + return; + + if (!api1_reconfig(vf, &example_image)) { + fprintf(stderr, "api1: Failed configuring video filter!\n"); + return; + } + + // Allocate two buffers to hold the example data, and fill the source + // buffer arbitrarily with a "simple" pattern. (Decoding the data into + // the buffer is not meant to be part of this benchmark) + uint8_t *srcbuf = malloc(BUFSIZE), + *dstbuf = malloc(BUFSIZE); + if (!srcbuf || !dstbuf) + goto done; + + for (size_t i = 0; i < BUFSIZE; i++) + srcbuf[i] = i; + + struct image src = example_image, dst = example_image; + src.planes[0].data = srcbuf + OFFSET0; + src.planes[1].data = srcbuf + OFFSET1; + dst.planes[0].data = dstbuf + OFFSET0; + dst.planes[1].data = dstbuf + OFFSET1; + + const pl_clock_t start = pl_clock_now(); + + // Process this dummy frame a bunch of times + unsigned frames = 0; + for (frames = 0; frames < FRAMES; frames++) { + if (!api1_filter(vf, &dst, &src)) { + fprintf(stderr, "api1: Failed filtering frame... aborting\n"); + break; + } + } + + const pl_clock_t stop = pl_clock_now(); + const float secs = pl_clock_diff(stop, start); + + printf("api1: %4u frames in %1.6f s => %2.6f ms/frame (%5.2f fps)\n", + frames, secs, 1000 * secs / frames, frames / secs); + + if (vf->render_count) { + printf(" render: %f ms, upload: %f ms, download: %f ms\n", + 1e-6 * vf->render_sum / vf->render_count, + vf->upload_count ? (1e-6 * vf->upload_sum / vf->upload_count) : 0.0, + vf->download_count ? (1e-6 * vf->download_sum / vf->download_count) : 0.0); + } + +done: + free(srcbuf); + free(dstbuf); + uninit(vf); +} + + +// API #2: Pretend we have some fancy pool of images. +#define POOLSIZE (PARALLELISM + 1) + +static struct api2_buf buffers[POOLSIZE] = {0}; +static struct image images[POOLSIZE] = {0}; +static int refcount[POOLSIZE] = {0}; +static unsigned api2_frames_in = 0; +static unsigned api2_frames_out = 0; + +static void api2_example(void) +{ + struct priv *vf = init(); + if (!vf) + return; + + // Set up a bunch of dummy images + for (int i = 0; i < POOLSIZE; i++) { + uint8_t *data; + images[i] = example_image; + if (api2_alloc(vf, BUFSIZE, &buffers[i])) { + data = buffers[i].data; + images[i].associated_buf = &buffers[i]; + } else { + // Fall back in case mapped buffers are unsupported + fprintf(stderr, "warning: falling back to malloc, may be slow\n"); + data = malloc(BUFSIZE); + } + // Fill with some "data" (like in API #1) + for (size_t n = 0; n < BUFSIZE; n++) + data[i] = n; + images[i].planes[0].data = data + OFFSET0; + images[i].planes[1].data = data + OFFSET1; + } + + const pl_clock_t start = pl_clock_now(); + + // Just keep driving the event loop regardless of the return status + // until we reach the critical number of frames. (Good enough for this PoC) + while (api2_frames_out < FRAMES) { + enum api2_status ret = api2_process(vf); + if (ret < 0) { + fprintf(stderr, "api2: Failed processing... aborting\n"); + break; + } + + // Sleep a short time (100us) to prevent busy waiting the CPU + pl_thread_sleep(1e-4); + check_timers(vf); + } + + const pl_clock_t stop = pl_clock_now(); + const float secs = pl_clock_diff(stop, start); + printf("api2: %4u frames in %1.6f s => %2.6f ms/frame (%5.2f fps)\n", + api2_frames_out, secs, 1000 * secs / api2_frames_out, + api2_frames_out / secs); + + if (vf->render_count) { + printf(" render: %f ms, upload: %f ms, download: %f ms\n", + 1e-6 * vf->render_sum / vf->render_count, + vf->upload_count ? (1e-6 * vf->upload_sum / vf->upload_count) : 0.0, + vf->download_count ? (1e-6 * vf->download_sum / vf->download_count) : 0.0); + } + + for (int i = 0; i < POOLSIZE; i++) { + if (images[i].associated_buf) { + api2_free(vf, images[i].associated_buf); + } else { + // This is what we originally malloc'd + free(images[i].planes[0].data); + } + } + + uninit(vf); +} + +struct image *get_image(void) +{ + if (api2_frames_in == FRAMES) + return NULL; // simulate EOF, to avoid queueing up "extra" work + + // if we can find a free (unlocked) image, give it that + for (int i = 0; i < POOLSIZE; i++) { + if (refcount[i] == 0) { + api2_frames_in++; + return &images[i]; + } + } + + return NULL; // no free image available +} + +void put_image(struct image *img) +{ + (void)img; + api2_frames_out++; +} + +void image_lock(struct image *img) +{ + int index = img - images; // cheat, for lack of having actual image management + refcount[index]++; +} + +void image_unlock(struct image *img) +{ + int index = img - images; + refcount[index]--; +} + +int main(void) +{ + printf("Running benchmarks...\n"); + api1_example(); + api2_example(); + return 0; +} |