demos/video-filtering.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871

/* Presented are two hypothetical scenarios of how one might use libplacebo
 * as something like an FFmpeg or mpv video filter. We examine two example
 * APIs (loosely modeled after real video filtering APIs) and how each style
 * would like to use libplacebo.
 *
 * For sake of a simple example, let's assume this is a debanding filter.
 * For those of you too lazy to compile/run this file but still want to see
 * results, these are from my machine (RX 5700 XT + 1950X, as of 2020-05-25):
 *
 * RADV+ACO:
 *   api1: 10000 frames in 16.328440 s => 1.632844 ms/frame (612.43 fps)
 *         render: 0.113524 ms, upload: 0.127551 ms, download: 0.146097 ms
 *   api2: 10000 frames in 5.335634 s => 0.533563 ms/frame (1874.19 fps)
 *         render: 0.064378 ms, upload: 0.000000 ms, download: 0.189719 ms
 *
 * AMDVLK:
 *   api1: 10000 frames in 14.921859 s => 1.492186 ms/frame (670.16 fps)
 *         render: 0.110603 ms, upload: 0.114412 ms, download: 0.115375 ms
 *   api2: 10000 frames in 4.667386 s => 0.466739 ms/frame (2142.53 fps)
 *         render: 0.030781 ms, upload: 0.000000 ms, download: 0.075237 ms
 *
 * You can see that AMDVLK is still better at doing texture streaming than
 * RADV - this is because as of writing RADV still does not support
 * asynchronous texture queues / DMA engine transfers. If we disable the
 * `async_transfer` option with AMDVLK we get this:
 *
 *   api1: 10000 frames in 16.087723 s => 1.608772 ms/frame (621.59 fps)
 *         render: 0.111154 ms, upload: 0.122476 ms, download: 0.133162 ms
 *   api2: 10000 frames in 6.344959 s => 0.634496 ms/frame (1576.05 fps)
 *         render: 0.031307 ms, upload: 0.000000 ms, download: 0.083520 ms
 *
 * License: CC0 / Public Domain
 */

#include <assert.h>
#include <stdlib.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>

#include "common.h"
#include "pl_clock.h"
#include "pl_thread.h"

#ifdef _WIN32
#include <windows.h>
#endif

#include <libplacebo/dispatch.h>
#include <libplacebo/shaders/sampling.h>
#include <libplacebo/utils/upload.h>
#include <libplacebo/vulkan.h>

///////////////////////
/// API definitions ///
///////////////////////

// Stuff that would be common to each API

void *init(void);
void uninit(void *priv);

struct format {
    // For simplicity let's make a few assumptions here, since configuring the
    // texture format is not the point of this example. (In practice you can
    // go nuts with the `utils/upload.h` helpers)
    //
    // - All formats contain unsigned integers only
    // - All components have the same size in bits
    // - All components are in the "canonical" order
    // - All formats have power of two sizes only (2 or 4 components, not 3)
    // - All plane strides are a multiple of the pixel size
    int num_comps;
    int bitdepth;
};

struct plane {
    int subx, suby; // subsampling shift
    struct format fmt;
    size_t stride;
    void *data;
};

#define MAX_PLANES 4

struct image {
    int width, height;
    int num_planes;
    struct plane planes[MAX_PLANES];

    // For API #2, the associated mapped buffer (if any)
    struct api2_buf *associated_buf;
};


// Example API design #1: synchronous, blocking, double-copy (bad!)
//
// In this API, `api1_filter` must immediately return with the new data.
// This prevents parallelism on the GPU and should be avoided if possible,
// but sometimes that's what you have to work with. So this is what it
// would look like.
//
// Also, let's assume this API design reconfigures the filter chain (using
// a blank `proxy` image every time the image format or dimensions change,
// and doesn't expect us to fail due to format mismatches or resource
// exhaustion afterwards.

bool api1_reconfig(void *priv, const struct image *proxy);
bool api1_filter(void *priv, struct image *dst, struct image *src);


// Example API design #2: asynchronous, streaming, queued, zero-copy (good!)
//
// In this API, `api2_process` will run by the calling code every so often
// (e.g. when new data is available or expected). This function has access
// to non-blocking functions `get_image` and `put_image` that interface
// with the video filtering engine's internal queueing system.
//
// This API is also designed to feed multiple frames ahead of time, i.e.
// it will feed us as many frames as it can while we're still returning
// `API2_WANT_MORE`. To drain the filter chain, it would continue running
// the process function until `API2_HAVE_MORE` is no longer present
// in the output.
//
// This API is also designed to do zero-copy where possible. When it wants
// to create a data buffer of a given size, it will call our function
// `api2_alloc` which will return a buffer that we can process directly.
// We can use this to do zero-copy uploading to the GPU, by creating
// host-visible persistently mapped buffers. In order to prevent the video
// filtering system from re-using our buffers while copies are happening, we
// use special functions `image_lock` and `image_unlock` to increase a
// refcount on the image's backing storage. (As is typical of such APIs)
//
// Finally, this API is designed to be fully dynamic: The image parameters
// could change at any time, and we must be equipped to handle that.

enum api2_status {
    // Negative values are used to signal error conditions
    API2_ERR_FMT = -2,          // incompatible / unsupported format
    API2_ERR_UNKNOWN = -1,      // some other error happened
    API2_OK = 0,                // no error, no status - everything's good

    // Positive values represent a mask of status conditions
    API2_WANT_MORE = (1 << 0),  // we want more frames, please feed some more!
    API2_HAVE_MORE = (1 << 1),  // we have more frames but they're not ready
};

enum api2_status api2_process(void *priv);

// Functions for creating persistently mapped buffers
struct api2_buf {
    void *data;
    size_t size;
    void *priv;
};

bool api2_alloc(void *priv, size_t size, struct api2_buf *out);
void api2_free(void *priv, const struct api2_buf *buf);

// These functions are provided by the API. The exact details of how images
// are enqueued, dequeued and locked are not really important here, so just
// do something unrealistic but simple to demonstrate with.
struct image *get_image(void);
void put_image(struct image *img);
void image_lock(struct image *img);
void image_unlock(struct image *img);


/////////////////////////////////
/// libplacebo implementation ///
/////////////////////////////////


// For API #2:
#define PARALLELISM 8

struct entry {
    pl_buf buf; // to stream the download
    pl_tex tex_in[MAX_PLANES];
    pl_tex tex_out[MAX_PLANES];
    struct image image;

    // For entries that are associated with a held image, so we can unlock them
    // as soon as possible
    struct image *held_image;
    pl_buf held_buf;
};

// For both APIs:
struct priv {
    pl_log log;
    pl_vulkan vk;
    pl_gpu gpu;
    pl_dispatch dp;
    pl_shader_obj dither_state;

    // Timer objects
    pl_timer render_timer;
    pl_timer upload_timer;
    pl_timer download_timer;
    uint64_t render_sum;
    uint64_t upload_sum;
    uint64_t download_sum;
    int render_count;
    int upload_count;
    int download_count;

    // API #1: A simple pair of input and output textures
    pl_tex tex_in[MAX_PLANES];
    pl_tex tex_out[MAX_PLANES];

    // API #2: A ring buffer of textures/buffers for streaming
    int idx_in;  // points the next free entry
    int idx_out; // points to the first entry still in progress
    struct entry entries[PARALLELISM];
};

void *init(void) {
    struct priv *p = calloc(1, sizeof(struct priv));
    if (!p)
        return NULL;

    p->log = pl_log_create(PL_API_VER, pl_log_params(
        .log_cb = pl_log_simple,
        .log_level = PL_LOG_WARN,
    ));

    p->vk = pl_vulkan_create(p->log, pl_vulkan_params(
        // Note: This is for API #2. In API #1 you could just pass params=NULL
        // and it wouldn't really matter much.
        .async_transfer = true,
        .async_compute = true,
        .queue_count = PARALLELISM,
    ));

    if (!p->vk) {
        fprintf(stderr, "Failed creating vulkan context\n");
        goto error;
    }

    // Give this a shorter name for convenience
    p->gpu = p->vk->gpu;

    p->dp = pl_dispatch_create(p->log, p->gpu);
    if (!p->dp) {
        fprintf(stderr, "Failed creating shader dispatch object\n");
        goto error;
    }

    p->render_timer = pl_timer_create(p->gpu);
    p->upload_timer = pl_timer_create(p->gpu);
    p->download_timer = pl_timer_create(p->gpu);

    return p;

error:
    uninit(p);
    return NULL;
}

void uninit(void *priv)
{
    struct priv *p = priv;

    // API #1
    for (int i = 0; i < MAX_PLANES; i++) {
        pl_tex_destroy(p->gpu, &p->tex_in[i]);
        pl_tex_destroy(p->gpu, &p->tex_out[i]);
    }

    // API #2
    for (int i = 0; i < PARALLELISM; i++) {
        pl_buf_destroy(p->gpu, &p->entries[i].buf);
        for (int j = 0; j < MAX_PLANES; j++) {
            pl_tex_destroy(p->gpu, &p->entries[i].tex_in[j]);
            pl_tex_destroy(p->gpu, &p->entries[i].tex_out[j]);
        }
        if (p->entries[i].held_image)
            image_unlock(p->entries[i].held_image);
    }

    pl_timer_destroy(p->gpu, &p->render_timer);
    pl_timer_destroy(p->gpu, &p->upload_timer);
    pl_timer_destroy(p->gpu, &p->download_timer);

    pl_shader_obj_destroy(&p->dither_state);
    pl_dispatch_destroy(&p->dp);
    pl_vulkan_destroy(&p->vk);
    pl_log_destroy(&p->log);

    free(p);
}

// Helper function to set up the `pl_plane_data` struct from the image params
static void setup_plane_data(const struct image *img,
                             struct pl_plane_data out[MAX_PLANES])
{
    for (int i = 0; i < img->num_planes; i++) {
        const struct plane *plane = &img->planes[i];

        out[i] = (struct pl_plane_data) {
            .type = PL_FMT_UNORM,
            .width = img->width >> plane->subx,
            .height = img->height >> plane->suby,
            .pixel_stride = plane->fmt.num_comps * plane->fmt.bitdepth / 8,
            .row_stride = plane->stride,
            .pixels = plane->data,
        };

        // For API 2 (direct rendering)
        if (img->associated_buf) {
            pl_buf buf = img->associated_buf->priv;
            out[i].pixels = NULL;
            out[i].buf = buf;
            out[i].buf_offset = (uintptr_t) plane->data - (uintptr_t) buf->data;
        }

        for (int c = 0; c < plane->fmt.num_comps; c++) {
            out[i].component_size[c] = plane->fmt.bitdepth;
            out[i].component_pad[c] = 0;
            out[i].component_map[c] = c;
        }
    }
}

static bool do_plane(struct priv *p, pl_tex dst, pl_tex src)
{
    int new_depth = dst->params.format->component_depth[0];

    // Do some debanding, and then also make sure to dither to the new depth
    // so that our debanded gradients are actually preserved well
    pl_shader sh = pl_dispatch_begin(p->dp);
    pl_shader_deband(sh, pl_sample_src( .tex = src ), NULL);
    pl_shader_dither(sh, new_depth, &p->dither_state, NULL);
    return pl_dispatch_finish(p->dp, pl_dispatch_params(
        .shader = &sh,
        .target = dst,
        .timer  = p->render_timer,
    ));
}

static void check_timers(struct priv *p)
{
    uint64_t ret;

    while ((ret = pl_timer_query(p->gpu, p->render_timer))) {
        p->render_sum += ret;
        p->render_count++;
    }

    while ((ret = pl_timer_query(p->gpu, p->upload_timer))) {
        p->upload_sum += ret;
        p->upload_count++;
    }

    while ((ret = pl_timer_query(p->gpu, p->download_timer))) {
        p->download_sum += ret;
        p->download_count++;
    }
}

// API #1 implementation:
//
// In this design, we will create all GPU resources inside `reconfig`, based on
// the texture format configured from the proxy image. This will avoid failing
// later on due to e.g. resource exhaustion or texture format mismatch, and
// thereby falls within the intended semantics of this style of API.

bool api1_reconfig(void *priv, const struct image *proxy)
{
    struct priv *p = priv;
    struct pl_plane_data data[MAX_PLANES];
    setup_plane_data(proxy, data);

    for (int i = 0; i < proxy->num_planes; i++) {
        pl_fmt fmt = pl_plane_find_fmt(p->gpu, NULL, &data[i]);
        if (!fmt) {
            fprintf(stderr, "Failed configuring filter: no good texture format!\n");
            return false;
        }

        bool ok = true;
        ok &= pl_tex_recreate(p->gpu, &p->tex_in[i], pl_tex_params(
            .w = data[i].width,
            .h = data[i].height,
            .format = fmt,
            .sampleable = true,
            .host_writable = true,
        ));

        ok &= pl_tex_recreate(p->gpu, &p->tex_out[i], pl_tex_params(
            .w = data[i].width,
            .h = data[i].height,
            .format = fmt,
            .renderable = true,
            .host_readable = true,
        ));

        if (!ok) {
            fprintf(stderr, "Failed creating GPU textures!\n");
            return false;
        }
    }

    return true;
}

bool api1_filter(void *priv, struct image *dst, struct image *src)
{
    struct priv *p = priv;
    struct pl_plane_data data[MAX_PLANES];
    setup_plane_data(src, data);

    // Upload planes
    for (int i = 0; i < src->num_planes; i++) {
        bool ok = pl_tex_upload(p->gpu, pl_tex_transfer_params(
            .tex = p->tex_in[i],
            .row_pitch = data[i].row_stride,
            .ptr = src->planes[i].data,
            .timer = p->upload_timer,
        ));

        if (!ok) {
            fprintf(stderr, "Failed uploading data to the GPU!\n");
            return false;
        }
    }

    // Process planes
    for (int i = 0; i < src->num_planes; i++) {
        if (!do_plane(p, p->tex_out[i], p->tex_in[i])) {
            fprintf(stderr, "Failed processing planes!\n");
            return false;
        }
    }

    // Download planes
    for (int i = 0; i < src->num_planes; i++) {
        bool ok = pl_tex_download(p->gpu, pl_tex_transfer_params(
            .tex = p->tex_out[i],
            .row_pitch = dst->planes[i].stride,
            .ptr = dst->planes[i].data,
            .timer = p->download_timer,
        ));

        if (!ok) {
            fprintf(stderr, "Failed downloading data from the GPU!\n");
            return false;
        }
    }

    check_timers(p);
    return true;
}


// API #2 implementation:
//
// In this implementation we maintain a queue (implemented as ring buffer)
// of "work entries", which are isolated structs that hold independent GPU
// resources - so that the GPU has no cross-entry dependencies on any of the
// textures or other resources. (Side note: It still has a dependency on the
// dither state, but this is just a shared LUT anyway)

// Align up to the nearest multiple of a power of two
#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1))

static enum api2_status submit_work(struct priv *p, struct entry *e,
                                    struct image *img)
{
    // If the image comes from a mapped buffer, we have to take a lock
    // while our upload is in progress
    if (img->associated_buf) {
        assert(!e->held_image);
        image_lock(img);
        e->held_image = img;
        e->held_buf = img->associated_buf->priv;
    }

    // Upload this image's data
    struct pl_plane_data data[MAX_PLANES];
    setup_plane_data(img, data);

    for (int i = 0; i < img->num_planes; i++) {
        pl_fmt fmt = pl_plane_find_fmt(p->gpu, NULL, &data[i]);
        if (!fmt)
            return API2_ERR_FMT;

        // FIXME: can we plumb a `pl_timer` in here somehow?
        if (!pl_upload_plane(p->gpu, NULL, &e->tex_in[i], &data[i]))
            return API2_ERR_UNKNOWN;

        // Re-create the target FBO as well with this format if necessary
        bool ok = pl_tex_recreate(p->gpu, &e->tex_out[i], pl_tex_params(
            .w = data[i].width,
            .h = data[i].height,
            .format = fmt,
            .renderable = true,
            .host_readable = true,
        ));
        if (!ok)
            return API2_ERR_UNKNOWN;
    }

    // Dispatch the work for this image
    for (int i = 0; i < img->num_planes; i++) {
        if (!do_plane(p, e->tex_out[i], e->tex_in[i]))
            return API2_ERR_UNKNOWN;
    }

    // Set up the resulting `struct image` that will hold our target
    // data. We just copy the format etc. from the source image
    memcpy(&e->image, img, sizeof(struct image));

    size_t offset[MAX_PLANES], stride[MAX_PLANES], total_size = 0;
    for (int i = 0; i < img->num_planes; i++) {
        // For performance, we want to make sure we align the stride
        // to a multiple of the GPU's preferred texture transfer stride
        // (This is entirely optional)
        stride[i] = ALIGN2(img->planes[i].stride,
                           p->gpu->limits.align_tex_xfer_pitch);
        int height = img->height >> img->planes[i].suby;

        // Round up the offset to the nearest multiple of the optimal
        // transfer alignment. (This is also entirely optional)
        offset[i] = ALIGN2(total_size, p->gpu->limits.align_tex_xfer_offset);
        total_size = offset[i] + stride[i] * height;
    }

    // Dispatch the asynchronous download into a mapped buffer
    bool ok = pl_buf_recreate(p->gpu, &e->buf, pl_buf_params(
        .size = total_size,
        .host_mapped = true,
    ));
    if (!ok)
        return API2_ERR_UNKNOWN;

    for (int i = 0; i < img->num_planes; i++) {
        ok = pl_tex_download(p->gpu, pl_tex_transfer_params(
            .tex = e->tex_out[i],
            .row_pitch = stride[i],
            .buf = e->buf,
            .buf_offset = offset[i],
            .timer = p->download_timer,
        ));
        if (!ok)
            return API2_ERR_UNKNOWN;

        // Update the output fields
        e->image.planes[i].data = e->buf->data + offset[i];
        e->image.planes[i].stride = stride[i];
    }

    // Make sure this work starts processing in the background, and especially
    // so we can move on to the next queue on the gPU
    pl_gpu_flush(p->gpu);
    return API2_OK;
}

enum api2_status api2_process(void *priv)
{
    struct priv *p = priv;
    enum api2_status ret = 0;

    // Opportunistically release any held images. We do this across the ring
    // buffer, rather than doing this as part of the following loop, because
    // we want to release images ahead-of-time (no FIFO constraints)
    for (int i = 0; i < PARALLELISM; i++) {
        struct entry *e = &p->entries[i];
        if (e->held_image && !pl_buf_poll(p->gpu, e->held_buf, 0)) {
            // upload buffer is no longer in use, release it
            image_unlock(e->held_image);
            e->held_image = NULL;
            e->held_buf = NULL;
        }
    }

    // Poll the status of existing entries and dequeue the ones that are done
    while (p->idx_out != p->idx_in) {
        struct entry *e = &p->entries[p->idx_out];
        if (pl_buf_poll(p->gpu, e->buf, 0))
            break;

        if (e->held_image) {
            image_unlock(e->held_image);
            e->held_image = NULL;
            e->held_buf = NULL;
        }

        // download buffer is no longer busy, dequeue the frame
        put_image(&e->image);
        p->idx_out = (p->idx_out + 1) % PARALLELISM;
    }

    // Fill up the queue with more work
    int last_free_idx = (p->idx_out ? p->idx_out : PARALLELISM) - 1;
    while (p->idx_in != last_free_idx) {
        struct image *img = get_image();
        if (!img) {
            ret |= API2_WANT_MORE;
            break;
        }

        enum api2_status err = submit_work(p, &p->entries[p->idx_in], img);
        if (err < 0)
            return err;

        p->idx_in = (p->idx_in + 1) % PARALLELISM;
    }

    if (p->idx_out != p->idx_in)
        ret |= API2_HAVE_MORE;

    return ret;
}

bool api2_alloc(void *priv, size_t size, struct api2_buf *out)
{
    struct priv *p = priv;
    if (!p->gpu->limits.buf_transfer || size > p->gpu->limits.max_mapped_size)
        return false;

    pl_buf buf = pl_buf_create(p->gpu, pl_buf_params(
        .size = size,
        .host_mapped = true,
    ));

    if (!buf)
        return false;

    *out = (struct api2_buf) {
        .data = buf->data,
        .size = size,
        .priv = (void *) buf,
    };
    return true;
}

void api2_free(void *priv, const struct api2_buf *buf)
{
    struct priv *p = priv;
    pl_buf plbuf = buf->priv;
    pl_buf_destroy(p->gpu, &plbuf);
}


////////////////////////////////////
/// Proof of Concept / Benchmark ///
////////////////////////////////////

#define FRAMES 10000

// Let's say we're processing a 1920x1080 4:2:0 8-bit NV12 video, arbitrarily
// with a stride aligned to 256 bytes. (For no particular reason)
#define TEXELSZ sizeof(uint8_t)
#define WIDTH   1920
#define HEIGHT  1080
#define STRIDE  (ALIGN2(WIDTH, 256) * TEXELSZ)
// Subsampled planes
#define SWIDTH  (WIDTH >> 1)
#define SHEIGHT (HEIGHT >> 1)
#define SSTRIDE (ALIGN2(SWIDTH, 256) * TEXELSZ)
// Plane offsets / sizes
#define SIZE0   (HEIGHT * STRIDE)
#define SIZE1   (2 * SHEIGHT * SSTRIDE)
#define OFFSET0 0
#define OFFSET1 SIZE0
#define BUFSIZE (OFFSET1 + SIZE1)

// Skeleton of an example image
static const struct image example_image = {
    .width = WIDTH,
    .height = HEIGHT,
    .num_planes = 2,
    .planes = {
        {
            .subx = 0,
            .suby = 0,
            .stride = STRIDE,
            .fmt = {
                .num_comps = 1,
                .bitdepth = 8 * TEXELSZ,
            },
        }, {
            .subx = 1,
            .suby = 1,
            .stride = SSTRIDE * 2,
            .fmt = {
                .num_comps = 2,
                .bitdepth = 8 * TEXELSZ,
            },
        },
    },
};

// API #1: Nice and simple (but slow)
static void api1_example(void)
{
    struct priv *vf = init();
    if (!vf)
        return;

    if (!api1_reconfig(vf, &example_image)) {
        fprintf(stderr, "api1: Failed configuring video filter!\n");
        return;
    }

    // Allocate two buffers to hold the example data, and fill the source
    // buffer arbitrarily with a "simple" pattern. (Decoding the data into
    // the buffer is not meant to be part of this benchmark)
    uint8_t *srcbuf = malloc(BUFSIZE),
            *dstbuf = malloc(BUFSIZE);
    if (!srcbuf || !dstbuf)
        goto done;

    for (size_t i = 0; i < BUFSIZE; i++)
        srcbuf[i] = i;

    struct image src = example_image, dst = example_image;
    src.planes[0].data = srcbuf + OFFSET0;
    src.planes[1].data = srcbuf + OFFSET1;
    dst.planes[0].data = dstbuf + OFFSET0;
    dst.planes[1].data = dstbuf + OFFSET1;

    const pl_clock_t start = pl_clock_now();

    // Process this dummy frame a bunch of times
    unsigned frames = 0;
    for (frames = 0; frames < FRAMES; frames++) {
        if (!api1_filter(vf, &dst, &src)) {
            fprintf(stderr, "api1: Failed filtering frame... aborting\n");
            break;
        }
    }

    const pl_clock_t stop = pl_clock_now();
    const float secs = pl_clock_diff(stop, start);

    printf("api1: %4u frames in %1.6f s => %2.6f ms/frame (%5.2f fps)\n",
           frames, secs, 1000 * secs / frames, frames / secs);

    if (vf->render_count) {
        printf("      render: %f ms, upload: %f ms, download: %f ms\n",
               1e-6 * vf->render_sum / vf->render_count,
               vf->upload_count ? (1e-6 * vf->upload_sum / vf->upload_count) : 0.0,
               vf->download_count ? (1e-6 * vf->download_sum / vf->download_count) : 0.0);
    }

done:
    free(srcbuf);
    free(dstbuf);
    uninit(vf);
}


// API #2: Pretend we have some fancy pool of images.
#define POOLSIZE (PARALLELISM + 1)

static struct api2_buf buffers[POOLSIZE] = {0};
static struct image images[POOLSIZE] = {0};
static int refcount[POOLSIZE] = {0};
static unsigned api2_frames_in = 0;
static unsigned api2_frames_out = 0;

static void api2_example(void)
{
    struct priv *vf = init();
    if (!vf)
        return;

    // Set up a bunch of dummy images
    for (int i = 0; i < POOLSIZE; i++) {
        uint8_t *data;
        images[i] = example_image;
        if (api2_alloc(vf, BUFSIZE, &buffers[i])) {
            data = buffers[i].data;
            images[i].associated_buf = &buffers[i];
        } else {
            // Fall back in case mapped buffers are unsupported
            fprintf(stderr, "warning: falling back to malloc, may be slow\n");
            data = malloc(BUFSIZE);
        }
        // Fill with some "data" (like in API #1)
        for (size_t n = 0; n < BUFSIZE; n++)
            data[i] = n;
        images[i].planes[0].data = data + OFFSET0;
        images[i].planes[1].data = data + OFFSET1;
    }

    const pl_clock_t start = pl_clock_now();

    // Just keep driving the event loop regardless of the return status
    // until we reach the critical number of frames. (Good enough for this PoC)
    while (api2_frames_out < FRAMES) {
        enum api2_status ret = api2_process(vf);
        if (ret < 0) {
            fprintf(stderr, "api2: Failed processing... aborting\n");
            break;
        }

        // Sleep a short time (100us) to prevent busy waiting the CPU
        pl_thread_sleep(1e-4);
        check_timers(vf);
    }

    const pl_clock_t stop = pl_clock_now();
    const float secs = pl_clock_diff(stop, start);
    printf("api2: %4u frames in %1.6f s => %2.6f ms/frame (%5.2f fps)\n",
           api2_frames_out, secs, 1000 * secs / api2_frames_out,
           api2_frames_out / secs);

    if (vf->render_count) {
        printf("      render: %f ms, upload: %f ms, download: %f ms\n",
               1e-6 * vf->render_sum / vf->render_count,
               vf->upload_count ? (1e-6 * vf->upload_sum / vf->upload_count) : 0.0,
               vf->download_count ? (1e-6 * vf->download_sum / vf->download_count) : 0.0);
    }

    for (int i = 0; i < POOLSIZE; i++) {
        if (images[i].associated_buf) {
            api2_free(vf, images[i].associated_buf);
        } else {
            // This is what we originally malloc'd
            free(images[i].planes[0].data);
        }
    }

    uninit(vf);
}

struct image *get_image(void)
{
    if (api2_frames_in == FRAMES)
        return NULL; // simulate EOF, to avoid queueing up "extra" work

    // if we can find a free (unlocked) image, give it that
    for (int i = 0; i < POOLSIZE; i++) {
        if (refcount[i] == 0) {
            api2_frames_in++;
            return &images[i];
        }
    }

    return NULL; // no free image available
}

void put_image(struct image *img)
{
    (void)img;
    api2_frames_out++;
}

void image_lock(struct image *img)
{
    int index = img - images; // cheat, for lack of having actual image management
    refcount[index]++;
}

void image_unlock(struct image *img)
{
    int index = img - images;
    refcount[index]--;
}

int main(void)
{
    printf("Running benchmarks...\n");
    api1_example();
    api2_example();
    return 0;
}