summaryrefslogtreecommitdiffstats
path: root/src/include/libplacebo/gpu.h
blob: a63fdf7cda1e73b175a4c7554faef3c8da5c04bd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
/*
 * This file is part of libplacebo.
 *
 * libplacebo is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * libplacebo is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifndef LIBPLACEBO_GPU_H_
#define LIBPLACEBO_GPU_H_

#include <stddef.h>
#include <stdbool.h>
#include <stdint.h>

#include <libplacebo/common.h>
#include <libplacebo/cache.h>
#include <libplacebo/log.h>

PL_API_BEGIN

// These are not memory managed, and should represent compile-time constants
typedef const char *pl_debug_tag;
#define PL_DEBUG_TAG (__FILE__ ":" PL_TOSTRING(__LINE__))

// Type of a shader input descriptor.
enum pl_desc_type {
    PL_DESC_INVALID = 0,
    PL_DESC_SAMPLED_TEX,    // C: pl_tex*    GLSL: combined texture sampler
                            // (`pl_tex->params.sampleable` must be set)
    PL_DESC_STORAGE_IMG,    // C: pl_tex*    GLSL: storage image
                            // (`pl_tex->params.storable` must be set)
    PL_DESC_BUF_UNIFORM,    // C: pl_buf*    GLSL: uniform buffer
                            // (`pl_buf->params.uniform` must be set)
    PL_DESC_BUF_STORAGE,    // C: pl_buf*    GLSL: storage buffer
                            // (`pl_buf->params.storable` must be set)
    PL_DESC_BUF_TEXEL_UNIFORM,// C: pl_buf*  GLSL: uniform samplerBuffer
                              // (`pl_buf->params.uniform` and `format` must be set)
    PL_DESC_BUF_TEXEL_STORAGE,// C: pl_buf*  GLSL: uniform imageBuffer
                              // (`pl_buf->params.uniform` and `format` must be set)
    PL_DESC_TYPE_COUNT
};

// This file contains the definition of an API which is designed to abstract
// away from platform-specific APIs like the various OpenGL variants, Direct3D
// and Vulkan in a common way. It is a much more limited API than those APIs,
// since it tries targeting a very small common subset of features that is
// needed to implement libplacebo's rendering.
//
// NOTE: Most, but not all, parameter conditions (phrases such as "must" or
// "valid usage" are explicitly tested and result in error messages followed by
// graceful failure. Exceptions are noted where they exist.

// Structure which wraps metadata describing GLSL capabilities.
struct pl_glsl_version {
    int version;        // GLSL version (e.g. 450), for #version
    bool gles;          // GLSL ES semantics (ESSL)
    bool vulkan;        // GL_KHR_vulkan_glsl semantics

    // Compute shader support and limits. If `compute` is false, then all
    // of the remaining fields in this section are {0}.
    bool compute;
    size_t max_shmem_size;      // maximum compute shader shared memory size
    uint32_t max_group_threads; // maximum number of local threads per work group
    uint32_t max_group_size[3]; // maximum work group size per dimension

    // If nonzero, signals availability of shader subgroups. This guarantess
    // availability of all of the following extensions:
    // - GL_KHR_shader_subgroup_basic
    // - GL_KHR_shader_subgroup_vote
    // - GL_KHR_shader_subgroup_arithmetic
    // - GL_KHR_shader_subgroup_ballot
    // - GL_KHR_shader_subgroup_shuffle
    uint32_t subgroup_size;

    // Miscellaneous shader limits
    int16_t min_gather_offset;  // minimum `textureGatherOffset` offset
    int16_t max_gather_offset;  // maximum `textureGatherOffset` offset
};

// Backwards compatibility alias
#define pl_glsl_desc pl_glsl_version

// Structure defining the physical limits and capabilities of this GPU
// instance. If a limit is given as 0, that means that feature is unsupported.
struct pl_gpu_limits {
    // --- pl_gpu
    bool thread_safe;           // `pl_gpu` calls are thread-safe
    bool callbacks;             // supports asynchronous GPU callbacks

    // --- pl_buf
    size_t max_buf_size;        // maximum size of any buffer
    size_t max_ubo_size;        // maximum size of a `uniform` buffer
    size_t max_ssbo_size;       // maximum size of a `storable` buffer
    size_t max_vbo_size;        // maximum size of a `drawable` buffer
    size_t max_mapped_size;     // maximum size of a `host_mapped` buffer
    uint64_t max_buffer_texels; // maximum number of texels in a texel buffer
    bool host_cached;           // if true, PL_BUF_MEM_HOST buffers are cached

    // Required alignment for PL_HANDLE_HOST_PTR imports. This is provided
    // merely as a hint to the user. If the host pointer being imported is
    // misaligned, libplacebo will internally round (over-map) the region.
    size_t align_host_ptr;

    // --- pl_tex
    uint32_t max_tex_1d_dim;    // maximum width for a 1D texture
    uint32_t max_tex_2d_dim;    // maximum width/height for a 2D texture (required)
    uint32_t max_tex_3d_dim;    // maximum width/height/depth for a 3D texture
    bool blittable_1d_3d;       // supports blittable 1D/3D textures
    bool buf_transfer;          // supports `pl_tex_transfer_params.buf`

    // These don't represent hard limits but indicate performance hints for
    // optimal alignment. For best performance, the corresponding field
    // should be aligned to a multiple of these. They will always be a power
    // of two.
    size_t align_tex_xfer_pitch;    // optimal `pl_tex_transfer_params.row_pitch`
    size_t align_tex_xfer_offset;   // optimal `pl_tex_transfer_params.buf_offset`

    // --- pl_pass
    size_t max_variable_comps;  // maximum components passed in variables
    size_t max_constants;       // maximum `pl_pass_params.num_constants`
    bool array_size_constants;  // push constants can be used to size arrays
    size_t max_pushc_size;      // maximum `push_constants_size`
    size_t align_vertex_stride; // alignment of `pl_pass_params.vertex_stride`
    uint32_t max_dispatch[3];   // maximum dispatch size per dimension

    // Note: At least one of `max_variable_comps` or `max_ubo_size` is
    // guaranteed to be nonzero.

    // As a performance hint, the GPU may signal the number of command queues
    // it has for fragment and compute shaders, respectively. Users may use
    // this information to decide the appropriate type of shader to dispatch.
    uint32_t fragment_queues;
    uint32_t compute_queues;
};

// Backwards compatibility aliases
#define max_xfer_size max_buf_size
#define align_tex_xfer_stride align_tex_xfer_pitch

// Some `pl_gpu` operations allow sharing GPU resources with external APIs -
// examples include interop with other graphics APIs such as CUDA, and also
// various hardware decoding APIs. This defines the mechanism underpinning the
// communication of such an interoperation.
typedef uint64_t pl_handle_caps;
enum pl_handle_type {
    PL_HANDLE_FD        = (1 << 0), // `int fd` for POSIX-style APIs
    PL_HANDLE_WIN32     = (1 << 1), // `HANDLE` for win32 API
    PL_HANDLE_WIN32_KMT = (1 << 2), // `HANDLE` for pre-Windows-8 win32 API
    PL_HANDLE_DMA_BUF   = (1 << 3), // 'int fd' for a dma_buf fd
    PL_HANDLE_HOST_PTR  = (1 << 4), // `void *` for a host-allocated pointer
    PL_HANDLE_MTL_TEX   = (1 << 5), // `MTLTexture*` for Apple platforms
    PL_HANDLE_IOSURFACE = (1 << 6), // `IOSurfaceRef` for Apple platforms
};

struct pl_gpu_handle_caps {
    pl_handle_caps tex;  // supported handles for `pl_tex` + `pl_shared_mem`
    pl_handle_caps buf;  // supported handles for `pl_buf` + `pl_shared_mem`
    pl_handle_caps sync; // supported handles for `pl_sync` / semaphores
};

// Wrapper for the handle used to communicate a shared resource externally.
// This handle is owned by the `pl_gpu` - if a user wishes to use it in a way
// that takes over ownership (e.g. importing into some APIs), they must clone
// the handle before doing so (e.g. using `dup` for fds). It is important to
// read the external API documentation _very_ carefully as different handle
// types may be managed in different ways. (eg: CUDA takes ownership of an fd,
// but does not take ownership of a win32 handle).
union pl_handle {
    int fd;         // PL_HANDLE_FD / PL_HANDLE_DMA_BUF
    void *handle;   // PL_HANDLE_WIN32 / PL_HANDLE_WIN32_KMT / PL_HANDLE_MTL_TEX / PL_HANDLE_IOSURFACE
    void *ptr;      // PL_HANDLE_HOST_PTR
};

// Structure encapsulating memory that is shared between libplacebo and the
// user. This memory can be imported into external APIs using the handle.
//
// If the object a `pl_shared_mem` belongs to is destroyed (e.g. via
// `pl_buf_destroy`), the handle becomes undefined, as do the contents of the
// memory it points to, as well as any external API objects imported from it.
struct pl_shared_mem {
    union pl_handle handle;
    size_t size;   // the total size of the memory referenced by this handle
    size_t offset; // the offset of the object within the referenced memory

    // Note: `size` is optional for some APIs and handle types, in particular
    // when importing DMABUFs or D3D11 textures.

    // For PL_HANDLE_DMA_BUF, this specifies the DRM format modifier that
    // describes this resource. Note that when importing `pl_buf`, this must
    // be DRM_FORMAT_MOD_LINEAR. For importing `pl_tex`, it can be any
    // format modifier supported by the implementation.
    uint64_t drm_format_mod;

    // When importing a `pl_tex` of type PL_HANDLE_DMA_BUF, this can be used to
    // set the image stride (AKA pitch) in memory. If left as 0, defaults to
    // the image width/height.
    size_t stride_w;
    size_t stride_h;

    // When importing a `pl_tex` of type PL_HANDLE_MTL_TEX, this determines
    // which plane is imported (0 - 2).
    unsigned plane;
};

// Structure grouping PCI bus address fields for GPU devices
struct pl_gpu_pci_address {
    uint32_t domain;
    uint32_t bus;
    uint32_t device;
    uint32_t function;
};

typedef const struct pl_fmt_t *pl_fmt;

// Abstract device context which wraps an underlying graphics context and can
// be used to dispatch rendering commands.
//
// Thread-safety: Depends on `pl_gpu_limits.thread_safe`
typedef const struct pl_gpu_t {
    pl_log log;

    struct pl_glsl_version glsl; // GLSL features supported by this GPU
    struct pl_gpu_limits limits; // physical device limits and capabilities

    // Fields relevant to external API interop. If the underlying device does
    // not support interop with other APIs, these will all be {0}.
    struct pl_gpu_handle_caps export_caps; // supported handles for exporting
    struct pl_gpu_handle_caps import_caps; // supported handles for importing
    uint8_t uuid[16];                      // underlying device UUID

    // Supported texture formats, in preference order. (If there are multiple
    // similar formats, the "better" ones come first)
    pl_fmt *formats;
    int num_formats;

    // PCI Bus address of the underlying device, to help with interop.
    // This will only be filled in if interop is supported.
    struct pl_gpu_pci_address pci;
} *pl_gpu;

// Attach a pl_cache object to this GPU instance. This cache will be
// used to cache all compiled shaders, as well as several other shader objects
// (e.g. cached 3DLUTs). Calling this with `cache = NULL` disables the cache.
//
// Note: Calling this after shaders have already been compiled will not
// retroactively add those shaders to the cache, so it's recommended to set
// this early, before creating any passes.
PL_API void pl_gpu_set_cache(pl_gpu gpu, pl_cache cache);

enum pl_fmt_type {
    PL_FMT_UNKNOWN = 0, // also used for inconsistent multi-component formats
    PL_FMT_UNORM,       // unsigned, normalized integer format (sampled as float)
    PL_FMT_SNORM,       // signed, normalized integer format (sampled as float)
    PL_FMT_UINT,        // unsigned integer format (sampled as integer)
    PL_FMT_SINT,        // signed integer format (sampled as integer)
    PL_FMT_FLOAT,       // (signed) float formats, any bit size
    PL_FMT_TYPE_COUNT,
};

enum pl_fmt_caps {
    PL_FMT_CAP_SAMPLEABLE    = 1 << 0,  // may be sampled from (PL_DESC_SAMPLED_TEX)
    PL_FMT_CAP_STORABLE      = 1 << 1,  // may be used as storage image (PL_DESC_STORAGE_IMG)
    PL_FMT_CAP_LINEAR        = 1 << 2,  // may be linearly samplied from (PL_TEX_SAMPLE_LINEAR)
    PL_FMT_CAP_RENDERABLE    = 1 << 3,  // may be rendered to (pl_pass_params.target_fmt)
    PL_FMT_CAP_BLENDABLE     = 1 << 4,  // may be blended to (pl_pass_params.enable_blend)
    PL_FMT_CAP_BLITTABLE     = 1 << 5,  // may be blitted from/to (pl_tex_blit)
    PL_FMT_CAP_VERTEX        = 1 << 6,  // may be used as a vertex attribute
    PL_FMT_CAP_TEXEL_UNIFORM = 1 << 7,  // may be used as a texel uniform buffer
    PL_FMT_CAP_TEXEL_STORAGE = 1 << 8,  // may be used as a texel storage buffer
    PL_FMT_CAP_HOST_READABLE = 1 << 9,  // may be used with `host_readable` textures
    PL_FMT_CAP_READWRITE     = 1 << 10, // may be used with PL_DESC_ACCESS_READWRITE

    // Notes:
    // - PL_FMT_CAP_LINEAR also implies PL_FMT_CAP_SAMPLEABLE
    // - PL_FMT_CAP_STORABLE also implies `pl_gpu.glsl.compute`
    // - PL_FMT_CAP_BLENDABLE implies PL_FMT_CAP_RENDERABLE
    // - PL_FMT_CAP_VERTEX implies that the format is non-opaque
    // - PL_FMT_CAP_HOST_READABLE implies that the format is non-opaque
};

struct pl_fmt_plane {
    // Underlying format of this particular sub-plane. This describes the
    // components, texel size and host representation for the purpose of
    // e.g. transfers, blits, and sampling.
    pl_fmt format;

    // X/Y subsampling shift factor for this plane.
    uint8_t shift_x, shift_y;
};

// Structure describing a texel/vertex format.
struct pl_fmt_t {
    const char *name;       // symbolic name for this format (e.g. rgba32f)
    uint64_t signature;     // unique but stable signature (for pass reusability)

    enum pl_fmt_type type;  // the format's data type and interpretation
    enum pl_fmt_caps caps;  // the features supported by this format
    int num_components;     // number of components for this format
    int component_depth[4]; // meaningful bits per component, texture precision
    size_t internal_size;   // internal texel size (for blit compatibility)

    // For planar formats, this provides a description of each sub-plane.
    //
    // Note on planar formats: Planar formats are always opaque and typically
    // support only a limit subset of capabilities (or none at all). Access
    // should be done via sub-planes. (See `pl_tex.planes`)
    struct pl_fmt_plane planes[4];
    int num_planes;         // or 0 for non-planar textures

    // This controls the relationship between the data as seen by the host and
    // the way it's interpreted by the texture. The host representation is
    // always tightly packed (no padding bits in between each component).
    //
    // This representation assumes little endian ordering, i.e. components
    // being ordered from LSB to MSB in memory. Note that for oddly packed
    // formats like rgb10a2 or rgb565, this is inconsistent with the naming.
    // (That is to say, rgb565 has sample order {2, 1, 0} under this convention
    // - because rgb565 treats the R channel as the *most* significant bits)
    //
    // If `opaque` is true, then there's no meaningful correspondence between
    // the two, and all of the remaining fields in this section are unset.
    //
    // If `emulated` is true, then this format doesn't actually exist on the
    // GPU as an uploadable texture format - and any apparent support is being
    // emulated (typically using compute shaders in the upload path).
    bool opaque;
    bool emulated;
    size_t texel_size;      // total size in bytes per texel
    size_t texel_align;     // texel alignment requirements (bytes)
    int host_bits[4];       // number of meaningful bits in host memory
    int sample_order[4];    // sampled index for each component, e.g.
                            // {2, 1, 0, 3} for BGRA textures

    // For sampleable formats, this bool indicates whether or not the format
    // is compatible with `textureGather()`
    bool gatherable;

    // If usable as a vertex or texel buffer format, this gives the GLSL type
    // corresponding to the data. (e.g. vec4)
    const char *glsl_type;

    // If usable as a storage image or texel storage buffer
    // (PL_FMT_CAP_STORABLE / PL_FMT_CAP_TEXEL_STORAGE), this gives the GLSL
    // texel format corresponding to the format (e.g. rgba16ui), if any. This
    // field may be NULL, in which case the format modifier may be left
    // unspecified.
    const char *glsl_format;

    // If available, this gives the fourcc associated with the host
    // representation. In particular, this is intended for use with
    // PL_HANDLE_DMA_BUF, where this field will match the DRM format from
    // <drm_fourcc.h>. May be 0, for formats without matching DRM fourcc.
    uint32_t fourcc;

    // If `fourcc` is set, this contains the list of supported drm format
    // modifiers for this format.
    const uint64_t *modifiers;
    int num_modifiers;
};

// Returns whether or not a pl_fmt's components are ordered sequentially
// in memory in the order RGBA.
PL_API bool pl_fmt_is_ordered(pl_fmt fmt);

// Returns whether or not a pl_fmt is sampled as a float (e.g. UNORM)
PL_API bool pl_fmt_is_float(pl_fmt fmt);

// Returns whether or not a pl_fmt supports a given DRM modifier.
PL_API bool pl_fmt_has_modifier(pl_fmt fmt, uint64_t modifier);

// Helper function to find a format with a given number of components and
// minimum effective precision per component. If `host_bits` is set, then the
// format will always be non-opaque, unpadded, ordered and have exactly this
// bit depth for each component. Finally, all `caps` must be supported.
PL_API pl_fmt pl_find_fmt(pl_gpu gpu, enum pl_fmt_type type, int num_components,
                          int min_depth, int host_bits, enum pl_fmt_caps caps);

// Finds a vertex format for a given configuration. The resulting vertex will
// have a component depth equivalent to the sizeof() the equivalent host type.
// (e.g. PL_FMT_FLOAT will always have sizeof(float))
PL_API pl_fmt pl_find_vertex_fmt(pl_gpu gpu, enum pl_fmt_type type, int num_components);

// Find a format based on its name.
PL_API pl_fmt pl_find_named_fmt(pl_gpu gpu, const char *name);

// Find a format based on its fourcc.
PL_API pl_fmt pl_find_fourcc(pl_gpu gpu, uint32_t fourcc);

// A generic 'timer query' object. These can be used to measure an
// approximation of the GPU execution time of a given operation. Due to the
// highly asynchronous nature of GPUs, the actual results of any individual
// timer query may be delayed by quite a bit. As such, users should avoid
// trying to pair any particular GPU command with any particular timer query
// result, and only reuse `pl_timer` objects with identical operations. The
// results of timer queries are guaranteed to be in-order, but individual
// queries may be dropped, and some operations might not record timer results
// at all. (For example, if the underlying hardware does not support timer
// queries for a given operation type)
//
// Thread-safety: Unsafe
typedef struct pl_timer_t *pl_timer;

// Creates a new timer object. This may return NULL, for example if the
// implementation does not support timers, but since passing NULL to
// `pl_timer_destroy` and `pl_timer_query` is safe, users generally need not
// concern themselves with handling this.
PL_API pl_timer pl_timer_create(pl_gpu gpu);
PL_API void pl_timer_destroy(pl_gpu gpu, pl_timer *);

// Queries any results that have been measured since the last execution of
// `pl_timer_query`. There may be more than one result, in which case the user
// should simply call the function again to get the subsequent values. This
// function returns a value of 0 in the event that there are no more
// unprocessed results.
//
// The results are reported in nanoseconds, but the actual precision of the
// timestamp queries may be significantly lower.
//
// Note: Results do not queue up indefinitely. Generally, the implementation
// will only keep track of a small, fixed number of results internally. Make
// sure to include this function as part of your main rendering loop to process
// all of its results, or older results will be overwritten by newer ones.
PL_API uint64_t pl_timer_query(pl_gpu gpu, pl_timer);

enum pl_buf_mem_type {
    PL_BUF_MEM_AUTO = 0, // use whatever seems most appropriate
    PL_BUF_MEM_HOST,     // try allocating from host memory (RAM)
    PL_BUF_MEM_DEVICE,   // try allocating from device memory (VRAM)
    PL_BUF_MEM_TYPE_COUNT,

    // Note: This distinction only matters for discrete GPUs
};

// Structure describing a buffer.
struct pl_buf_params {
    size_t size;        // size in bytes (must be <= `pl_gpu_limits.max_buf_size`)
    bool host_writable; // contents may be updated via pl_buf_write()
    bool host_readable; // contents may be read back via pl_buf_read()
    bool host_mapped;   // create a persistent, RW mapping (pl_buf.data)

    // May be used as PL_DESC_BUF_UNIFORM or PL_DESC_BUF_TEXEL_UNIFORM.
    // Requires `size <= pl_gpu_limits.max_ubo_size`
    bool uniform;

    // May be used as PL_DESC_BUF_STORAGE or PL_DESC_BUF_TEXEL_STORAGE.
    // Requires `size <= pl_gpu_limits.max_ssbo_size`
    bool storable;

    // May be used as the source of vertex data for `pl_pass_run`.
    bool drawable;

    // Provide a hint for the memory type you want to use when allocating
    // this buffer's memory.
    //
    // Note: Restrictions may apply depending on the usage flags. In
    // particular, allocating buffers with `uniform` or `storable` enabled from
    // non-device memory will almost surely fail.
    enum pl_buf_mem_type memory_type;

    // Setting this to a format with the `PL_FMT_CAP_TEXEL_*` capability allows
    // this buffer to be used as a `PL_DESC_BUF_TEXEL_*`, when `uniform` and
    // `storage` are respectively also enabled.
    pl_fmt format;

    // At most one of `export_handle` and `import_handle` can be set for a
    // buffer.

    // Setting this indicates that the memory backing this buffer should be
    // shared with external APIs, If so, this must be exactly *one* of
    // `pl_gpu.export_caps.buf`.
    enum pl_handle_type export_handle;

    // Setting this indicates that the memory backing this buffer will be
    // imported from an external API. If so, this must be exactly *one* of
    // `pl_gpu.import_caps.buf`.
    enum pl_handle_type import_handle;

    // If the shared memory is being imported, the import handle must be
    // specified here. Otherwise, this is ignored.
    struct pl_shared_mem shared_mem;

    // If non-NULL, the buffer will be created with these contents. Otherwise,
    // the initial data is undefined. Using this does *not* require setting
    // host_writable.
    const void *initial_data;

    // Arbitrary user data. libplacebo does not use this at all.
    void *user_data;

    // Arbitrary identifying tag. Used only for debugging purposes.
    pl_debug_tag debug_tag;
};

#define pl_buf_params(...) (&(struct pl_buf_params) {   \
        .debug_tag = PL_DEBUG_TAG,                      \
        __VA_ARGS__                                     \
    })

// A generic buffer, which can be used for multiple purposes (texture transfer,
// storage buffer, uniform buffer, etc.)
//
// Note on efficiency: A pl_buf does not necessarily represent a true "buffer"
// object on the underlying graphics API. It may also refer to a sub-slice of
// a larger buffer, depending on the implementation details of the GPU. The
// bottom line is that users do not need to worry about the efficiency of using
// many small pl_buf objects. Having many small pl_bufs, even lots of few-byte
// vertex buffers, is designed to be completely fine.
//
// Thread-safety: Unsafe
typedef const struct pl_buf_t {
    struct pl_buf_params params;
    uint8_t *data; // for persistently mapped buffers, points to the first byte

    // If `params.handle_type` is set, this structure references the shared
    // memory backing this buffer, via the requested handle type.
    //
    // While this buffer is not in an "exported" state, the contents of the
    // memory are undefined. (See: `pl_buf_export`)
    struct pl_shared_mem shared_mem;
} *pl_buf;

// Create a buffer. The type of buffer depends on the parameters. The buffer
// parameters must adhere to the restrictions imposed by the pl_gpu_limits.
// Returns NULL on failure.
//
// For buffers with shared memory, the buffer is considered to be in an
// "exported" state by default, and may be used directly by the external API
// after being created (until the first libplacebo operation on the buffer).
PL_API pl_buf pl_buf_create(pl_gpu gpu, const struct pl_buf_params *params);
PL_API void pl_buf_destroy(pl_gpu gpu, pl_buf *buf);

// This behaves like `pl_buf_create`, but if the buffer already exists and has
// incompatible parameters, it will get destroyed first. A buffer is considered
// "compatible" if it has the same buffer type and texel format, a size greater
// than or equal to the requested size, and it has a superset of the features
// the user requested. After this operation, the contents of the buffer are
// undefined.
//
// Note: Due to its unpredictability, it's not allowed to use this with
// `params->initial_data` being set. Similarly, it's not allowed on a buffer
// with `params->export_handle`. since this may invalidate the corresponding
// external API's handle. Conversely, it *is* allowed on a buffer with
// `params->host_mapped`, and the corresponding `buf->data` pointer *may*
// change as a result of doing so.
//
// Note: If the `user_data` alone changes, this does not trigger a buffer
// recreation. In theory, this can be used to detect when the buffer ended
// up being recreated.
PL_API bool pl_buf_recreate(pl_gpu gpu, pl_buf *buf, const struct pl_buf_params *params);

// Update the contents of a buffer, starting at a given offset (must be a
// multiple of 4) and up to a given size, with the contents of *data.
//
// This function will block until the buffer is no longer in use. Use
// `pl_buf_poll` to perform non-blocking queries of buffer availability.
//
// Note: This function can incur synchronization overhead, so it shouldn't be
// used in tight loops. If you do need to loop (e.g. to perform a strided
// write), consider using host-mapped buffers, or fixing the memory in RAM,
// before calling this function.
PL_API void pl_buf_write(pl_gpu gpu, pl_buf buf, size_t buf_offset,
                         const void *data, size_t size);

// Read back the contents of a buffer, starting at a given offset, storing the
// data into *dest. Returns whether successful.
//
// This function will block until the buffer is no longer in use. Use
// `pl_buf_poll` to perform non-blocking queries of buffer availability.
PL_API bool pl_buf_read(pl_gpu gpu, pl_buf buf, size_t buf_offset,
                        void *dest, size_t size);

// Copy `size` bytes from one buffer to another, reading from and writing to
// the respective offsets.
PL_API void pl_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset,
                        pl_buf src, size_t src_offset, size_t size);

// Initiates a buffer export operation, allowing a buffer to be accessed by an
// external API. This is only valid for buffers with `params.handle_type`.
// Calling this twice in a row is a harmless no-op. Returns whether successful.
//
// There is no corresponding "buffer import" operation, the next libplacebo
// operation that touches the buffer (e.g. pl_tex_upload, but also pl_buf_write
// and pl_buf_read) will implicitly import the buffer back to libplacebo. Users
// must ensure that all pending operations made by the external API are fully
// completed before using it in libplacebo again. (Otherwise, the behaviour
// is undefined)
//
// Please note that this function returning does not mean the memory is
// immediately available as such. In general, it will mark a buffer as "in use"
// in the same way any other buffer operation would, and it is the user's
// responsibility to wait until `pl_buf_poll` returns false before accessing
// the memory from the external API.
//
// In terms of the access performed by this operation, it is not considered a
// "read" or "write" and therefore does not technically conflict with reads or
// writes to the buffer performed by the host (via mapped memory - any use of
// `pl_buf_read` or `pl_buf_write` would defeat the purpose of the export).
// However, restrictions made by the external API may apply that prevent this.
//
// The recommended use pattern is something like this:
//
// while (loop) {
//    pl_buf buf = get_free_buffer(); // or block on pl_buf_poll
//    // write to the buffer using the external API
//    pl_tex_upload(gpu, /* ... buf ... */); // implicitly imports
//    pl_buf_export(gpu, buf);
// }
//
// i.e. perform an external API operation, then use and immediately export the
// buffer in libplacebo, and finally wait until `pl_buf_poll` is false before
// re-using it in the external API. (Or get a new buffer in the meantime)
PL_API bool pl_buf_export(pl_gpu gpu, pl_buf buf);

// Returns whether or not a buffer is currently "in use". This can either be
// because of a pending read operation, a pending write operation or a pending
// buffer export operation. Any access to the buffer by external APIs or via
// the host pointer (for host-mapped buffers) is forbidden while a buffer is
// "in use". The only exception to this rule is multiple reads, for example
// reading from a buffer with `pl_tex_upload` while simultaneously reading from
// it using mapped memory.
//
// The `timeout`, specified in nanoseconds, indicates how long to block for
// before returning. If set to 0, this function will never block, and only
// returns the current status of the buffer. The actual precision of the
// timeout may be significantly longer than one nanosecond, and has no upper
// bound. This function does not provide hard latency guarantees. This function
// may also return at any time, even if the buffer is still in use. If the user
// wishes to block until the buffer is definitely no longer in use, the
// recommended usage is:
//
// while (pl_buf_poll(gpu, buf, UINT64_MAX))
//      ; // do nothing
//
// Note: libplacebo operations on buffers are always internally synchronized,
// so this is only needed for host-mapped or externally exported buffers.
// However, it may be used to do non-blocking queries before calling blocking
// functions such as `pl_buf_read`.
//
// Note: If `pl_gpu_limits.thread_safe` is set, this function is implicitly
// synchronized, meaning it can safely be called on a `pl_buf` that is in use
// by another thread.
PL_API bool pl_buf_poll(pl_gpu gpu, pl_buf buf, uint64_t timeout);

enum pl_tex_sample_mode {
    PL_TEX_SAMPLE_NEAREST,  // nearest neighbour sampling
    PL_TEX_SAMPLE_LINEAR,   // linear filtering, requires PL_FMT_CAP_LINEAR
    PL_TEX_SAMPLE_MODE_COUNT,
};

enum pl_tex_address_mode {
    PL_TEX_ADDRESS_CLAMP,  // clamp the nearest edge texel
    PL_TEX_ADDRESS_REPEAT, // repeat (tile) the texture
    PL_TEX_ADDRESS_MIRROR, // repeat (mirror) the texture
    PL_TEX_ADDRESS_MODE_COUNT,
};

// Structure describing a texture.
struct pl_tex_params {
    int w, h, d;            // physical dimension; unused dimensions must be 0
    pl_fmt format;

    // The following bools describe what operations can be performed. The
    // corresponding pl_fmt capability must be set for every enabled
    // operation type.
    //
    // Note: For planar formats, it is also possible to set capabilities only
    // supported by sub-planes. In this case, the corresponding functionality
    // will be available for the sub-plane, but not the planar texture itself.
    bool sampleable;    // usable as a PL_DESC_SAMPLED_TEX
    bool renderable;    // usable as a render target (pl_pass_run)
                        // (must only be used with 2D textures)
    bool storable;      // usable as a storage image (PL_DESC_IMG_*)
    bool blit_src;      // usable as a blit source
    bool blit_dst;      // usable as a blit destination
    bool host_writable; // may be updated with pl_tex_upload()
    bool host_readable; // may be fetched with pl_tex_download()

    // Note: For `blit_src`, `blit_dst`, the texture must either be
    // 2-dimensional or `pl_gpu_limits.blittable_1d_3d` must be set.

    // At most one of `export_handle` and `import_handle` can be set for a
    // texture.

    // Setting this indicates that the memory backing this texture should be
    // shared with external APIs, If so, this must be exactly *one* of
    // `pl_gpu.export_caps.tex`.
    enum pl_handle_type export_handle;

    // Setting this indicates that the memory backing this texture will be
    // imported from an external API. If so, this must be exactly *one* of
    // `pl_gpu.import_caps.tex`. Mutually exclusive with `initial_data`.
    enum pl_handle_type import_handle;

    // If the shared memory is being imported, the import handle must be
    // specified here. Otherwise, this is ignored.
    struct pl_shared_mem shared_mem;

    // If non-NULL, the texture will be created with these contents (tightly
    // packed). Using this does *not* require setting host_writable. Otherwise,
    // the initial data is undefined. Mutually exclusive with `import_handle`.
    const void *initial_data;

    // Arbitrary user data. libplacebo does not use this at all.
    void *user_data;

    // Arbitrary identifying tag. Used only for debugging purposes.
    pl_debug_tag debug_tag;
};

#define pl_tex_params(...) (&(struct pl_tex_params) {   \
        .debug_tag = PL_DEBUG_TAG,                      \
        __VA_ARGS__                                     \
    })

static inline int pl_tex_params_dimension(const struct pl_tex_params params)
{
    return params.d ? 3 : params.h ? 2 : 1;
}

enum pl_sampler_type {
    PL_SAMPLER_NORMAL,      // gsampler2D, gsampler3D etc.
    PL_SAMPLER_RECT,        // gsampler2DRect
    PL_SAMPLER_EXTERNAL,    // gsamplerExternalOES
    PL_SAMPLER_TYPE_COUNT,
};

// Conflates the following typical GPU API concepts:
// - texture itself
// - sampler state
// - staging buffers for texture upload
// - framebuffer objects
// - wrappers for swapchain framebuffers
// - synchronization needed for upload/rendering/etc.
//
// Essentially a pl_tex can be anything ranging from a normal texture, a wrapped
// external/real framebuffer, a framebuffer object + texture pair, a mapped
// texture (via pl_hwdec), or other sorts of things that can be sampled from
// and/or rendered to.
//
// Thread-safety: Unsafe
typedef const struct pl_tex_t *pl_tex;
struct pl_tex_t {
    struct pl_tex_params params;

    // If `params.format` is a planar format, this contains `pl_tex` handles
    // encapsulating individual texture planes. Conversely, if this is a
    // sub-plane of a planar texture, `parent` points to the planar texture.
    //
    // Note: Calling `pl_tex_destroy` on sub-planes is undefined behavior.
    pl_tex planes[4];
    pl_tex parent;

    // If `params.export_handle` is set, this structure references the shared
    // memory backing this buffer, via the requested handle type.
    //
    // While this texture is not in an "exported" state, the contents of the
    // memory are undefined. (See: `pl_tex_export`)
    //
    // Note: Due to vulkan driver limitations, `shared_mem.drm_format_mod` will
    // currently always be set to DRM_FORMAT_MOD_INVALID. No guarantee can be
    // made about the cross-driver compatibility of textures exported this way.
    struct pl_shared_mem shared_mem;

    // If `params.sampleable` is true, this indicates the correct sampler type
    // to use when sampling from this texture.
    enum pl_sampler_type sampler_type;
};

// Create a texture (with undefined contents). Returns NULL on failure. This is
// assumed to be an expensive/rare operation, and may need to perform memory
// allocation or framebuffer creation.
PL_API pl_tex pl_tex_create(pl_gpu gpu, const struct pl_tex_params *params);
PL_API void pl_tex_destroy(pl_gpu gpu, pl_tex *tex);

// This works like `pl_tex_create`, but if the texture already exists and has
// incompatible texture parameters, it will get destroyed first. A texture is
// considered "compatible" if it has the same texture format and sample/address
// mode and it supports a superset of the features the user requested.
//
// Even if the texture is not recreated, calling this function will still
// invalidate the contents of the texture. (Note: Because of this,
// `initial_data` may not be used with `pl_tex_recreate`. Doing so is an error)
//
// Note: If the `user_data` alone changes, this does not trigger a texture
// recreation. In theory, this can be used to detect when the texture ended
// up being recreated.
PL_API bool pl_tex_recreate(pl_gpu gpu, pl_tex *tex, const struct pl_tex_params *params);

// Invalidates the contents of a texture. After this, the contents are fully
// undefined.
PL_API void pl_tex_invalidate(pl_gpu gpu, pl_tex tex);

union pl_clear_color {
    float f[4];
    int32_t i[4];
    uint32_t u[4];
};

// Clear the dst texture with the given color (rgba). This is functionally
// identical to a blit operation, which means `dst->params.blit_dst` must be
// set.
PL_API void pl_tex_clear_ex(pl_gpu gpu, pl_tex dst, const union pl_clear_color color);

// Wrapper for `pl_tex_clear_ex` which only works for floating point textures.
PL_API void pl_tex_clear(pl_gpu gpu, pl_tex dst, const float color[4]);

struct pl_tex_blit_params {
    // The texture to blit from. Must have `params.blit_src` enabled.
    pl_tex src;

    // The texture to blit to. Must have `params.blit_dst` enabled, and a
    // format that is loosely compatible with `src`. This essentially means
    // that they must have the same `internal_size`. Additionally, UINT
    // textures can only be blitted to other UINT textures, and SINT textures
    // can only be blitted to other SINT textures.
    pl_tex dst;

    // The region of the source texture to blit. Must be within the texture
    // bounds of `src`. May be flipped. (Optional)
    pl_rect3d src_rc;

    // The region of the destination texture to blit into. Must be within the
    // texture bounds of `dst`. May be flipped. Areas outside of `dst_rc` in
    // `dst` are preserved. (Optional)
    pl_rect3d dst_rc;

    // If `src_rc` and `dst_rc` have different sizes, the texture will be
    // scaled using the given texture sampling mode.
    enum pl_tex_sample_mode sample_mode;
};

#define pl_tex_blit_params(...) (&(struct pl_tex_blit_params) { __VA_ARGS__ })

// Copy a sub-rectangle from one texture to another.
PL_API void pl_tex_blit(pl_gpu gpu, const struct pl_tex_blit_params *params);

// Structure describing a texture transfer operation.
struct pl_tex_transfer_params {
    // Texture to transfer to/from. Depending on the type of the operation,
    // this must have params.host_writable (uploads) or params.host_readable
    // (downloads) set, respectively.
    pl_tex tex;

    // Note: Superfluous parameters are ignored, i.e. for a 1D texture, the y
    // and z fields of `rc`, as well as the corresponding pitches, are ignored.
    // In all other cases, the pitch must be large enough to contain the
    // corresponding dimension of `rc`, and the `rc` must be normalized and
    // fully contained within the image dimensions. Missing fields in the `rc`
    // are inferred from the image size. If unset, the pitch is inferred
    // from `rc` (that is, it's assumed that the data is tightly packed in the
    // buffer). Otherwise, `row_pitch` *must* be a multiple of
    // `tex->params.format->texel_align`, and `depth_pitch` must be a multiple
    // of `row_pitch`.
    pl_rect3d rc;       // region of the texture to transfer
    size_t row_pitch;   // the number of bytes separating image rows
    size_t depth_pitch; // the number of bytes separating image planes

    // An optional timer to report the approximate duration of the texture
    // transfer to. Note that this is only an approximation, since the actual
    // texture transfer may happen entirely in the background (in particular,
    // for implementations with asynchronous transfer capabilities). It's also
    // not guaranteed that all GPUs support this.
    pl_timer timer;

    // An optional callback to fire after the operation completes. If this is
    // specified, then the operation is performed asynchronously. Note that
    // transfers to/from buffers are always asynchronous, even without, this
    // field, so it's more useful for `ptr` transfers. (Though it can still be
    // helpful to avoid having to manually poll buffers all the time)
    //
    // When this is *not* specified, uploads from `ptr` are still asynchronous
    // but require a host memcpy, while downloads from `ptr` are blocking. As
    // such, it's recommended to always try using asynchronous texture
    // transfers wherever possible.
    //
    // Note: Requires `pl_gpu_limits.callbacks`
    //
    // Note: Callbacks are implicitly synchronized, meaning that callbacks are
    // guaranteed to never execute concurrently with other callbacks. However,
    // they may execute from any thread that the `pl_gpu` is used on.
    void (*callback)(void *priv);
    void *priv; // arbitrary user data

    // For the data source/target of a transfer operation, there are two valid
    // options:
    //
    // 1. Transferring to/from a buffer: (requires `pl_gpu_limits.buf_transfer`)
    pl_buf buf;         // buffer to use
    size_t buf_offset;  // offset of data within buffer, should be a
                        // multiple of `tex->params.format->texel_size`
    // 2. Transferring to/from host memory directly:
    void *ptr;          // address of data
    bool no_import;     // always use memcpy, bypassing host ptr import

    // Note: The contents of the memory region / buffer must exactly match the
    // texture format; i.e. there is no explicit conversion between formats.
};

#define pl_tex_transfer_params(...) (&(struct pl_tex_transfer_params) { __VA_ARGS__ })

// Upload data to a texture. Returns whether successful.
PL_API bool pl_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params);

// Download data from a texture. Returns whether successful.
PL_API bool pl_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params);

// Returns whether or not a texture is currently "in use". This can either be
// because of a pending read operation, a pending write operation or a pending
// texture export operation. Note that this function's usefulness is extremely
// limited under ordinary circumstances. In practically all cases, textures do
// not need to be directly synchronized by the user, except when interfacing
// with external libraries. This function should NOT, however, be used as a
// crutch to avoid having to implement semaphore-based synchronization. Use
// the API-specific functions such as `pl_vulkan_hold/release` for that.
//
// A good example of a use case in which this function is required is when
// interoperating with external memory management that needs to know when an
// imported texture is safe to free / reclaim internally, in which case
// semaphores are insufficient because memory management is a host operation.
//
// The `timeout`, specified in nanoseconds, indicates how long to block for
// before returning. If set to 0, this function will never block, and only
// returns the current status of the texture. The actual precision of the
// timeout may be significantly longer than one nanosecond, and has no upper
// bound. This function does not provide hard latency guarantees. This function
// may also return at any time, even if the texture is still in use. If the
// user wishes to block until the texture is definitely no longer in use, the
// recommended usage is:
//
// while (pl_tex_poll(gpu, buf, UINT64_MAX))
//      ; // do nothing
//
// Note: If `pl_gpu_limits.thread_safe` is set, this function is implicitly
// synchronized, meaning it can safely be called on a `pl_tex` that is in use
// by another thread.
PL_API bool pl_tex_poll(pl_gpu gpu, pl_tex tex, uint64_t timeout);

// Data type of a shader input variable (e.g. uniform, or UBO member)
enum pl_var_type {
    PL_VAR_INVALID = 0,
    PL_VAR_SINT,        // C: int           GLSL: int/ivec
    PL_VAR_UINT,        // C: unsigned int  GLSL: uint/uvec
    PL_VAR_FLOAT,       // C: float         GLSL: float/vec/mat
    PL_VAR_TYPE_COUNT
};

// Returns the host size (in bytes) of a pl_var_type.
PL_API size_t pl_var_type_size(enum pl_var_type type);

// Represents a shader input variable (concrete data, e.g. vector, matrix)
struct pl_var {
    const char *name;       // name as used in the shader
    enum pl_var_type type;
    // The total number of values is given by dim_v * dim_m. For example, a
    // vec2 would have dim_v = 2 and dim_m = 1. A mat3x4 would have dim_v = 4
    // and dim_m = 3.
    int dim_v;              // vector dimension
    int dim_m;              // matrix dimension (number of columns, see below)
    int dim_a;              // array dimension
};

// Helper functions for constructing the most common pl_vars, with names
// corresponding to their corresponding GLSL built-in types.
PL_API struct pl_var pl_var_float(const char *name);
PL_API struct pl_var pl_var_vec2(const char *name);
PL_API struct pl_var pl_var_vec3(const char *name);
PL_API struct pl_var pl_var_vec4(const char *name);
PL_API struct pl_var pl_var_mat2(const char *name);
PL_API struct pl_var pl_var_mat2x3(const char *name);
PL_API struct pl_var pl_var_mat2x4(const char *name);
PL_API struct pl_var pl_var_mat3(const char *name);
PL_API struct pl_var pl_var_mat3x4(const char *name);
PL_API struct pl_var pl_var_mat4x2(const char *name);
PL_API struct pl_var pl_var_mat4x3(const char *name);
PL_API struct pl_var pl_var_mat4(const char *name);
PL_API struct pl_var pl_var_int(const char *name);
PL_API struct pl_var pl_var_ivec2(const char *name);
PL_API struct pl_var pl_var_ivec3(const char *name);
PL_API struct pl_var pl_var_ivec4(const char *name);
PL_API struct pl_var pl_var_uint(const char *name);
PL_API struct pl_var pl_var_uvec2(const char *name);
PL_API struct pl_var pl_var_uvec3(const char *name);
PL_API struct pl_var pl_var_uvec4(const char *name);

struct pl_named_var {
    const char *glsl_name;
    struct pl_var var;
};

// The same list as above, tagged by name and terminated with a {0} entry.
PL_API extern const struct pl_named_var pl_var_glsl_types[];

// Efficient helper function for performing a lookup in the above array.
// Returns NULL if the variable is not legal. Note that the array dimension is
// ignored, since it's usually part of the variable name and not the type name.
PL_API const char *pl_var_glsl_type_name(struct pl_var var);

// Converts a pl_fmt to an "equivalent" pl_var. Equivalent in this sense means
// that the pl_var's type will be the same as the vertex's sampled type (e.g.
// PL_FMT_UNORM gets turned into PL_VAR_FLOAT).
PL_API struct pl_var pl_var_from_fmt(pl_fmt fmt, const char *name);

// Describes the memory layout of a variable, relative to some starting location
// (typically the offset within a uniform/storage/pushconstant buffer)
//
// Note on matrices: All GPUs expect column major matrices, for both buffers and
// input variables. Care needs to be taken to avoid trying to use e.g. a
// pl_matrix3x3 (which is row major) directly as a pl_var_update.data!
//
// In terms of the host layout, a column-major matrix (e.g. matCxR) with C
// columns and R rows is treated like an array vecR[C]. The `stride` here refers
// to the separation between these array elements, i.e. the separation between
// the individual columns.
//
// Visualization of a mat4x3:
//
//       0   1   2   3  <- columns
// 0  [ (A) (D) (G) (J) ]
// 1  [ (B) (E) (H) (K) ]
// 2  [ (C) (F) (I) (L) ]
// ^ rows
//
// Layout in GPU memory: (stride=16, size=60)
//
// [ A B C ] X <- column 0, offset +0
// [ D E F ] X <- column 1, offset +16
// [ G H I ] X <- column 2, offset +32
// [ J K L ]   <- column 3, offset +48
//
// Note the lack of padding on the last column in this example.
// In general: size <= stride * dim_m
//
// C representation: (stride=12, size=48)
//
// { { A, B, C },
//   { D, E, F },
//   { G, H, I },
//   { J, K, L } }
//
// Note on arrays: `stride` represents both the stride between elements of a
// matrix, and the stride between elements of an array. That is, there is no
// distinction between the columns of a matrix and the rows of an array. For
// example, a mat2[10] and a vec2[20] share the same pl_var_layout - the stride
// would be sizeof(vec2) and the size would be sizeof(vec2) * 2 * 10.
//
// For non-array/matrix types, `stride` is equal to `size`.

struct pl_var_layout {
    size_t offset; // the starting offset of the first byte
    size_t stride; // the delta between two elements of an array/matrix
    size_t size;   // the total size of the input
};

// Returns the host layout of an input variable as required for a
// tightly-packed, byte-aligned C data type, given a starting offset.
PL_API struct pl_var_layout pl_var_host_layout(size_t offset, const struct pl_var *var);

// Returns the GLSL std140 layout of an input variable given a current buffer
// offset, as required for a buffer descriptor of type PL_DESC_BUF_UNIFORM
//
// The normal way to use this function is when calculating the size and offset
// requirements of a uniform buffer in an incremental fashion, to calculate the
// new offset of the next variable in this buffer.
PL_API struct pl_var_layout pl_std140_layout(size_t offset, const struct pl_var *var);

// Returns the GLSL std430 layout of an input variable given a current buffer
// offset, as required for a buffer descriptor of type PL_DESC_BUF_STORAGE, and
// for push constants.
PL_API struct pl_var_layout pl_std430_layout(size_t offset, const struct pl_var *var);

// Convenience definitions / friendly names for these
#define pl_buf_uniform_layout pl_std140_layout
#define pl_buf_storage_layout pl_std430_layout
#define pl_push_constant_layout pl_std430_layout

// Like memcpy, but copies bytes from `src` to `dst` in a manner governed by
// the stride and size of `dst_layout` as well as `src_layout`. Also takes
// into account the respective `offset`.
PL_API void memcpy_layout(void *dst, struct pl_var_layout dst_layout,
                          const void *src, struct pl_var_layout src_layout);

// Represents a compile-time constant.
struct pl_constant {
    enum pl_var_type type;  // constant data type
    uint32_t id;            // GLSL `constant_id`
    size_t offset;          // byte offset in `constant_data`
};

// Represents a vertex attribute.
struct pl_vertex_attrib {
    const char *name;   // name as used in the shader
    pl_fmt fmt;         // data format (must have PL_FMT_CAP_VERTEX)
    size_t offset;      // byte offset into the vertex struct
    int location;       // vertex location (as used in the shader)
};

// Returns an abstract namespace index for a given descriptor type. This will
// always be a value >= 0 and < PL_DESC_TYPE_COUNT. Implementations can use
// this to figure out which descriptors may share the same value of `binding`.
// Bindings must only be unique for all descriptors within the same namespace.
PL_API int pl_desc_namespace(pl_gpu gpu, enum pl_desc_type type);

// Access mode of a shader input descriptor.
enum pl_desc_access {
    PL_DESC_ACCESS_READWRITE,
    PL_DESC_ACCESS_READONLY,
    PL_DESC_ACCESS_WRITEONLY,
    PL_DESC_ACCESS_COUNT,
};

// Returns the GLSL syntax for a given access mode (e.g. "readonly").
PL_API const char *pl_desc_access_glsl_name(enum pl_desc_access mode);

// Represents a shader descriptor (e.g. texture or buffer binding)
struct pl_desc {
    const char *name;       // name as used in the shader
    enum pl_desc_type type;

    // The binding of this descriptor, as used in the shader. All bindings
    // within a namespace must be unique. (see: pl_desc_namespace)
    int binding;

    // For storage images and storage buffers, this can be used to restrict
    // the type of access that may be performed on the descriptor. Ignored for
    // the other descriptor types (uniform buffers and sampled textures are
    // always read-only).
    enum pl_desc_access access;
};

// Framebuffer blending mode (for raster passes)
enum pl_blend_mode {
    PL_BLEND_ZERO,
    PL_BLEND_ONE,
    PL_BLEND_SRC_ALPHA,
    PL_BLEND_ONE_MINUS_SRC_ALPHA,
    PL_BLEND_MODE_COUNT,
};

struct pl_blend_params {
    enum pl_blend_mode src_rgb;
    enum pl_blend_mode dst_rgb;
    enum pl_blend_mode src_alpha;
    enum pl_blend_mode dst_alpha;
};

#define pl_blend_params(...) (&(struct pl_blend_params) { __VA_ARGS__ })

// Typical alpha compositing
PL_API extern const struct pl_blend_params pl_alpha_overlay;

enum pl_prim_type {
    PL_PRIM_TRIANGLE_LIST,
    PL_PRIM_TRIANGLE_STRIP,
    PL_PRIM_TYPE_COUNT,
};

enum pl_index_format {
    PL_INDEX_UINT16 = 0,
    PL_INDEX_UINT32,
    PL_INDEX_FORMAT_COUNT,
};

enum pl_pass_type {
    PL_PASS_INVALID = 0,
    PL_PASS_RASTER,  // vertex+fragment shader
    PL_PASS_COMPUTE, // compute shader (requires `pl_gpu.glsl.compute`)
    PL_PASS_TYPE_COUNT,
};

// Description of a rendering pass. It conflates the following:
//  - GLSL shader(s) and its list of inputs
//  - target parameters (for raster passes)
struct pl_pass_params {
    enum pl_pass_type type;

    // Input variables.
    struct pl_var *variables;
    int num_variables;

    // Input descriptors.
    struct pl_desc *descriptors;
    int num_descriptors;

    // Compile-time specialization constants.
    struct pl_constant *constants;
    int num_constants;

    // Initial data for the specialization constants. Optional. If NULL,
    // specialization constants receive the values from the shader text.
    void *constant_data;

    // Push constant region. Must be be a multiple of 4 <= limits.max_pushc_size
    size_t push_constants_size;

    // The shader text in GLSL. For PL_PASS_RASTER, this is interpreted
    // as a fragment shader. For PL_PASS_COMPUTE, this is interpreted as
    // a compute shader.
    const char *glsl_shader;

    // --- type==PL_PASS_RASTER only

    // Describes the interpretation and layout of the vertex data.
    enum pl_prim_type vertex_type;
    struct pl_vertex_attrib *vertex_attribs;
    int num_vertex_attribs;
    size_t vertex_stride; // must be a multiple of limits.align_vertex_stride

    // The vertex shader itself.
    const char *vertex_shader;

    // Target format. The format must support PL_FMT_CAP_RENDERABLE. The
    // resulting pass may only be used on textures that have a format with a
    // `pl_fmt.signature` compatible to this format.
    pl_fmt target_format;

    // Target blending mode. If this is NULL, blending is disabled. Otherwise,
    // the `target_format` must also support PL_FMT_CAP_BLENDABLE.
    const struct pl_blend_params *blend_params;

    // If false, the target's existing contents will be discarded before the
    // pass is run. (Semantically equivalent to calling pl_tex_invalidate
    // before every pl_pass_run, but slightly more efficient)
    //
    // Specifying `blend_params` requires `load_target` to be true.
    bool load_target;

    // --- Deprecated / removed fields.
    PL_DEPRECATED const uint8_t *cached_program; // Non-functional
    PL_DEPRECATED size_t cached_program_len;
};

#define pl_pass_params(...) (&(struct pl_pass_params) { __VA_ARGS__ })

// Conflates the following typical GPU API concepts:
// - various kinds of shaders
// - rendering pipelines
// - descriptor sets, uniforms, other bindings
// - all synchronization necessary
// - the current values of all inputs
//
// Thread-safety: Unsafe
typedef const struct pl_pass_t {
    struct pl_pass_params params;
} *pl_pass;

// Compile a shader and create a render pass. This is a rare/expensive
// operation and may take a significant amount of time, even if a cached
// program is used. Returns NULL on failure.
PL_API pl_pass pl_pass_create(pl_gpu gpu, const struct pl_pass_params *params);
PL_API void pl_pass_destroy(pl_gpu gpu, pl_pass *pass);

struct pl_desc_binding {
    const void *object; // pl_* object with type corresponding to pl_desc_type

    // For PL_DESC_SAMPLED_TEX, this can be used to configure the sampler.
    enum pl_tex_address_mode address_mode;
    enum pl_tex_sample_mode sample_mode;
};

struct pl_var_update {
    int index;        // index into params.variables[]
    const void *data; // pointer to raw byte data corresponding to pl_var_host_layout()
};

struct pl_pass_run_params {
    pl_pass pass;

    // If present, the shader will be re-specialized with the new constants
    // provided. This is a significantly cheaper operation than recompiling a
    // brand new shader, but should still be avoided if possible.
    //
    // Leaving it as NULL re-uses the existing specialization values. Ignored
    // if the shader has no specialization constants. Guaranteed to be a no-op
    // if the values have not changed since the last invocation.
    void *constant_data;

    // This list only contains descriptors/variables which have changed
    // since the previous invocation. All non-mentioned variables implicitly
    // preserve their state from the last invocation.
    struct pl_var_update *var_updates;
    int num_var_updates;

    // This list contains all descriptors used by this pass. It must
    // always be filled, even if the descriptors haven't changed. The order
    // must match that of pass->params.descriptors
    struct pl_desc_binding *desc_bindings;

    // The push constants for this invocation. This must always be set and
    // fully defined for every invocation if params.push_constants_size > 0.
    void *push_constants;

    // An optional timer to report the approximate runtime of this shader pass
    // invocation to. Note that this is only an approximation, since shaders
    // may overlap their execution times and contend for GPU time.
    pl_timer timer;

    // --- pass->params.type==PL_PASS_RASTER only

    // Target must be a 2D texture, `target->params.renderable` must be true,
    // and `target->params.format->signature` must match the signature provided
    // in `pass->params.target_format`.
    //
    // If the viewport or scissors are left blank, they are inferred from
    // target->params.
    //
    // WARNING: Rendering to a *target that is being read from by the same
    // shader is undefined behavior. In general, trying to bind the same
    // resource multiple times to the same shader is undefined behavior.
    pl_tex target;
    pl_rect2d viewport; // screen space viewport (must be normalized)
    pl_rect2d scissors; // target render scissors (must be normalized)

    // Number of vertices to render
    int vertex_count;

    // Vertex data may be provided in one of two forms:
    //
    // 1. Drawing from host memory directly
    const void *vertex_data;
    // 2. Drawing from a vertex buffer (requires `vertex_buf->params.drawable`)
    pl_buf vertex_buf;
    size_t buf_offset;

    // (Optional) Index data may be provided in the form given by `index_fmt`.
    // These will be used for instanced rendering. Similar to vertex data, this
    // can be provided in two forms:
    // 1. From host memory
    const void *index_data;
    enum pl_index_format index_fmt;
    // 2. From an index buffer (requires `index_buf->params.drawable`)
    pl_buf index_buf;
    size_t index_offset;
    // Note: Drawing from an index buffer requires vertex data to also be
    // present in buffer form, i.e. it's forbidden to mix `index_buf` with
    // `vertex_data` (though vice versa is allowed).

    // --- pass->params.type==PL_PASS_COMPUTE only

    // Number of work groups to dispatch per dimension (X/Y/Z). Must be <= the
    // corresponding index of limits.max_dispatch
    int compute_groups[3];
};

#define pl_pass_run_params(...) (&(struct pl_pass_run_params) { __VA_ARGS__ })

// Execute a render pass.
PL_API void pl_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params);

// This is semantically a no-op, but it provides a hint that you want to flush
// any partially queued up commands and begin execution. There is normally no
// need to call this, because queued commands will always be implicitly flushed
// whenever necessary to make forward progress on commands like `pl_buf_poll`,
// or when submitting a frame to a swapchain for display. In fact, calling this
// function can negatively impact performance, because some GPUs rely on being
// able to re-order and modify queued commands in order to enable optimizations
// retroactively.
//
// The only time this might be beneficial to call explicitly is if you're doing
// lots of offline processing, i.e. you aren't rendering to a swapchain but to
// textures that you download from again. In that case you should call this
// function after each "work item" to ensure good parallelism between them.
//
// It's worth noting that this function may block if you're over-feeding the
// GPU without waiting for existing results to finish.
PL_API void pl_gpu_flush(pl_gpu gpu);

// This is like `pl_gpu_flush` but also blocks until the GPU is fully idle
// before returning. Using this in your rendering loop is seriously disadvised,
// and almost never the right solution. The intended use case is for deinit
// logic, where users may want to force the all pending GPU operations to
// finish so they can clean up their state more easily.
//
// After this operation is called, it's guaranteed that all pending buffer
// operations are complete - i.e. `pl_buf_poll` is guaranteed to return false.
// It's also guaranteed that any outstanding timer query results are available.
//
// Note: If you only care about buffer operations, you can accomplish this more
// easily by using `pl_buf_poll` with the timeout set to `UINT64_MAX`. But if
// you have many buffers it may be more convenient to call this function
// instead. The difference is that this function will also affect e.g. renders
// to a `pl_swapchain`.
PL_API void pl_gpu_finish(pl_gpu gpu);

// Returns true if the GPU is considered to be in a "failed" state, which
// during normal operation is typically the result of things like the device
// being lost (due to e.g. power management).
//
// If this returns true, users *should* destroy and recreate the `pl_gpu`,
// including all associated resources, via the appropriate mechanism.
PL_API bool pl_gpu_is_failed(pl_gpu gpu);


// Deprecated objects and functions:

// A generic synchronization object intended for use with an external API. This
// is not required when solely using libplacebo API functions, as all required
// synchronisation is done internally. This comes in the form of a pair of
// semaphores - one to synchronize access in each direction.
//
// Thread-safety: Unsafe
typedef const struct pl_sync_t {
    enum pl_handle_type handle_type;

    // This handle is signalled by the `pl_gpu`, and waited on by the user. It
    // fires when it is safe for the user to access the shared resource.
    union pl_handle wait_handle;

    // This handle is signalled by the user, and waited on by the `pl_gpu`. It
    // must fire when the user has finished accessing the shared resource.
    union pl_handle signal_handle;
} *pl_sync;

// Create a synchronization object. Returns NULL on failure.
//
// `handle_type` must be exactly *one* of `pl_gpu.export_caps.sync`, and
// indicates which type of handle to generate for sharing this sync object.
//
// Deprecated in favor of API-specific semaphore creation operations such as
// `pl_vulkan_sem_create`.
PL_DEPRECATED PL_API pl_sync pl_sync_create(pl_gpu gpu, enum pl_handle_type handle_type);

// Destroy a `pl_sync`. Note that this invalidates the externally imported
// semaphores. Users should therefore make sure that all operations that
// wait on or signal any of the semaphore have been fully submitted and
// processed by the external API before destroying the `pl_sync`.
//
// Despite this, it's safe to destroy a `pl_sync` if the only pending
// operations that involve it are internal to libplacebo.
PL_DEPRECATED PL_API void pl_sync_destroy(pl_gpu gpu, pl_sync *sync);

// Initiates a texture export operation, allowing a texture to be accessed by
// an external API. Returns whether successful. After this operation
// successfully returns, it is guaranteed that `sync->wait_handle` will
// eventually be signalled. For APIs where this is relevant, the image layout
// should be specified as "general", e.g. `GL_LAYOUT_GENERAL_EXT` for OpenGL.
//
// There is no corresponding "import" operation - the next operation that uses
// a texture will implicitly import the texture. Valid API usage requires that
// the user *must* submit a semaphore signal operation on `sync->signal_handle`
// before doing so. Not doing so is undefined behavior and may very well
// deadlock the calling process and/or the graphics card!
//
// Note that despite this restriction, it is always valid to call
// `pl_tex_destroy`, even if the texture is in an exported state, without
// having to signal the corresponding sync object first.
//
// Deprecated in favor of API-specific synchronization mechanisms such as
// `pl_vulkan_hold/release_ex`.
PL_DEPRECATED PL_API bool pl_tex_export(pl_gpu gpu, pl_tex tex, pl_sync sync);


PL_API_END

#endif // LIBPLACEBO_GPU_H_