1 files changed, 1464 insertions, 0 deletions
diff --git a/src/include/libplacebo/gpu.h b/src/include/libplacebo/gpu.h
new file mode 100644
index 0000000..a63fdf7
--- /dev/null
+++ b/src/include/libplacebo/gpu.h
@@ -0,0 +1,1464 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_GPU_H_
+#define LIBPLACEBO_GPU_H_
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <libplacebo/common.h>
+#include <libplacebo/cache.h>
+#include <libplacebo/log.h>
+
+PL_API_BEGIN
+
+// These are not memory managed, and should represent compile-time constants
+typedef const char *pl_debug_tag;
+#define PL_DEBUG_TAG (__FILE__ ":" PL_TOSTRING(__LINE__))
+
+// Type of a shader input descriptor.
+enum pl_desc_type {
+    PL_DESC_INVALID = 0,
+    PL_DESC_SAMPLED_TEX,    // C: pl_tex*    GLSL: combined texture sampler
+                            // (`pl_tex->params.sampleable` must be set)
+    PL_DESC_STORAGE_IMG,    // C: pl_tex*    GLSL: storage image
+                            // (`pl_tex->params.storable` must be set)
+    PL_DESC_BUF_UNIFORM,    // C: pl_buf*    GLSL: uniform buffer
+                            // (`pl_buf->params.uniform` must be set)
+    PL_DESC_BUF_STORAGE,    // C: pl_buf*    GLSL: storage buffer
+                            // (`pl_buf->params.storable` must be set)
+    PL_DESC_BUF_TEXEL_UNIFORM,// C: pl_buf*  GLSL: uniform samplerBuffer
+                              // (`pl_buf->params.uniform` and `format` must be set)
+    PL_DESC_BUF_TEXEL_STORAGE,// C: pl_buf*  GLSL: uniform imageBuffer
+                              // (`pl_buf->params.uniform` and `format` must be set)
+    PL_DESC_TYPE_COUNT
+};
+
+// This file contains the definition of an API which is designed to abstract
+// away from platform-specific APIs like the various OpenGL variants, Direct3D
+// and Vulkan in a common way. It is a much more limited API than those APIs,
+// since it tries targeting a very small common subset of features that is
+// needed to implement libplacebo's rendering.
+//
+// NOTE: Most, but not all, parameter conditions (phrases such as "must" or
+// "valid usage" are explicitly tested and result in error messages followed by
+// graceful failure. Exceptions are noted where they exist.
+
+// Structure which wraps metadata describing GLSL capabilities.
+struct pl_glsl_version {
+    int version;        // GLSL version (e.g. 450), for #version
+    bool gles;          // GLSL ES semantics (ESSL)
+    bool vulkan;        // GL_KHR_vulkan_glsl semantics
+
+    // Compute shader support and limits. If `compute` is false, then all
+    // of the remaining fields in this section are {0}.
+    bool compute;
+    size_t max_shmem_size;      // maximum compute shader shared memory size
+    uint32_t max_group_threads; // maximum number of local threads per work group
+    uint32_t max_group_size[3]; // maximum work group size per dimension
+
+    // If nonzero, signals availability of shader subgroups. This guarantess
+    // availability of all of the following extensions:
+    // - GL_KHR_shader_subgroup_basic
+    // - GL_KHR_shader_subgroup_vote
+    // - GL_KHR_shader_subgroup_arithmetic
+    // - GL_KHR_shader_subgroup_ballot
+    // - GL_KHR_shader_subgroup_shuffle
+    uint32_t subgroup_size;
+
+    // Miscellaneous shader limits
+    int16_t min_gather_offset;  // minimum `textureGatherOffset` offset
+    int16_t max_gather_offset;  // maximum `textureGatherOffset` offset
+};
+
+// Backwards compatibility alias
+#define pl_glsl_desc pl_glsl_version
+
+// Structure defining the physical limits and capabilities of this GPU
+// instance. If a limit is given as 0, that means that feature is unsupported.
+struct pl_gpu_limits {
+    // --- pl_gpu
+    bool thread_safe;           // `pl_gpu` calls are thread-safe
+    bool callbacks;             // supports asynchronous GPU callbacks
+
+    // --- pl_buf
+    size_t max_buf_size;        // maximum size of any buffer
+    size_t max_ubo_size;        // maximum size of a `uniform` buffer
+    size_t max_ssbo_size;       // maximum size of a `storable` buffer
+    size_t max_vbo_size;        // maximum size of a `drawable` buffer
+    size_t max_mapped_size;     // maximum size of a `host_mapped` buffer
+    uint64_t max_buffer_texels; // maximum number of texels in a texel buffer
+    bool host_cached;           // if true, PL_BUF_MEM_HOST buffers are cached
+
+    // Required alignment for PL_HANDLE_HOST_PTR imports. This is provided
+    // merely as a hint to the user. If the host pointer being imported is
+    // misaligned, libplacebo will internally round (over-map) the region.
+    size_t align_host_ptr;
+
+    // --- pl_tex
+    uint32_t max_tex_1d_dim;    // maximum width for a 1D texture
+    uint32_t max_tex_2d_dim;    // maximum width/height for a 2D texture (required)
+    uint32_t max_tex_3d_dim;    // maximum width/height/depth for a 3D texture
+    bool blittable_1d_3d;       // supports blittable 1D/3D textures
+    bool buf_transfer;          // supports `pl_tex_transfer_params.buf`
+
+    // These don't represent hard limits but indicate performance hints for
+    // optimal alignment. For best performance, the corresponding field
+    // should be aligned to a multiple of these. They will always be a power
+    // of two.
+    size_t align_tex_xfer_pitch;    // optimal `pl_tex_transfer_params.row_pitch`
+    size_t align_tex_xfer_offset;   // optimal `pl_tex_transfer_params.buf_offset`
+
+    // --- pl_pass
+    size_t max_variable_comps;  // maximum components passed in variables
+    size_t max_constants;       // maximum `pl_pass_params.num_constants`
+    bool array_size_constants;  // push constants can be used to size arrays
+    size_t max_pushc_size;      // maximum `push_constants_size`
+    size_t align_vertex_stride; // alignment of `pl_pass_params.vertex_stride`
+    uint32_t max_dispatch[3];   // maximum dispatch size per dimension
+
+    // Note: At least one of `max_variable_comps` or `max_ubo_size` is
+    // guaranteed to be nonzero.
+
+    // As a performance hint, the GPU may signal the number of command queues
+    // it has for fragment and compute shaders, respectively. Users may use
+    // this information to decide the appropriate type of shader to dispatch.
+    uint32_t fragment_queues;
+    uint32_t compute_queues;
+};
+
+// Backwards compatibility aliases
+#define max_xfer_size max_buf_size
+#define align_tex_xfer_stride align_tex_xfer_pitch
+
+// Some `pl_gpu` operations allow sharing GPU resources with external APIs -
+// examples include interop with other graphics APIs such as CUDA, and also
+// various hardware decoding APIs. This defines the mechanism underpinning the
+// communication of such an interoperation.
+typedef uint64_t pl_handle_caps;
+enum pl_handle_type {
+    PL_HANDLE_FD        = (1 << 0), // `int fd` for POSIX-style APIs
+    PL_HANDLE_WIN32     = (1 << 1), // `HANDLE` for win32 API
+    PL_HANDLE_WIN32_KMT = (1 << 2), // `HANDLE` for pre-Windows-8 win32 API
+    PL_HANDLE_DMA_BUF   = (1 << 3), // 'int fd' for a dma_buf fd
+    PL_HANDLE_HOST_PTR  = (1 << 4), // `void *` for a host-allocated pointer
+    PL_HANDLE_MTL_TEX   = (1 << 5), // `MTLTexture*` for Apple platforms
+    PL_HANDLE_IOSURFACE = (1 << 6), // `IOSurfaceRef` for Apple platforms
+};
+
+struct pl_gpu_handle_caps {
+    pl_handle_caps tex;  // supported handles for `pl_tex` + `pl_shared_mem`
+    pl_handle_caps buf;  // supported handles for `pl_buf` + `pl_shared_mem`
+    pl_handle_caps sync; // supported handles for `pl_sync` / semaphores
+};
+
+// Wrapper for the handle used to communicate a shared resource externally.
+// This handle is owned by the `pl_gpu` - if a user wishes to use it in a way
+// that takes over ownership (e.g. importing into some APIs), they must clone
+// the handle before doing so (e.g. using `dup` for fds). It is important to
+// read the external API documentation _very_ carefully as different handle
+// types may be managed in different ways. (eg: CUDA takes ownership of an fd,
+// but does not take ownership of a win32 handle).
+union pl_handle {
+    int fd;         // PL_HANDLE_FD / PL_HANDLE_DMA_BUF
+    void *handle;   // PL_HANDLE_WIN32 / PL_HANDLE_WIN32_KMT / PL_HANDLE_MTL_TEX / PL_HANDLE_IOSURFACE
+    void *ptr;      // PL_HANDLE_HOST_PTR
+};
+
+// Structure encapsulating memory that is shared between libplacebo and the
+// user. This memory can be imported into external APIs using the handle.
+//
+// If the object a `pl_shared_mem` belongs to is destroyed (e.g. via
+// `pl_buf_destroy`), the handle becomes undefined, as do the contents of the
+// memory it points to, as well as any external API objects imported from it.
+struct pl_shared_mem {
+    union pl_handle handle;
+    size_t size;   // the total size of the memory referenced by this handle
+    size_t offset; // the offset of the object within the referenced memory
+
+    // Note: `size` is optional for some APIs and handle types, in particular
+    // when importing DMABUFs or D3D11 textures.
+
+    // For PL_HANDLE_DMA_BUF, this specifies the DRM format modifier that
+    // describes this resource. Note that when importing `pl_buf`, this must
+    // be DRM_FORMAT_MOD_LINEAR. For importing `pl_tex`, it can be any
+    // format modifier supported by the implementation.
+    uint64_t drm_format_mod;
+
+    // When importing a `pl_tex` of type PL_HANDLE_DMA_BUF, this can be used to
+    // set the image stride (AKA pitch) in memory. If left as 0, defaults to
+    // the image width/height.
+    size_t stride_w;
+    size_t stride_h;
+
+    // When importing a `pl_tex` of type PL_HANDLE_MTL_TEX, this determines
+    // which plane is imported (0 - 2).
+    unsigned plane;
+};
+
+// Structure grouping PCI bus address fields for GPU devices
+struct pl_gpu_pci_address {
+    uint32_t domain;
+    uint32_t bus;
+    uint32_t device;
+    uint32_t function;
+};
+
+typedef const struct pl_fmt_t *pl_fmt;
+
+// Abstract device context which wraps an underlying graphics context and can
+// be used to dispatch rendering commands.
+//
+// Thread-safety: Depends on `pl_gpu_limits.thread_safe`
+typedef const struct pl_gpu_t {
+    pl_log log;
+
+    struct pl_glsl_version glsl; // GLSL features supported by this GPU
+    struct pl_gpu_limits limits; // physical device limits and capabilities
+
+    // Fields relevant to external API interop. If the underlying device does
+    // not support interop with other APIs, these will all be {0}.
+    struct pl_gpu_handle_caps export_caps; // supported handles for exporting
+    struct pl_gpu_handle_caps import_caps; // supported handles for importing
+    uint8_t uuid[16];                      // underlying device UUID
+
+    // Supported texture formats, in preference order. (If there are multiple
+    // similar formats, the "better" ones come first)
+    pl_fmt *formats;
+    int num_formats;
+
+    // PCI Bus address of the underlying device, to help with interop.
+    // This will only be filled in if interop is supported.
+    struct pl_gpu_pci_address pci;
+} *pl_gpu;
+
+// Attach a pl_cache object to this GPU instance. This cache will be
+// used to cache all compiled shaders, as well as several other shader objects
+// (e.g. cached 3DLUTs). Calling this with `cache = NULL` disables the cache.
+//
+// Note: Calling this after shaders have already been compiled will not
+// retroactively add those shaders to the cache, so it's recommended to set
+// this early, before creating any passes.
+PL_API void pl_gpu_set_cache(pl_gpu gpu, pl_cache cache);
+
+enum pl_fmt_type {
+    PL_FMT_UNKNOWN = 0, // also used for inconsistent multi-component formats
+    PL_FMT_UNORM,       // unsigned, normalized integer format (sampled as float)
+    PL_FMT_SNORM,       // signed, normalized integer format (sampled as float)
+    PL_FMT_UINT,        // unsigned integer format (sampled as integer)
+    PL_FMT_SINT,        // signed integer format (sampled as integer)
+    PL_FMT_FLOAT,       // (signed) float formats, any bit size
+    PL_FMT_TYPE_COUNT,
+};
+
+enum pl_fmt_caps {
+    PL_FMT_CAP_SAMPLEABLE    = 1 << 0,  // may be sampled from (PL_DESC_SAMPLED_TEX)
+    PL_FMT_CAP_STORABLE      = 1 << 1,  // may be used as storage image (PL_DESC_STORAGE_IMG)
+    PL_FMT_CAP_LINEAR        = 1 << 2,  // may be linearly samplied from (PL_TEX_SAMPLE_LINEAR)
+    PL_FMT_CAP_RENDERABLE    = 1 << 3,  // may be rendered to (pl_pass_params.target_fmt)
+    PL_FMT_CAP_BLENDABLE     = 1 << 4,  // may be blended to (pl_pass_params.enable_blend)
+    PL_FMT_CAP_BLITTABLE     = 1 << 5,  // may be blitted from/to (pl_tex_blit)
+    PL_FMT_CAP_VERTEX        = 1 << 6,  // may be used as a vertex attribute
+    PL_FMT_CAP_TEXEL_UNIFORM = 1 << 7,  // may be used as a texel uniform buffer
+    PL_FMT_CAP_TEXEL_STORAGE = 1 << 8,  // may be used as a texel storage buffer
+    PL_FMT_CAP_HOST_READABLE = 1 << 9,  // may be used with `host_readable` textures
+    PL_FMT_CAP_READWRITE     = 1 << 10, // may be used with PL_DESC_ACCESS_READWRITE
+
+    // Notes:
+    // - PL_FMT_CAP_LINEAR also implies PL_FMT_CAP_SAMPLEABLE
+    // - PL_FMT_CAP_STORABLE also implies `pl_gpu.glsl.compute`
+    // - PL_FMT_CAP_BLENDABLE implies PL_FMT_CAP_RENDERABLE
+    // - PL_FMT_CAP_VERTEX implies that the format is non-opaque
+    // - PL_FMT_CAP_HOST_READABLE implies that the format is non-opaque
+};
+
+struct pl_fmt_plane {
+    // Underlying format of this particular sub-plane. This describes the
+    // components, texel size and host representation for the purpose of
+    // e.g. transfers, blits, and sampling.
+    pl_fmt format;
+
+    // X/Y subsampling shift factor for this plane.
+    uint8_t shift_x, shift_y;
+};
+
+// Structure describing a texel/vertex format.
+struct pl_fmt_t {
+    const char *name;       // symbolic name for this format (e.g. rgba32f)
+    uint64_t signature;     // unique but stable signature (for pass reusability)
+
+    enum pl_fmt_type type;  // the format's data type and interpretation
+    enum pl_fmt_caps caps;  // the features supported by this format
+    int num_components;     // number of components for this format
+    int component_depth[4]; // meaningful bits per component, texture precision
+    size_t internal_size;   // internal texel size (for blit compatibility)
+
+    // For planar formats, this provides a description of each sub-plane.
+    //
+    // Note on planar formats: Planar formats are always opaque and typically
+    // support only a limit subset of capabilities (or none at all). Access
+    // should be done via sub-planes. (See `pl_tex.planes`)
+    struct pl_fmt_plane planes[4];
+    int num_planes;         // or 0 for non-planar textures
+
+    // This controls the relationship between the data as seen by the host and
+    // the way it's interpreted by the texture. The host representation is
+    // always tightly packed (no padding bits in between each component).
+    //
+    // This representation assumes little endian ordering, i.e. components
+    // being ordered from LSB to MSB in memory. Note that for oddly packed
+    // formats like rgb10a2 or rgb565, this is inconsistent with the naming.
+    // (That is to say, rgb565 has sample order {2, 1, 0} under this convention
+    // - because rgb565 treats the R channel as the *most* significant bits)
+    //
+    // If `opaque` is true, then there's no meaningful correspondence between
+    // the two, and all of the remaining fields in this section are unset.
+    //
+    // If `emulated` is true, then this format doesn't actually exist on the
+    // GPU as an uploadable texture format - and any apparent support is being
+    // emulated (typically using compute shaders in the upload path).
+    bool opaque;
+    bool emulated;
+    size_t texel_size;      // total size in bytes per texel
+    size_t texel_align;     // texel alignment requirements (bytes)
+    int host_bits[4];       // number of meaningful bits in host memory
+    int sample_order[4];    // sampled index for each component, e.g.
+                            // {2, 1, 0, 3} for BGRA textures
+
+    // For sampleable formats, this bool indicates whether or not the format
+    // is compatible with `textureGather()`
+    bool gatherable;
+
+    // If usable as a vertex or texel buffer format, this gives the GLSL type
+    // corresponding to the data. (e.g. vec4)
+    const char *glsl_type;
+
+    // If usable as a storage image or texel storage buffer
+    // (PL_FMT_CAP_STORABLE / PL_FMT_CAP_TEXEL_STORAGE), this gives the GLSL
+    // texel format corresponding to the format (e.g. rgba16ui), if any. This
+    // field may be NULL, in which case the format modifier may be left
+    // unspecified.
+    const char *glsl_format;
+
+    // If available, this gives the fourcc associated with the host
+    // representation. In particular, this is intended for use with
+    // PL_HANDLE_DMA_BUF, where this field will match the DRM format from
+    // <drm_fourcc.h>. May be 0, for formats without matching DRM fourcc.
+    uint32_t fourcc;
+
+    // If `fourcc` is set, this contains the list of supported drm format
+    // modifiers for this format.
+    const uint64_t *modifiers;
+    int num_modifiers;
+};
+
+// Returns whether or not a pl_fmt's components are ordered sequentially
+// in memory in the order RGBA.
+PL_API bool pl_fmt_is_ordered(pl_fmt fmt);
+
+// Returns whether or not a pl_fmt is sampled as a float (e.g. UNORM)
+PL_API bool pl_fmt_is_float(pl_fmt fmt);
+
+// Returns whether or not a pl_fmt supports a given DRM modifier.
+PL_API bool pl_fmt_has_modifier(pl_fmt fmt, uint64_t modifier);
+
+// Helper function to find a format with a given number of components and
+// minimum effective precision per component. If `host_bits` is set, then the
+// format will always be non-opaque, unpadded, ordered and have exactly this
+// bit depth for each component. Finally, all `caps` must be supported.
+PL_API pl_fmt pl_find_fmt(pl_gpu gpu, enum pl_fmt_type type, int num_components,
+                          int min_depth, int host_bits, enum pl_fmt_caps caps);
+
+// Finds a vertex format for a given configuration. The resulting vertex will
+// have a component depth equivalent to the sizeof() the equivalent host type.
+// (e.g. PL_FMT_FLOAT will always have sizeof(float))
+PL_API pl_fmt pl_find_vertex_fmt(pl_gpu gpu, enum pl_fmt_type type, int num_components);
+
+// Find a format based on its name.
+PL_API pl_fmt pl_find_named_fmt(pl_gpu gpu, const char *name);
+
+// Find a format based on its fourcc.
+PL_API pl_fmt pl_find_fourcc(pl_gpu gpu, uint32_t fourcc);
+
+// A generic 'timer query' object. These can be used to measure an
+// approximation of the GPU execution time of a given operation. Due to the
+// highly asynchronous nature of GPUs, the actual results of any individual
+// timer query may be delayed by quite a bit. As such, users should avoid
+// trying to pair any particular GPU command with any particular timer query
+// result, and only reuse `pl_timer` objects with identical operations. The
+// results of timer queries are guaranteed to be in-order, but individual
+// queries may be dropped, and some operations might not record timer results
+// at all. (For example, if the underlying hardware does not support timer
+// queries for a given operation type)
+//
+// Thread-safety: Unsafe
+typedef struct pl_timer_t *pl_timer;
+
+// Creates a new timer object. This may return NULL, for example if the
+// implementation does not support timers, but since passing NULL to
+// `pl_timer_destroy` and `pl_timer_query` is safe, users generally need not
+// concern themselves with handling this.
+PL_API pl_timer pl_timer_create(pl_gpu gpu);
+PL_API void pl_timer_destroy(pl_gpu gpu, pl_timer *);
+
+// Queries any results that have been measured since the last execution of
+// `pl_timer_query`. There may be more than one result, in which case the user
+// should simply call the function again to get the subsequent values. This
+// function returns a value of 0 in the event that there are no more
+// unprocessed results.
+//
+// The results are reported in nanoseconds, but the actual precision of the
+// timestamp queries may be significantly lower.
+//
+// Note: Results do not queue up indefinitely. Generally, the implementation
+// will only keep track of a small, fixed number of results internally. Make
+// sure to include this function as part of your main rendering loop to process
+// all of its results, or older results will be overwritten by newer ones.
+PL_API uint64_t pl_timer_query(pl_gpu gpu, pl_timer);
+
+enum pl_buf_mem_type {
+    PL_BUF_MEM_AUTO = 0, // use whatever seems most appropriate
+    PL_BUF_MEM_HOST,     // try allocating from host memory (RAM)
+    PL_BUF_MEM_DEVICE,   // try allocating from device memory (VRAM)
+    PL_BUF_MEM_TYPE_COUNT,
+
+    // Note: This distinction only matters for discrete GPUs
+};
+
+// Structure describing a buffer.
+struct pl_buf_params {
+    size_t size;        // size in bytes (must be <= `pl_gpu_limits.max_buf_size`)
+    bool host_writable; // contents may be updated via pl_buf_write()
+    bool host_readable; // contents may be read back via pl_buf_read()
+    bool host_mapped;   // create a persistent, RW mapping (pl_buf.data)
+
+    // May be used as PL_DESC_BUF_UNIFORM or PL_DESC_BUF_TEXEL_UNIFORM.
+    // Requires `size <= pl_gpu_limits.max_ubo_size`
+    bool uniform;
+
+    // May be used as PL_DESC_BUF_STORAGE or PL_DESC_BUF_TEXEL_STORAGE.
+    // Requires `size <= pl_gpu_limits.max_ssbo_size`
+    bool storable;
+
+    // May be used as the source of vertex data for `pl_pass_run`.
+    bool drawable;
+
+    // Provide a hint for the memory type you want to use when allocating
+    // this buffer's memory.
+    //
+    // Note: Restrictions may apply depending on the usage flags. In
+    // particular, allocating buffers with `uniform` or `storable` enabled from
+    // non-device memory will almost surely fail.
+    enum pl_buf_mem_type memory_type;
+
+    // Setting this to a format with the `PL_FMT_CAP_TEXEL_*` capability allows
+    // this buffer to be used as a `PL_DESC_BUF_TEXEL_*`, when `uniform` and
+    // `storage` are respectively also enabled.
+    pl_fmt format;
+
+    // At most one of `export_handle` and `import_handle` can be set for a
+    // buffer.
+
+    // Setting this indicates that the memory backing this buffer should be
+    // shared with external APIs, If so, this must be exactly *one* of
+    // `pl_gpu.export_caps.buf`.
+    enum pl_handle_type export_handle;
+
+    // Setting this indicates that the memory backing this buffer will be
+    // imported from an external API. If so, this must be exactly *one* of
+    // `pl_gpu.import_caps.buf`.
+    enum pl_handle_type import_handle;
+
+    // If the shared memory is being imported, the import handle must be
+    // specified here. Otherwise, this is ignored.
+    struct pl_shared_mem shared_mem;
+
+    // If non-NULL, the buffer will be created with these contents. Otherwise,
+    // the initial data is undefined. Using this does *not* require setting
+    // host_writable.
+    const void *initial_data;
+
+    // Arbitrary user data. libplacebo does not use this at all.
+    void *user_data;
+
+    // Arbitrary identifying tag. Used only for debugging purposes.
+    pl_debug_tag debug_tag;
+};
+
+#define pl_buf_params(...) (&(struct pl_buf_params) {   \
+        .debug_tag = PL_DEBUG_TAG,                      \
+        __VA_ARGS__                                     \
+    })
+
+// A generic buffer, which can be used for multiple purposes (texture transfer,
+// storage buffer, uniform buffer, etc.)
+//
+// Note on efficiency: A pl_buf does not necessarily represent a true "buffer"
+// object on the underlying graphics API. It may also refer to a sub-slice of
+// a larger buffer, depending on the implementation details of the GPU. The
+// bottom line is that users do not need to worry about the efficiency of using
+// many small pl_buf objects. Having many small pl_bufs, even lots of few-byte
+// vertex buffers, is designed to be completely fine.
+//
+// Thread-safety: Unsafe
+typedef const struct pl_buf_t {
+    struct pl_buf_params params;
+    uint8_t *data; // for persistently mapped buffers, points to the first byte
+
+    // If `params.handle_type` is set, this structure references the shared
+    // memory backing this buffer, via the requested handle type.
+    //
+    // While this buffer is not in an "exported" state, the contents of the
+    // memory are undefined. (See: `pl_buf_export`)
+    struct pl_shared_mem shared_mem;
+} *pl_buf;
+
+// Create a buffer. The type of buffer depends on the parameters. The buffer
+// parameters must adhere to the restrictions imposed by the pl_gpu_limits.
+// Returns NULL on failure.
+//
+// For buffers with shared memory, the buffer is considered to be in an
+// "exported" state by default, and may be used directly by the external API
+// after being created (until the first libplacebo operation on the buffer).
+PL_API pl_buf pl_buf_create(pl_gpu gpu, const struct pl_buf_params *params);
+PL_API void pl_buf_destroy(pl_gpu gpu, pl_buf *buf);
+
+// This behaves like `pl_buf_create`, but if the buffer already exists and has
+// incompatible parameters, it will get destroyed first. A buffer is considered
+// "compatible" if it has the same buffer type and texel format, a size greater
+// than or equal to the requested size, and it has a superset of the features
+// the user requested. After this operation, the contents of the buffer are
+// undefined.
+//
+// Note: Due to its unpredictability, it's not allowed to use this with
+// `params->initial_data` being set. Similarly, it's not allowed on a buffer
+// with `params->export_handle`. since this may invalidate the corresponding
+// external API's handle. Conversely, it *is* allowed on a buffer with
+// `params->host_mapped`, and the corresponding `buf->data` pointer *may*
+// change as a result of doing so.
+//
+// Note: If the `user_data` alone changes, this does not trigger a buffer
+// recreation. In theory, this can be used to detect when the buffer ended
+// up being recreated.
+PL_API bool pl_buf_recreate(pl_gpu gpu, pl_buf *buf, const struct pl_buf_params *params);
+
+// Update the contents of a buffer, starting at a given offset (must be a
+// multiple of 4) and up to a given size, with the contents of *data.
+//
+// This function will block until the buffer is no longer in use. Use
+// `pl_buf_poll` to perform non-blocking queries of buffer availability.
+//
+// Note: This function can incur synchronization overhead, so it shouldn't be
+// used in tight loops. If you do need to loop (e.g. to perform a strided
+// write), consider using host-mapped buffers, or fixing the memory in RAM,
+// before calling this function.
+PL_API void pl_buf_write(pl_gpu gpu, pl_buf buf, size_t buf_offset,
+                         const void *data, size_t size);
+
+// Read back the contents of a buffer, starting at a given offset, storing the
+// data into *dest. Returns whether successful.
+//
+// This function will block until the buffer is no longer in use. Use
+// `pl_buf_poll` to perform non-blocking queries of buffer availability.
+PL_API bool pl_buf_read(pl_gpu gpu, pl_buf buf, size_t buf_offset,
+                        void *dest, size_t size);
+
+// Copy `size` bytes from one buffer to another, reading from and writing to
+// the respective offsets.
+PL_API void pl_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset,
+                        pl_buf src, size_t src_offset, size_t size);
+
+// Initiates a buffer export operation, allowing a buffer to be accessed by an
+// external API. This is only valid for buffers with `params.handle_type`.
+// Calling this twice in a row is a harmless no-op. Returns whether successful.
+//
+// There is no corresponding "buffer import" operation, the next libplacebo
+// operation that touches the buffer (e.g. pl_tex_upload, but also pl_buf_write
+// and pl_buf_read) will implicitly import the buffer back to libplacebo. Users
+// must ensure that all pending operations made by the external API are fully
+// completed before using it in libplacebo again. (Otherwise, the behaviour
+// is undefined)
+//
+// Please note that this function returning does not mean the memory is
+// immediately available as such. In general, it will mark a buffer as "in use"
+// in the same way any other buffer operation would, and it is the user's
+// responsibility to wait until `pl_buf_poll` returns false before accessing
+// the memory from the external API.
+//
+// In terms of the access performed by this operation, it is not considered a
+// "read" or "write" and therefore does not technically conflict with reads or
+// writes to the buffer performed by the host (via mapped memory - any use of
+// `pl_buf_read` or `pl_buf_write` would defeat the purpose of the export).
+// However, restrictions made by the external API may apply that prevent this.
+//
+// The recommended use pattern is something like this:
+//
+// while (loop) {
+//    pl_buf buf = get_free_buffer(); // or block on pl_buf_poll
+//    // write to the buffer using the external API
+//    pl_tex_upload(gpu, /* ... buf ... */); // implicitly imports
+//    pl_buf_export(gpu, buf);
+// }
+//
+// i.e. perform an external API operation, then use and immediately export the
+// buffer in libplacebo, and finally wait until `pl_buf_poll` is false before
+// re-using it in the external API. (Or get a new buffer in the meantime)
+PL_API bool pl_buf_export(pl_gpu gpu, pl_buf buf);
+
+// Returns whether or not a buffer is currently "in use". This can either be
+// because of a pending read operation, a pending write operation or a pending
+// buffer export operation. Any access to the buffer by external APIs or via
+// the host pointer (for host-mapped buffers) is forbidden while a buffer is
+// "in use". The only exception to this rule is multiple reads, for example
+// reading from a buffer with `pl_tex_upload` while simultaneously reading from
+// it using mapped memory.
+//
+// The `timeout`, specified in nanoseconds, indicates how long to block for
+// before returning. If set to 0, this function will never block, and only
+// returns the current status of the buffer. The actual precision of the
+// timeout may be significantly longer than one nanosecond, and has no upper
+// bound. This function does not provide hard latency guarantees. This function
+// may also return at any time, even if the buffer is still in use. If the user
+// wishes to block until the buffer is definitely no longer in use, the
+// recommended usage is:
+//
+// while (pl_buf_poll(gpu, buf, UINT64_MAX))
+//      ; // do nothing
+//
+// Note: libplacebo operations on buffers are always internally synchronized,
+// so this is only needed for host-mapped or externally exported buffers.
+// However, it may be used to do non-blocking queries before calling blocking
+// functions such as `pl_buf_read`.
+//
+// Note: If `pl_gpu_limits.thread_safe` is set, this function is implicitly
+// synchronized, meaning it can safely be called on a `pl_buf` that is in use
+// by another thread.
+PL_API bool pl_buf_poll(pl_gpu gpu, pl_buf buf, uint64_t timeout);
+
+enum pl_tex_sample_mode {
+    PL_TEX_SAMPLE_NEAREST,  // nearest neighbour sampling
+    PL_TEX_SAMPLE_LINEAR,   // linear filtering, requires PL_FMT_CAP_LINEAR
+    PL_TEX_SAMPLE_MODE_COUNT,
+};
+
+enum pl_tex_address_mode {
+    PL_TEX_ADDRESS_CLAMP,  // clamp the nearest edge texel
+    PL_TEX_ADDRESS_REPEAT, // repeat (tile) the texture
+    PL_TEX_ADDRESS_MIRROR, // repeat (mirror) the texture
+    PL_TEX_ADDRESS_MODE_COUNT,
+};
+
+// Structure describing a texture.
+struct pl_tex_params {
+    int w, h, d;            // physical dimension; unused dimensions must be 0
+    pl_fmt format;
+
+    // The following bools describe what operations can be performed. The
+    // corresponding pl_fmt capability must be set for every enabled
+    // operation type.
+    //
+    // Note: For planar formats, it is also possible to set capabilities only
+    // supported by sub-planes. In this case, the corresponding functionality
+    // will be available for the sub-plane, but not the planar texture itself.
+    bool sampleable;    // usable as a PL_DESC_SAMPLED_TEX
+    bool renderable;    // usable as a render target (pl_pass_run)
+                        // (must only be used with 2D textures)
+    bool storable;      // usable as a storage image (PL_DESC_IMG_*)
+    bool blit_src;      // usable as a blit source
+    bool blit_dst;      // usable as a blit destination
+    bool host_writable; // may be updated with pl_tex_upload()
+    bool host_readable; // may be fetched with pl_tex_download()
+
+    // Note: For `blit_src`, `blit_dst`, the texture must either be
+    // 2-dimensional or `pl_gpu_limits.blittable_1d_3d` must be set.
+
+    // At most one of `export_handle` and `import_handle` can be set for a
+    // texture.
+
+    // Setting this indicates that the memory backing this texture should be
+    // shared with external APIs, If so, this must be exactly *one* of
+    // `pl_gpu.export_caps.tex`.
+    enum pl_handle_type export_handle;
+
+    // Setting this indicates that the memory backing this texture will be
+    // imported from an external API. If so, this must be exactly *one* of
+    // `pl_gpu.import_caps.tex`. Mutually exclusive with `initial_data`.
+    enum pl_handle_type import_handle;
+
+    // If the shared memory is being imported, the import handle must be
+    // specified here. Otherwise, this is ignored.
+    struct pl_shared_mem shared_mem;
+
+    // If non-NULL, the texture will be created with these contents (tightly
+    // packed). Using this does *not* require setting host_writable. Otherwise,
+    // the initial data is undefined. Mutually exclusive with `import_handle`.
+    const void *initial_data;
+
+    // Arbitrary user data. libplacebo does not use this at all.
+    void *user_data;
+
+    // Arbitrary identifying tag. Used only for debugging purposes.
+    pl_debug_tag debug_tag;
+};
+
+#define pl_tex_params(...) (&(struct pl_tex_params) {   \
+        .debug_tag = PL_DEBUG_TAG,                      \
+        __VA_ARGS__                                     \
+    })
+
+static inline int pl_tex_params_dimension(const struct pl_tex_params params)
+{
+    return params.d ? 3 : params.h ? 2 : 1;
+}
+
+enum pl_sampler_type {
+    PL_SAMPLER_NORMAL,      // gsampler2D, gsampler3D etc.
+    PL_SAMPLER_RECT,        // gsampler2DRect
+    PL_SAMPLER_EXTERNAL,    // gsamplerExternalOES
+    PL_SAMPLER_TYPE_COUNT,
+};
+
+// Conflates the following typical GPU API concepts:
+// - texture itself
+// - sampler state
+// - staging buffers for texture upload
+// - framebuffer objects
+// - wrappers for swapchain framebuffers
+// - synchronization needed for upload/rendering/etc.
+//
+// Essentially a pl_tex can be anything ranging from a normal texture, a wrapped
+// external/real framebuffer, a framebuffer object + texture pair, a mapped
+// texture (via pl_hwdec), or other sorts of things that can be sampled from
+// and/or rendered to.
+//
+// Thread-safety: Unsafe
+typedef const struct pl_tex_t *pl_tex;
+struct pl_tex_t {
+    struct pl_tex_params params;
+
+    // If `params.format` is a planar format, this contains `pl_tex` handles
+    // encapsulating individual texture planes. Conversely, if this is a
+    // sub-plane of a planar texture, `parent` points to the planar texture.
+    //
+    // Note: Calling `pl_tex_destroy` on sub-planes is undefined behavior.
+    pl_tex planes[4];
+    pl_tex parent;
+
+    // If `params.export_handle` is set, this structure references the shared
+    // memory backing this buffer, via the requested handle type.
+    //
+    // While this texture is not in an "exported" state, the contents of the
+    // memory are undefined. (See: `pl_tex_export`)
+    //
+    // Note: Due to vulkan driver limitations, `shared_mem.drm_format_mod` will
+    // currently always be set to DRM_FORMAT_MOD_INVALID. No guarantee can be
+    // made about the cross-driver compatibility of textures exported this way.
+    struct pl_shared_mem shared_mem;
+
+    // If `params.sampleable` is true, this indicates the correct sampler type
+    // to use when sampling from this texture.
+    enum pl_sampler_type sampler_type;
+};
+
+// Create a texture (with undefined contents). Returns NULL on failure. This is
+// assumed to be an expensive/rare operation, and may need to perform memory
+// allocation or framebuffer creation.
+PL_API pl_tex pl_tex_create(pl_gpu gpu, const struct pl_tex_params *params);
+PL_API void pl_tex_destroy(pl_gpu gpu, pl_tex *tex);
+
+// This works like `pl_tex_create`, but if the texture already exists and has
+// incompatible texture parameters, it will get destroyed first. A texture is
+// considered "compatible" if it has the same texture format and sample/address
+// mode and it supports a superset of the features the user requested.
+//
+// Even if the texture is not recreated, calling this function will still
+// invalidate the contents of the texture. (Note: Because of this,
+// `initial_data` may not be used with `pl_tex_recreate`. Doing so is an error)
+//
+// Note: If the `user_data` alone changes, this does not trigger a texture
+// recreation. In theory, this can be used to detect when the texture ended
+// up being recreated.
+PL_API bool pl_tex_recreate(pl_gpu gpu, pl_tex *tex, const struct pl_tex_params *params);
+
+// Invalidates the contents of a texture. After this, the contents are fully
+// undefined.
+PL_API void pl_tex_invalidate(pl_gpu gpu, pl_tex tex);
+
+union pl_clear_color {
+    float f[4];
+    int32_t i[4];
+    uint32_t u[4];
+};
+
+// Clear the dst texture with the given color (rgba). This is functionally
+// identical to a blit operation, which means `dst->params.blit_dst` must be
+// set.
+PL_API void pl_tex_clear_ex(pl_gpu gpu, pl_tex dst, const union pl_clear_color color);
+
+// Wrapper for `pl_tex_clear_ex` which only works for floating point textures.
+PL_API void pl_tex_clear(pl_gpu gpu, pl_tex dst, const float color[4]);
+
+struct pl_tex_blit_params {
+    // The texture to blit from. Must have `params.blit_src` enabled.
+    pl_tex src;
+
+    // The texture to blit to. Must have `params.blit_dst` enabled, and a
+    // format that is loosely compatible with `src`. This essentially means
+    // that they must have the same `internal_size`. Additionally, UINT
+    // textures can only be blitted to other UINT textures, and SINT textures
+    // can only be blitted to other SINT textures.
+    pl_tex dst;
+
+    // The region of the source texture to blit. Must be within the texture
+    // bounds of `src`. May be flipped. (Optional)
+    pl_rect3d src_rc;
+
+    // The region of the destination texture to blit into. Must be within the
+    // texture bounds of `dst`. May be flipped. Areas outside of `dst_rc` in
+    // `dst` are preserved. (Optional)
+    pl_rect3d dst_rc;
+
+    // If `src_rc` and `dst_rc` have different sizes, the texture will be
+    // scaled using the given texture sampling mode.
+    enum pl_tex_sample_mode sample_mode;
+};
+
+#define pl_tex_blit_params(...) (&(struct pl_tex_blit_params) { __VA_ARGS__ })
+
+// Copy a sub-rectangle from one texture to another.
+PL_API void pl_tex_blit(pl_gpu gpu, const struct pl_tex_blit_params *params);
+
+// Structure describing a texture transfer operation.
+struct pl_tex_transfer_params {
+    // Texture to transfer to/from. Depending on the type of the operation,
+    // this must have params.host_writable (uploads) or params.host_readable
+    // (downloads) set, respectively.
+    pl_tex tex;
+
+    // Note: Superfluous parameters are ignored, i.e. for a 1D texture, the y
+    // and z fields of `rc`, as well as the corresponding pitches, are ignored.
+    // In all other cases, the pitch must be large enough to contain the
+    // corresponding dimension of `rc`, and the `rc` must be normalized and
+    // fully contained within the image dimensions. Missing fields in the `rc`
+    // are inferred from the image size. If unset, the pitch is inferred
+    // from `rc` (that is, it's assumed that the data is tightly packed in the
+    // buffer). Otherwise, `row_pitch` *must* be a multiple of
+    // `tex->params.format->texel_align`, and `depth_pitch` must be a multiple
+    // of `row_pitch`.
+    pl_rect3d rc;       // region of the texture to transfer
+    size_t row_pitch;   // the number of bytes separating image rows
+    size_t depth_pitch; // the number of bytes separating image planes
+
+    // An optional timer to report the approximate duration of the texture
+    // transfer to. Note that this is only an approximation, since the actual
+    // texture transfer may happen entirely in the background (in particular,
+    // for implementations with asynchronous transfer capabilities). It's also
+    // not guaranteed that all GPUs support this.
+    pl_timer timer;
+
+    // An optional callback to fire after the operation completes. If this is
+    // specified, then the operation is performed asynchronously. Note that
+    // transfers to/from buffers are always asynchronous, even without, this
+    // field, so it's more useful for `ptr` transfers. (Though it can still be
+    // helpful to avoid having to manually poll buffers all the time)
+    //
+    // When this is *not* specified, uploads from `ptr` are still asynchronous
+    // but require a host memcpy, while downloads from `ptr` are blocking. As
+    // such, it's recommended to always try using asynchronous texture
+    // transfers wherever possible.
+    //
+    // Note: Requires `pl_gpu_limits.callbacks`
+    //
+    // Note: Callbacks are implicitly synchronized, meaning that callbacks are
+    // guaranteed to never execute concurrently with other callbacks. However,
+    // they may execute from any thread that the `pl_gpu` is used on.
+    void (*callback)(void *priv);
+    void *priv; // arbitrary user data
+
+    // For the data source/target of a transfer operation, there are two valid
+    // options:
+    //
+    // 1. Transferring to/from a buffer: (requires `pl_gpu_limits.buf_transfer`)
+    pl_buf buf;         // buffer to use
+    size_t buf_offset;  // offset of data within buffer, should be a
+                        // multiple of `tex->params.format->texel_size`
+    // 2. Transferring to/from host memory directly:
+    void *ptr;          // address of data
+    bool no_import;     // always use memcpy, bypassing host ptr import
+
+    // Note: The contents of the memory region / buffer must exactly match the
+    // texture format; i.e. there is no explicit conversion between formats.
+};
+
+#define pl_tex_transfer_params(...) (&(struct pl_tex_transfer_params) { __VA_ARGS__ })
+
+// Upload data to a texture. Returns whether successful.
+PL_API bool pl_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params);
+
+// Download data from a texture. Returns whether successful.
+PL_API bool pl_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params);
+
+// Returns whether or not a texture is currently "in use". This can either be
+// because of a pending read operation, a pending write operation or a pending
+// texture export operation. Note that this function's usefulness is extremely
+// limited under ordinary circumstances. In practically all cases, textures do
+// not need to be directly synchronized by the user, except when interfacing
+// with external libraries. This function should NOT, however, be used as a
+// crutch to avoid having to implement semaphore-based synchronization. Use
+// the API-specific functions such as `pl_vulkan_hold/release` for that.
+//
+// A good example of a use case in which this function is required is when
+// interoperating with external memory management that needs to know when an
+// imported texture is safe to free / reclaim internally, in which case
+// semaphores are insufficient because memory management is a host operation.
+//
+// The `timeout`, specified in nanoseconds, indicates how long to block for
+// before returning. If set to 0, this function will never block, and only
+// returns the current status of the texture. The actual precision of the
+// timeout may be significantly longer than one nanosecond, and has no upper
+// bound. This function does not provide hard latency guarantees. This function
+// may also return at any time, even if the texture is still in use. If the
+// user wishes to block until the texture is definitely no longer in use, the
+// recommended usage is:
+//
+// while (pl_tex_poll(gpu, buf, UINT64_MAX))
+//      ; // do nothing
+//
+// Note: If `pl_gpu_limits.thread_safe` is set, this function is implicitly
+// synchronized, meaning it can safely be called on a `pl_tex` that is in use
+// by another thread.
+PL_API bool pl_tex_poll(pl_gpu gpu, pl_tex tex, uint64_t timeout);
+
+// Data type of a shader input variable (e.g. uniform, or UBO member)
+enum pl_var_type {
+    PL_VAR_INVALID = 0,
+    PL_VAR_SINT,        // C: int           GLSL: int/ivec
+    PL_VAR_UINT,        // C: unsigned int  GLSL: uint/uvec
+    PL_VAR_FLOAT,       // C: float         GLSL: float/vec/mat
+    PL_VAR_TYPE_COUNT
+};
+
+// Returns the host size (in bytes) of a pl_var_type.
+PL_API size_t pl_var_type_size(enum pl_var_type type);
+
+// Represents a shader input variable (concrete data, e.g. vector, matrix)
+struct pl_var {
+    const char *name;       // name as used in the shader
+    enum pl_var_type type;
+    // The total number of values is given by dim_v * dim_m. For example, a
+    // vec2 would have dim_v = 2 and dim_m = 1. A mat3x4 would have dim_v = 4
+    // and dim_m = 3.
+    int dim_v;              // vector dimension
+    int dim_m;              // matrix dimension (number of columns, see below)
+    int dim_a;              // array dimension
+};
+
+// Helper functions for constructing the most common pl_vars, with names
+// corresponding to their corresponding GLSL built-in types.
+PL_API struct pl_var pl_var_float(const char *name);
+PL_API struct pl_var pl_var_vec2(const char *name);
+PL_API struct pl_var pl_var_vec3(const char *name);
+PL_API struct pl_var pl_var_vec4(const char *name);
+PL_API struct pl_var pl_var_mat2(const char *name);
+PL_API struct pl_var pl_var_mat2x3(const char *name);
+PL_API struct pl_var pl_var_mat2x4(const char *name);
+PL_API struct pl_var pl_var_mat3(const char *name);
+PL_API struct pl_var pl_var_mat3x4(const char *name);
+PL_API struct pl_var pl_var_mat4x2(const char *name);
+PL_API struct pl_var pl_var_mat4x3(const char *name);
+PL_API struct pl_var pl_var_mat4(const char *name);
+PL_API struct pl_var pl_var_int(const char *name);
+PL_API struct pl_var pl_var_ivec2(const char *name);
+PL_API struct pl_var pl_var_ivec3(const char *name);
+PL_API struct pl_var pl_var_ivec4(const char *name);
+PL_API struct pl_var pl_var_uint(const char *name);
+PL_API struct pl_var pl_var_uvec2(const char *name);
+PL_API struct pl_var pl_var_uvec3(const char *name);
+PL_API struct pl_var pl_var_uvec4(const char *name);
+
+struct pl_named_var {
+    const char *glsl_name;
+    struct pl_var var;
+};
+
+// The same list as above, tagged by name and terminated with a {0} entry.
+PL_API extern const struct pl_named_var pl_var_glsl_types[];
+
+// Efficient helper function for performing a lookup in the above array.
+// Returns NULL if the variable is not legal. Note that the array dimension is
+// ignored, since it's usually part of the variable name and not the type name.
+PL_API const char *pl_var_glsl_type_name(struct pl_var var);
+
+// Converts a pl_fmt to an "equivalent" pl_var. Equivalent in this sense means
+// that the pl_var's type will be the same as the vertex's sampled type (e.g.
+// PL_FMT_UNORM gets turned into PL_VAR_FLOAT).
+PL_API struct pl_var pl_var_from_fmt(pl_fmt fmt, const char *name);
+
+// Describes the memory layout of a variable, relative to some starting location
+// (typically the offset within a uniform/storage/pushconstant buffer)
+//
+// Note on matrices: All GPUs expect column major matrices, for both buffers and
+// input variables. Care needs to be taken to avoid trying to use e.g. a
+// pl_matrix3x3 (which is row major) directly as a pl_var_update.data!
+//
+// In terms of the host layout, a column-major matrix (e.g. matCxR) with C
+// columns and R rows is treated like an array vecR[C]. The `stride` here refers
+// to the separation between these array elements, i.e. the separation between
+// the individual columns.
+//
+// Visualization of a mat4x3:
+//
+//       0   1   2   3  <- columns
+// 0  [ (A) (D) (G) (J) ]
+// 1  [ (B) (E) (H) (K) ]
+// 2  [ (C) (F) (I) (L) ]
+// ^ rows
+//
+// Layout in GPU memory: (stride=16, size=60)
+//
+// [ A B C ] X <- column 0, offset +0
+// [ D E F ] X <- column 1, offset +16
+// [ G H I ] X <- column 2, offset +32
+// [ J K L ]   <- column 3, offset +48
+//
+// Note the lack of padding on the last column in this example.
+// In general: size <= stride * dim_m
+//
+// C representation: (stride=12, size=48)
+//
+// { { A, B, C },
+//   { D, E, F },
+//   { G, H, I },
+//   { J, K, L } }
+//
+// Note on arrays: `stride` represents both the stride between elements of a
+// matrix, and the stride between elements of an array. That is, there is no
+// distinction between the columns of a matrix and the rows of an array. For
+// example, a mat2[10] and a vec2[20] share the same pl_var_layout - the stride
+// would be sizeof(vec2) and the size would be sizeof(vec2) * 2 * 10.
+//
+// For non-array/matrix types, `stride` is equal to `size`.
+
+struct pl_var_layout {
+    size_t offset; // the starting offset of the first byte
+    size_t stride; // the delta between two elements of an array/matrix
+    size_t size;   // the total size of the input
+};
+
+// Returns the host layout of an input variable as required for a
+// tightly-packed, byte-aligned C data type, given a starting offset.
+PL_API struct pl_var_layout pl_var_host_layout(size_t offset, const struct pl_var *var);
+
+// Returns the GLSL std140 layout of an input variable given a current buffer
+// offset, as required for a buffer descriptor of type PL_DESC_BUF_UNIFORM
+//
+// The normal way to use this function is when calculating the size and offset
+// requirements of a uniform buffer in an incremental fashion, to calculate the
+// new offset of the next variable in this buffer.
+PL_API struct pl_var_layout pl_std140_layout(size_t offset, const struct pl_var *var);
+
+// Returns the GLSL std430 layout of an input variable given a current buffer
+// offset, as required for a buffer descriptor of type PL_DESC_BUF_STORAGE, and
+// for push constants.
+PL_API struct pl_var_layout pl_std430_layout(size_t offset, const struct pl_var *var);
+
+// Convenience definitions / friendly names for these
+#define pl_buf_uniform_layout pl_std140_layout
+#define pl_buf_storage_layout pl_std430_layout
+#define pl_push_constant_layout pl_std430_layout
+
+// Like memcpy, but copies bytes from `src` to `dst` in a manner governed by
+// the stride and size of `dst_layout` as well as `src_layout`. Also takes
+// into account the respective `offset`.
+PL_API void memcpy_layout(void *dst, struct pl_var_layout dst_layout,
+                          const void *src, struct pl_var_layout src_layout);
+
+// Represents a compile-time constant.
+struct pl_constant {
+    enum pl_var_type type;  // constant data type
+    uint32_t id;            // GLSL `constant_id`
+    size_t offset;          // byte offset in `constant_data`
+};
+
+// Represents a vertex attribute.
+struct pl_vertex_attrib {
+    const char *name;   // name as used in the shader
+    pl_fmt fmt;         // data format (must have PL_FMT_CAP_VERTEX)
+    size_t offset;      // byte offset into the vertex struct
+    int location;       // vertex location (as used in the shader)
+};
+
+// Returns an abstract namespace index for a given descriptor type. This will
+// always be a value >= 0 and < PL_DESC_TYPE_COUNT. Implementations can use
+// this to figure out which descriptors may share the same value of `binding`.
+// Bindings must only be unique for all descriptors within the same namespace.
+PL_API int pl_desc_namespace(pl_gpu gpu, enum pl_desc_type type);
+
+// Access mode of a shader input descriptor.
+enum pl_desc_access {
+    PL_DESC_ACCESS_READWRITE,
+    PL_DESC_ACCESS_READONLY,
+    PL_DESC_ACCESS_WRITEONLY,
+    PL_DESC_ACCESS_COUNT,
+};
+
+// Returns the GLSL syntax for a given access mode (e.g. "readonly").
+PL_API const char *pl_desc_access_glsl_name(enum pl_desc_access mode);
+
+// Represents a shader descriptor (e.g. texture or buffer binding)
+struct pl_desc {
+    const char *name;       // name as used in the shader
+    enum pl_desc_type type;
+
+    // The binding of this descriptor, as used in the shader. All bindings
+    // within a namespace must be unique. (see: pl_desc_namespace)
+    int binding;
+
+    // For storage images and storage buffers, this can be used to restrict
+    // the type of access that may be performed on the descriptor. Ignored for
+    // the other descriptor types (uniform buffers and sampled textures are
+    // always read-only).
+    enum pl_desc_access access;
+};
+
+// Framebuffer blending mode (for raster passes)
+enum pl_blend_mode {
+    PL_BLEND_ZERO,
+    PL_BLEND_ONE,
+    PL_BLEND_SRC_ALPHA,
+    PL_BLEND_ONE_MINUS_SRC_ALPHA,
+    PL_BLEND_MODE_COUNT,
+};
+
+struct pl_blend_params {
+    enum pl_blend_mode src_rgb;
+    enum pl_blend_mode dst_rgb;
+    enum pl_blend_mode src_alpha;
+    enum pl_blend_mode dst_alpha;
+};
+
+#define pl_blend_params(...) (&(struct pl_blend_params) { __VA_ARGS__ })
+
+// Typical alpha compositing
+PL_API extern const struct pl_blend_params pl_alpha_overlay;
+
+enum pl_prim_type {
+    PL_PRIM_TRIANGLE_LIST,
+    PL_PRIM_TRIANGLE_STRIP,
+    PL_PRIM_TYPE_COUNT,
+};
+
+enum pl_index_format {
+    PL_INDEX_UINT16 = 0,
+    PL_INDEX_UINT32,
+    PL_INDEX_FORMAT_COUNT,
+};
+
+enum pl_pass_type {
+    PL_PASS_INVALID = 0,
+    PL_PASS_RASTER,  // vertex+fragment shader
+    PL_PASS_COMPUTE, // compute shader (requires `pl_gpu.glsl.compute`)
+    PL_PASS_TYPE_COUNT,
+};
+
+// Description of a rendering pass. It conflates the following:
+//  - GLSL shader(s) and its list of inputs
+//  - target parameters (for raster passes)
+struct pl_pass_params {
+    enum pl_pass_type type;
+
+    // Input variables.
+    struct pl_var *variables;
+    int num_variables;
+
+    // Input descriptors.
+    struct pl_desc *descriptors;
+    int num_descriptors;
+
+    // Compile-time specialization constants.
+    struct pl_constant *constants;
+    int num_constants;
+
+    // Initial data for the specialization constants. Optional. If NULL,
+    // specialization constants receive the values from the shader text.
+    void *constant_data;
+
+    // Push constant region. Must be be a multiple of 4 <= limits.max_pushc_size
+    size_t push_constants_size;
+
+    // The shader text in GLSL. For PL_PASS_RASTER, this is interpreted
+    // as a fragment shader. For PL_PASS_COMPUTE, this is interpreted as
+    // a compute shader.
+    const char *glsl_shader;
+
+    // --- type==PL_PASS_RASTER only
+
+    // Describes the interpretation and layout of the vertex data.
+    enum pl_prim_type vertex_type;
+    struct pl_vertex_attrib *vertex_attribs;
+    int num_vertex_attribs;
+    size_t vertex_stride; // must be a multiple of limits.align_vertex_stride
+
+    // The vertex shader itself.
+    const char *vertex_shader;
+
+    // Target format. The format must support PL_FMT_CAP_RENDERABLE. The
+    // resulting pass may only be used on textures that have a format with a
+    // `pl_fmt.signature` compatible to this format.
+    pl_fmt target_format;
+
+    // Target blending mode. If this is NULL, blending is disabled. Otherwise,
+    // the `target_format` must also support PL_FMT_CAP_BLENDABLE.
+    const struct pl_blend_params *blend_params;
+
+    // If false, the target's existing contents will be discarded before the
+    // pass is run. (Semantically equivalent to calling pl_tex_invalidate
+    // before every pl_pass_run, but slightly more efficient)
+    //
+    // Specifying `blend_params` requires `load_target` to be true.
+    bool load_target;
+
+    // --- Deprecated / removed fields.
+    PL_DEPRECATED const uint8_t *cached_program; // Non-functional
+    PL_DEPRECATED size_t cached_program_len;
+};
+
+#define pl_pass_params(...) (&(struct pl_pass_params) { __VA_ARGS__ })
+
+// Conflates the following typical GPU API concepts:
+// - various kinds of shaders
+// - rendering pipelines
+// - descriptor sets, uniforms, other bindings
+// - all synchronization necessary
+// - the current values of all inputs
+//
+// Thread-safety: Unsafe
+typedef const struct pl_pass_t {
+    struct pl_pass_params params;
+} *pl_pass;
+
+// Compile a shader and create a render pass. This is a rare/expensive
+// operation and may take a significant amount of time, even if a cached
+// program is used. Returns NULL on failure.
+PL_API pl_pass pl_pass_create(pl_gpu gpu, const struct pl_pass_params *params);
+PL_API void pl_pass_destroy(pl_gpu gpu, pl_pass *pass);
+
+struct pl_desc_binding {
+    const void *object; // pl_* object with type corresponding to pl_desc_type
+
+    // For PL_DESC_SAMPLED_TEX, this can be used to configure the sampler.
+    enum pl_tex_address_mode address_mode;
+    enum pl_tex_sample_mode sample_mode;
+};
+
+struct pl_var_update {
+    int index;        // index into params.variables[]
+    const void *data; // pointer to raw byte data corresponding to pl_var_host_layout()
+};
+
+struct pl_pass_run_params {
+    pl_pass pass;
+
+    // If present, the shader will be re-specialized with the new constants
+    // provided. This is a significantly cheaper operation than recompiling a
+    // brand new shader, but should still be avoided if possible.
+    //
+    // Leaving it as NULL re-uses the existing specialization values. Ignored
+    // if the shader has no specialization constants. Guaranteed to be a no-op
+    // if the values have not changed since the last invocation.
+    void *constant_data;
+
+    // This list only contains descriptors/variables which have changed
+    // since the previous invocation. All non-mentioned variables implicitly
+    // preserve their state from the last invocation.
+    struct pl_var_update *var_updates;
+    int num_var_updates;
+
+    // This list contains all descriptors used by this pass. It must
+    // always be filled, even if the descriptors haven't changed. The order
+    // must match that of pass->params.descriptors
+    struct pl_desc_binding *desc_bindings;
+
+    // The push constants for this invocation. This must always be set and
+    // fully defined for every invocation if params.push_constants_size > 0.
+    void *push_constants;
+
+    // An optional timer to report the approximate runtime of this shader pass
+    // invocation to. Note that this is only an approximation, since shaders
+    // may overlap their execution times and contend for GPU time.
+    pl_timer timer;
+
+    // --- pass->params.type==PL_PASS_RASTER only
+
+    // Target must be a 2D texture, `target->params.renderable` must be true,
+    // and `target->params.format->signature` must match the signature provided
+    // in `pass->params.target_format`.
+    //
+    // If the viewport or scissors are left blank, they are inferred from
+    // target->params.
+    //
+    // WARNING: Rendering to a *target that is being read from by the same
+    // shader is undefined behavior. In general, trying to bind the same
+    // resource multiple times to the same shader is undefined behavior.
+    pl_tex target;
+    pl_rect2d viewport; // screen space viewport (must be normalized)
+    pl_rect2d scissors; // target render scissors (must be normalized)
+
+    // Number of vertices to render
+    int vertex_count;
+
+    // Vertex data may be provided in one of two forms:
+    //
+    // 1. Drawing from host memory directly
+    const void *vertex_data;
+    // 2. Drawing from a vertex buffer (requires `vertex_buf->params.drawable`)
+    pl_buf vertex_buf;
+    size_t buf_offset;
+
+    // (Optional) Index data may be provided in the form given by `index_fmt`.
+    // These will be used for instanced rendering. Similar to vertex data, this
+    // can be provided in two forms:
+    // 1. From host memory
+    const void *index_data;
+    enum pl_index_format index_fmt;
+    // 2. From an index buffer (requires `index_buf->params.drawable`)
+    pl_buf index_buf;
+    size_t index_offset;
+    // Note: Drawing from an index buffer requires vertex data to also be
+    // present in buffer form, i.e. it's forbidden to mix `index_buf` with
+    // `vertex_data` (though vice versa is allowed).
+
+    // --- pass->params.type==PL_PASS_COMPUTE only
+
+    // Number of work groups to dispatch per dimension (X/Y/Z). Must be <= the
+    // corresponding index of limits.max_dispatch
+    int compute_groups[3];
+};
+
+#define pl_pass_run_params(...) (&(struct pl_pass_run_params) { __VA_ARGS__ })
+
+// Execute a render pass.
+PL_API void pl_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params);
+
+// This is semantically a no-op, but it provides a hint that you want to flush
+// any partially queued up commands and begin execution. There is normally no
+// need to call this, because queued commands will always be implicitly flushed
+// whenever necessary to make forward progress on commands like `pl_buf_poll`,
+// or when submitting a frame to a swapchain for display. In fact, calling this
+// function can negatively impact performance, because some GPUs rely on being
+// able to re-order and modify queued commands in order to enable optimizations
+// retroactively.
+//
+// The only time this might be beneficial to call explicitly is if you're doing
+// lots of offline processing, i.e. you aren't rendering to a swapchain but to
+// textures that you download from again. In that case you should call this
+// function after each "work item" to ensure good parallelism between them.
+//
+// It's worth noting that this function may block if you're over-feeding the
+// GPU without waiting for existing results to finish.
+PL_API void pl_gpu_flush(pl_gpu gpu);
+
+// This is like `pl_gpu_flush` but also blocks until the GPU is fully idle
+// before returning. Using this in your rendering loop is seriously disadvised,
+// and almost never the right solution. The intended use case is for deinit
+// logic, where users may want to force the all pending GPU operations to
+// finish so they can clean up their state more easily.
+//
+// After this operation is called, it's guaranteed that all pending buffer
+// operations are complete - i.e. `pl_buf_poll` is guaranteed to return false.
+// It's also guaranteed that any outstanding timer query results are available.
+//
+// Note: If you only care about buffer operations, you can accomplish this more
+// easily by using `pl_buf_poll` with the timeout set to `UINT64_MAX`. But if
+// you have many buffers it may be more convenient to call this function
+// instead. The difference is that this function will also affect e.g. renders
+// to a `pl_swapchain`.
+PL_API void pl_gpu_finish(pl_gpu gpu);
+
+// Returns true if the GPU is considered to be in a "failed" state, which
+// during normal operation is typically the result of things like the device
+// being lost (due to e.g. power management).
+//
+// If this returns true, users *should* destroy and recreate the `pl_gpu`,
+// including all associated resources, via the appropriate mechanism.
+PL_API bool pl_gpu_is_failed(pl_gpu gpu);
+
+
+// Deprecated objects and functions:
+
+// A generic synchronization object intended for use with an external API. This
+// is not required when solely using libplacebo API functions, as all required
+// synchronisation is done internally. This comes in the form of a pair of
+// semaphores - one to synchronize access in each direction.
+//
+// Thread-safety: Unsafe
+typedef const struct pl_sync_t {
+    enum pl_handle_type handle_type;
+
+    // This handle is signalled by the `pl_gpu`, and waited on by the user. It
+    // fires when it is safe for the user to access the shared resource.
+    union pl_handle wait_handle;
+
+    // This handle is signalled by the user, and waited on by the `pl_gpu`. It
+    // must fire when the user has finished accessing the shared resource.
+    union pl_handle signal_handle;
+} *pl_sync;
+
+// Create a synchronization object. Returns NULL on failure.
+//
+// `handle_type` must be exactly *one* of `pl_gpu.export_caps.sync`, and
+// indicates which type of handle to generate for sharing this sync object.
+//
+// Deprecated in favor of API-specific semaphore creation operations such as
+// `pl_vulkan_sem_create`.
+PL_DEPRECATED PL_API pl_sync pl_sync_create(pl_gpu gpu, enum pl_handle_type handle_type);
+
+// Destroy a `pl_sync`. Note that this invalidates the externally imported
+// semaphores. Users should therefore make sure that all operations that
+// wait on or signal any of the semaphore have been fully submitted and
+// processed by the external API before destroying the `pl_sync`.
+//
+// Despite this, it's safe to destroy a `pl_sync` if the only pending
+// operations that involve it are internal to libplacebo.
+PL_DEPRECATED PL_API void pl_sync_destroy(pl_gpu gpu, pl_sync *sync);
+
+// Initiates a texture export operation, allowing a texture to be accessed by
+// an external API. Returns whether successful. After this operation
+// successfully returns, it is guaranteed that `sync->wait_handle` will
+// eventually be signalled. For APIs where this is relevant, the image layout
+// should be specified as "general", e.g. `GL_LAYOUT_GENERAL_EXT` for OpenGL.
+//
+// There is no corresponding "import" operation - the next operation that uses
+// a texture will implicitly import the texture. Valid API usage requires that
+// the user *must* submit a semaphore signal operation on `sync->signal_handle`
+// before doing so. Not doing so is undefined behavior and may very well
+// deadlock the calling process and/or the graphics card!
+//
+// Note that despite this restriction, it is always valid to call
+// `pl_tex_destroy`, even if the texture is in an exported state, without
+// having to signal the corresponding sync object first.
+//
+// Deprecated in favor of API-specific synchronization mechanisms such as
+// `pl_vulkan_hold/release_ex`.
+PL_DEPRECATED PL_API bool pl_tex_export(pl_gpu gpu, pl_tex tex, pl_sync sync);
+
+
+PL_API_END
+
+#endif // LIBPLACEBO_GPU_H_