summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/cache.c447
-rw-r--r--src/cache.h72
-rw-r--r--src/colorspace.c1609
-rw-r--r--src/common.c500
-rw-r--r--src/common.h191
-rw-r--r--src/convert.cc233
-rw-r--r--src/d3d11/common.h66
-rw-r--r--src/d3d11/context.c488
-rw-r--r--src/d3d11/formats.c293
-rw-r--r--src/d3d11/formats.h36
-rw-r--r--src/d3d11/gpu.c685
-rw-r--r--src/d3d11/gpu.h212
-rw-r--r--src/d3d11/gpu_buf.c310
-rw-r--r--src/d3d11/gpu_pass.c1293
-rw-r--r--src/d3d11/gpu_tex.c745
-rw-r--r--src/d3d11/meson.build41
-rw-r--r--src/d3d11/stubs.c56
-rw-r--r--src/d3d11/swapchain.c667
-rw-r--r--src/d3d11/utils.c500
-rw-r--r--src/d3d11/utils.h88
-rw-r--r--src/dispatch.c1615
-rw-r--r--src/dispatch.h31
-rw-r--r--src/dither.c317
-rw-r--r--src/dummy.c348
-rw-r--r--src/filters.c1015
-rw-r--r--src/filters.h58
-rw-r--r--src/format.c205
-rw-r--r--src/gamut_mapping.c1008
-rw-r--r--src/glsl/glslang.cc121
-rw-r--r--src/glsl/glslang.h57
-rw-r--r--src/glsl/glslang_resources.c132
-rw-r--r--src/glsl/meson.build73
-rw-r--r--src/glsl/spirv.c64
-rw-r--r--src/glsl/spirv.h50
-rw-r--r--src/glsl/spirv_glslang.c112
-rw-r--r--src/glsl/spirv_shaderc.c174
-rw-r--r--src/glsl/utils.h52
-rw-r--r--src/gpu.c1338
-rw-r--r--src/gpu.h207
-rw-r--r--src/gpu/utils.c1288
-rw-r--r--src/hash.h162
-rw-r--r--src/include/libplacebo/cache.h200
-rw-r--r--src/include/libplacebo/colorspace.h719
-rw-r--r--src/include/libplacebo/common.h244
-rw-r--r--src/include/libplacebo/config.h.in102
-rw-r--r--src/include/libplacebo/d3d11.h248
-rw-r--r--src/include/libplacebo/dispatch.h239
-rw-r--r--src/include/libplacebo/dither.h82
-rw-r--r--src/include/libplacebo/dummy.h131
-rw-r--r--src/include/libplacebo/filters.h415
-rw-r--r--src/include/libplacebo/gamut_mapping.h182
-rw-r--r--src/include/libplacebo/gpu.h1464
-rw-r--r--src/include/libplacebo/log.h113
-rw-r--r--src/include/libplacebo/meson.build6
-rw-r--r--src/include/libplacebo/opengl.h230
-rw-r--r--src/include/libplacebo/options.h201
-rw-r--r--src/include/libplacebo/renderer.h847
-rw-r--r--src/include/libplacebo/shaders.h273
-rw-r--r--src/include/libplacebo/shaders/colorspace.h381
-rw-r--r--src/include/libplacebo/shaders/custom.h341
-rw-r--r--src/include/libplacebo/shaders/deinterlacing.h137
-rw-r--r--src/include/libplacebo/shaders/dithering.h140
-rw-r--r--src/include/libplacebo/shaders/film_grain.h137
-rw-r--r--src/include/libplacebo/shaders/icc.h135
-rw-r--r--src/include/libplacebo/shaders/lut.h78
-rw-r--r--src/include/libplacebo/shaders/sampling.h257
-rw-r--r--src/include/libplacebo/swapchain.h171
-rw-r--r--src/include/libplacebo/tone_mapping.h268
-rw-r--r--src/include/libplacebo/utils/dav1d.h129
-rw-r--r--src/include/libplacebo/utils/dav1d_internal.h613
-rw-r--r--src/include/libplacebo/utils/dolbyvision.h34
-rw-r--r--src/include/libplacebo/utils/frame_queue.h230
-rw-r--r--src/include/libplacebo/utils/libav.h284
-rw-r--r--src/include/libplacebo/utils/libav_internal.h1482
-rw-r--r--src/include/libplacebo/utils/upload.h153
-rw-r--r--src/include/libplacebo/vulkan.h638
-rw-r--r--src/log.c471
-rw-r--r--src/log.h84
-rw-r--r--src/meson.build347
-rw-r--r--src/opengl/common.h66
-rw-r--r--src/opengl/context.c332
-rw-r--r--src/opengl/formats.c485
-rw-r--r--src/opengl/formats.h32
-rw-r--r--src/opengl/gpu.c645
-rw-r--r--src/opengl/gpu.h141
-rw-r--r--src/opengl/gpu_pass.c707
-rw-r--r--src/opengl/gpu_tex.c1078
-rw-r--r--src/opengl/include/glad/meson.build29
-rw-r--r--src/opengl/loader_egl.c2
-rw-r--r--src/opengl/loader_gl.c2
-rw-r--r--src/opengl/meson.build76
-rw-r--r--src/opengl/stubs.c63
-rw-r--r--src/opengl/swapchain.c278
-rw-r--r--src/opengl/utils.c158
-rw-r--r--src/opengl/utils.h57
-rw-r--r--src/options.c1166
-rw-r--r--src/os.h30
-rw-r--r--src/pl_alloc.c313
-rw-r--r--src/pl_alloc.h191
-rw-r--r--src/pl_assert.h37
-rw-r--r--src/pl_clock.h98
-rw-r--r--src/pl_string.c418
-rw-r--r--src/pl_string.h318
-rw-r--r--src/pl_thread.h73
-rw-r--r--src/pl_thread_pthread.h137
-rw-r--r--src/pl_thread_win32.h182
-rw-r--r--src/renderer.c3815
-rw-r--r--src/shaders.c992
-rw-r--r--src/shaders.h387
-rw-r--r--src/shaders/colorspace.c2120
-rw-r--r--src/shaders/custom.c89
-rw-r--r--src/shaders/custom_mpv.c1768
-rw-r--r--src/shaders/deinterlacing.c260
-rw-r--r--src/shaders/dithering.c527
-rw-r--r--src/shaders/film_grain.c65
-rw-r--r--src/shaders/film_grain.h75
-rw-r--r--src/shaders/film_grain_av1.c1001
-rw-r--r--src/shaders/film_grain_h274.c815
-rw-r--r--src/shaders/icc.c781
-rw-r--r--src/shaders/lut.c820
-rw-r--r--src/shaders/meson.build23
-rw-r--r--src/shaders/sampling.c1198
-rw-r--r--src/swapchain.c92
-rw-r--r--src/swapchain.h39
-rw-r--r--src/tests/bench.c550
-rw-r--r--src/tests/cache.c215
-rw-r--r--src/tests/colorspace.c488
-rw-r--r--src/tests/common.c136
-rw-r--r--src/tests/d3d11.c59
-rw-r--r--src/tests/dav1d.c45
-rw-r--r--src/tests/dither.c41
-rw-r--r--src/tests/dummy.c70
-rw-r--r--src/tests/filters.c81
-rw-r--r--src/tests/fuzz/lut.c24
-rw-r--r--src/tests/fuzz/options.c26
-rw-r--r--src/tests/fuzz/shaders.c166
-rw-r--r--src/tests/fuzz/user_shaders.c28
-rw-r--r--src/tests/gpu_tests.h1741
-rw-r--r--src/tests/icc.c106
-rw-r--r--src/tests/include/include_tmpl.c1
-rw-r--r--src/tests/include/include_tmpl.cpp3
-rw-r--r--src/tests/include/meson.build35
-rw-r--r--src/tests/libav.c393
-rw-r--r--src/tests/lut.c86
-rw-r--r--src/tests/meson.build39
-rw-r--r--src/tests/opengl_surfaceless.c247
-rw-r--r--src/tests/options.c123
-rw-r--r--src/tests/string.c147
-rw-r--r--src/tests/tests.h319
-rw-r--r--src/tests/tone_mapping.c181
-rw-r--r--src/tests/utils.c165
-rw-r--r--src/tests/vulkan.c296
-rw-r--r--src/tone_mapping.c775
-rw-r--r--src/ucrt_math.def292
-rw-r--r--src/utils/dolbyvision.c63
-rw-r--r--src/utils/frame_queue.c1030
-rw-r--r--src/utils/upload.c382
-rw-r--r--src/version.h.in1
-rw-r--r--src/vulkan/command.c571
-rw-r--r--src/vulkan/command.h142
-rw-r--r--src/vulkan/common.h234
-rw-r--r--src/vulkan/context.c1704
-rw-r--r--src/vulkan/formats.c616
-rw-r--r--src/vulkan/formats.h34
-rw-r--r--src/vulkan/gpu.c924
-rw-r--r--src/vulkan/gpu.h175
-rw-r--r--src/vulkan/gpu_buf.c470
-rw-r--r--src/vulkan/gpu_pass.c964
-rw-r--r--src/vulkan/gpu_tex.c1453
-rw-r--r--src/vulkan/malloc.c1058
-rw-r--r--src/vulkan/malloc.h72
-rw-r--r--src/vulkan/meson.build59
-rw-r--r--src/vulkan/stubs.c108
-rw-r--r--src/vulkan/swapchain.c911
-rw-r--r--src/vulkan/utils.c181
-rw-r--r--src/vulkan/utils.h136
-rw-r--r--src/vulkan/utils_gen.c.j2137
-rw-r--r--src/vulkan/utils_gen.py219
178 files changed, 69668 insertions, 0 deletions
diff --git a/src/cache.c b/src/cache.c
new file mode 100644
index 0000000..4f8ed4e
--- /dev/null
+++ b/src/cache.c
@@ -0,0 +1,447 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <locale.h>
+#include <limits.h>
+
+#include "common.h"
+#include "cache.h"
+#include "log.h"
+#include "pl_thread.h"
+
+const struct pl_cache_params pl_cache_default_params = {0};
+
+struct priv {
+ pl_log log;
+ pl_mutex lock;
+ PL_ARRAY(pl_cache_obj) objects;
+ size_t total_size;
+};
+
+int pl_cache_objects(pl_cache cache)
+{
+ if (!cache)
+ return 0;
+
+ struct priv *p = PL_PRIV(cache);
+ pl_mutex_lock(&p->lock);
+ int num = p->objects.num;
+ pl_mutex_unlock(&p->lock);
+ return num;
+}
+
+size_t pl_cache_size(pl_cache cache)
+{
+ if (!cache)
+ return 0;
+
+ struct priv *p = PL_PRIV(cache);
+ pl_mutex_lock(&p->lock);
+ size_t size = p->total_size;
+ pl_mutex_unlock(&p->lock);
+ return size;
+}
+
+pl_cache pl_cache_create(const struct pl_cache_params *params)
+{
+ struct pl_cache_t *cache = pl_zalloc_obj(NULL, cache, struct priv);
+ struct priv *p = PL_PRIV(cache);
+ pl_mutex_init(&p->lock);
+ if (params) {
+ cache->params = *params;
+ p->log = params->log;
+ }
+
+ // Sanitize size limits
+ size_t total_size = PL_DEF(cache->params.max_total_size, SIZE_MAX);
+ size_t object_size = PL_DEF(cache->params.max_object_size, SIZE_MAX);
+ object_size = PL_MIN(total_size, object_size);
+ cache->params.max_total_size = total_size;
+ cache->params.max_object_size = object_size;
+
+ return cache;
+}
+
+static void remove_obj(pl_cache cache, pl_cache_obj obj)
+{
+ struct priv *p = PL_PRIV(cache);
+
+ p->total_size -= obj.size;
+ if (obj.free)
+ obj.free(obj.data);
+}
+
+void pl_cache_destroy(pl_cache *pcache)
+{
+ pl_cache cache = *pcache;
+ if (!cache)
+ return;
+
+ struct priv *p = PL_PRIV(cache);
+ for (int i = 0; i < p->objects.num; i++)
+ remove_obj(cache, p->objects.elem[i]);
+
+ pl_assert(p->total_size == 0);
+ pl_mutex_destroy(&p->lock);
+ pl_free((void *) cache);
+ *pcache = NULL;
+}
+
+void pl_cache_reset(pl_cache cache)
+{
+ if (!cache)
+ return;
+
+ struct priv *p = PL_PRIV(cache);
+ pl_mutex_lock(&p->lock);
+ for (int i = 0; i < p->objects.num; i++)
+ remove_obj(cache, p->objects.elem[i]);
+ p->objects.num = 0;
+ pl_assert(p->total_size == 0);
+ pl_mutex_unlock(&p->lock);
+}
+
+static bool try_set(pl_cache cache, pl_cache_obj obj)
+{
+ struct priv *p = PL_PRIV(cache);
+
+ // Remove any existing entry with this key
+ for (int i = p->objects.num - 1; i >= 0; i--) {
+ pl_cache_obj prev = p->objects.elem[i];
+ if (prev.key == obj.key) {
+ PL_TRACE(p, "Removing out-of-date object 0x%"PRIx64, prev.key);
+ remove_obj(cache, prev);
+ PL_ARRAY_REMOVE_AT(p->objects, i);
+ break;
+ }
+ }
+
+ if (!obj.size) {
+ PL_TRACE(p, "Deleted object 0x%"PRIx64, obj.key);
+ return true;
+ }
+
+ if (obj.size > cache->params.max_object_size) {
+ PL_DEBUG(p, "Object 0x%"PRIx64" (size %zu) exceeds max size %zu, discarding",
+ obj.key, obj.size, cache->params.max_object_size);
+ return false;
+ }
+
+ // Make space by deleting old objects
+ while (p->total_size + obj.size > cache->params.max_total_size ||
+ p->objects.num == INT_MAX)
+ {
+ pl_assert(p->objects.num);
+ pl_cache_obj old = p->objects.elem[0];
+ PL_TRACE(p, "Removing object 0x%"PRIx64" (size %zu) to make room",
+ old.key, old.size);
+ remove_obj(cache, old);
+ PL_ARRAY_REMOVE_AT(p->objects, 0);
+ }
+
+ if (!obj.free) {
+ obj.data = pl_memdup(NULL, obj.data, obj.size);
+ obj.free = pl_free;
+ }
+
+ PL_TRACE(p, "Inserting new object 0x%"PRIx64" (size %zu)", obj.key, obj.size);
+ PL_ARRAY_APPEND((void *) cache, p->objects, obj);
+ p->total_size += obj.size;
+ return true;
+}
+
+static pl_cache_obj strip_obj(pl_cache_obj obj)
+{
+ return (pl_cache_obj) { .key = obj.key };
+}
+
+bool pl_cache_try_set(pl_cache cache, pl_cache_obj *pobj)
+{
+ if (!cache)
+ return false;
+
+ pl_cache_obj obj = *pobj;
+ struct priv *p = PL_PRIV(cache);
+ pl_mutex_lock(&p->lock);
+ bool ok = try_set(cache, obj);
+ pl_mutex_unlock(&p->lock);
+ if (ok) {
+ *pobj = strip_obj(obj); // ownership transfers, clear ptr
+ } else {
+ obj = strip_obj(obj); // ownership remains with caller, clear copy
+ }
+ if (cache->params.set)
+ cache->params.set(cache->params.priv, obj);
+ return ok;
+}
+
+void pl_cache_set(pl_cache cache, pl_cache_obj *obj)
+{
+ if (!pl_cache_try_set(cache, obj)) {
+ if (obj->free)
+ obj->free(obj->data);
+ *obj = (pl_cache_obj) { .key = obj->key };
+ }
+}
+
+static void noop(void *ignored)
+{
+ (void) ignored;
+}
+
+bool pl_cache_get(pl_cache cache, pl_cache_obj *out_obj)
+{
+ const uint64_t key = out_obj->key;
+ if (!cache)
+ goto fail;
+
+ struct priv *p = PL_PRIV(cache);
+ pl_mutex_lock(&p->lock);
+
+ // Search backwards to prioritize recently added entries
+ for (int i = p->objects.num - 1; i >= 0; i--) {
+ pl_cache_obj obj = p->objects.elem[i];
+ if (obj.key == key) {
+ PL_ARRAY_REMOVE_AT(p->objects, i);
+ p->total_size -= obj.size;
+ pl_mutex_unlock(&p->lock);
+ pl_assert(obj.free);
+ *out_obj = obj;
+ return true;
+ }
+ }
+
+ pl_mutex_unlock(&p->lock);
+ if (!cache->params.get)
+ goto fail;
+
+ pl_cache_obj obj = cache->params.get(cache->params.priv, key);
+ if (!obj.size)
+ goto fail;
+
+ // Sanitize object
+ obj.key = key;
+ obj.free = PL_DEF(obj.free, noop);
+ *out_obj = obj;
+ return true;
+
+fail:
+ *out_obj = (pl_cache_obj) { .key = key };
+ return false;
+}
+
+void pl_cache_iterate(pl_cache cache,
+ void (*cb)(void *priv, pl_cache_obj obj),
+ void *priv)
+{
+ if (!cache)
+ return;
+
+ struct priv *p = PL_PRIV(cache);
+ pl_mutex_lock(&p->lock);
+ for (int i = 0; i < p->objects.num; i++)
+ cb(priv, p->objects.elem[i]);
+ pl_mutex_unlock(&p->lock);
+}
+
+// --- Saving/loading
+
+#define CACHE_MAGIC "pl_cache"
+#define CACHE_VERSION 1
+#define PAD_ALIGN(x) PL_ALIGN2(x, sizeof(uint32_t))
+
+struct __attribute__((__packed__)) cache_header {
+ char magic[8];
+ uint32_t version;
+ uint32_t num_entries;
+};
+
+struct __attribute__((__packed__)) cache_entry {
+ uint64_t key;
+ uint64_t size;
+ uint64_t hash;
+};
+
+pl_static_assert(sizeof(struct cache_header) % alignof(struct cache_entry) == 0);
+
+int pl_cache_save_ex(pl_cache cache,
+ void (*write)(void *priv, size_t size, const void *ptr),
+ void *priv)
+{
+ if (!cache)
+ return 0;
+
+ struct priv *p = PL_PRIV(cache);
+ pl_mutex_lock(&p->lock);
+ pl_clock_t start = pl_clock_now();
+
+ const int num_objects = p->objects.num;
+ const size_t saved_bytes = p->total_size;
+ write(priv, sizeof(struct cache_header), &(struct cache_header) {
+ .magic = CACHE_MAGIC,
+ .version = CACHE_VERSION,
+ .num_entries = num_objects,
+ });
+
+ for (int i = 0; i < num_objects; i++) {
+ pl_cache_obj obj = p->objects.elem[i];
+ PL_TRACE(p, "Saving object 0x%"PRIx64" (size %zu)", obj.key, obj.size);
+ write(priv, sizeof(struct cache_entry), &(struct cache_entry) {
+ .key = obj.key,
+ .size = obj.size,
+ .hash = pl_mem_hash(obj.data, obj.size),
+ });
+ static const uint8_t padding[PAD_ALIGN(1)] = {0};
+ write(priv, obj.size, obj.data);
+ write(priv, PAD_ALIGN(obj.size) - obj.size, padding);
+ }
+
+ pl_mutex_unlock(&p->lock);
+ pl_log_cpu_time(p->log, start, pl_clock_now(), "saving cache");
+ if (num_objects)
+ PL_DEBUG(p, "Saved %d objects, totalling %zu bytes", num_objects, saved_bytes);
+
+ return num_objects;
+}
+
+int pl_cache_load_ex(pl_cache cache,
+ bool (*read)(void *priv, size_t size, void *ptr),
+ void *priv)
+{
+ if (!cache)
+ return 0;
+
+ struct priv *p = PL_PRIV(cache);
+ struct cache_header header;
+ if (!read(priv, sizeof(header), &header)) {
+ PL_ERR(p, "Failed loading cache: file seems empty or truncated");
+ return -1;
+ }
+ if (memcmp(header.magic, CACHE_MAGIC, sizeof(header.magic)) != 0) {
+ PL_ERR(p, "Failed loading cache: invalid magic bytes");
+ return -1;
+ }
+ if (header.version != CACHE_VERSION) {
+ PL_INFO(p, "Failed loading cache: wrong version... skipping");
+ return 0;
+ }
+ if (header.num_entries > INT_MAX) {
+ PL_ERR(p, "Failed loading cache: %"PRIu32" entries overflows int",
+ header.num_entries);
+ return 0;
+ }
+
+ int num_loaded = 0;
+ size_t loaded_bytes = 0;
+ pl_mutex_lock(&p->lock);
+ pl_clock_t start = pl_clock_now();
+
+ for (int i = 0; i < header.num_entries; i++) {
+ struct cache_entry entry;
+ if (!read(priv, sizeof(entry), &entry)) {
+ PL_WARN(p, "Cache seems truncated, missing objects.. ignoring rest");
+ goto error;
+ }
+
+ if (entry.size > SIZE_MAX) {
+ PL_WARN(p, "Cache object size %"PRIu64" overflows SIZE_MAX.. "
+ "suspect broken file, ignoring rest", entry.size);
+ goto error;
+ }
+
+ void *buf = pl_alloc(NULL, PAD_ALIGN(entry.size));
+ if (!read(priv, PAD_ALIGN(entry.size), buf)) {
+ PL_WARN(p, "Cache seems truncated, missing objects.. ignoring rest");
+ pl_free(buf);
+ goto error;
+ }
+
+ uint64_t checksum = pl_mem_hash(buf, entry.size);
+ if (checksum != entry.hash) {
+ PL_WARN(p, "Cache entry seems corrupt, checksum mismatch.. ignoring rest");
+ pl_free(buf);
+ goto error;
+ }
+
+ pl_cache_obj obj = {
+ .key = entry.key,
+ .size = entry.size,
+ .data = buf,
+ .free = pl_free,
+ };
+
+ PL_TRACE(p, "Loading object 0x%"PRIx64" (size %zu)", obj.key, obj.size);
+ if (try_set(cache, obj)) {
+ num_loaded++;
+ loaded_bytes += entry.size;
+ } else {
+ pl_free(buf);
+ }
+ }
+
+ pl_log_cpu_time(p->log, start, pl_clock_now(), "loading cache");
+ if (num_loaded)
+ PL_DEBUG(p, "Loaded %d objects, totalling %zu bytes", num_loaded, loaded_bytes);
+
+ // fall through
+error:
+ pl_mutex_unlock(&p->lock);
+ return num_loaded;
+}
+
+// Save/load wrappers
+
+struct ptr_ctx {
+ uint8_t *data; // base pointer
+ size_t size; // total size
+ size_t pos; // read/write index
+};
+
+static void write_ptr(void *priv, size_t size, const void *ptr)
+{
+ struct ptr_ctx *ctx = priv;
+ size_t end = PL_MIN(ctx->pos + size, ctx->size);
+ if (end > ctx->pos)
+ memcpy(ctx->data + ctx->pos, ptr, end - ctx->pos);
+ ctx->pos += size;
+}
+
+static bool read_ptr(void *priv, size_t size, void *ptr)
+{
+ struct ptr_ctx *ctx = priv;
+ if (ctx->pos + size > ctx->size)
+ return false;
+ memcpy(ptr, ctx->data + ctx->pos, size);
+ ctx->pos += size;
+ return true;
+}
+
+size_t pl_cache_save(pl_cache cache, uint8_t *data, size_t size)
+{
+ struct ptr_ctx ctx = { data, size };
+ pl_cache_save_ex(cache, write_ptr, &ctx);
+ return ctx.pos;
+}
+
+int pl_cache_load(pl_cache cache, const uint8_t *data, size_t size)
+{
+ return pl_cache_load_ex(cache, read_ptr, &(struct ptr_ctx) {
+ .data = (uint8_t *) data,
+ .size = size,
+ });
+}
diff --git a/src/cache.h b/src/cache.h
new file mode 100644
index 0000000..7e0ff2f
--- /dev/null
+++ b/src/cache.h
@@ -0,0 +1,72 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+#include "hash.h"
+
+#include <libplacebo/cache.h>
+
+// Convenience wrapper around pl_cache_set
+static inline void pl_cache_str(pl_cache cache, uint64_t key, pl_str *str)
+{
+ pl_cache_set(cache, &(pl_cache_obj) {
+ .key = key,
+ .data = pl_steal(NULL, str->buf),
+ .size = str->len,
+ .free = pl_free,
+ });
+ *str = (pl_str) {0};
+}
+
+// Steal and insert a cache object
+static inline void pl_cache_steal(pl_cache cache, pl_cache_obj *obj)
+{
+ if (obj->free == pl_free)
+ obj->data = pl_steal(NULL, obj->data);
+ pl_cache_set(cache, obj);
+}
+
+// Resize `obj->data` to a given size, re-using allocated buffers where possible
+static inline void pl_cache_obj_resize(void *alloc, pl_cache_obj *obj, size_t size)
+{
+ if (obj->free != pl_free) {
+ if (obj->free)
+ obj->free(obj->data);
+ obj->data = pl_alloc(alloc, size);
+ obj->free = pl_free;
+ } else if (pl_get_size(obj->data) < size) {
+ obj->data = pl_steal(alloc, obj->data);
+ obj->data = pl_realloc(alloc, obj->data, size);
+ }
+ obj->size = size;
+}
+
+// Internal list of base seeds for different object types, randomly generated
+
+enum {
+ CACHE_KEY_SH_LUT = UINT64_C(0x2206183d320352c6), // sh_lut cache
+ CACHE_KEY_ICC_3DLUT = UINT64_C(0xff703a6dd8a996f6), // ICC 3dlut
+ CACHE_KEY_DITHER = UINT64_C(0x6fed75eb6dce86cb), // dither matrix
+ CACHE_KEY_H274 = UINT64_C(0x2fb9adca04b42c4d), // H.274 film grain DB
+ CACHE_KEY_GAMUT_LUT = UINT64_C(0x6109e47f15d478b1), // gamut mapping 3DLUT
+ CACHE_KEY_SPIRV = UINT64_C(0x32352f6605ff60a7), // bare SPIR-V module
+ CACHE_KEY_VK_PIPE = UINT64_C(0x4bdab2817ad02ad4), // VkPipelineCache
+ CACHE_KEY_GL_PROG = UINT64_C(0x4274c309f4f0477b), // GL_ARB_get_program_binary
+ CACHE_KEY_D3D_DXBC = UINT64_C(0x807668516811d3bc), // DXBC bytecode
+};
diff --git a/src/colorspace.c b/src/colorspace.c
new file mode 100644
index 0000000..5cef2b5
--- /dev/null
+++ b/src/colorspace.c
@@ -0,0 +1,1609 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+
+#include "common.h"
+#include "hash.h"
+
+#include <libplacebo/colorspace.h>
+#include <libplacebo/tone_mapping.h>
+
+bool pl_color_system_is_ycbcr_like(enum pl_color_system sys)
+{
+ switch (sys) {
+ case PL_COLOR_SYSTEM_UNKNOWN:
+ case PL_COLOR_SYSTEM_RGB:
+ case PL_COLOR_SYSTEM_XYZ:
+ return false;
+ case PL_COLOR_SYSTEM_BT_601:
+ case PL_COLOR_SYSTEM_BT_709:
+ case PL_COLOR_SYSTEM_SMPTE_240M:
+ case PL_COLOR_SYSTEM_BT_2020_NC:
+ case PL_COLOR_SYSTEM_BT_2020_C:
+ case PL_COLOR_SYSTEM_BT_2100_PQ:
+ case PL_COLOR_SYSTEM_BT_2100_HLG:
+ case PL_COLOR_SYSTEM_DOLBYVISION:
+ case PL_COLOR_SYSTEM_YCGCO:
+ return true;
+ case PL_COLOR_SYSTEM_COUNT: break;
+ };
+
+ pl_unreachable();
+}
+
+bool pl_color_system_is_linear(enum pl_color_system sys)
+{
+ switch (sys) {
+ case PL_COLOR_SYSTEM_UNKNOWN:
+ case PL_COLOR_SYSTEM_RGB:
+ case PL_COLOR_SYSTEM_BT_601:
+ case PL_COLOR_SYSTEM_BT_709:
+ case PL_COLOR_SYSTEM_SMPTE_240M:
+ case PL_COLOR_SYSTEM_BT_2020_NC:
+ case PL_COLOR_SYSTEM_YCGCO:
+ return true;
+ case PL_COLOR_SYSTEM_BT_2020_C:
+ case PL_COLOR_SYSTEM_BT_2100_PQ:
+ case PL_COLOR_SYSTEM_BT_2100_HLG:
+ case PL_COLOR_SYSTEM_DOLBYVISION:
+ case PL_COLOR_SYSTEM_XYZ:
+ return false;
+ case PL_COLOR_SYSTEM_COUNT: break;
+ };
+
+ pl_unreachable();
+}
+
+enum pl_color_system pl_color_system_guess_ycbcr(int width, int height)
+{
+ if (width >= 1280 || height > 576) {
+ // Typical HD content
+ return PL_COLOR_SYSTEM_BT_709;
+ } else {
+ // Typical SD content
+ return PL_COLOR_SYSTEM_BT_601;
+ }
+}
+
+bool pl_bit_encoding_equal(const struct pl_bit_encoding *b1,
+ const struct pl_bit_encoding *b2)
+{
+ return b1->sample_depth == b2->sample_depth &&
+ b1->color_depth == b2->color_depth &&
+ b1->bit_shift == b2->bit_shift;
+}
+
+const struct pl_color_repr pl_color_repr_unknown = {0};
+
+const struct pl_color_repr pl_color_repr_rgb = {
+ .sys = PL_COLOR_SYSTEM_RGB,
+ .levels = PL_COLOR_LEVELS_FULL,
+};
+
+const struct pl_color_repr pl_color_repr_sdtv = {
+ .sys = PL_COLOR_SYSTEM_BT_601,
+ .levels = PL_COLOR_LEVELS_LIMITED,
+};
+
+const struct pl_color_repr pl_color_repr_hdtv = {
+ .sys = PL_COLOR_SYSTEM_BT_709,
+ .levels = PL_COLOR_LEVELS_LIMITED,
+};
+
+const struct pl_color_repr pl_color_repr_uhdtv = {
+ .sys = PL_COLOR_SYSTEM_BT_2020_NC,
+ .levels = PL_COLOR_LEVELS_LIMITED,
+};
+
+const struct pl_color_repr pl_color_repr_jpeg = {
+ .sys = PL_COLOR_SYSTEM_BT_601,
+ .levels = PL_COLOR_LEVELS_FULL,
+};
+
+bool pl_color_repr_equal(const struct pl_color_repr *c1,
+ const struct pl_color_repr *c2)
+{
+ return c1->sys == c2->sys &&
+ c1->levels == c2->levels &&
+ c1->alpha == c2->alpha &&
+ c1->dovi == c2->dovi &&
+ pl_bit_encoding_equal(&c1->bits, &c2->bits);
+}
+
+static struct pl_bit_encoding pl_bit_encoding_merge(const struct pl_bit_encoding *orig,
+ const struct pl_bit_encoding *new)
+{
+ return (struct pl_bit_encoding) {
+ .sample_depth = PL_DEF(orig->sample_depth, new->sample_depth),
+ .color_depth = PL_DEF(orig->color_depth, new->color_depth),
+ .bit_shift = PL_DEF(orig->bit_shift, new->bit_shift),
+ };
+}
+
+void pl_color_repr_merge(struct pl_color_repr *orig, const struct pl_color_repr *new)
+{
+ *orig = (struct pl_color_repr) {
+ .sys = PL_DEF(orig->sys, new->sys),
+ .levels = PL_DEF(orig->levels, new->levels),
+ .alpha = PL_DEF(orig->alpha, new->alpha),
+ .dovi = PL_DEF(orig->dovi, new->dovi),
+ .bits = pl_bit_encoding_merge(&orig->bits, &new->bits),
+ };
+}
+
+enum pl_color_levels pl_color_levels_guess(const struct pl_color_repr *repr)
+{
+ if (repr->sys == PL_COLOR_SYSTEM_DOLBYVISION)
+ return PL_COLOR_LEVELS_FULL;
+
+ if (repr->levels)
+ return repr->levels;
+
+ return pl_color_system_is_ycbcr_like(repr->sys)
+ ? PL_COLOR_LEVELS_LIMITED
+ : PL_COLOR_LEVELS_FULL;
+}
+
+float pl_color_repr_normalize(struct pl_color_repr *repr)
+{
+ float scale = 1.0;
+ struct pl_bit_encoding *bits = &repr->bits;
+
+ if (bits->bit_shift) {
+ scale /= (1LL << bits->bit_shift);
+ bits->bit_shift = 0;
+ }
+
+ // If one of these is set but not the other, use the set one
+ int tex_bits = PL_DEF(bits->sample_depth, 8);
+ int col_bits = PL_DEF(bits->color_depth, tex_bits);
+ tex_bits = PL_DEF(tex_bits, col_bits);
+
+ if (pl_color_levels_guess(repr) == PL_COLOR_LEVELS_LIMITED) {
+ // Limit range is always shifted directly
+ scale *= (float) (1LL << tex_bits) / (1LL << col_bits);
+ } else {
+ // Full range always uses the full range available
+ scale *= ((1LL << tex_bits) - 1.) / ((1LL << col_bits) - 1.);
+ }
+
+ bits->color_depth = bits->sample_depth;
+ return scale;
+}
+
+bool pl_color_primaries_is_wide_gamut(enum pl_color_primaries prim)
+{
+ switch (prim) {
+ case PL_COLOR_PRIM_UNKNOWN:
+ case PL_COLOR_PRIM_BT_601_525:
+ case PL_COLOR_PRIM_BT_601_625:
+ case PL_COLOR_PRIM_BT_709:
+ case PL_COLOR_PRIM_BT_470M:
+ case PL_COLOR_PRIM_EBU_3213:
+ return false;
+ case PL_COLOR_PRIM_BT_2020:
+ case PL_COLOR_PRIM_APPLE:
+ case PL_COLOR_PRIM_ADOBE:
+ case PL_COLOR_PRIM_PRO_PHOTO:
+ case PL_COLOR_PRIM_CIE_1931:
+ case PL_COLOR_PRIM_DCI_P3:
+ case PL_COLOR_PRIM_DISPLAY_P3:
+ case PL_COLOR_PRIM_V_GAMUT:
+ case PL_COLOR_PRIM_S_GAMUT:
+ case PL_COLOR_PRIM_FILM_C:
+ case PL_COLOR_PRIM_ACES_AP0:
+ case PL_COLOR_PRIM_ACES_AP1:
+ return true;
+ case PL_COLOR_PRIM_COUNT: break;
+ }
+
+ pl_unreachable();
+}
+
+enum pl_color_primaries pl_color_primaries_guess(int width, int height)
+{
+ // HD content
+ if (width >= 1280 || height > 576)
+ return PL_COLOR_PRIM_BT_709;
+
+ switch (height) {
+ case 576: // Typical PAL content, including anamorphic/squared
+ return PL_COLOR_PRIM_BT_601_625;
+
+ case 480: // Typical NTSC content, including squared
+ case 486: // NTSC Pro or anamorphic NTSC
+ return PL_COLOR_PRIM_BT_601_525;
+
+ default: // No good metric, just pick BT.709 to minimize damage
+ return PL_COLOR_PRIM_BT_709;
+ }
+}
+
+// HLG 75% value (scene-referred)
+#define HLG_75 3.17955
+
+float pl_color_transfer_nominal_peak(enum pl_color_transfer trc)
+{
+ switch (trc) {
+ case PL_COLOR_TRC_UNKNOWN:
+ case PL_COLOR_TRC_BT_1886:
+ case PL_COLOR_TRC_SRGB:
+ case PL_COLOR_TRC_LINEAR:
+ case PL_COLOR_TRC_GAMMA18:
+ case PL_COLOR_TRC_GAMMA20:
+ case PL_COLOR_TRC_GAMMA22:
+ case PL_COLOR_TRC_GAMMA24:
+ case PL_COLOR_TRC_GAMMA26:
+ case PL_COLOR_TRC_GAMMA28:
+ case PL_COLOR_TRC_PRO_PHOTO:
+ case PL_COLOR_TRC_ST428:
+ return 1.0;
+ case PL_COLOR_TRC_PQ: return 10000.0 / PL_COLOR_SDR_WHITE;
+ case PL_COLOR_TRC_HLG: return 12.0 / HLG_75;
+ case PL_COLOR_TRC_V_LOG: return 46.0855;
+ case PL_COLOR_TRC_S_LOG1: return 6.52;
+ case PL_COLOR_TRC_S_LOG2: return 9.212;
+ case PL_COLOR_TRC_COUNT: break;
+ }
+
+ pl_unreachable();
+}
+
+const struct pl_hdr_metadata pl_hdr_metadata_empty = {0};
+const struct pl_hdr_metadata pl_hdr_metadata_hdr10 ={
+ .prim = {
+ .red = {0.708, 0.292},
+ .green = {0.170, 0.797},
+ .blue = {0.131, 0.046},
+ .white = {0.31271, 0.32902},
+ },
+ .min_luma = 0,
+ .max_luma = 10000,
+ .max_cll = 10000,
+ .max_fall = 0, // unknown
+};
+
+static const float PQ_M1 = 2610./4096 * 1./4,
+ PQ_M2 = 2523./4096 * 128,
+ PQ_C1 = 3424./4096,
+ PQ_C2 = 2413./4096 * 32,
+ PQ_C3 = 2392./4096 * 32;
+
+float pl_hdr_rescale(enum pl_hdr_scaling from, enum pl_hdr_scaling to, float x)
+{
+ if (from == to)
+ return x;
+ if (!x) // micro-optimization for common value
+ return x;
+
+ x = fmaxf(x, 0.0f);
+
+ // Convert input to PL_SCALE_RELATIVE
+ switch (from) {
+ case PL_HDR_PQ:
+ x = powf(x, 1.0f / PQ_M2);
+ x = fmaxf(x - PQ_C1, 0.0f) / (PQ_C2 - PQ_C3 * x);
+ x = powf(x, 1.0f / PQ_M1);
+ x *= 10000.0f;
+ // fall through
+ case PL_HDR_NITS:
+ x /= PL_COLOR_SDR_WHITE;
+ // fall through
+ case PL_HDR_NORM:
+ goto output;
+ case PL_HDR_SQRT:
+ x *= x;
+ goto output;
+ case PL_HDR_SCALING_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+
+output:
+ // Convert PL_SCALE_RELATIVE to output
+ switch (to) {
+ case PL_HDR_NORM:
+ return x;
+ case PL_HDR_SQRT:
+ return sqrtf(x);
+ case PL_HDR_NITS:
+ return x * PL_COLOR_SDR_WHITE;
+ case PL_HDR_PQ:
+ x *= PL_COLOR_SDR_WHITE / 10000.0f;
+ x = powf(x, PQ_M1);
+ x = (PQ_C1 + PQ_C2 * x) / (1.0f + PQ_C3 * x);
+ x = powf(x, PQ_M2);
+ return x;
+ case PL_HDR_SCALING_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+}
+
+static inline bool pl_hdr_bezier_equal(const struct pl_hdr_bezier *a,
+ const struct pl_hdr_bezier *b)
+{
+ return a->target_luma == b->target_luma &&
+ a->knee_x == b->knee_x &&
+ a->knee_y == b->knee_y &&
+ a->num_anchors == b->num_anchors &&
+ !memcmp(a->anchors, b->anchors, sizeof(a->anchors[0]) * a->num_anchors);
+}
+
+bool pl_hdr_metadata_equal(const struct pl_hdr_metadata *a,
+ const struct pl_hdr_metadata *b)
+{
+ return pl_raw_primaries_equal(&a->prim, &b->prim) &&
+ a->min_luma == b->min_luma &&
+ a->max_luma == b->max_luma &&
+ a->max_cll == b->max_cll &&
+ a->max_fall == b->max_fall &&
+ a->scene_max[0] == b->scene_max[0] &&
+ a->scene_max[1] == b->scene_max[1] &&
+ a->scene_max[2] == b->scene_max[2] &&
+ a->scene_avg == b->scene_avg &&
+ pl_hdr_bezier_equal(&a->ootf, &b->ootf) &&
+ a->max_pq_y == b->max_pq_y &&
+ a->avg_pq_y == b->avg_pq_y;
+}
+
+void pl_hdr_metadata_merge(struct pl_hdr_metadata *orig,
+ const struct pl_hdr_metadata *update)
+{
+ pl_raw_primaries_merge(&orig->prim, &update->prim);
+ if (!orig->min_luma)
+ orig->min_luma = update->min_luma;
+ if (!orig->max_luma)
+ orig->max_luma = update->max_luma;
+ if (!orig->max_cll)
+ orig->max_cll = update->max_cll;
+ if (!orig->max_fall)
+ orig->max_fall = update->max_fall;
+ if (!orig->scene_max[1])
+ memcpy(orig->scene_max, update->scene_max, sizeof(orig->scene_max));
+ if (!orig->scene_avg)
+ orig->scene_avg = update->scene_avg;
+ if (!orig->ootf.target_luma)
+ orig->ootf = update->ootf;
+ if (!orig->max_pq_y)
+ orig->max_pq_y = update->max_pq_y;
+ if (!orig->avg_pq_y)
+ orig->avg_pq_y = update->avg_pq_y;
+}
+
+bool pl_hdr_metadata_contains(const struct pl_hdr_metadata *data,
+ enum pl_hdr_metadata_type type)
+{
+ bool has_hdr10 = data->max_luma;
+ bool has_hdr10plus = data->scene_avg && (data->scene_max[0] ||
+ data->scene_max[1] ||
+ data->scene_max[2]);
+ bool has_cie_y = data->max_pq_y && data->avg_pq_y;
+
+ switch (type) {
+ case PL_HDR_METADATA_NONE: return true;
+ case PL_HDR_METADATA_ANY: return has_hdr10 || has_hdr10plus || has_cie_y;
+ case PL_HDR_METADATA_HDR10: return has_hdr10;
+ case PL_HDR_METADATA_HDR10PLUS: return has_hdr10plus;
+ case PL_HDR_METADATA_CIE_Y: return has_cie_y;
+ case PL_HDR_METADATA_TYPE_COUNT: break;
+ }
+
+ pl_unreachable();
+}
+
+const struct pl_color_space pl_color_space_unknown = {0};
+
+const struct pl_color_space pl_color_space_srgb = {
+ .primaries = PL_COLOR_PRIM_BT_709,
+ .transfer = PL_COLOR_TRC_SRGB,
+};
+
+const struct pl_color_space pl_color_space_bt709 = {
+ .primaries = PL_COLOR_PRIM_BT_709,
+ .transfer = PL_COLOR_TRC_BT_1886,
+};
+
+const struct pl_color_space pl_color_space_hdr10 = {
+ .primaries = PL_COLOR_PRIM_BT_2020,
+ .transfer = PL_COLOR_TRC_PQ,
+};
+
+const struct pl_color_space pl_color_space_bt2020_hlg = {
+ .primaries = PL_COLOR_PRIM_BT_2020,
+ .transfer = PL_COLOR_TRC_HLG,
+};
+
+const struct pl_color_space pl_color_space_monitor = {
+ .primaries = PL_COLOR_PRIM_BT_709, // sRGB primaries
+ .transfer = PL_COLOR_TRC_UNKNOWN, // unknown SDR response
+};
+
+bool pl_color_space_is_hdr(const struct pl_color_space *csp)
+{
+ return csp->hdr.max_luma > PL_COLOR_SDR_WHITE ||
+ pl_color_transfer_is_hdr(csp->transfer);
+}
+
+bool pl_color_space_is_black_scaled(const struct pl_color_space *csp)
+{
+ switch (csp->transfer) {
+ case PL_COLOR_TRC_UNKNOWN:
+ case PL_COLOR_TRC_SRGB:
+ case PL_COLOR_TRC_LINEAR:
+ case PL_COLOR_TRC_GAMMA18:
+ case PL_COLOR_TRC_GAMMA20:
+ case PL_COLOR_TRC_GAMMA22:
+ case PL_COLOR_TRC_GAMMA24:
+ case PL_COLOR_TRC_GAMMA26:
+ case PL_COLOR_TRC_GAMMA28:
+ case PL_COLOR_TRC_PRO_PHOTO:
+ case PL_COLOR_TRC_ST428:
+ case PL_COLOR_TRC_HLG:
+ return true;
+
+ case PL_COLOR_TRC_BT_1886:
+ case PL_COLOR_TRC_PQ:
+ case PL_COLOR_TRC_V_LOG:
+ case PL_COLOR_TRC_S_LOG1:
+ case PL_COLOR_TRC_S_LOG2:
+ return false;
+
+ case PL_COLOR_TRC_COUNT: break;
+ }
+
+ pl_unreachable();
+}
+
+void pl_color_space_merge(struct pl_color_space *orig,
+ const struct pl_color_space *new)
+{
+ if (!orig->primaries)
+ orig->primaries = new->primaries;
+ if (!orig->transfer)
+ orig->transfer = new->transfer;
+ pl_hdr_metadata_merge(&orig->hdr, &new->hdr);
+}
+
+bool pl_color_space_equal(const struct pl_color_space *c1,
+ const struct pl_color_space *c2)
+{
+ return c1->primaries == c2->primaries &&
+ c1->transfer == c2->transfer &&
+ pl_hdr_metadata_equal(&c1->hdr, &c2->hdr);
+}
+
+// Estimates luminance from maxRGB by looking at how monochromatic MaxSCL is
+static void luma_from_maxrgb(const struct pl_color_space *csp,
+ enum pl_hdr_scaling scaling,
+ float *out_max, float *out_avg)
+{
+ const float maxscl = PL_MAX3(csp->hdr.scene_max[0],
+ csp->hdr.scene_max[1],
+ csp->hdr.scene_max[2]);
+ if (!maxscl)
+ return;
+
+ struct pl_raw_primaries prim = csp->hdr.prim;
+ pl_raw_primaries_merge(&prim, pl_raw_primaries_get(csp->primaries));
+ const pl_matrix3x3 rgb2xyz = pl_get_rgb2xyz_matrix(&prim);
+
+ const float max_luma = rgb2xyz.m[1][0] * csp->hdr.scene_max[0] +
+ rgb2xyz.m[1][1] * csp->hdr.scene_max[1] +
+ rgb2xyz.m[1][2] * csp->hdr.scene_max[2];
+
+ const float coef = max_luma / maxscl;
+ *out_max = pl_hdr_rescale(PL_HDR_NITS, scaling, max_luma);
+ *out_avg = pl_hdr_rescale(PL_HDR_NITS, scaling, coef * csp->hdr.scene_avg);
+}
+
+static inline bool metadata_compat(enum pl_hdr_metadata_type metadata,
+ enum pl_hdr_metadata_type compat)
+{
+ return metadata == PL_HDR_METADATA_ANY || metadata == compat;
+}
+
+void pl_color_space_nominal_luma_ex(const struct pl_nominal_luma_params *params)
+{
+ if (!params || (!params->out_min && !params->out_max && !params->out_avg))
+ return;
+
+ const struct pl_color_space *csp = params->color;
+ const enum pl_hdr_scaling scaling = params->scaling;
+
+ float min_luma = 0, max_luma = 0, avg_luma = 0;
+ if (params->metadata != PL_HDR_METADATA_NONE) {
+ // Initialize from static HDR10 metadata, in all cases
+ min_luma = pl_hdr_rescale(PL_HDR_NITS, scaling, csp->hdr.min_luma);
+ max_luma = pl_hdr_rescale(PL_HDR_NITS, scaling, csp->hdr.max_luma);
+ }
+
+ if (metadata_compat(params->metadata, PL_HDR_METADATA_HDR10PLUS) &&
+ pl_hdr_metadata_contains(&csp->hdr, PL_HDR_METADATA_HDR10PLUS))
+ {
+ luma_from_maxrgb(csp, scaling, &max_luma, &avg_luma);
+ }
+
+ if (metadata_compat(params->metadata, PL_HDR_METADATA_CIE_Y) &&
+ pl_hdr_metadata_contains(&csp->hdr, PL_HDR_METADATA_CIE_Y))
+ {
+ max_luma = pl_hdr_rescale(PL_HDR_PQ, scaling, csp->hdr.max_pq_y);
+ avg_luma = pl_hdr_rescale(PL_HDR_PQ, scaling, csp->hdr.avg_pq_y);
+ }
+
+ // Clamp to sane value range
+ const float hdr_min = pl_hdr_rescale(PL_HDR_NITS, scaling, PL_COLOR_HDR_BLACK);
+ const float hdr_max = pl_hdr_rescale(PL_HDR_PQ, scaling, 1.0f);
+ max_luma = max_luma ? PL_CLAMP(max_luma, hdr_min, hdr_max) : 0;
+ min_luma = min_luma ? PL_CLAMP(min_luma, hdr_min, hdr_max) : 0;
+ if ((max_luma && min_luma >= max_luma) || min_luma >= hdr_max)
+ min_luma = max_luma = 0; // sanity
+
+ // PQ is always scaled down to absolute black, ignoring HDR metadata
+ if (csp->transfer == PL_COLOR_TRC_PQ)
+ min_luma = hdr_min;
+
+ // Baseline/fallback metadata, inferred entirely from the colorspace
+ // description and built-in default assumptions
+ if (!max_luma) {
+ if (csp->transfer == PL_COLOR_TRC_HLG) {
+ max_luma = pl_hdr_rescale(PL_HDR_NITS, scaling, PL_COLOR_HLG_PEAK);
+ } else {
+ const float peak = pl_color_transfer_nominal_peak(csp->transfer);
+ max_luma = pl_hdr_rescale(PL_HDR_NORM, scaling, peak);
+ }
+ }
+
+ if (!min_luma) {
+ if (pl_color_transfer_is_hdr(csp->transfer)) {
+ min_luma = hdr_min;
+ } else {
+ const float peak = pl_hdr_rescale(scaling, PL_HDR_NITS, max_luma);
+ min_luma = pl_hdr_rescale(PL_HDR_NITS, scaling,
+ peak / PL_COLOR_SDR_CONTRAST);
+ }
+ }
+
+ if (avg_luma)
+ avg_luma = PL_CLAMP(avg_luma, min_luma, max_luma); // sanity
+
+ if (params->out_min)
+ *params->out_min = min_luma;
+ if (params->out_max)
+ *params->out_max = max_luma;
+ if (params->out_avg)
+ *params->out_avg = avg_luma;
+}
+
+void pl_color_space_nominal_luma(const struct pl_color_space *csp,
+ float *out_min, float *out_max)
+{
+ pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+ .color = csp,
+ .metadata = PL_HDR_METADATA_ANY,
+ .scaling = PL_HDR_NORM,
+ .out_min = out_min,
+ .out_max = out_max,
+ ));
+}
+
+void pl_color_space_infer(struct pl_color_space *space)
+{
+ if (!space->primaries)
+ space->primaries = PL_COLOR_PRIM_BT_709;
+ if (!space->transfer)
+ space->transfer = PL_COLOR_TRC_BT_1886;
+
+ // Sanitize the static HDR metadata
+ pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+ .color = space,
+ .metadata = PL_HDR_METADATA_HDR10,
+ .scaling = PL_HDR_NITS,
+ .out_max = &space->hdr.max_luma,
+ // Preserve tagged minimum
+ .out_min = space->hdr.min_luma ? NULL : &space->hdr.min_luma,
+ ));
+
+ // Default the signal color space based on the nominal raw primaries
+ if (!pl_primaries_valid(&space->hdr.prim))
+ space->hdr.prim = *pl_raw_primaries_get(space->primaries);
+}
+
+static void infer_both_ref(struct pl_color_space *space,
+ struct pl_color_space *ref)
+{
+ pl_color_space_infer(ref);
+
+ if (!space->primaries) {
+ if (pl_color_primaries_is_wide_gamut(ref->primaries)) {
+ space->primaries = PL_COLOR_PRIM_BT_709;
+ } else {
+ space->primaries = ref->primaries;
+ }
+ }
+
+ if (!space->transfer) {
+ switch (ref->transfer) {
+ case PL_COLOR_TRC_UNKNOWN:
+ case PL_COLOR_TRC_COUNT:
+ pl_unreachable();
+ case PL_COLOR_TRC_BT_1886:
+ case PL_COLOR_TRC_SRGB:
+ case PL_COLOR_TRC_GAMMA22:
+ // Re-use input transfer curve to avoid small adaptations
+ space->transfer = ref->transfer;
+ break;
+ case PL_COLOR_TRC_PQ:
+ case PL_COLOR_TRC_HLG:
+ case PL_COLOR_TRC_V_LOG:
+ case PL_COLOR_TRC_S_LOG1:
+ case PL_COLOR_TRC_S_LOG2:
+ // Pick BT.1886 model because it models SDR contrast accurately,
+ // and we need contrast information for tone mapping
+ space->transfer = PL_COLOR_TRC_BT_1886;
+ break;
+ case PL_COLOR_TRC_PRO_PHOTO:
+ // ProPhotoRGB and sRGB are both piecewise with linear slope
+ space->transfer = PL_COLOR_TRC_SRGB;
+ break;
+ case PL_COLOR_TRC_LINEAR:
+ case PL_COLOR_TRC_GAMMA18:
+ case PL_COLOR_TRC_GAMMA20:
+ case PL_COLOR_TRC_GAMMA24:
+ case PL_COLOR_TRC_GAMMA26:
+ case PL_COLOR_TRC_GAMMA28:
+ case PL_COLOR_TRC_ST428:
+ // Pick pure power output curve to avoid introducing black crush
+ space->transfer = PL_COLOR_TRC_GAMMA22;
+ break;
+ }
+ }
+
+ // Infer the remaining fields after making the above choices
+ pl_color_space_infer(space);
+}
+
+void pl_color_space_infer_ref(struct pl_color_space *space,
+ const struct pl_color_space *refp)
+{
+ // Make a copy of `refp` to infer missing values first
+ struct pl_color_space ref = *refp;
+ infer_both_ref(space, &ref);
+}
+
+void pl_color_space_infer_map(struct pl_color_space *src,
+ struct pl_color_space *dst)
+{
+ bool unknown_src_contrast = !src->hdr.min_luma;
+ bool unknown_dst_contrast = !dst->hdr.min_luma;
+
+ infer_both_ref(dst, src);
+
+ // If the src has an unspecified gamma curve with dynamic black scaling,
+ // default it to match the dst colorspace contrast. This does not matter in
+ // most cases, but ensures that BT.1886 is tuned to the appropriate black
+ // point by default.
+ bool dynamic_src_contrast = pl_color_space_is_black_scaled(src) ||
+ src->transfer == PL_COLOR_TRC_BT_1886;
+ if (unknown_src_contrast && dynamic_src_contrast)
+ src->hdr.min_luma = dst->hdr.min_luma;
+
+ // Do the same in reverse if both src and dst are SDR curves
+ bool src_is_sdr = !pl_color_space_is_hdr(src);
+ bool dst_is_sdr = !pl_color_space_is_hdr(dst);
+ if (unknown_dst_contrast && src_is_sdr && dst_is_sdr)
+ dst->hdr.min_luma = src->hdr.min_luma;
+
+ // If the src is HLG and the output is HDR, tune the HLG peak to the output
+ if (src->transfer == PL_COLOR_TRC_HLG && pl_color_space_is_hdr(dst))
+ src->hdr.max_luma = dst->hdr.max_luma;
+}
+
+const struct pl_color_adjustment pl_color_adjustment_neutral = {
+ PL_COLOR_ADJUSTMENT_NEUTRAL
+};
+
+void pl_chroma_location_offset(enum pl_chroma_location loc, float *x, float *y)
+{
+ *x = *y = 0;
+
+ // This is the majority of subsampled chroma content out there
+ loc = PL_DEF(loc, PL_CHROMA_LEFT);
+
+ switch (loc) {
+ case PL_CHROMA_LEFT:
+ case PL_CHROMA_TOP_LEFT:
+ case PL_CHROMA_BOTTOM_LEFT:
+ *x = -0.5;
+ break;
+ default: break;
+ }
+
+ switch (loc) {
+ case PL_CHROMA_TOP_LEFT:
+ case PL_CHROMA_TOP_CENTER:
+ *y = -0.5;
+ break;
+ default: break;
+ }
+
+ switch (loc) {
+ case PL_CHROMA_BOTTOM_LEFT:
+ case PL_CHROMA_BOTTOM_CENTER:
+ *y = 0.5;
+ break;
+ default: break;
+ }
+}
+
+struct pl_cie_xy pl_white_from_temp(float temp)
+{
+ temp = PL_CLAMP(temp, 2500, 25000);
+
+ double ti = 1000.0 / temp, ti2 = ti * ti, ti3 = ti2 * ti, x;
+ if (temp <= 7000) {
+ x = -4.6070 * ti3 + 2.9678 * ti2 + 0.09911 * ti + 0.244063;
+ } else {
+ x = -2.0064 * ti3 + 1.9018 * ti2 + 0.24748 * ti + 0.237040;
+ }
+
+ return (struct pl_cie_xy) {
+ .x = x,
+ .y = -3 * (x*x) + 2.87 * x - 0.275,
+ };
+}
+
+bool pl_raw_primaries_equal(const struct pl_raw_primaries *a,
+ const struct pl_raw_primaries *b)
+{
+ return pl_cie_xy_equal(&a->red, &b->red) &&
+ pl_cie_xy_equal(&a->green, &b->green) &&
+ pl_cie_xy_equal(&a->blue, &b->blue) &&
+ pl_cie_xy_equal(&a->white, &b->white);
+}
+
+bool pl_raw_primaries_similar(const struct pl_raw_primaries *a,
+ const struct pl_raw_primaries *b)
+{
+ float delta = fabsf(a->red.x - b->red.x) +
+ fabsf(a->red.y - b->red.y) +
+ fabsf(a->green.x - b->green.x) +
+ fabsf(a->green.y - b->green.y) +
+ fabsf(a->blue.x - b->blue.x) +
+ fabsf(a->blue.y - b->blue.y) +
+ fabsf(a->white.x - b->white.x) +
+ fabsf(a->white.y - b->white.y);
+
+ return delta < 0.001;
+}
+
+void pl_raw_primaries_merge(struct pl_raw_primaries *orig,
+ const struct pl_raw_primaries *update)
+{
+ union {
+ struct pl_raw_primaries prim;
+ float raw[8];
+ } *pa = (void *) orig,
+ *pb = (void *) update;
+
+ pl_static_assert(sizeof(*pa) == sizeof(*orig));
+ for (int i = 0; i < PL_ARRAY_SIZE(pa->raw); i++)
+ pa->raw[i] = PL_DEF(pa->raw[i], pb->raw[i]);
+}
+
+const struct pl_raw_primaries *pl_raw_primaries_get(enum pl_color_primaries prim)
+{
+ /*
+ Values from: ITU-R Recommendations BT.470-6, BT.601-7, BT.709-5, BT.2020-0
+
+ https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.470-6-199811-S!!PDF-E.pdf
+ https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.601-7-201103-I!!PDF-E.pdf
+ https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.709-5-200204-I!!PDF-E.pdf
+ https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.2020-0-201208-I!!PDF-E.pdf
+
+ Other colorspaces from https://en.wikipedia.org/wiki/RGB_color_space#Specifications
+ */
+
+ // CIE standard illuminant series
+#define CIE_D50 {0.3457, 0.3585}
+#define CIE_D65 {0.3127, 0.3290}
+#define CIE_C {0.3100, 0.3160}
+#define CIE_E {1.0/3.0, 1.0/3.0}
+#define DCI {0.3140, 0.3510}
+
+ static const struct pl_raw_primaries primaries[] = {
+ [PL_COLOR_PRIM_BT_470M] = {
+ .red = {0.670, 0.330},
+ .green = {0.210, 0.710},
+ .blue = {0.140, 0.080},
+ .white = CIE_C,
+ },
+
+ [PL_COLOR_PRIM_BT_601_525] = {
+ .red = {0.630, 0.340},
+ .green = {0.310, 0.595},
+ .blue = {0.155, 0.070},
+ .white = CIE_D65,
+ },
+ [PL_COLOR_PRIM_BT_601_625] = {
+ .red = {0.640, 0.330},
+ .green = {0.290, 0.600},
+ .blue = {0.150, 0.060},
+ .white = CIE_D65,
+ },
+ [PL_COLOR_PRIM_BT_709] = {
+ .red = {0.640, 0.330},
+ .green = {0.300, 0.600},
+ .blue = {0.150, 0.060},
+ .white = CIE_D65,
+ },
+ [PL_COLOR_PRIM_BT_2020] = {
+ .red = {0.708, 0.292},
+ .green = {0.170, 0.797},
+ .blue = {0.131, 0.046},
+ .white = CIE_D65,
+ },
+ [PL_COLOR_PRIM_APPLE] = {
+ .red = {0.625, 0.340},
+ .green = {0.280, 0.595},
+ .blue = {0.115, 0.070},
+ .white = CIE_D65,
+ },
+ [PL_COLOR_PRIM_ADOBE] = {
+ .red = {0.640, 0.330},
+ .green = {0.210, 0.710},
+ .blue = {0.150, 0.060},
+ .white = CIE_D65,
+ },
+ [PL_COLOR_PRIM_PRO_PHOTO] = {
+ .red = {0.7347, 0.2653},
+ .green = {0.1596, 0.8404},
+ .blue = {0.0366, 0.0001},
+ .white = CIE_D50,
+ },
+ [PL_COLOR_PRIM_CIE_1931] = {
+ .red = {0.7347, 0.2653},
+ .green = {0.2738, 0.7174},
+ .blue = {0.1666, 0.0089},
+ .white = CIE_E,
+ },
+ // From SMPTE RP 431-2
+ [PL_COLOR_PRIM_DCI_P3] = {
+ .red = {0.680, 0.320},
+ .green = {0.265, 0.690},
+ .blue = {0.150, 0.060},
+ .white = DCI,
+ },
+ [PL_COLOR_PRIM_DISPLAY_P3] = {
+ .red = {0.680, 0.320},
+ .green = {0.265, 0.690},
+ .blue = {0.150, 0.060},
+ .white = CIE_D65,
+ },
+ // From Panasonic VARICAM reference manual
+ [PL_COLOR_PRIM_V_GAMUT] = {
+ .red = {0.730, 0.280},
+ .green = {0.165, 0.840},
+ .blue = {0.100, -0.03},
+ .white = CIE_D65,
+ },
+ // From Sony S-Log reference manual
+ [PL_COLOR_PRIM_S_GAMUT] = {
+ .red = {0.730, 0.280},
+ .green = {0.140, 0.855},
+ .blue = {0.100, -0.05},
+ .white = CIE_D65,
+ },
+ // From FFmpeg source code
+ [PL_COLOR_PRIM_FILM_C] = {
+ .red = {0.681, 0.319},
+ .green = {0.243, 0.692},
+ .blue = {0.145, 0.049},
+ .white = CIE_C,
+ },
+ [PL_COLOR_PRIM_EBU_3213] = {
+ .red = {0.630, 0.340},
+ .green = {0.295, 0.605},
+ .blue = {0.155, 0.077},
+ .white = CIE_D65,
+ },
+ // From Wikipedia
+ [PL_COLOR_PRIM_ACES_AP0] = {
+ .red = {0.7347, 0.2653},
+ .green = {0.0000, 1.0000},
+ .blue = {0.0001, -0.0770},
+ .white = {0.32168, 0.33767},
+ },
+ [PL_COLOR_PRIM_ACES_AP1] = {
+ .red = {0.713, 0.293},
+ .green = {0.165, 0.830},
+ .blue = {0.128, 0.044},
+ .white = {0.32168, 0.33767},
+ },
+ };
+
+ // This is the default assumption if no colorspace information could
+ // be determined, eg. for files which have no video channel.
+ if (!prim)
+ prim = PL_COLOR_PRIM_BT_709;
+
+ pl_assert(prim < PL_ARRAY_SIZE(primaries));
+ return &primaries[prim];
+}
+
+// Compute the RGB/XYZ matrix as described here:
+// http://www.brucelindbloom.com/index.html?Eqn_RGB_XYZ_Matrix.html
+pl_matrix3x3 pl_get_rgb2xyz_matrix(const struct pl_raw_primaries *prim)
+{
+ pl_matrix3x3 out = {{{0}}};
+ float S[3], X[4], Z[4];
+
+ X[0] = pl_cie_X(prim->red);
+ X[1] = pl_cie_X(prim->green);
+ X[2] = pl_cie_X(prim->blue);
+ X[3] = pl_cie_X(prim->white);
+
+ Z[0] = pl_cie_Z(prim->red);
+ Z[1] = pl_cie_Z(prim->green);
+ Z[2] = pl_cie_Z(prim->blue);
+ Z[3] = pl_cie_Z(prim->white);
+
+ // S = XYZ^-1 * W
+ for (int i = 0; i < 3; i++) {
+ out.m[0][i] = X[i];
+ out.m[1][i] = 1;
+ out.m[2][i] = Z[i];
+ }
+
+ pl_matrix3x3_invert(&out);
+
+ for (int i = 0; i < 3; i++)
+ S[i] = out.m[i][0] * X[3] + out.m[i][1] * 1 + out.m[i][2] * Z[3];
+
+ // M = [Sc * XYZc]
+ for (int i = 0; i < 3; i++) {
+ out.m[0][i] = S[i] * X[i];
+ out.m[1][i] = S[i] * 1;
+ out.m[2][i] = S[i] * Z[i];
+ }
+
+ return out;
+}
+
+pl_matrix3x3 pl_get_xyz2rgb_matrix(const struct pl_raw_primaries *prim)
+{
+ // For simplicity, just invert the rgb2xyz matrix
+ pl_matrix3x3 out = pl_get_rgb2xyz_matrix(prim);
+ pl_matrix3x3_invert(&out);
+ return out;
+}
+
+// LMS<-XYZ revised matrix from CIECAM97, based on a linear transform and
+// normalized for equal energy on monochrome inputs
+static const pl_matrix3x3 m_cat97 = {{
+ { 0.8562, 0.3372, -0.1934 },
+ { -0.8360, 1.8327, 0.0033 },
+ { 0.0357, -0.0469, 1.0112 },
+}};
+
+// M := M * XYZd<-XYZs
+static void apply_chromatic_adaptation(struct pl_cie_xy src,
+ struct pl_cie_xy dest,
+ pl_matrix3x3 *mat)
+{
+ // If the white points are nearly identical, this is a wasteful identity
+ // operation.
+ if (fabs(src.x - dest.x) < 1e-6 && fabs(src.y - dest.y) < 1e-6)
+ return;
+
+ // XYZd<-XYZs = Ma^-1 * (I*[Cd/Cs]) * Ma
+ // http://www.brucelindbloom.com/index.html?Eqn_ChromAdapt.html
+ // For Ma, we use the CIECAM97 revised (linear) matrix
+ float C[3][2];
+
+ for (int i = 0; i < 3; i++) {
+ // source cone
+ C[i][0] = m_cat97.m[i][0] * pl_cie_X(src)
+ + m_cat97.m[i][1] * 1
+ + m_cat97.m[i][2] * pl_cie_Z(src);
+
+ // dest cone
+ C[i][1] = m_cat97.m[i][0] * pl_cie_X(dest)
+ + m_cat97.m[i][1] * 1
+ + m_cat97.m[i][2] * pl_cie_Z(dest);
+ }
+
+ // tmp := I * [Cd/Cs] * Ma
+ pl_matrix3x3 tmp = {0};
+ for (int i = 0; i < 3; i++)
+ tmp.m[i][i] = C[i][1] / C[i][0];
+
+ pl_matrix3x3_mul(&tmp, &m_cat97);
+
+ // M := M * Ma^-1 * tmp
+ pl_matrix3x3 ma_inv = m_cat97;
+ pl_matrix3x3_invert(&ma_inv);
+ pl_matrix3x3_mul(mat, &ma_inv);
+ pl_matrix3x3_mul(mat, &tmp);
+}
+
+pl_matrix3x3 pl_get_adaptation_matrix(struct pl_cie_xy src, struct pl_cie_xy dst)
+{
+ // Use BT.709 primaries (with chosen white point) as an XYZ reference
+ struct pl_raw_primaries csp = *pl_raw_primaries_get(PL_COLOR_PRIM_BT_709);
+ csp.white = src;
+
+ pl_matrix3x3 rgb2xyz = pl_get_rgb2xyz_matrix(&csp);
+ pl_matrix3x3 xyz2rgb = rgb2xyz;
+ pl_matrix3x3_invert(&xyz2rgb);
+
+ apply_chromatic_adaptation(src, dst, &xyz2rgb);
+ pl_matrix3x3_mul(&xyz2rgb, &rgb2xyz);
+ return xyz2rgb;
+}
+
+pl_matrix3x3 pl_ipt_rgb2lms(const struct pl_raw_primaries *prim)
+{
+ static const pl_matrix3x3 hpe = {{ // HPE XYZ->LMS (D65) method
+ { 0.40024f, 0.70760f, -0.08081f },
+ { -0.22630f, 1.16532f, 0.04570f },
+ { 0.00000f, 0.00000f, 0.91822f },
+ }};
+
+ const float c = 0.04; // 4% crosstalk
+ pl_matrix3x3 m = {{
+ { 1 - 2*c, c, c },
+ { c, 1 - 2*c, c },
+ { c, c, 1 - 2*c },
+ }};
+
+ pl_matrix3x3_mul(&m, &hpe);
+
+ // Apply chromatic adaptation to D65 if the input white point differs
+ static const struct pl_cie_xy d65 = CIE_D65;
+ apply_chromatic_adaptation(prim->white, d65, &m);
+
+ const pl_matrix3x3 rgb2xyz = pl_get_rgb2xyz_matrix(prim);
+ pl_matrix3x3_mul(&m, &rgb2xyz);
+ return m;
+}
+
+pl_matrix3x3 pl_ipt_lms2rgb(const struct pl_raw_primaries *prim)
+{
+ pl_matrix3x3 m = pl_ipt_rgb2lms(prim);
+ pl_matrix3x3_invert(&m);
+ return m;
+}
+
+// As standardized in Ebner & Fairchild IPT (1998)
+const pl_matrix3x3 pl_ipt_lms2ipt = {{
+ { 0.4000, 0.4000, 0.2000 },
+ { 4.4550, -4.8510, 0.3960 },
+ { 0.8056, 0.3572, -1.1628 },
+}};
+
+// Numerically inverted from the matrix above
+const pl_matrix3x3 pl_ipt_ipt2lms = {{
+ { 1.0, 0.0975689, 0.205226 },
+ { 1.0, -0.1138760, 0.133217 },
+ { 1.0, 0.0326151, -0.676887 },
+}};
+
+const struct pl_cone_params pl_vision_normal = {PL_CONE_NONE, 1.0};
+const struct pl_cone_params pl_vision_protanomaly = {PL_CONE_L, 0.5};
+const struct pl_cone_params pl_vision_protanopia = {PL_CONE_L, 0.0};
+const struct pl_cone_params pl_vision_deuteranomaly = {PL_CONE_M, 0.5};
+const struct pl_cone_params pl_vision_deuteranopia = {PL_CONE_M, 0.0};
+const struct pl_cone_params pl_vision_tritanomaly = {PL_CONE_S, 0.5};
+const struct pl_cone_params pl_vision_tritanopia = {PL_CONE_S, 0.0};
+const struct pl_cone_params pl_vision_monochromacy = {PL_CONE_LM, 0.0};
+const struct pl_cone_params pl_vision_achromatopsia = {PL_CONE_LMS, 0.0};
+
+pl_matrix3x3 pl_get_cone_matrix(const struct pl_cone_params *params,
+ const struct pl_raw_primaries *prim)
+{
+ // LMS<-RGB := LMS<-XYZ * XYZ<-RGB
+ pl_matrix3x3 rgb2lms = m_cat97;
+ pl_matrix3x3 rgb2xyz = pl_get_rgb2xyz_matrix(prim);
+ pl_matrix3x3_mul(&rgb2lms, &rgb2xyz);
+
+ // LMS versions of the two opposing primaries, plus neutral
+ float lms_r[3] = {1.0, 0.0, 0.0},
+ lms_b[3] = {0.0, 0.0, 1.0},
+ lms_w[3] = {1.0, 1.0, 1.0};
+
+ pl_matrix3x3_apply(&rgb2lms, lms_r);
+ pl_matrix3x3_apply(&rgb2lms, lms_b);
+ pl_matrix3x3_apply(&rgb2lms, lms_w);
+
+ float a, b, c = params->strength;
+ pl_matrix3x3 distort;
+
+ switch (params->cones) {
+ case PL_CONE_NONE:
+ return pl_matrix3x3_identity;
+
+ case PL_CONE_L:
+ // Solve to preserve neutral and blue
+ a = (lms_b[0] - lms_b[2] * lms_w[0] / lms_w[2]) /
+ (lms_b[1] - lms_b[2] * lms_w[1] / lms_w[2]);
+ b = (lms_b[0] - lms_b[1] * lms_w[0] / lms_w[1]) /
+ (lms_b[2] - lms_b[1] * lms_w[2] / lms_w[1]);
+ assert(fabs(a * lms_w[1] + b * lms_w[2] - lms_w[0]) < 1e-6);
+
+ distort = (pl_matrix3x3) {{
+ { c, (1.0 - c) * a, (1.0 - c) * b},
+ { 0.0, 1.0, 0.0},
+ { 0.0, 0.0, 1.0},
+ }};
+ break;
+
+ case PL_CONE_M:
+ // Solve to preserve neutral and blue
+ a = (lms_b[1] - lms_b[2] * lms_w[1] / lms_w[2]) /
+ (lms_b[0] - lms_b[2] * lms_w[0] / lms_w[2]);
+ b = (lms_b[1] - lms_b[0] * lms_w[1] / lms_w[0]) /
+ (lms_b[2] - lms_b[0] * lms_w[2] / lms_w[0]);
+ assert(fabs(a * lms_w[0] + b * lms_w[2] - lms_w[1]) < 1e-6);
+
+ distort = (pl_matrix3x3) {{
+ { 1.0, 0.0, 0.0},
+ {(1.0 - c) * a, c, (1.0 - c) * b},
+ { 0.0, 0.0, 1.0},
+ }};
+ break;
+
+ case PL_CONE_S:
+ // Solve to preserve neutral and red
+ a = (lms_r[2] - lms_r[1] * lms_w[2] / lms_w[1]) /
+ (lms_r[0] - lms_r[1] * lms_w[0] / lms_w[1]);
+ b = (lms_r[2] - lms_r[0] * lms_w[2] / lms_w[0]) /
+ (lms_r[1] - lms_r[0] * lms_w[1] / lms_w[0]);
+ assert(fabs(a * lms_w[0] + b * lms_w[1] - lms_w[2]) < 1e-6);
+
+ distort = (pl_matrix3x3) {{
+ { 1.0, 0.0, 0.0},
+ { 0.0, 1.0, 0.0},
+ {(1.0 - c) * a, (1.0 - c) * b, c},
+ }};
+ break;
+
+ case PL_CONE_LM:
+ // Solve to preserve neutral
+ a = lms_w[0] / lms_w[2];
+ b = lms_w[1] / lms_w[2];
+
+ distort = (pl_matrix3x3) {{
+ { c, 0.0, (1.0 - c) * a},
+ { 0.0, c, (1.0 - c) * b},
+ { 0.0, 0.0, 1.0},
+ }};
+ break;
+
+ case PL_CONE_MS:
+ // Solve to preserve neutral
+ a = lms_w[1] / lms_w[0];
+ b = lms_w[2] / lms_w[0];
+
+ distort = (pl_matrix3x3) {{
+ { 1.0, 0.0, 0.0},
+ {(1.0 - c) * a, c, 0.0},
+ {(1.0 - c) * b, 0.0, c},
+ }};
+ break;
+
+ case PL_CONE_LS:
+ // Solve to preserve neutral
+ a = lms_w[0] / lms_w[1];
+ b = lms_w[2] / lms_w[1];
+
+ distort = (pl_matrix3x3) {{
+ { c, (1.0 - c) * a, 0.0},
+ { 0.0, 1.0, 0.0},
+ { 0.0, (1.0 - c) * b, c},
+ }};
+ break;
+
+ case PL_CONE_LMS: {
+ // Rod cells only, which can be modelled somewhat as a combination of
+ // L and M cones. Either way, this is pushing the limits of the our
+ // color model, so this is only a rough approximation.
+ const float w[3] = {0.3605, 0.6415, -0.002};
+ assert(fabs(w[0] + w[1] + w[2] - 1.0) < 1e-6);
+
+ for (int i = 0; i < 3; i++) {
+ for (int j = 0; j < 3; j++) {
+ distort.m[i][j] = (1.0 - c) * w[j] * lms_w[i] / lms_w[j];
+ if (i == j)
+ distort.m[i][j] += c;
+ }
+ }
+ break;
+ }
+
+ default:
+ pl_unreachable();
+ }
+
+ // out := RGB<-LMS * distort * LMS<-RGB
+ pl_matrix3x3 out = rgb2lms;
+ pl_matrix3x3_invert(&out);
+ pl_matrix3x3_mul(&out, &distort);
+ pl_matrix3x3_mul(&out, &rgb2lms);
+
+ return out;
+}
+
+pl_matrix3x3 pl_get_color_mapping_matrix(const struct pl_raw_primaries *src,
+ const struct pl_raw_primaries *dst,
+ enum pl_rendering_intent intent)
+{
+ // In saturation mapping, we don't care about accuracy and just want
+ // primaries to map to primaries, making this an identity transformation.
+ if (intent == PL_INTENT_SATURATION)
+ return pl_matrix3x3_identity;
+
+ // RGBd<-RGBs = RGBd<-XYZd * XYZd<-XYZs * XYZs<-RGBs
+ // Equations from: http://www.brucelindbloom.com/index.html?Math.html
+ // Note: Perceptual is treated like relative colorimetric. There's no
+ // definition for perceptual other than "make it look good".
+
+ // RGBd<-XYZd matrix
+ pl_matrix3x3 xyz2rgb_d = pl_get_xyz2rgb_matrix(dst);
+
+ // Chromatic adaptation, except in absolute colorimetric intent
+ if (intent != PL_INTENT_ABSOLUTE_COLORIMETRIC)
+ apply_chromatic_adaptation(src->white, dst->white, &xyz2rgb_d);
+
+ // XYZs<-RGBs
+ pl_matrix3x3 rgb2xyz_s = pl_get_rgb2xyz_matrix(src);
+ pl_matrix3x3_mul(&xyz2rgb_d, &rgb2xyz_s);
+ return xyz2rgb_d;
+}
+
+// Test the sign of 'p' relative to the line 'ab' (barycentric coordinates)
+static float test_point_line(const struct pl_cie_xy p,
+ const struct pl_cie_xy a,
+ const struct pl_cie_xy b)
+{
+ return (p.x - b.x) * (a.y - b.y) - (a.x - b.x) * (p.y - b.y);
+}
+
+// Test if a point is entirely inside a gamut
+static float test_point_gamut(struct pl_cie_xy point,
+ const struct pl_raw_primaries *prim)
+{
+ float d1 = test_point_line(point, prim->red, prim->green),
+ d2 = test_point_line(point, prim->green, prim->blue),
+ d3 = test_point_line(point, prim->blue, prim->red);
+
+ bool has_neg = d1 < -1e-6f || d2 < -1e-6f || d3 < -1e-6f,
+ has_pos = d1 > 1e-6f || d2 > 1e-6f || d3 > 1e-6f;
+
+ return !(has_neg && has_pos);
+}
+
+bool pl_primaries_superset(const struct pl_raw_primaries *a,
+ const struct pl_raw_primaries *b)
+{
+ return test_point_gamut(b->red, a) &&
+ test_point_gamut(b->green, a) &&
+ test_point_gamut(b->blue, a);
+}
+
+bool pl_primaries_valid(const struct pl_raw_primaries *prim)
+{
+ // Test to see if the primaries form a valid triangle (nonzero area)
+ float area = (prim->blue.x - prim->green.x) * (prim->red.y - prim->green.y)
+ - (prim->red.x - prim->green.x) * (prim->blue.y - prim->green.y);
+
+ return fabs(area) > 1e-6 && test_point_gamut(prim->white, prim);
+}
+
+static inline float xy_dist2(struct pl_cie_xy a, struct pl_cie_xy b)
+{
+ const float dx = a.x - b.x, dy = a.y - b.y;
+ return dx * dx + dy * dy;
+}
+
+bool pl_primaries_compatible(const struct pl_raw_primaries *a,
+ const struct pl_raw_primaries *b)
+{
+ float RR = xy_dist2(a->red, b->red), RG = xy_dist2(a->red, b->green),
+ RB = xy_dist2(a->red, b->blue), GG = xy_dist2(a->green, b->green),
+ GB = xy_dist2(a->green, b->blue), BB = xy_dist2(a->blue, b->blue);
+ return RR < RG && RR < RB && GG < RG && GG < GB && BB < RB && BB < GB;
+}
+
+// returns the intersection of the two lines defined by ab and cd
+static struct pl_cie_xy intersection(struct pl_cie_xy a, struct pl_cie_xy b,
+ struct pl_cie_xy c, struct pl_cie_xy d)
+{
+ float det = (a.x - b.x) * (c.y - d.y) - (a.y - b.y) * (c.x - d.x);
+ float t = ((a.x - c.x) * (c.y - d.y) - (a.y - c.y) * (c.x - d.x)) / det;
+ return (struct pl_cie_xy) {
+ .x = t ? a.x + t * (b.x - a.x) : 0.0f,
+ .y = t ? a.y + t * (b.y - a.y) : 0.0f,
+ };
+}
+
+// x, y, z specified in clockwise order, with a, b, c being the enclosing gamut
+static struct pl_cie_xy
+clip_point(struct pl_cie_xy x, struct pl_cie_xy y, struct pl_cie_xy z,
+ struct pl_cie_xy a, struct pl_cie_xy b, struct pl_cie_xy c)
+{
+ const float d1 = test_point_line(y, a, b);
+ const float d2 = test_point_line(y, b, c);
+ if (d1 <= 0.0f && d2 <= 0.0f) {
+ return y; // already inside triangle
+ } else if (d1 > 0.0f && d2 > 0.0f) {
+ return b; // target vertex fully enclosed
+ } else if (d1 > 0.0f) {
+ return intersection(a, b, y, z);
+ } else {
+ return intersection(x, y, b, c);
+ }
+}
+
+struct pl_raw_primaries pl_primaries_clip(const struct pl_raw_primaries *src,
+ const struct pl_raw_primaries *dst)
+{
+ return (struct pl_raw_primaries) {
+ .red = clip_point(src->green, src->red, src->blue,
+ dst->green, dst->red, dst->blue),
+ .green = clip_point(src->blue, src->green, src->red,
+ dst->blue, dst->green, dst->red),
+ .blue = clip_point(src->red, src->blue, src->green,
+ dst->red, dst->blue, dst->green),
+ .white = src->white,
+ };
+}
+
+/* Fill in the Y, U, V vectors of a yuv-to-rgb conversion matrix
+ * based on the given luma weights of the R, G and B components (lr, lg, lb).
+ * lr+lg+lb is assumed to equal 1.
+ * This function is meant for colorspaces satisfying the following
+ * conditions (which are true for common YUV colorspaces):
+ * - The mapping from input [Y, U, V] to output [R, G, B] is linear.
+ * - Y is the vector [1, 1, 1]. (meaning input Y component maps to 1R+1G+1B)
+ * - U maps to a value with zero R and positive B ([0, x, y], y > 0;
+ * i.e. blue and green only).
+ * - V maps to a value with zero B and positive R ([x, y, 0], x > 0;
+ * i.e. red and green only).
+ * - U and V are orthogonal to the luma vector [lr, lg, lb].
+ * - The magnitudes of the vectors U and V are the minimal ones for which
+ * the image of the set Y=[0...1],U=[-0.5...0.5],V=[-0.5...0.5] under the
+ * conversion function will cover the set R=[0...1],G=[0...1],B=[0...1]
+ * (the resulting matrix can be converted for other input/output ranges
+ * outside this function).
+ * Under these conditions the given parameters lr, lg, lb uniquely
+ * determine the mapping of Y, U, V to R, G, B.
+ */
+static pl_matrix3x3 luma_coeffs(float lr, float lg, float lb)
+{
+ pl_assert(fabs(lr+lg+lb - 1) < 1e-6);
+ return (pl_matrix3x3) {{
+ {1, 0, 2 * (1-lr) },
+ {1, -2 * (1-lb) * lb/lg, -2 * (1-lr) * lr/lg },
+ {1, 2 * (1-lb), 0 },
+ }};
+}
+
+// Applies hue and saturation controls to a YCbCr->RGB matrix
+static inline void apply_hue_sat(pl_matrix3x3 *m,
+ const struct pl_color_adjustment *params)
+{
+ // Hue is equivalent to rotating input [U, V] subvector around the origin.
+ // Saturation scales [U, V].
+ float huecos = params->saturation * cos(params->hue);
+ float huesin = params->saturation * sin(params->hue);
+ for (int i = 0; i < 3; i++) {
+ float u = m->m[i][1], v = m->m[i][2];
+ m->m[i][1] = huecos * u - huesin * v;
+ m->m[i][2] = huesin * u + huecos * v;
+ }
+}
+
+pl_transform3x3 pl_color_repr_decode(struct pl_color_repr *repr,
+ const struct pl_color_adjustment *params)
+{
+ params = PL_DEF(params, &pl_color_adjustment_neutral);
+
+ pl_matrix3x3 m;
+ switch (repr->sys) {
+ case PL_COLOR_SYSTEM_BT_709: m = luma_coeffs(0.2126, 0.7152, 0.0722); break;
+ case PL_COLOR_SYSTEM_BT_601: m = luma_coeffs(0.2990, 0.5870, 0.1140); break;
+ case PL_COLOR_SYSTEM_SMPTE_240M: m = luma_coeffs(0.2122, 0.7013, 0.0865); break;
+ case PL_COLOR_SYSTEM_BT_2020_NC: m = luma_coeffs(0.2627, 0.6780, 0.0593); break;
+ case PL_COLOR_SYSTEM_BT_2020_C:
+ // Note: This outputs into the [-0.5,0.5] range for chroma information.
+ m = (pl_matrix3x3) {{
+ {0, 0, 1},
+ {1, 0, 0},
+ {0, 1, 0},
+ }};
+ break;
+ case PL_COLOR_SYSTEM_BT_2100_PQ: {
+ // Reversed from the matrix in the spec, hard-coded for efficiency
+ // and precision reasons. Exact values truncated from ITU-T H-series
+ // Supplement 18.
+ static const float lm_t = 0.008609, lm_p = 0.111029625;
+ m = (pl_matrix3x3) {{
+ {1.0, lm_t, lm_p},
+ {1.0, -lm_t, -lm_p},
+ {1.0, 0.560031, -0.320627},
+ }};
+ break;
+ }
+ case PL_COLOR_SYSTEM_BT_2100_HLG: {
+ // Similar to BT.2100 PQ, exact values truncated from WolframAlpha
+ static const float lm_t = 0.01571858011, lm_p = 0.2095810681;
+ m = (pl_matrix3x3) {{
+ {1.0, lm_t, lm_p},
+ {1.0, -lm_t, -lm_p},
+ {1.0, 1.02127108, -0.605274491},
+ }};
+ break;
+ }
+ case PL_COLOR_SYSTEM_DOLBYVISION:
+ m = repr->dovi->nonlinear;
+ break;
+ case PL_COLOR_SYSTEM_YCGCO:
+ m = (pl_matrix3x3) {{
+ {1, -1, 1},
+ {1, 1, 0},
+ {1, -1, -1},
+ }};
+ break;
+ case PL_COLOR_SYSTEM_UNKNOWN: // fall through
+ case PL_COLOR_SYSTEM_RGB:
+ m = pl_matrix3x3_identity;
+ break;
+ case PL_COLOR_SYSTEM_XYZ: {
+ // For lack of anything saner to do, just assume the caller wants
+ // DCI-P3 primaries, which is a reasonable assumption.
+ const struct pl_raw_primaries *dst = pl_raw_primaries_get(PL_COLOR_PRIM_DCI_P3);
+ m = pl_get_xyz2rgb_matrix(dst);
+ // DCDM X'Y'Z' is expected to have equal energy white point (EG 432-1 Annex H)
+ apply_chromatic_adaptation((struct pl_cie_xy)CIE_E, dst->white, &m);
+ break;
+ }
+ case PL_COLOR_SYSTEM_COUNT:
+ pl_unreachable();
+ }
+
+ // Apply hue and saturation in the correct way depending on the colorspace.
+ if (pl_color_system_is_ycbcr_like(repr->sys)) {
+ apply_hue_sat(&m, params);
+ } else if (params->saturation != 1.0 || params->hue != 0.0) {
+ // Arbitrarily simulate hue shifts using the BT.709 YCbCr model
+ pl_matrix3x3 yuv2rgb = luma_coeffs(0.2126, 0.7152, 0.0722);
+ pl_matrix3x3 rgb2yuv = yuv2rgb;
+ pl_matrix3x3_invert(&rgb2yuv);
+ apply_hue_sat(&yuv2rgb, params);
+ // M := RGB<-YUV * YUV<-RGB * M
+ pl_matrix3x3_rmul(&rgb2yuv, &m);
+ pl_matrix3x3_rmul(&yuv2rgb, &m);
+ }
+
+ // Apply color temperature adaptation, relative to BT.709 primaries
+ if (params->temperature) {
+ struct pl_cie_xy src = pl_white_from_temp(6500);
+ struct pl_cie_xy dst = pl_white_from_temp(6500 + 3500 * params->temperature);
+ pl_matrix3x3 adapt = pl_get_adaptation_matrix(src, dst);
+ pl_matrix3x3_rmul(&adapt, &m);
+ }
+
+ pl_transform3x3 out = { .mat = m };
+ int bit_depth = PL_DEF(repr->bits.sample_depth,
+ PL_DEF(repr->bits.color_depth, 8));
+
+ double ymax, ymin, cmax, cmid;
+ double scale = (1LL << bit_depth) / ((1LL << bit_depth) - 1.0);
+
+ switch (pl_color_levels_guess(repr)) {
+ case PL_COLOR_LEVELS_LIMITED: {
+ ymax = 235 / 256. * scale;
+ ymin = 16 / 256. * scale;
+ cmax = 240 / 256. * scale;
+ cmid = 128 / 256. * scale;
+ break;
+ }
+ case PL_COLOR_LEVELS_FULL:
+ // Note: For full-range YUV, there are multiple, subtly inconsistent
+ // standards. So just pick the sanest implementation, which is to
+ // assume MAX_INT == 1.0.
+ ymax = 1.0;
+ ymin = 0.0;
+ cmax = 1.0;
+ cmid = 128 / 256. * scale; // *not* exactly 0.5
+ break;
+ default:
+ pl_unreachable();
+ }
+
+ double ymul = 1.0 / (ymax - ymin);
+ double cmul = 0.5 / (cmax - cmid);
+
+ double mul[3] = { ymul, ymul, ymul };
+ double black[3] = { ymin, ymin, ymin };
+
+#ifdef PL_HAVE_DOVI
+ if (repr->sys == PL_COLOR_SYSTEM_DOLBYVISION) {
+ // The RPU matrix already includes levels normalization, but in this
+ // case we also have to respect the signalled color offsets
+ for (int i = 0; i < 3; i++) {
+ mul[i] = 1.0;
+ black[i] = repr->dovi->nonlinear_offset[i] * scale;
+ }
+ } else
+#endif
+ if (pl_color_system_is_ycbcr_like(repr->sys)) {
+ mul[1] = mul[2] = cmul;
+ black[1] = black[2] = cmid;
+ }
+
+ // Contrast scales the output value range (gain)
+ // Brightness scales the constant output bias (black lift/boost)
+ for (int i = 0; i < 3; i++) {
+ mul[i] *= params->contrast;
+ out.c[i] += params->brightness;
+ }
+
+ // Multiply in the texture multiplier and adjust `c` so that black[j] keeps
+ // on mapping to RGB=0 (black to black)
+ for (int i = 0; i < 3; i++) {
+ for (int j = 0; j < 3; j++) {
+ out.mat.m[i][j] *= mul[j];
+ out.c[i] -= out.mat.m[i][j] * black[j];
+ }
+ }
+
+ // Finally, multiply in the scaling factor required to get the color up to
+ // the correct representation.
+ pl_matrix3x3_scale(&out.mat, pl_color_repr_normalize(repr));
+
+ // Update the metadata to reflect the change.
+ repr->sys = PL_COLOR_SYSTEM_RGB;
+ repr->levels = PL_COLOR_LEVELS_FULL;
+
+ return out;
+}
+
+bool pl_icc_profile_equal(const struct pl_icc_profile *p1,
+ const struct pl_icc_profile *p2)
+{
+ if (p1->len != p2->len)
+ return false;
+
+ // Ignore signatures on length-0 profiles, as a special case
+ return !p1->len || p1->signature == p2->signature;
+}
+
+void pl_icc_profile_compute_signature(struct pl_icc_profile *profile)
+{
+ if (!profile->len)
+ profile->signature = 0;
+
+ // In theory, we could get this value from the profile header itself if
+ // lcms is available, but I'm not sure if it's even worth the trouble. Just
+ // hard-code this to a pl_mem_hash(), which is decently fast anyway.
+ profile->signature = pl_mem_hash(profile->data, profile->len);
+}
diff --git a/src/common.c b/src/common.c
new file mode 100644
index 0000000..8c8a4f0
--- /dev/null
+++ b/src/common.c
@@ -0,0 +1,500 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+
+#include "common.h"
+#include "version.h"
+
+#include <libplacebo/common.h>
+
+int pl_fix_ver(void)
+{
+ return BUILD_FIX_VER;
+}
+
+const char *pl_version(void)
+{
+ return BUILD_VERSION;
+}
+
+void pl_rect2d_normalize(pl_rect2d *rc)
+{
+ *rc = (pl_rect2d) {
+ .x0 = PL_MIN(rc->x0, rc->x1),
+ .x1 = PL_MAX(rc->x0, rc->x1),
+ .y0 = PL_MIN(rc->y0, rc->y1),
+ .y1 = PL_MAX(rc->y0, rc->y1),
+ };
+}
+
+void pl_rect3d_normalize(pl_rect3d *rc)
+{
+ *rc = (pl_rect3d) {
+ .x0 = PL_MIN(rc->x0, rc->x1),
+ .x1 = PL_MAX(rc->x0, rc->x1),
+ .y0 = PL_MIN(rc->y0, rc->y1),
+ .y1 = PL_MAX(rc->y0, rc->y1),
+ .z0 = PL_MIN(rc->z0, rc->z1),
+ .z1 = PL_MAX(rc->z0, rc->z1),
+ };
+}
+
+void pl_rect2df_normalize(pl_rect2df *rc)
+{
+ *rc = (pl_rect2df) {
+ .x0 = PL_MIN(rc->x0, rc->x1),
+ .x1 = PL_MAX(rc->x0, rc->x1),
+ .y0 = PL_MIN(rc->y0, rc->y1),
+ .y1 = PL_MAX(rc->y0, rc->y1),
+ };
+}
+
+void pl_rect3df_normalize(pl_rect3df *rc)
+{
+ *rc = (pl_rect3df) {
+ .x0 = PL_MIN(rc->x0, rc->x1),
+ .x1 = PL_MAX(rc->x0, rc->x1),
+ .y0 = PL_MIN(rc->y0, rc->y1),
+ .y1 = PL_MAX(rc->y0, rc->y1),
+ .z0 = PL_MIN(rc->z0, rc->z1),
+ .z1 = PL_MAX(rc->z0, rc->z1),
+ };
+}
+
+pl_rect2d pl_rect2df_round(const pl_rect2df *rc)
+{
+ return (pl_rect2d) {
+ .x0 = roundf(rc->x0),
+ .x1 = roundf(rc->x1),
+ .y0 = roundf(rc->y0),
+ .y1 = roundf(rc->y1),
+ };
+}
+
+pl_rect3d pl_rect3df_round(const pl_rect3df *rc)
+{
+ return (pl_rect3d) {
+ .x0 = roundf(rc->x0),
+ .x1 = roundf(rc->x1),
+ .y0 = roundf(rc->y0),
+ .y1 = roundf(rc->y1),
+ .z0 = roundf(rc->z0),
+ .z1 = roundf(rc->z1),
+ };
+}
+
+const pl_matrix3x3 pl_matrix3x3_identity = {{
+ { 1, 0, 0 },
+ { 0, 1, 0 },
+ { 0, 0, 1 },
+}};
+
+void pl_matrix3x3_apply(const pl_matrix3x3 *mat, float vec[3])
+{
+ float x = vec[0], y = vec[1], z = vec[2];
+
+ for (int i = 0; i < 3; i++)
+ vec[i] = mat->m[i][0] * x + mat->m[i][1] * y + mat->m[i][2] * z;
+}
+
+void pl_matrix3x3_apply_rc(const pl_matrix3x3 *mat, pl_rect3df *rc)
+{
+ float x0 = rc->x0, x1 = rc->x1,
+ y0 = rc->y0, y1 = rc->y1,
+ z0 = rc->z0, z1 = rc->z1;
+
+ rc->x0 = mat->m[0][0] * x0 + mat->m[0][1] * y0 + mat->m[0][2] * z0;
+ rc->y0 = mat->m[1][0] * x0 + mat->m[1][1] * y0 + mat->m[1][2] * z0;
+ rc->z0 = mat->m[2][0] * x0 + mat->m[2][1] * y0 + mat->m[2][2] * z0;
+
+ rc->x1 = mat->m[0][0] * x1 + mat->m[0][1] * y1 + mat->m[0][2] * z1;
+ rc->y1 = mat->m[1][0] * x1 + mat->m[1][1] * y1 + mat->m[1][2] * z1;
+ rc->z1 = mat->m[2][0] * x1 + mat->m[2][1] * y1 + mat->m[2][2] * z1;
+}
+
+void pl_matrix3x3_scale(pl_matrix3x3 *mat, float scale)
+{
+ for (int i = 0; i < 3; i++) {
+ for (int j = 0; j < 3; j++)
+ mat->m[i][j] *= scale;
+ }
+}
+
+void pl_matrix3x3_invert(pl_matrix3x3 *mat)
+{
+ double m00 = mat->m[0][0], m01 = mat->m[0][1], m02 = mat->m[0][2],
+ m10 = mat->m[1][0], m11 = mat->m[1][1], m12 = mat->m[1][2],
+ m20 = mat->m[2][0], m21 = mat->m[2][1], m22 = mat->m[2][2];
+
+ // calculate the adjoint
+ double a00 = (m11 * m22 - m21 * m12);
+ double a01 = -(m01 * m22 - m21 * m02);
+ double a02 = (m01 * m12 - m11 * m02);
+ double a10 = -(m10 * m22 - m20 * m12);
+ double a11 = (m00 * m22 - m20 * m02);
+ double a12 = -(m00 * m12 - m10 * m02);
+ double a20 = (m10 * m21 - m20 * m11);
+ double a21 = -(m00 * m21 - m20 * m01);
+ double a22 = (m00 * m11 - m10 * m01);
+
+ // calculate the determinant (as inverse == 1/det * adjoint,
+ // adjoint * m == identity * det, so this calculates the det)
+ double det = m00 * a00 + m10 * a01 + m20 * a02;
+ det = 1.0 / det;
+
+ mat->m[0][0] = det * a00;
+ mat->m[0][1] = det * a01;
+ mat->m[0][2] = det * a02;
+ mat->m[1][0] = det * a10;
+ mat->m[1][1] = det * a11;
+ mat->m[1][2] = det * a12;
+ mat->m[2][0] = det * a20;
+ mat->m[2][1] = det * a21;
+ mat->m[2][2] = det * a22;
+}
+
+void pl_matrix3x3_mul(pl_matrix3x3 *a, const pl_matrix3x3 *b)
+{
+ float a00 = a->m[0][0], a01 = a->m[0][1], a02 = a->m[0][2],
+ a10 = a->m[1][0], a11 = a->m[1][1], a12 = a->m[1][2],
+ a20 = a->m[2][0], a21 = a->m[2][1], a22 = a->m[2][2];
+
+ for (int i = 0; i < 3; i++) {
+ a->m[0][i] = a00 * b->m[0][i] + a01 * b->m[1][i] + a02 * b->m[2][i];
+ a->m[1][i] = a10 * b->m[0][i] + a11 * b->m[1][i] + a12 * b->m[2][i];
+ a->m[2][i] = a20 * b->m[0][i] + a21 * b->m[1][i] + a22 * b->m[2][i];
+ }
+}
+
+void pl_matrix3x3_rmul(const pl_matrix3x3 *a, pl_matrix3x3 *b)
+{
+ pl_matrix3x3 m = *a;
+ pl_matrix3x3_mul(&m, b);
+ *b = m;
+}
+
+const pl_transform3x3 pl_transform3x3_identity = {
+ .mat = {{
+ { 1, 0, 0 },
+ { 0, 1, 0 },
+ { 0, 0, 1 },
+ }},
+};
+
+void pl_transform3x3_apply(const pl_transform3x3 *t, float vec[3])
+{
+ pl_matrix3x3_apply(&t->mat, vec);
+
+ for (int i = 0; i < 3; i++)
+ vec[i] += t->c[i];
+}
+
+void pl_transform3x3_apply_rc(const pl_transform3x3 *t, pl_rect3df *rc)
+{
+ pl_matrix3x3_apply_rc(&t->mat, rc);
+
+ rc->x0 += t->c[0];
+ rc->x1 += t->c[0];
+ rc->y0 += t->c[1];
+ rc->y1 += t->c[1];
+ rc->z0 += t->c[2];
+ rc->z1 += t->c[2];
+}
+
+void pl_transform3x3_scale(pl_transform3x3 *t, float scale)
+{
+ pl_matrix3x3_scale(&t->mat, scale);
+
+ for (int i = 0; i < 3; i++)
+ t->c[i] *= scale;
+}
+
+// based on DarkPlaces engine (relicensed from GPL to LGPL)
+void pl_transform3x3_invert(pl_transform3x3 *t)
+{
+ pl_matrix3x3_invert(&t->mat);
+
+ float m00 = t->mat.m[0][0], m01 = t->mat.m[0][1], m02 = t->mat.m[0][2],
+ m10 = t->mat.m[1][0], m11 = t->mat.m[1][1], m12 = t->mat.m[1][2],
+ m20 = t->mat.m[2][0], m21 = t->mat.m[2][1], m22 = t->mat.m[2][2];
+
+ // fix the constant coefficient
+ // rgb = M * yuv + C
+ // M^-1 * rgb = yuv + M^-1 * C
+ // yuv = M^-1 * rgb - M^-1 * C
+ // ^^^^^^^^^^
+ float c0 = t->c[0], c1 = t->c[1], c2 = t->c[2];
+ t->c[0] = -(m00 * c0 + m01 * c1 + m02 * c2);
+ t->c[1] = -(m10 * c0 + m11 * c1 + m12 * c2);
+ t->c[2] = -(m20 * c0 + m21 * c1 + m22 * c2);
+}
+
+const pl_matrix2x2 pl_matrix2x2_identity = {{
+ { 1, 0 },
+ { 0, 1 },
+}};
+
+pl_matrix2x2 pl_matrix2x2_rotation(float a)
+{
+ return (pl_matrix2x2) {{
+ { cosf(a), -sinf(a) },
+ { sinf(a), cosf(a) },
+ }};
+}
+
+void pl_matrix2x2_apply(const pl_matrix2x2 *mat, float vec[2])
+{
+ float x = vec[0], y = vec[1];
+
+ for (int i = 0; i < 2; i++)
+ vec[i] = mat->m[i][0] * x + mat->m[i][1] * y;
+}
+
+void pl_matrix2x2_apply_rc(const pl_matrix2x2 *mat, pl_rect2df *rc)
+{
+ float x0 = rc->x0, x1 = rc->x1,
+ y0 = rc->y0, y1 = rc->y1;
+
+ rc->x0 = mat->m[0][0] * x0 + mat->m[0][1] * y0;
+ rc->y0 = mat->m[1][0] * x0 + mat->m[1][1] * y0;
+
+ rc->x1 = mat->m[0][0] * x1 + mat->m[0][1] * y1;
+ rc->y1 = mat->m[1][0] * x1 + mat->m[1][1] * y1;
+}
+
+void pl_matrix2x2_mul(pl_matrix2x2 *a, const pl_matrix2x2 *b)
+{
+ float a00 = a->m[0][0], a01 = a->m[0][1],
+ a10 = a->m[1][0], a11 = a->m[1][1];
+
+ for (int i = 0; i < 2; i++) {
+ a->m[0][i] = a00 * b->m[0][i] + a01 * b->m[1][i];
+ a->m[1][i] = a10 * b->m[0][i] + a11 * b->m[1][i];
+ }
+}
+
+void pl_matrix2x2_rmul(const pl_matrix2x2 *a, pl_matrix2x2 *b)
+{
+ pl_matrix2x2 m = *a;
+ pl_matrix2x2_mul(&m, b);
+ *b = m;
+}
+
+void pl_matrix2x2_scale(pl_matrix2x2 *mat, float scale)
+{
+ for (int i = 0; i < 2; i++) {
+ for (int j = 0; j < 2; j++)
+ mat->m[i][j] *= scale;
+ }
+}
+
+void pl_matrix2x2_invert(pl_matrix2x2 *mat)
+{
+ float m00 = mat->m[0][0], m01 = mat->m[0][1],
+ m10 = mat->m[1][0], m11 = mat->m[1][1];
+ float invdet = 1.0f / (m11 * m00 - m10 * m01);
+
+ mat->m[0][0] = m11 * invdet;
+ mat->m[0][1] = -m01 * invdet;
+ mat->m[1][0] = -m10 * invdet;
+ mat->m[1][1] = m00 * invdet;
+}
+
+const pl_transform2x2 pl_transform2x2_identity = {
+ .mat = {{
+ { 1, 0 },
+ { 0, 1 },
+ }},
+};
+
+void pl_transform2x2_apply(const pl_transform2x2 *t, float vec[2])
+{
+ pl_matrix2x2_apply(&t->mat, vec);
+
+ for (int i = 0; i < 2; i++)
+ vec[i] += t->c[i];
+}
+
+void pl_transform2x2_apply_rc(const pl_transform2x2 *t, pl_rect2df *rc)
+{
+ pl_matrix2x2_apply_rc(&t->mat, rc);
+
+ rc->x0 += t->c[0];
+ rc->x1 += t->c[0];
+ rc->y0 += t->c[1];
+ rc->y1 += t->c[1];
+}
+
+void pl_transform2x2_mul(pl_transform2x2 *a, const pl_transform2x2 *b)
+{
+ float c[2] = { b->c[0], b->c[1] };
+ pl_transform2x2_apply(a, c);
+ memcpy(a->c, c, sizeof(c));
+ pl_matrix2x2_mul(&a->mat, &b->mat);
+}
+
+void pl_transform2x2_rmul(const pl_transform2x2 *a, pl_transform2x2 *b)
+{
+ pl_transform2x2_apply(a, b->c);
+ pl_matrix2x2_rmul(&a->mat, &b->mat);
+}
+
+void pl_transform2x2_scale(pl_transform2x2 *t, float scale)
+{
+ pl_matrix2x2_scale(&t->mat, scale);
+
+ for (int i = 0; i < 2; i++)
+ t->c[i] *= scale;
+}
+
+void pl_transform2x2_invert(pl_transform2x2 *t)
+{
+ pl_matrix2x2_invert(&t->mat);
+
+ float m00 = t->mat.m[0][0], m01 = t->mat.m[0][1],
+ m10 = t->mat.m[1][0], m11 = t->mat.m[1][1];
+ float c0 = t->c[0], c1 = t->c[1];
+ t->c[0] = -(m00 * c0 + m01 * c1);
+ t->c[1] = -(m10 * c0 + m11 * c1);
+}
+
+pl_rect2df pl_transform2x2_bounds(const pl_transform2x2 *t, const pl_rect2df *rc)
+{
+ float p[4][2] = {
+ { rc->x0, rc->y0 },
+ { rc->x0, rc->y1 },
+ { rc->x1, rc->y0 },
+ { rc->x1, rc->y1 },
+ };
+ for (int i = 0; i < PL_ARRAY_SIZE(p); i++)
+ pl_transform2x2_apply(t, p[i]);
+
+ return (pl_rect2df) {
+ .x0 = fminf(fminf(p[0][0], p[1][0]), fminf(p[2][0], p[3][0])),
+ .x1 = fmaxf(fmaxf(p[0][0], p[1][0]), fmaxf(p[2][0], p[3][0])),
+ .y0 = fminf(fminf(p[0][1], p[1][1]), fminf(p[2][1], p[3][1])),
+ .y1 = fmaxf(fmaxf(p[0][1], p[1][1]), fmaxf(p[2][1], p[3][1])),
+ };
+}
+
+float pl_rect2df_aspect(const pl_rect2df *rc)
+{
+ float w = fabsf(pl_rect_w(*rc)), h = fabsf(pl_rect_h(*rc));
+ return h ? (w / h) : 0.0;
+}
+
+void pl_rect2df_aspect_set(pl_rect2df *rc, float aspect, float panscan)
+{
+ pl_assert(aspect >= 0);
+ float orig_aspect = pl_rect2df_aspect(rc);
+ if (!aspect || !orig_aspect)
+ return;
+
+ float scale_x, scale_y;
+ if (aspect > orig_aspect) {
+ // New aspect is wider than the original, so we need to either grow in
+ // scale_x (panscan=1) or shrink in scale_y (panscan=0)
+ scale_x = powf(aspect / orig_aspect, panscan);
+ scale_y = powf(aspect / orig_aspect, panscan - 1.0);
+ } else if (aspect < orig_aspect) {
+ // New aspect is taller, so either grow in scale_y (panscan=1) or
+ // shrink in scale_x (panscan=0)
+ scale_x = powf(orig_aspect / aspect, panscan - 1.0);
+ scale_y = powf(orig_aspect / aspect, panscan);
+ } else {
+ return; // No change in aspect
+ }
+
+ pl_rect2df_stretch(rc, scale_x, scale_y);
+}
+
+void pl_rect2df_aspect_fit(pl_rect2df *rc, const pl_rect2df *src, float panscan)
+{
+ float orig_w = fabs(pl_rect_w(*rc)),
+ orig_h = fabs(pl_rect_h(*rc));
+ if (!orig_w || !orig_h)
+ return;
+
+ // If either one of these is larger than 1, then we need to shrink to fit,
+ // otherwise we can just directly stretch the rect.
+ float scale_x = fabs(pl_rect_w(*src)) / orig_w,
+ scale_y = fabs(pl_rect_h(*src)) / orig_h;
+
+ if (scale_x > 1.0 || scale_y > 1.0) {
+ pl_rect2df_aspect_copy(rc, src, panscan);
+ } else {
+ pl_rect2df_stretch(rc, scale_x, scale_y);
+ }
+}
+
+void pl_rect2df_stretch(pl_rect2df *rc, float stretch_x, float stretch_y)
+{
+ float midx = (rc->x0 + rc->x1) / 2.0,
+ midy = (rc->y0 + rc->y1) / 2.0;
+
+ rc->x0 = rc->x0 * stretch_x + midx * (1.0 - stretch_x);
+ rc->x1 = rc->x1 * stretch_x + midx * (1.0 - stretch_x);
+ rc->y0 = rc->y0 * stretch_y + midy * (1.0 - stretch_y);
+ rc->y1 = rc->y1 * stretch_y + midy * (1.0 - stretch_y);
+}
+
+void pl_rect2df_offset(pl_rect2df *rc, float offset_x, float offset_y)
+{
+ if (rc->x1 < rc->x0)
+ offset_x = -offset_x;
+ if (rc->y1 < rc->y0)
+ offset_y = -offset_y;
+
+ rc->x0 += offset_x;
+ rc->x1 += offset_x;
+ rc->y0 += offset_y;
+ rc->y1 += offset_y;
+}
+
+void pl_rect2df_rotate(pl_rect2df *rc, pl_rotation rot)
+{
+ if (!(rot = pl_rotation_normalize(rot)))
+ return;
+
+ float x0 = rc->x0, y0 = rc->y0, x1 = rc->x1, y1 = rc->y1;
+ if (rot >= PL_ROTATION_180) {
+ rot -= PL_ROTATION_180;
+ PL_SWAP(x0, x1);
+ PL_SWAP(y0, y1);
+ }
+
+ switch (rot) {
+ case PL_ROTATION_0:
+ *rc = (pl_rect2df) {
+ .x0 = x0,
+ .y0 = y0,
+ .x1 = x1,
+ .y1 = y1,
+ };
+ return;
+ case PL_ROTATION_90:
+ *rc = (pl_rect2df) {
+ .x0 = y1,
+ .y0 = x0,
+ .x1 = y0,
+ .y1 = x1,
+ };
+ return;
+ default: pl_unreachable();
+ }
+}
diff --git a/src/common.h b/src/common.h
new file mode 100644
index 0000000..0cac24d
--- /dev/null
+++ b/src/common.h
@@ -0,0 +1,191 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#define __STDC_FORMAT_MACROS
+
+#ifdef __cplusplus
+#include <version>
+#endif
+
+#if !defined(__cplusplus) || defined(__cpp_lib_stdatomic_h)
+#define PL_HAVE_STDATOMIC
+#endif
+
+#ifdef PL_HAVE_STDATOMIC
+#include <stdatomic.h>
+#endif
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#if defined(__MINGW32__) && !defined(__clang__)
+#define PL_PRINTF(fmt, va) __attribute__ ((format(gnu_printf, fmt, va))) \
+ __attribute__ ((nonnull(fmt)))
+#elif defined(__GNUC__)
+#define PL_PRINTF(fmt, va) __attribute__ ((format(printf, fmt, va))) \
+ __attribute__ ((nonnull(fmt)))
+#else
+#define PL_PRINTF(fmt, va)
+#endif
+
+#define PL_NOINLINE __attribute__((noinline))
+
+#include "os.h"
+
+#include "config_internal.h"
+
+#define PL_DEPRECATED
+
+#include <libplacebo/config.h>
+
+#include "pl_assert.h"
+#include "pl_alloc.h"
+#include "pl_clock.h"
+#include "pl_string.h"
+
+#if PL_API_VER != BUILD_API_VER
+#error Header mismatch? <libplacebo/config.h> pulled from elsewhere!
+#endif
+
+// Divide a number while rounding up (careful: double-eval)
+#define PL_DIV_UP(x, y) (((x) + (y) - 1) / (y))
+
+// Align up to the nearest multiple of an arbitrary alignment, which may also
+// be 0 to signal no alignment requirements.
+#define PL_ALIGN(x, align) ((align) ? PL_DIV_UP(x, align) * (align) : (x))
+
+// This is faster but must only be called on positive powers of two.
+#define PL_ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1))
+
+// Returns the log base 2 of an unsigned long long
+#define PL_LOG2(x) ((unsigned) (8*sizeof (unsigned long long) - __builtin_clzll((x)) - 1))
+
+// Rounds a number up to the nearest power of two
+#define PL_ALIGN_POT(x) (0x1LLU << (PL_LOG2((x) - 1) + 1))
+
+// Right shift a number while rounding up
+#define PL_RSHIFT_UP(x, s) -((-(x)) >> (s))
+
+// Returns whether or not a number is a power of two (or zero)
+#define PL_ISPOT(x) (((x) & ((x) - 1)) == 0)
+
+// Returns the size of a static array with known size.
+#define PL_ARRAY_SIZE(s) (sizeof(s) / sizeof((s)[0]))
+
+// Swaps two variables
+#define PL_SWAP(a, b) \
+ do { \
+ __typeof__ (a) _tmp = (a); \
+ (a) = (b); \
+ (b) = _tmp; \
+ } while (0)
+
+// Helper functions for transposing a matrix in-place.
+#define PL_TRANSPOSE_DIM(d, m) \
+ pl_transpose((d), (float[(d)*(d)]){0}, (const float *)(m))
+
+#define PL_TRANSPOSE_2X2(m) PL_TRANSPOSE_DIM(2, m)
+#define PL_TRANSPOSE_3X3(m) PL_TRANSPOSE_DIM(3, m)
+#define PL_TRANSPOSE_4X4(m) PL_TRANSPOSE_DIM(4, m)
+
+static inline float *pl_transpose(int dim, float *out, const float *in)
+{
+ for (int i = 0; i < dim; i++) {
+ for (int j = 0; j < dim; j++)
+ out[i * dim + j] = in[j * dim + i];
+ }
+
+ return out;
+}
+
+// Helper functions for some common numeric operations (careful: double-eval)
+#define PL_MAX(x, y) ((x) > (y) ? (x) : (y))
+#define PL_MAX3(x, y, z) PL_MAX(PL_MAX(x, y), z)
+#define PL_MIN(x, y) ((x) < (y) ? (x) : (y))
+#define PL_CLAMP(x, l, h) ((x) < (l) ? (l) : (x) > (h) ? (h) : (x))
+#define PL_CMP(a, b) (((a) > (b)) - ((a) < (b)))
+#define PL_DEF(x, d) ((x) ? (x) : (d))
+#define PL_SQUARE(x) ((x) * (x))
+#define PL_CUBE(x) ((x) * (x) * (x))
+#define PL_MIX(a, b, x) ((x) * (b) + (1 - (x)) * (a))
+
+static inline float pl_smoothstep(float edge0, float edge1, float x)
+{
+ if (edge0 == edge1)
+ return x >= edge0;
+ x = (x - edge0) / (edge1 - edge0);
+ x = PL_CLAMP(x, 0.0f, 1.0f);
+ return x * x * (3.0f - 2.0f * x);
+}
+
+// Helpers for doing alignment calculations
+static inline size_t pl_gcd(size_t x, size_t y)
+{
+ assert(x && y);
+ while (y) {
+ size_t tmp = y;
+ y = x % y;
+ x = tmp;
+ }
+
+ return x;
+}
+
+static inline size_t pl_lcm(size_t x, size_t y)
+{
+ assert(x && y);
+ return x * (y / pl_gcd(x, y));
+}
+
+// Conditional abort() macro that depends on the configuration option
+#ifdef PL_DEBUG_ABORT
+# define pl_debug_abort() do { \
+ fprintf(stderr, "pl_debug_abort() triggered!\n"); \
+ abort(); \
+} while (0)
+#else
+# define pl_debug_abort() do {} while (0)
+#endif
+
+#ifdef PL_HAVE_STDATOMIC
+
+// Refcounting helpers
+typedef atomic_uint_fast32_t pl_rc_t;
+#define pl_rc_init(rc) atomic_init(rc, 1)
+#define pl_rc_ref(rc) ((void) atomic_fetch_add_explicit(rc, 1, memory_order_acquire))
+#define pl_rc_deref(rc) (atomic_fetch_sub_explicit(rc, 1, memory_order_release) == 1)
+#define pl_rc_count(rc) atomic_load(rc)
+
+#endif
+
+#define pl_unreachable() (assert(!"unreachable"), __builtin_unreachable())
+
+// Helper for parameter validation
+#define pl_require(ctx, expr) \
+ do { \
+ if (!(expr)) { \
+ PL_ERR(ctx, "Validation failed: %s (%s:%d)", \
+ #expr, __FILE__, __LINE__); \
+ pl_log_stack_trace(ctx->log, PL_LOG_ERR); \
+ pl_debug_abort(); \
+ goto error; \
+ } \
+ } while (0)
diff --git a/src/convert.cc b/src/convert.cc
new file mode 100644
index 0000000..05c9dd0
--- /dev/null
+++ b/src/convert.cc
@@ -0,0 +1,233 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <charconv>
+#include <limits>
+#include <system_error>
+
+#if __has_include(<fast_float/fast_float.h>)
+# include <fast_float/fast_float.h>
+#endif
+
+#include "pl_string.h"
+
+[[maybe_unused]]
+static int ccStrPrintDouble( char *str, int bufsize, int decimals, double value );
+
+namespace {
+
+template <typename T>
+struct has_std_to_chars_impl {
+ template <typename CT>
+ static auto _(CT s) -> decltype(std::to_chars(s, s, std::declval<T>()), std::true_type{});
+ static auto _(...) -> std::false_type;
+ static constexpr bool value = decltype(_((char *){}))::value;
+};
+
+template <typename T>
+constexpr bool has_std_to_chars = has_std_to_chars_impl<T>::value;
+
+template <typename T, typename... Args>
+static inline int to_chars(char *buf, size_t len, T n, Args ...args)
+{
+ if constexpr (has_std_to_chars<T>) {
+ auto [ptr, ec] = std::to_chars(buf, buf + len, n, args...);
+ return ec == std::errc() ? ptr - buf : 0;
+ } else {
+ static_assert(std::is_same_v<float, T> || std::is_same_v<double, T>,
+ "Not implemented!");
+ // FIXME: Fallback for GCC <= 10 currently required for MinGW-w64 on
+ // Ubuntu 22.04. Remove this when Ubuntu 24.04 is released, as it will
+ // provide newer MinGW-w64 GCC and it will be safe to require it.
+ return ccStrPrintDouble(buf, len, std::numeric_limits<T>::max_digits10, n);
+ }
+}
+
+template <typename T>
+struct has_std_from_chars_impl {
+ template <typename CT>
+ static auto _(CT s) -> decltype(std::from_chars(s, s, std::declval<T&>()), std::true_type{});
+ static auto _(...) -> std::false_type;
+ static constexpr bool value = decltype(_((const char *){}))::value;
+};
+
+template <typename T>
+constexpr bool has_std_from_chars = has_std_from_chars_impl<T>::value;
+
+template <typename T, typename... Args>
+static inline bool from_chars(pl_str str, T &n, Args ...args)
+{
+ if constexpr (has_std_from_chars<T>) {
+ auto [ptr, ec] = std::from_chars((const char *) str.buf,
+ (const char *) str.buf + str.len,
+ n, args...);
+ return ec == std::errc();
+ } else {
+ constexpr bool is_fp = std::is_same_v<float, T> || std::is_same_v<double, T>;
+ static_assert(is_fp, "Not implemented!");
+#if !__has_include(<fast_float/fast_float.h>)
+ static_assert(!is_fp, "<fast_float/fast_float.h> is required, but not " \
+ "found. Please run `git submodule update --init`" \
+ " or provide <fast_float/fast_float.h>");
+#else
+ // FIXME: Fallback for libc++, as it does not implement floating-point
+ // variant of std::from_chars. Remove this when appropriate.
+ auto [ptr, ec] = fast_float::from_chars((const char *) str.buf,
+ (const char *) str.buf + str.len,
+ n, args...);
+ return ec == std::errc();
+#endif
+ }
+}
+
+}
+
+#define CHAR_CONVERT(name, type, ...) \
+ int pl_str_print_##name(char *buf, size_t len, type n) \
+ { \
+ return to_chars(buf, len, n __VA_OPT__(,) __VA_ARGS__); \
+ } \
+ bool pl_str_parse_##name(pl_str str, type *n) \
+ { \
+ return from_chars(str, *n __VA_OPT__(,) __VA_ARGS__); \
+ }
+
+CHAR_CONVERT(hex, unsigned short, 16)
+CHAR_CONVERT(int, int)
+CHAR_CONVERT(uint, unsigned int)
+CHAR_CONVERT(int64, int64_t)
+CHAR_CONVERT(uint64, uint64_t)
+CHAR_CONVERT(float, float)
+CHAR_CONVERT(double, double)
+
+/* *****************************************************************************
+ *
+ * Copyright (c) 2007-2016 Alexis Naveros.
+ * Modified for use with libplacebo by Niklas Haas
+ * Changes include:
+ * - Removed a CC_MIN macro dependency by equivalent logic
+ * - Removed CC_ALWAYSINLINE
+ * - Fixed (!seq) check to (!seqlength)
+ * - Added support for scientific notation (e.g. 1.0e10) in ccSeqParseDouble
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * -----------------------------------------------------------------------------
+ */
+
+static int ccStrPrintDouble( char *str, int bufsize, int decimals, double value )
+{
+ int size, offset, index;
+ int32_t frac, accumsub;
+ double muldec;
+ uint32_t u32;
+ uint64_t u64;
+
+ size = 0;
+ if( value < 0.0 )
+ {
+ size = 1;
+ *str++ = '-';
+ bufsize--;
+ value = -value;
+ }
+
+ if( value < 4294967296.0 )
+ {
+ u32 = (uint32_t)value;
+ offset = pl_str_print_uint( str, bufsize, u32 );
+ if (!offset)
+ goto error;
+ size += offset;
+ bufsize -= size;
+ value -= (double)u32;
+ }
+ else if( value < 18446744073709551616.0 )
+ {
+ u64 = (uint64_t)value;
+ offset = pl_str_print_uint64( str, bufsize, u64 );
+ if (!offset)
+ goto error;
+ size += offset;
+ bufsize -= size;
+ value -= (double)u64;
+ }
+ else
+ goto error;
+
+ if (decimals > bufsize - 2)
+ decimals = bufsize - 2;
+ if( decimals <= 0 )
+ return size;
+
+ muldec = 10.0;
+ accumsub = 0;
+ str += offset;
+
+ for( index = 0 ; index < decimals ; index++ )
+ {
+ // Skip printing insignificant decimal digits
+ if (value * muldec - accumsub <= std::numeric_limits<double>::epsilon())
+ break;
+ if (index == 0) {
+ size += 1;
+ *str++ = '.';
+ }
+ frac = (int32_t)( value * muldec ) - accumsub;
+ frac = PL_CLAMP(frac, 0, 9); // FIXME: why is this needed?
+ str[index] = '0' + (char)frac;
+ accumsub += frac;
+ accumsub = ( accumsub << 3 ) + ( accumsub << 1 );
+ if( muldec < 10000000 )
+ muldec *= 10.0;
+ else
+ {
+ value *= 10000000.0;
+ value -= (int32_t)value;
+ muldec = 10.0;
+ accumsub = 0;
+ }
+ }
+ // Round up the last decimal digit
+ if ( str[ index - 1 ] < '9' && (int32_t)( value * muldec ) - accumsub >= 5 )
+ str[ index - 1 ]++;
+ str[ index ] = 0;
+ size += index;
+ return size;
+
+error:
+ if( bufsize < 4 )
+ *str = 0;
+ else
+ {
+ str[0] = 'E';
+ str[1] = 'R';
+ str[2] = 'R';
+ str[3] = 0;
+ }
+ return 0;
+}
diff --git a/src/d3d11/common.h b/src/d3d11/common.h
new file mode 100644
index 0000000..e14b709
--- /dev/null
+++ b/src/d3d11/common.h
@@ -0,0 +1,66 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../common.h"
+#include "../log.h"
+
+#ifdef PL_HAVE_DXGI_DEBUG
+#include <dxgidebug.h>
+#endif
+
+#include <libplacebo/d3d11.h>
+
+// Shared struct used to hold the D3D11 device and associated interfaces
+struct d3d11_ctx {
+ pl_log log;
+ pl_d3d11 d3d11;
+
+ // Copy of the device from pl_d3d11 for convenience. Does not hold an
+ // additional reference.
+ ID3D11Device *dev;
+
+ // DXGI device. This does hold a reference.
+ IDXGIDevice1 *dxgi_dev;
+
+#ifdef PL_HAVE_DXGI_DEBUG
+ // Debug interfaces
+ IDXGIDebug *debug;
+ IDXGIInfoQueue *iqueue;
+ uint64_t last_discarded; // Last count of discarded messages
+ DXGI_INFO_QUEUE_MESSAGE *dxgi_msg;
+#endif
+
+ // pl_gpu_is_failed (We saw a device removed error!)
+ bool is_failed;
+};
+
+// DDK value. Apparently some D3D functions can return this instead of the
+// proper user-mode error code. See:
+// https://docs.microsoft.com/en-us/windows/win32/api/dxgi/nf-dxgi-idxgiswapchain-present
+#define D3DDDIERR_DEVICEREMOVED (0x88760870)
+
+#ifndef D3D11_FORMAT_SUPPORT2_UAV_TYPED_STORE
+#define D3D11_FORMAT_SUPPORT2_UAV_TYPED_STORE (0x80)
+#endif
+#ifndef D3D11_FORMAT_SUPPORT2_UAV_TYPED_LOAD
+#define D3D11_FORMAT_SUPPORT2_UAV_TYPED_LOAD (0x40)
+#endif
+#ifndef PL_HAVE_DXGI_DEBUG_D3D11
+DEFINE_GUID(DXGI_DEBUG_D3D11, 0x4b99317b, 0xac39, 0x4aa6, 0xbb, 0xb, 0xba, 0xa0, 0x47, 0x84, 0x79, 0x8f);
+#endif
diff --git a/src/d3d11/context.c b/src/d3d11/context.c
new file mode 100644
index 0000000..e0ba90f
--- /dev/null
+++ b/src/d3d11/context.c
@@ -0,0 +1,488 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+
+// Windows 8 enum value, not present in mingw-w64 v7
+#define DXGI_ADAPTER_FLAG_SOFTWARE (2)
+
+const struct pl_d3d11_params pl_d3d11_default_params = { PL_D3D11_DEFAULTS };
+
+static INIT_ONCE d3d11_once = INIT_ONCE_STATIC_INIT;
+static PFN_D3D11_CREATE_DEVICE pD3D11CreateDevice = NULL;
+static __typeof__(&CreateDXGIFactory1) pCreateDXGIFactory1 = NULL;
+#ifdef PL_HAVE_DXGI_DEBUG
+static __typeof__(&DXGIGetDebugInterface) pDXGIGetDebugInterface = NULL;
+#endif
+
+static void d3d11_load(void)
+{
+ BOOL bPending = FALSE;
+ InitOnceBeginInitialize(&d3d11_once, 0, &bPending, NULL);
+
+ if (bPending)
+ {
+ HMODULE d3d11 = LoadLibraryW(L"d3d11.dll");
+ if (d3d11) {
+ pD3D11CreateDevice = (void *)
+ GetProcAddress(d3d11, "D3D11CreateDevice");
+ }
+
+ HMODULE dxgi = LoadLibraryW(L"dxgi.dll");
+ if (dxgi) {
+ pCreateDXGIFactory1 = (void *)
+ GetProcAddress(dxgi, "CreateDXGIFactory1");
+ }
+
+#ifdef PL_HAVE_DXGI_DEBUG
+ HMODULE dxgi_debug = LoadLibraryW(L"dxgidebug.dll");
+ if (dxgi_debug) {
+ pDXGIGetDebugInterface = (void *)
+ GetProcAddress(dxgi_debug, "DXGIGetDebugInterface");
+ }
+#endif
+ }
+
+ InitOnceComplete(&d3d11_once, 0, NULL);
+}
+
+// Get a const array of D3D_FEATURE_LEVELs from max_fl to min_fl (inclusive)
+static int get_feature_levels(int max_fl, int min_fl,
+ const D3D_FEATURE_LEVEL **out)
+{
+ static const D3D_FEATURE_LEVEL levels[] = {
+ D3D_FEATURE_LEVEL_12_1,
+ D3D_FEATURE_LEVEL_12_0,
+ D3D_FEATURE_LEVEL_11_1,
+ D3D_FEATURE_LEVEL_11_0,
+ D3D_FEATURE_LEVEL_10_1,
+ D3D_FEATURE_LEVEL_10_0,
+ D3D_FEATURE_LEVEL_9_3,
+ D3D_FEATURE_LEVEL_9_2,
+ D3D_FEATURE_LEVEL_9_1,
+ };
+ static const int levels_len = PL_ARRAY_SIZE(levels);
+
+ int start = 0;
+ for (; start < levels_len; start++) {
+ if (levels[start] <= max_fl)
+ break;
+ }
+ int len = 0;
+ for (; start + len < levels_len; len++) {
+ if (levels[start + len] < min_fl)
+ break;
+ }
+ *out = &levels[start];
+ return len;
+}
+
+static bool is_null_luid(LUID luid)
+{
+ return luid.LowPart == 0 && luid.HighPart == 0;
+}
+
+static IDXGIAdapter *get_adapter(pl_d3d11 d3d11, LUID adapter_luid)
+{
+ struct d3d11_ctx *ctx = PL_PRIV(d3d11);
+ IDXGIFactory1 *factory = NULL;
+ IDXGIAdapter1 *adapter1 = NULL;
+ IDXGIAdapter *adapter = NULL;
+ HRESULT hr;
+
+ if (!pCreateDXGIFactory1) {
+ PL_FATAL(ctx, "Failed to load dxgi.dll");
+ goto error;
+ }
+ pCreateDXGIFactory1(&IID_IDXGIFactory1, (void **) &factory);
+
+ for (int i = 0;; i++) {
+ hr = IDXGIFactory1_EnumAdapters1(factory, i, &adapter1);
+ if (hr == DXGI_ERROR_NOT_FOUND)
+ break;
+ if (FAILED(hr)) {
+ PL_FATAL(ctx, "Failed to enumerate adapters");
+ goto error;
+ }
+
+ DXGI_ADAPTER_DESC1 desc;
+ D3D(IDXGIAdapter1_GetDesc1(adapter1, &desc));
+ if (desc.AdapterLuid.LowPart == adapter_luid.LowPart &&
+ desc.AdapterLuid.HighPart == adapter_luid.HighPart)
+ {
+ break;
+ }
+
+ SAFE_RELEASE(adapter1);
+ }
+ if (!adapter1) {
+ PL_FATAL(ctx, "Adapter with LUID %08lx%08lx not found",
+ adapter_luid.HighPart, adapter_luid.LowPart);
+ goto error;
+ }
+
+ D3D(IDXGIAdapter1_QueryInterface(adapter1, &IID_IDXGIAdapter,
+ (void **) &adapter));
+
+error:
+ SAFE_RELEASE(factory);
+ SAFE_RELEASE(adapter1);
+ return adapter;
+}
+
+static bool has_sdk_layers(void)
+{
+ // This will fail if the SDK layers aren't installed
+ return SUCCEEDED(pD3D11CreateDevice(NULL, D3D_DRIVER_TYPE_NULL, NULL,
+ D3D11_CREATE_DEVICE_DEBUG, NULL, 0, D3D11_SDK_VERSION, NULL, NULL,
+ NULL));
+}
+
+static ID3D11Device *create_device(struct pl_d3d11_t *d3d11,
+ const struct pl_d3d11_params *params)
+{
+ struct d3d11_ctx *ctx = PL_PRIV(d3d11);
+ bool debug = params->debug;
+ bool warp = params->force_software;
+ int max_fl = params->max_feature_level;
+ int min_fl = params->min_feature_level;
+ ID3D11Device *dev = NULL;
+ IDXGIDevice1 *dxgi_dev = NULL;
+ IDXGIAdapter *adapter = NULL;
+ bool release_adapter = false;
+ HRESULT hr;
+
+ d3d11_load();
+
+ if (!pD3D11CreateDevice) {
+ PL_FATAL(ctx, "Failed to load d3d11.dll");
+ goto error;
+ }
+
+ if (params->adapter) {
+ adapter = params->adapter;
+ } else if (!is_null_luid(params->adapter_luid)) {
+ adapter = get_adapter(d3d11, params->adapter_luid);
+ release_adapter = true;
+ }
+
+ if (debug && !has_sdk_layers()) {
+ PL_INFO(ctx, "Debug layer not available, removing debug flag");
+ debug = false;
+ }
+
+ // Return here to retry creating the device
+ do {
+ // Use these default feature levels if they are not set
+ max_fl = PL_DEF(max_fl, D3D_FEATURE_LEVEL_12_1);
+ min_fl = PL_DEF(min_fl, D3D_FEATURE_LEVEL_9_1);
+
+ // Get a list of feature levels from min_fl to max_fl
+ const D3D_FEATURE_LEVEL *levels;
+ int levels_len = get_feature_levels(max_fl, min_fl, &levels);
+ if (!levels_len) {
+ PL_FATAL(ctx, "No suitable Direct3D feature level found");
+ goto error;
+ }
+
+ D3D_DRIVER_TYPE type = D3D_DRIVER_TYPE_UNKNOWN;
+ if (!adapter) {
+ if (warp) {
+ type = D3D_DRIVER_TYPE_WARP;
+ } else {
+ type = D3D_DRIVER_TYPE_HARDWARE;
+ }
+ }
+
+ UINT flags = params->flags;
+ if (debug)
+ flags |= D3D11_CREATE_DEVICE_DEBUG;
+
+ hr = pD3D11CreateDevice(adapter, type, NULL, flags, levels, levels_len,
+ D3D11_SDK_VERSION, &dev, NULL, NULL);
+ if (SUCCEEDED(hr))
+ break;
+
+ pl_d3d11_after_error(ctx, hr);
+
+ // Trying to create a D3D_FEATURE_LEVEL_12_0 device on Windows 8.1 or
+ // below will not succeed. Try an 11_1 device.
+ if (hr == E_INVALIDARG && max_fl >= D3D_FEATURE_LEVEL_12_0 &&
+ min_fl <= D3D_FEATURE_LEVEL_11_1) {
+ PL_DEBUG(ctx, "Failed to create 12_0+ device, trying 11_1");
+ max_fl = D3D_FEATURE_LEVEL_11_1;
+ continue;
+ }
+
+ // Trying to create a D3D_FEATURE_LEVEL_11_1 device on Windows 7
+ // without the platform update will not succeed. Try an 11_0 device.
+ if (hr == E_INVALIDARG && max_fl >= D3D_FEATURE_LEVEL_11_1 &&
+ min_fl <= D3D_FEATURE_LEVEL_11_0) {
+ PL_DEBUG(ctx, "Failed to create 11_1+ device, trying 11_0");
+ max_fl = D3D_FEATURE_LEVEL_11_0;
+ continue;
+ }
+
+ // Retry with WARP if allowed
+ if (!adapter && !warp && params->allow_software) {
+ PL_DEBUG(ctx, "Failed to create hardware device, trying WARP: %s",
+ pl_hresult_to_str(hr));
+ warp = true;
+ max_fl = params->max_feature_level;
+ min_fl = params->min_feature_level;
+ continue;
+ }
+
+ PL_FATAL(ctx, "Failed to create Direct3D 11 device: %s",
+ pl_hresult_to_str(hr));
+ goto error;
+ } while (true);
+
+ if (params->max_frame_latency) {
+ D3D(ID3D11Device_QueryInterface(dev, &IID_IDXGIDevice1,
+ (void **) &dxgi_dev));
+ IDXGIDevice1_SetMaximumFrameLatency(dxgi_dev, params->max_frame_latency);
+ }
+
+ d3d11->software = warp;
+
+error:
+ if (release_adapter)
+ SAFE_RELEASE(adapter);
+ SAFE_RELEASE(dxgi_dev);
+ return dev;
+}
+
+static void init_debug_layer(struct d3d11_ctx *ctx, bool leak_check)
+{
+#ifdef PL_HAVE_DXGI_DEBUG
+ if (!pDXGIGetDebugInterface)
+ d3d11_load();
+
+ if (!pDXGIGetDebugInterface)
+ goto error;
+
+ D3D(pDXGIGetDebugInterface(&IID_IDXGIInfoQueue, (void **) &ctx->iqueue));
+
+ // Push empty filter to get everything
+ IDXGIInfoQueue_PushStorageFilter(ctx->iqueue, DXGI_DEBUG_ALL,
+ &(DXGI_INFO_QUEUE_FILTER){0});
+
+ // Filter some annoying D3D11 messages
+ DXGI_INFO_QUEUE_MESSAGE_ID deny_ids[] = {
+ // This false-positive error occurs every time we Draw() with a shader
+ // that samples from a texture format that only supports point sampling.
+ // Since we already use CheckFormatSupport to know which formats can be
+ // linearly sampled from, we shouldn't ever bind a non-point sampler to
+ // a format that doesn't support it.
+ D3D11_MESSAGE_ID_DEVICE_DRAW_RESOURCE_FORMAT_SAMPLE_UNSUPPORTED,
+ };
+ DXGI_INFO_QUEUE_FILTER filter = {
+ .DenyList = {
+ .NumIDs = PL_ARRAY_SIZE(deny_ids),
+ .pIDList = deny_ids,
+ },
+ };
+ IDXGIInfoQueue_PushStorageFilter(ctx->iqueue, DXGI_DEBUG_D3D11, &filter);
+
+ IDXGIInfoQueue_SetMessageCountLimit(ctx->iqueue, DXGI_DEBUG_D3D11, -1);
+ IDXGIInfoQueue_SetMessageCountLimit(ctx->iqueue, DXGI_DEBUG_DXGI, -1);
+
+ if (leak_check)
+ D3D(pDXGIGetDebugInterface(&IID_IDXGIDebug, (void **) &ctx->debug));
+
+error:
+ return;
+#endif
+}
+
+void pl_d3d11_destroy(pl_d3d11 *ptr)
+{
+ pl_d3d11 d3d11 = *ptr;
+ if (!d3d11)
+ return;
+ struct d3d11_ctx *ctx = PL_PRIV(d3d11);
+
+ pl_gpu_destroy(d3d11->gpu);
+
+ SAFE_RELEASE(ctx->dev);
+ SAFE_RELEASE(ctx->dxgi_dev);
+
+#ifdef PL_HAVE_DXGI_DEBUG
+ if (ctx->debug) {
+ // Report any leaked objects
+ pl_d3d11_flush_message_queue(ctx, "After destroy");
+ IDXGIDebug_ReportLiveObjects(ctx->debug, DXGI_DEBUG_ALL, DXGI_DEBUG_RLO_DETAIL);
+ pl_d3d11_flush_message_queue(ctx, "After leak check");
+ IDXGIDebug_ReportLiveObjects(ctx->debug, DXGI_DEBUG_ALL, DXGI_DEBUG_RLO_SUMMARY);
+ pl_d3d11_flush_message_queue(ctx, "After leak summary");
+ }
+
+ SAFE_RELEASE(ctx->debug);
+ SAFE_RELEASE(ctx->iqueue);
+#endif
+
+ pl_free_ptr((void **) ptr);
+}
+
+pl_d3d11 pl_d3d11_create(pl_log log, const struct pl_d3d11_params *params)
+{
+ params = PL_DEF(params, &pl_d3d11_default_params);
+ IDXGIAdapter1 *adapter = NULL;
+ IDXGIAdapter2 *adapter2 = NULL;
+ bool success = false;
+ HRESULT hr;
+
+ struct pl_d3d11_t *d3d11 = pl_zalloc_obj(NULL, d3d11, struct d3d11_ctx);
+ struct d3d11_ctx *ctx = PL_PRIV(d3d11);
+ ctx->log = log;
+ ctx->d3d11 = d3d11;
+
+ if (params->device) {
+ d3d11->device = params->device;
+ ID3D11Device_AddRef(d3d11->device);
+ } else {
+ d3d11->device = create_device(d3d11, params);
+ if (!d3d11->device)
+ goto error;
+ }
+ ctx->dev = d3d11->device;
+
+ if (params->debug ||
+ ID3D11Device_GetCreationFlags(d3d11->device) & D3D11_CREATE_DEVICE_DEBUG)
+ {
+ // Do not report live object on pl_d3d11_destroy if device was created
+ // externally, it makes no sense as there will be a lot of things alive.
+ init_debug_layer(ctx, !params->device);
+ }
+
+ D3D(ID3D11Device_QueryInterface(d3d11->device, &IID_IDXGIDevice1,
+ (void **) &ctx->dxgi_dev));
+ D3D(IDXGIDevice1_GetParent(ctx->dxgi_dev, &IID_IDXGIAdapter1,
+ (void **) &adapter));
+
+ hr = IDXGIAdapter1_QueryInterface(adapter, &IID_IDXGIAdapter2,
+ (void **) &adapter2);
+ if (FAILED(hr))
+ adapter2 = NULL;
+
+ if (adapter2) {
+ PL_INFO(ctx, "Using DXGI 1.2+");
+ } else {
+ PL_INFO(ctx, "Using DXGI 1.1");
+ }
+
+ D3D_FEATURE_LEVEL fl = ID3D11Device_GetFeatureLevel(d3d11->device);
+ PL_INFO(ctx, "Using Direct3D 11 feature level %u_%u",
+ ((unsigned) fl) >> 12, (((unsigned) fl) >> 8) & 0xf);
+
+ char *dev_name = NULL;
+ UINT vendor_id, device_id, revision, subsys_id;
+ LUID adapter_luid;
+ UINT flags;
+
+ if (adapter2) {
+ // DXGI 1.2 IDXGIAdapter2::GetDesc2 is preferred over the DXGI 1.1
+ // version because it reports the real adapter information when using
+ // feature level 9 hardware
+ DXGI_ADAPTER_DESC2 desc;
+ D3D(IDXGIAdapter2_GetDesc2(adapter2, &desc));
+
+ dev_name = pl_to_utf8(NULL, desc.Description);
+ vendor_id = desc.VendorId;
+ device_id = desc.DeviceId;
+ revision = desc.Revision;
+ subsys_id = desc.SubSysId;
+ adapter_luid = desc.AdapterLuid;
+ flags = desc.Flags;
+ } else {
+ DXGI_ADAPTER_DESC1 desc;
+ D3D(IDXGIAdapter1_GetDesc1(adapter, &desc));
+
+ dev_name = pl_to_utf8(NULL, desc.Description);
+ vendor_id = desc.VendorId;
+ device_id = desc.DeviceId;
+ revision = desc.Revision;
+ subsys_id = desc.SubSysId;
+ adapter_luid = desc.AdapterLuid;
+ flags = desc.Flags;
+ }
+
+ PL_INFO(ctx, "Direct3D 11 device properties:");
+ PL_INFO(ctx, " Device Name: %s", dev_name);
+ PL_INFO(ctx, " Device ID: %04x:%04x (rev %02x)",
+ vendor_id, device_id, revision);
+ PL_INFO(ctx, " Subsystem ID: %04x:%04x",
+ LOWORD(subsys_id), HIWORD(subsys_id));
+ PL_INFO(ctx, " LUID: %08lx%08lx",
+ adapter_luid.HighPart, adapter_luid.LowPart);
+ pl_free(dev_name);
+
+ LARGE_INTEGER version;
+ hr = IDXGIAdapter1_CheckInterfaceSupport(adapter, &IID_IDXGIDevice, &version);
+ if (SUCCEEDED(hr)) {
+ PL_INFO(ctx, " Driver version: %u.%u.%u.%u",
+ HIWORD(version.HighPart), LOWORD(version.HighPart),
+ HIWORD(version.LowPart), LOWORD(version.LowPart));
+ }
+
+ // Note: DXGI_ADAPTER_FLAG_SOFTWARE doesn't exist before Windows 8, but we
+ // also set d3d11->software in create_device if we pick WARP ourselves
+ if (flags & DXGI_ADAPTER_FLAG_SOFTWARE)
+ d3d11->software = true;
+
+ // If the primary display adapter is a software adapter, the
+ // DXGI_ADAPTER_FLAG_SOFTWARE flag won't be set, but the device IDs should
+ // still match the Microsoft Basic Render Driver
+ if (vendor_id == 0x1414 && device_id == 0x8c)
+ d3d11->software = true;
+
+ if (d3d11->software) {
+ bool external_adapter = params->device || params->adapter ||
+ !is_null_luid(params->adapter_luid);
+
+ // The allow_software flag only applies if the API user didn't manually
+ // specify an adapter or a device
+ if (!params->allow_software && !external_adapter) {
+ // If we got this far with allow_software set, the primary adapter
+ // must be a software adapter
+ PL_ERR(ctx, "Primary adapter is a software adapter");
+ goto error;
+ }
+
+ // If a software adapter was manually specified, don't show a warning
+ enum pl_log_level level = PL_LOG_WARN;
+ if (external_adapter || params->force_software)
+ level = PL_LOG_INFO;
+
+ PL_MSG(ctx, level, "Using a software adapter");
+ }
+
+ d3d11->gpu = pl_gpu_create_d3d11(ctx);
+ if (!d3d11->gpu)
+ goto error;
+
+ success = true;
+error:
+ if (!success) {
+ PL_FATAL(ctx, "Failed initializing Direct3D 11 device");
+ pl_d3d11_destroy((pl_d3d11 *) &d3d11);
+ }
+ SAFE_RELEASE(adapter);
+ SAFE_RELEASE(adapter2);
+ return d3d11;
+}
diff --git a/src/d3d11/formats.c b/src/d3d11/formats.c
new file mode 100644
index 0000000..7aaec26
--- /dev/null
+++ b/src/d3d11/formats.c
@@ -0,0 +1,293 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "formats.h"
+#include "gpu.h"
+
+#define FMT(_minor, _name, _dxfmt, _type, num, size, bits, order) \
+ (struct d3d_format) { \
+ .dxfmt = DXGI_FORMAT_##_dxfmt##_##_type, \
+ .minor = _minor, \
+ .fmt = { \
+ .name = _name, \
+ .type = PL_FMT_##_type, \
+ .num_components = num, \
+ .component_depth = bits, \
+ .texel_size = size, \
+ .texel_align = 1, \
+ .internal_size = size, \
+ .host_bits = bits, \
+ .sample_order = order, \
+ }, \
+ }
+
+#define IDX(...) {__VA_ARGS__}
+#define BITS(...) {__VA_ARGS__}
+
+#define REGFMT(name, dxfmt, type, num, bits) \
+ FMT(0, name, dxfmt, type, num, (num) * (bits) / 8, \
+ BITS(bits, bits, bits, bits), \
+ IDX(0, 1, 2, 3))
+
+#define EMUFMT(_name, _dxfmt, _type, in, en, ib, eb) \
+ (struct d3d_format) { \
+ .dxfmt = DXGI_FORMAT_##_dxfmt##_##_type, \
+ .minor = 0, \
+ .fmt = { \
+ .name = _name, \
+ .type = PL_FMT_##_type, \
+ .num_components = en, \
+ .component_depth = BITS(ib, ib, ib, ib), \
+ .internal_size = (in) * (ib) / 8, \
+ .opaque = false, \
+ .emulated = true, \
+ .texel_size = (en) * (eb) / 8, \
+ .texel_align = (eb) / 8, \
+ .host_bits = BITS(eb, eb, eb, eb), \
+ .sample_order = IDX(0, 1, 2, 3), \
+ }, \
+ }
+
+const struct d3d_format pl_d3d11_formats[] = {
+ REGFMT("r8", R8, UNORM, 1, 8),
+ REGFMT("rg8", R8G8, UNORM, 2, 8),
+ EMUFMT("rgb8", R8G8B8A8, UNORM, 4, 3, 8, 8),
+ REGFMT("rgba8", R8G8B8A8, UNORM, 4, 8),
+ REGFMT("r16", R16, UNORM, 1, 16),
+ REGFMT("rg16", R16G16, UNORM, 2, 16),
+ EMUFMT("rgb16", R16G16B16A16, UNORM, 4, 3, 16, 16),
+ REGFMT("rgba16", R16G16B16A16, UNORM, 4, 16),
+
+ REGFMT("r8s", R8, SNORM, 1, 8),
+ REGFMT("rg8s", R8G8, SNORM, 2, 8),
+ REGFMT("rgba8s", R8G8B8A8, SNORM, 4, 8),
+ REGFMT("r16s", R16, SNORM, 1, 16),
+ REGFMT("rg16s", R16G16, SNORM, 2, 16),
+ REGFMT("rgba16s", R16G16B16A16, SNORM, 4, 16),
+
+ REGFMT("r16hf", R16, FLOAT, 1, 16),
+ REGFMT("rg16hf", R16G16, FLOAT, 2, 16),
+ EMUFMT("rgb16hf", R16G16B16A16, FLOAT, 4, 3, 16, 16),
+ REGFMT("rgba16hf", R16G16B16A16, FLOAT, 4, 16),
+ REGFMT("r32f", R32, FLOAT, 1, 32),
+ REGFMT("rg32f", R32G32, FLOAT, 2, 32),
+ REGFMT("rgb32f", R32G32B32, FLOAT, 3, 32),
+ REGFMT("rgba32f", R32G32B32A32, FLOAT, 4, 32),
+
+ EMUFMT("r16f", R16, FLOAT, 1, 1, 16, 32),
+ EMUFMT("rg16f", R16G16, FLOAT, 2, 2, 16, 32),
+ EMUFMT("rgb16f", R16G16B16A16, FLOAT, 4, 3, 16, 32),
+ EMUFMT("rgba16f", R16G16B16A16, FLOAT, 4, 4, 16, 32),
+
+ REGFMT("r8u", R8, UINT, 1, 8),
+ REGFMT("rg8u", R8G8, UINT, 2, 8),
+ REGFMT("rgba8u", R8G8B8A8, UINT, 4, 8),
+ REGFMT("r16u", R16, UINT, 1, 16),
+ REGFMT("rg16u", R16G16, UINT, 2, 16),
+ REGFMT("rgba16u", R16G16B16A16, UINT, 4, 16),
+ REGFMT("r32u", R32, UINT, 1, 32),
+ REGFMT("rg32u", R32G32, UINT, 2, 32),
+ REGFMT("rgb32u", R32G32B32, UINT, 3, 32),
+ REGFMT("rgba32u", R32G32B32A32, UINT, 4, 32),
+
+ REGFMT("r8i", R8, SINT, 1, 8),
+ REGFMT("rg8i", R8G8, SINT, 2, 8),
+ REGFMT("rgba8i", R8G8B8A8, SINT, 4, 8),
+ REGFMT("r16i", R16, SINT, 1, 16),
+ REGFMT("rg16i", R16G16, SINT, 2, 16),
+ REGFMT("rgba16i", R16G16B16A16, SINT, 4, 16),
+ REGFMT("r32i", R32, SINT, 1, 32),
+ REGFMT("rg32i", R32G32, SINT, 2, 32),
+ REGFMT("rgb32i", R32G32B32, SINT, 3, 32),
+ REGFMT("rgba32i", R32G32B32A32, SINT, 4, 32),
+
+ FMT(0, "rgb10a2", R10G10B10A2, UNORM, 4, 4, BITS(10, 10, 10, 2), IDX(0, 1, 2, 3)),
+ FMT(0, "rgb10a2u", R10G10B10A2, UINT, 4, 4, BITS(10, 10, 10, 2), IDX(0, 1, 2, 3)),
+
+ FMT(0, "bgra8", B8G8R8A8, UNORM, 4, 4, BITS( 8, 8, 8, 8), IDX(2, 1, 0, 3)),
+ FMT(0, "bgrx8", B8G8R8X8, UNORM, 3, 4, BITS( 8, 8, 8), IDX(2, 1, 0)),
+ FMT(0, "rg11b10f", R11G11B10, FLOAT, 3, 4, BITS(11, 11, 10), IDX(0, 1, 2)),
+
+ // D3D11.1 16-bit formats (resurrected D3D9 formats)
+ FMT(1, "bgr565", B5G6R5, UNORM, 3, 2, BITS( 5, 6, 5), IDX(2, 1, 0)),
+ FMT(1, "bgr5a1", B5G5R5A1, UNORM, 4, 2, BITS( 5, 5, 5, 1), IDX(2, 1, 0, 3)),
+ FMT(1, "bgra4", B4G4R4A4, UNORM, 4, 2, BITS( 4, 4, 4, 4), IDX(2, 1, 0, 3)),
+
+ {0}
+};
+#undef BITS
+#undef IDX
+#undef REGFMT
+#undef FMT
+
+void pl_d3d11_setup_formats(struct pl_gpu_t *gpu)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ PL_ARRAY(pl_fmt) formats = {0};
+ HRESULT hr;
+
+ for (int i = 0; pl_d3d11_formats[i].dxfmt; i++) {
+ const struct d3d_format *d3d_fmt = &pl_d3d11_formats[i];
+
+ // The Direct3D 11.0 debug layer will segfault if CheckFormatSupport is
+ // called on a format it doesn't know about
+ if (pl_d3d11_formats[i].minor > p->minor)
+ continue;
+
+ UINT sup = 0;
+ hr = ID3D11Device_CheckFormatSupport(p->dev, d3d_fmt->dxfmt, &sup);
+ if (FAILED(hr))
+ continue;
+
+ D3D11_FEATURE_DATA_FORMAT_SUPPORT2 sup2 = { .InFormat = d3d_fmt->dxfmt };
+ ID3D11Device_CheckFeatureSupport(p->dev, D3D11_FEATURE_FORMAT_SUPPORT2,
+ &sup2, sizeof(sup2));
+
+ struct pl_fmt_t *fmt = pl_alloc_obj(gpu, fmt, struct d3d_fmt *);
+ const struct d3d_format **fmtp = PL_PRIV(fmt);
+ *fmt = d3d_fmt->fmt;
+ *fmtp = d3d_fmt;
+
+ // For sanity, clear the superfluous fields
+ for (int j = fmt->num_components; j < 4; j++) {
+ fmt->component_depth[j] = 0;
+ fmt->sample_order[j] = 0;
+ fmt->host_bits[j] = 0;
+ }
+
+ static const struct {
+ enum pl_fmt_caps caps;
+ UINT sup;
+ UINT sup2;
+ } support[] = {
+ {
+ .caps = PL_FMT_CAP_SAMPLEABLE,
+ .sup = D3D11_FORMAT_SUPPORT_TEXTURE2D,
+ },
+ {
+ .caps = PL_FMT_CAP_STORABLE,
+ // SHADER_LOAD is for readonly images, which can use a SRV
+ .sup = D3D11_FORMAT_SUPPORT_TEXTURE2D |
+ D3D11_FORMAT_SUPPORT_TYPED_UNORDERED_ACCESS_VIEW |
+ D3D11_FORMAT_SUPPORT_SHADER_LOAD,
+ .sup2 = D3D11_FORMAT_SUPPORT2_UAV_TYPED_STORE,
+ },
+ {
+ .caps = PL_FMT_CAP_READWRITE,
+ .sup = D3D11_FORMAT_SUPPORT_TEXTURE2D |
+ D3D11_FORMAT_SUPPORT_TYPED_UNORDERED_ACCESS_VIEW,
+ .sup2 = D3D11_FORMAT_SUPPORT2_UAV_TYPED_LOAD,
+ },
+ {
+ .caps = PL_FMT_CAP_LINEAR,
+ .sup = D3D11_FORMAT_SUPPORT_TEXTURE2D |
+ D3D11_FORMAT_SUPPORT_SHADER_SAMPLE,
+ },
+ {
+ .caps = PL_FMT_CAP_RENDERABLE,
+ .sup = D3D11_FORMAT_SUPPORT_RENDER_TARGET,
+ },
+ {
+ .caps = PL_FMT_CAP_BLENDABLE,
+ .sup = D3D11_FORMAT_SUPPORT_RENDER_TARGET |
+ D3D11_FORMAT_SUPPORT_BLENDABLE,
+ },
+ {
+ .caps = PL_FMT_CAP_VERTEX,
+ .sup = D3D11_FORMAT_SUPPORT_IA_VERTEX_BUFFER,
+ },
+ {
+ .caps = PL_FMT_CAP_TEXEL_UNIFORM,
+ .sup = D3D11_FORMAT_SUPPORT_BUFFER |
+ D3D11_FORMAT_SUPPORT_SHADER_LOAD,
+ },
+ {
+ .caps = PL_FMT_CAP_TEXEL_STORAGE,
+ // SHADER_LOAD is for readonly buffers, which can use a SRV
+ .sup = D3D11_FORMAT_SUPPORT_BUFFER |
+ D3D11_FORMAT_SUPPORT_TYPED_UNORDERED_ACCESS_VIEW |
+ D3D11_FORMAT_SUPPORT_SHADER_LOAD,
+ .sup2 = D3D11_FORMAT_SUPPORT2_UAV_TYPED_STORE,
+ },
+ {
+ .caps = PL_FMT_CAP_HOST_READABLE,
+ .sup = D3D11_FORMAT_SUPPORT_CPU_LOCKABLE,
+ },
+ };
+
+ for (int j = 0; j < PL_ARRAY_SIZE(support); j++) {
+ if ((sup & support[j].sup) == support[j].sup &&
+ (sup2.OutFormatSupport2 & support[j].sup2) == support[j].sup2)
+ {
+ fmt->caps |= support[j].caps;
+ }
+ }
+
+ // PL_FMT_CAP_STORABLE implies compute shaders, so don't set it if we
+ // don't have them
+ if (!gpu->glsl.compute)
+ fmt->caps &= ~PL_FMT_CAP_STORABLE;
+
+ // PL_FMT_CAP_READWRITE implies PL_FMT_CAP_STORABLE
+ if (!(fmt->caps & PL_FMT_CAP_STORABLE))
+ fmt->caps &= ~PL_FMT_CAP_READWRITE;
+
+ // `fmt->gatherable` must have PL_FMT_CAP_SAMPLEABLE
+ if ((fmt->caps & PL_FMT_CAP_SAMPLEABLE) &&
+ (sup & D3D11_FORMAT_SUPPORT_SHADER_GATHER))
+ {
+ fmt->gatherable = true;
+ }
+
+ // PL_FMT_CAP_BLITTABLE implies support for stretching, flipping and
+ // loose format conversion, which require a shader pass in D3D11
+ if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
+ // On >=FL11_0, we use a compute pass, which supports 1D and 3D
+ // textures
+ if (fmt->caps & PL_FMT_CAP_STORABLE)
+ fmt->caps |= PL_FMT_CAP_BLITTABLE;
+ } else {
+ // On <FL11_0 we use a raster pass
+ static const enum pl_fmt_caps req = PL_FMT_CAP_RENDERABLE |
+ PL_FMT_CAP_SAMPLEABLE;
+ if ((fmt->caps & req) == req)
+ fmt->caps |= PL_FMT_CAP_BLITTABLE;
+ }
+
+ if (fmt->caps & (PL_FMT_CAP_VERTEX | PL_FMT_CAP_TEXEL_UNIFORM |
+ PL_FMT_CAP_TEXEL_STORAGE)) {
+ fmt->glsl_type = pl_var_glsl_type_name(pl_var_from_fmt(fmt, ""));
+ pl_assert(fmt->glsl_type);
+ }
+
+ if (fmt->caps & (PL_FMT_CAP_STORABLE | PL_FMT_CAP_TEXEL_STORAGE))
+ fmt->glsl_format = pl_fmt_glsl_format(fmt, fmt->num_components);
+
+ fmt->fourcc = pl_fmt_fourcc(fmt);
+
+ // If no caps, D3D11 only supports this for things we don't care about
+ if (!fmt->caps) {
+ pl_free(fmt);
+ continue;
+ }
+
+ PL_ARRAY_APPEND(gpu, formats, fmt);
+ }
+
+ gpu->formats = formats.elem;
+ gpu->num_formats = formats.num;
+}
diff --git a/src/d3d11/formats.h b/src/d3d11/formats.h
new file mode 100644
index 0000000..08336c0
--- /dev/null
+++ b/src/d3d11/formats.h
@@ -0,0 +1,36 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+struct d3d_format {
+ DXGI_FORMAT dxfmt;
+ int minor; // The D3D11 minor version number which supports this format
+ struct pl_fmt_t fmt;
+};
+
+extern const struct d3d_format pl_d3d11_formats[];
+
+static inline DXGI_FORMAT fmt_to_dxgi(pl_fmt fmt)
+{
+ const struct d3d_format **fmtp = PL_PRIV(fmt);
+ return (*fmtp)->dxfmt;
+}
+
+void pl_d3d11_setup_formats(struct pl_gpu_t *gpu);
diff --git a/src/d3d11/gpu.c b/src/d3d11/gpu.c
new file mode 100644
index 0000000..05a08a3
--- /dev/null
+++ b/src/d3d11/gpu.c
@@ -0,0 +1,685 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <initguid.h>
+#include <windows.h>
+#include <versionhelpers.h>
+
+#include "common.h"
+#include "gpu.h"
+#include "formats.h"
+#include "glsl/spirv.h"
+
+#define DXGI_ADAPTER_FLAG3_SUPPORT_MONITORED_FENCES (0x8)
+
+struct timer_query {
+ ID3D11Query *ts_start;
+ ID3D11Query *ts_end;
+ ID3D11Query *disjoint;
+};
+
+struct pl_timer_t {
+ // Ring buffer of timer queries to use
+ int current;
+ int pending;
+ struct timer_query queries[16];
+};
+
+void pl_d3d11_timer_start(pl_gpu gpu, pl_timer timer)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+
+ if (!timer)
+ return;
+ struct timer_query *query = &timer->queries[timer->current];
+
+ // Create the query objects lazilly
+ if (!query->ts_start) {
+ D3D(ID3D11Device_CreateQuery(p->dev,
+ &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP }, &query->ts_start));
+ D3D(ID3D11Device_CreateQuery(p->dev,
+ &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP }, &query->ts_end));
+
+ // Measuring duration in D3D11 requires three queries: start and end
+ // timestamp queries, and a disjoint query containing a flag which says
+ // whether the timestamps are usable or if a discontinuity occurred
+ // between them, like a change in power state or clock speed. The
+ // disjoint query also contains the timer frequency, so the timestamps
+ // are useless without it.
+ D3D(ID3D11Device_CreateQuery(p->dev,
+ &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP_DISJOINT }, &query->disjoint));
+ }
+
+ // Query the start timestamp
+ ID3D11DeviceContext_Begin(p->imm, (ID3D11Asynchronous *) query->disjoint);
+ ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) query->ts_start);
+ return;
+
+error:
+ SAFE_RELEASE(query->ts_start);
+ SAFE_RELEASE(query->ts_end);
+ SAFE_RELEASE(query->disjoint);
+}
+
+void pl_d3d11_timer_end(pl_gpu gpu, pl_timer timer)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+
+ if (!timer)
+ return;
+ struct timer_query *query = &timer->queries[timer->current];
+
+ // Even if timer_start and timer_end are called in-order, timer_start might
+ // have failed to create the timer objects
+ if (!query->ts_start)
+ return;
+
+ // Query the end timestamp
+ ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) query->ts_end);
+ ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) query->disjoint);
+
+ // Advance to the next set of queries, for the next call to timer_start
+ timer->current++;
+ if (timer->current >= PL_ARRAY_SIZE(timer->queries))
+ timer->current = 0; // Wrap around
+
+ // Increment the number of pending queries, unless the ring buffer is full,
+ // in which case, timer->current now points to the oldest one, which will be
+ // dropped and reused
+ if (timer->pending < PL_ARRAY_SIZE(timer->queries))
+ timer->pending++;
+}
+
+static uint64_t timestamp_to_ns(uint64_t timestamp, uint64_t freq)
+{
+ static const uint64_t ns_per_s = 1000000000llu;
+ return timestamp / freq * ns_per_s + timestamp % freq * ns_per_s / freq;
+}
+
+static uint64_t d3d11_timer_query(pl_gpu gpu, pl_timer timer)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+ HRESULT hr;
+
+ for (; timer->pending > 0; timer->pending--) {
+ int index = timer->current - timer->pending;
+ if (index < 0)
+ index += PL_ARRAY_SIZE(timer->queries);
+ struct timer_query *query = &timer->queries[index];
+
+ UINT64 start, end;
+ D3D11_QUERY_DATA_TIMESTAMP_DISJOINT dj;
+
+ // Fetch the results of each query, or on S_FALSE, return 0 to indicate
+ // the queries are still pending
+ D3D(hr = ID3D11DeviceContext_GetData(p->imm,
+ (ID3D11Asynchronous *) query->disjoint, &dj, sizeof(dj),
+ D3D11_ASYNC_GETDATA_DONOTFLUSH));
+ if (hr == S_FALSE)
+ return 0;
+ D3D(hr = ID3D11DeviceContext_GetData(p->imm,
+ (ID3D11Asynchronous *) query->ts_end, &end, sizeof(end),
+ D3D11_ASYNC_GETDATA_DONOTFLUSH));
+ if (hr == S_FALSE)
+ return 0;
+ D3D(hr = ID3D11DeviceContext_GetData(p->imm,
+ (ID3D11Asynchronous *) query->ts_start, &start, sizeof(start),
+ D3D11_ASYNC_GETDATA_DONOTFLUSH));
+ if (hr == S_FALSE)
+ return 0;
+
+ // There was a discontinuity during the queries, so a timestamp can't be
+ // produced. Skip it and try the next one.
+ if (dj.Disjoint || !dj.Frequency)
+ continue;
+
+ // We got a result. Return it to the caller.
+ timer->pending--;
+ pl_d3d11_flush_message_queue(ctx, "After timer query");
+
+ uint64_t ns = timestamp_to_ns(end - start, dj.Frequency);
+ return PL_MAX(ns, 1);
+
+ error:
+ // There was an error fetching the timer result, so skip it and try the
+ // next one
+ continue;
+ }
+
+ // No more unprocessed results
+ return 0;
+}
+
+static void d3d11_timer_destroy(pl_gpu gpu, pl_timer timer)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+
+ for (int i = 0; i < PL_ARRAY_SIZE(timer->queries); i++) {
+ SAFE_RELEASE(timer->queries[i].ts_start);
+ SAFE_RELEASE(timer->queries[i].ts_end);
+ SAFE_RELEASE(timer->queries[i].disjoint);
+ }
+
+ pl_d3d11_flush_message_queue(ctx, "After timer destroy");
+
+ pl_free(timer);
+}
+
+static pl_timer d3d11_timer_create(pl_gpu gpu)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ if (!p->has_timestamp_queries)
+ return NULL;
+
+ struct pl_timer_t *timer = pl_alloc_ptr(NULL, timer);
+ *timer = (struct pl_timer_t) {0};
+ return timer;
+}
+
+static int d3d11_desc_namespace(pl_gpu gpu, enum pl_desc_type type)
+{
+ // Vulkan-style binding, where all descriptors are in the same namespace, is
+ // required to use SPIRV-Cross' HLSL resource mapping API, which targets
+ // resources by binding number
+ return 0;
+}
+
+static void d3d11_gpu_flush(pl_gpu gpu)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+ ID3D11DeviceContext_Flush(p->imm);
+
+ pl_d3d11_flush_message_queue(ctx, "After gpu flush");
+}
+
+static void d3d11_gpu_finish(pl_gpu gpu)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+ HRESULT hr;
+
+ if (p->finish_fence) {
+ p->finish_value++;
+ D3D(ID3D11Fence_SetEventOnCompletion(p->finish_fence, p->finish_value,
+ p->finish_event));
+ ID3D11DeviceContext4_Signal(p->imm4, p->finish_fence, p->finish_value);
+ ID3D11DeviceContext_Flush(p->imm);
+ WaitForSingleObject(p->finish_event, INFINITE);
+ } else {
+ ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) p->finish_query);
+
+ // D3D11 doesn't have blocking queries, but it does have blocking
+ // readback. As a performance hack to try to avoid polling, do a dummy
+ // copy/readback between two buffers. Hopefully this will block until
+ // all prior commands are finished. If it does, the first GetData call
+ // will return a result and we won't have to poll.
+ pl_buf_copy(gpu, p->finish_buf_dst, 0, p->finish_buf_src, 0, sizeof(uint32_t));
+ pl_buf_read(gpu, p->finish_buf_dst, 0, &(uint32_t) {0}, sizeof(uint32_t));
+
+ // Poll the event query until it completes
+ for (;;) {
+ BOOL idle;
+ D3D(hr = ID3D11DeviceContext_GetData(p->imm,
+ (ID3D11Asynchronous *) p->finish_query, &idle, sizeof(idle), 0));
+ if (hr == S_OK && idle)
+ break;
+ Sleep(1);
+ }
+ }
+
+ pl_d3d11_flush_message_queue(ctx, "After gpu finish");
+
+error:
+ return;
+}
+
+static bool d3d11_gpu_is_failed(pl_gpu gpu)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+
+ if (ctx->is_failed)
+ return true;
+
+ // GetDeviceRemovedReason returns S_OK if the device isn't removed
+ HRESULT hr = ID3D11Device_GetDeviceRemovedReason(p->dev);
+ if (FAILED(hr)) {
+ ctx->is_failed = true;
+ pl_d3d11_after_error(ctx, hr);
+ }
+
+ return ctx->is_failed;
+}
+
+static void d3d11_gpu_destroy(pl_gpu gpu)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+
+ pl_buf_destroy(gpu, &p->finish_buf_src);
+ pl_buf_destroy(gpu, &p->finish_buf_dst);
+
+ // Release everything except the immediate context
+ SAFE_RELEASE(p->dev);
+ SAFE_RELEASE(p->dev1);
+ SAFE_RELEASE(p->dev5);
+ SAFE_RELEASE(p->imm1);
+ SAFE_RELEASE(p->imm4);
+ SAFE_RELEASE(p->vbuf.buf);
+ SAFE_RELEASE(p->ibuf.buf);
+ SAFE_RELEASE(p->rstate);
+ SAFE_RELEASE(p->dsstate);
+ for (int i = 0; i < PL_TEX_SAMPLE_MODE_COUNT; i++) {
+ for (int j = 0; j < PL_TEX_ADDRESS_MODE_COUNT; j++) {
+ SAFE_RELEASE(p->samplers[i][j]);
+ }
+ }
+ SAFE_RELEASE(p->finish_fence);
+ if (p->finish_event)
+ CloseHandle(p->finish_event);
+ SAFE_RELEASE(p->finish_query);
+
+ // Destroy the immediate context synchronously so referenced objects don't
+ // show up in the leak check
+ if (p->imm) {
+ ID3D11DeviceContext_ClearState(p->imm);
+ ID3D11DeviceContext_Flush(p->imm);
+ SAFE_RELEASE(p->imm);
+ }
+
+ pl_spirv_destroy(&p->spirv);
+ pl_free((void *) gpu);
+}
+
+pl_d3d11 pl_d3d11_get(pl_gpu gpu)
+{
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ if (impl->destroy == d3d11_gpu_destroy) {
+ struct pl_gpu_d3d11 *p = (struct pl_gpu_d3d11 *) impl;
+ return p->ctx->d3d11;
+ }
+
+ return NULL;
+}
+
+static bool load_d3d_compiler(pl_gpu gpu)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ HMODULE d3dcompiler = NULL;
+
+ static const struct {
+ const wchar_t *name;
+ bool inbox;
+ } compiler_dlls[] = {
+ // Try the inbox D3DCompiler first (Windows 8.1 and up)
+ { .name = L"d3dcompiler_47.dll", .inbox = true },
+ // Check for a packaged version of d3dcompiler_47.dll
+ { .name = L"d3dcompiler_47.dll" },
+ // Try d3dcompiler_46.dll from the Windows 8 SDK
+ { .name = L"d3dcompiler_46.dll" },
+ // Try d3dcompiler_43.dll from the June 2010 DirectX SDK
+ { .name = L"d3dcompiler_43.dll" },
+ };
+
+ for (int i = 0; i < PL_ARRAY_SIZE(compiler_dlls); i++) {
+ if (compiler_dlls[i].inbox) {
+ if (!IsWindows8Point1OrGreater())
+ continue;
+ d3dcompiler = LoadLibraryExW(compiler_dlls[i].name, NULL,
+ LOAD_LIBRARY_SEARCH_SYSTEM32);
+ } else {
+ d3dcompiler = LoadLibraryW(compiler_dlls[i].name);
+ }
+ if (!d3dcompiler)
+ continue;
+
+ p->D3DCompile = (void *) GetProcAddress(d3dcompiler, "D3DCompile");
+ if (!p->D3DCompile)
+ return false;
+ p->d3d_compiler_ver = pl_get_dll_version(compiler_dlls[i].name);
+
+ return true;
+ }
+
+ return false;
+}
+
+static struct pl_gpu_fns pl_fns_d3d11 = {
+ .tex_create = pl_d3d11_tex_create,
+ .tex_destroy = pl_d3d11_tex_destroy,
+ .tex_invalidate = pl_d3d11_tex_invalidate,
+ .tex_clear_ex = pl_d3d11_tex_clear_ex,
+ .tex_blit = pl_d3d11_tex_blit,
+ .tex_upload = pl_d3d11_tex_upload,
+ .tex_download = pl_d3d11_tex_download,
+ .buf_create = pl_d3d11_buf_create,
+ .buf_destroy = pl_d3d11_buf_destroy,
+ .buf_write = pl_d3d11_buf_write,
+ .buf_read = pl_d3d11_buf_read,
+ .buf_copy = pl_d3d11_buf_copy,
+ .desc_namespace = d3d11_desc_namespace,
+ .pass_create = pl_d3d11_pass_create,
+ .pass_destroy = pl_d3d11_pass_destroy,
+ .pass_run = pl_d3d11_pass_run,
+ .timer_create = d3d11_timer_create,
+ .timer_destroy = d3d11_timer_destroy,
+ .timer_query = d3d11_timer_query,
+ .gpu_flush = d3d11_gpu_flush,
+ .gpu_finish = d3d11_gpu_finish,
+ .gpu_is_failed = d3d11_gpu_is_failed,
+ .destroy = d3d11_gpu_destroy,
+};
+
+pl_gpu pl_gpu_create_d3d11(struct d3d11_ctx *ctx)
+{
+ pl_assert(ctx->dev);
+ IDXGIDevice1 *dxgi_dev = NULL;
+ IDXGIAdapter1 *adapter = NULL;
+ IDXGIAdapter4 *adapter4 = NULL;
+ bool success = false;
+ HRESULT hr;
+
+ struct pl_gpu_t *gpu = pl_zalloc_obj(NULL, gpu, struct pl_gpu_d3d11);
+ gpu->log = ctx->log;
+
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ uint32_t spirv_ver = PL_MIN(SPV_VERSION, PL_MAX_SPIRV_VER);
+ *p = (struct pl_gpu_d3d11) {
+ .ctx = ctx,
+ .impl = pl_fns_d3d11,
+ .dev = ctx->dev,
+ .spirv = pl_spirv_create(ctx->log, (struct pl_spirv_version) {
+ .env_version = pl_spirv_version_to_vulkan(spirv_ver),
+ .spv_version = spirv_ver,
+ }),
+ .vbuf.bind_flags = D3D11_BIND_VERTEX_BUFFER,
+ .ibuf.bind_flags = D3D11_BIND_INDEX_BUFFER,
+ };
+ if (!p->spirv)
+ goto error;
+
+ ID3D11Device_AddRef(p->dev);
+ ID3D11Device_GetImmediateContext(p->dev, &p->imm);
+
+ // Check D3D11.1 interfaces
+ hr = ID3D11Device_QueryInterface(p->dev, &IID_ID3D11Device1,
+ (void **) &p->dev1);
+ if (SUCCEEDED(hr)) {
+ p->minor = 1;
+ ID3D11Device1_GetImmediateContext1(p->dev1, &p->imm1);
+ }
+
+ // Check D3D11.4 interfaces
+ hr = ID3D11Device_QueryInterface(p->dev, &IID_ID3D11Device5,
+ (void **) &p->dev5);
+ if (SUCCEEDED(hr)) {
+ // There is no GetImmediateContext4 method
+ hr = ID3D11DeviceContext_QueryInterface(p->imm, &IID_ID3D11DeviceContext4,
+ (void **) &p->imm4);
+ if (SUCCEEDED(hr))
+ p->minor = 4;
+ }
+
+ PL_INFO(gpu, "Using Direct3D 11.%d runtime", p->minor);
+
+ D3D(ID3D11Device_QueryInterface(p->dev, &IID_IDXGIDevice1, (void **) &dxgi_dev));
+ D3D(IDXGIDevice1_GetParent(dxgi_dev, &IID_IDXGIAdapter1, (void **) &adapter));
+
+ DXGI_ADAPTER_DESC1 adapter_desc = {0};
+ IDXGIAdapter1_GetDesc1(adapter, &adapter_desc);
+
+ // No resource can be larger than max_res_size in bytes
+ unsigned int max_res_size = PL_CLAMP(
+ D3D11_REQ_RESOURCE_SIZE_IN_MEGABYTES_EXPRESSION_B_TERM * adapter_desc.DedicatedVideoMemory,
+ D3D11_REQ_RESOURCE_SIZE_IN_MEGABYTES_EXPRESSION_A_TERM * 1024u * 1024u,
+ D3D11_REQ_RESOURCE_SIZE_IN_MEGABYTES_EXPRESSION_C_TERM * 1024u * 1024u);
+
+ gpu->glsl = (struct pl_glsl_version) {
+ .version = 450,
+ .vulkan = true,
+ };
+
+ gpu->limits = (struct pl_gpu_limits) {
+ .max_buf_size = max_res_size,
+ .max_ssbo_size = max_res_size,
+ .max_vbo_size = max_res_size,
+ .align_vertex_stride = 1,
+
+ // Make up some values
+ .align_tex_xfer_offset = 32,
+ .align_tex_xfer_pitch = 1,
+ .fragment_queues = 1,
+ };
+
+ p->fl = ID3D11Device_GetFeatureLevel(p->dev);
+
+ // If we're not using FL9_x, we can use the same suballocated buffer as a
+ // vertex buffer and index buffer
+ if (p->fl >= D3D_FEATURE_LEVEL_10_0)
+ p->vbuf.bind_flags |= D3D11_BIND_INDEX_BUFFER;
+
+ if (p->fl >= D3D_FEATURE_LEVEL_10_0) {
+ gpu->limits.max_ubo_size = D3D11_REQ_CONSTANT_BUFFER_ELEMENT_COUNT * CBUF_ELEM;
+ } else {
+ // 10level9 restriction:
+ // https://docs.microsoft.com/en-us/windows/win32/direct3d11/d3d11-graphics-reference-10level9-context
+ gpu->limits.max_ubo_size = 255 * CBUF_ELEM;
+ }
+
+ if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
+ gpu->limits.max_tex_1d_dim = D3D11_REQ_TEXTURE1D_U_DIMENSION;
+ gpu->limits.max_tex_2d_dim = D3D11_REQ_TEXTURE2D_U_OR_V_DIMENSION;
+ gpu->limits.max_tex_3d_dim = D3D11_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
+ } else if (p->fl >= D3D_FEATURE_LEVEL_10_0) {
+ gpu->limits.max_tex_1d_dim = D3D10_REQ_TEXTURE1D_U_DIMENSION;
+ gpu->limits.max_tex_2d_dim = D3D10_REQ_TEXTURE2D_U_OR_V_DIMENSION;
+ gpu->limits.max_tex_3d_dim = D3D10_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
+ } else if (p->fl >= D3D_FEATURE_LEVEL_9_3) {
+ gpu->limits.max_tex_2d_dim = D3D_FL9_3_REQ_TEXTURE2D_U_OR_V_DIMENSION;
+ // Same limit as FL9_1
+ gpu->limits.max_tex_3d_dim = D3D_FL9_1_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
+ } else {
+ gpu->limits.max_tex_2d_dim = D3D_FL9_1_REQ_TEXTURE2D_U_OR_V_DIMENSION;
+ gpu->limits.max_tex_3d_dim = D3D_FL9_1_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
+ }
+
+ if (p->fl >= D3D_FEATURE_LEVEL_10_0) {
+ gpu->limits.max_buffer_texels =
+ 1 << D3D11_REQ_BUFFER_RESOURCE_TEXEL_COUNT_2_TO_EXP;
+ }
+
+ if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
+ gpu->glsl.compute = true;
+ gpu->limits.compute_queues = 1;
+ // Set `gpu->limits.blittable_1d_3d`, since `pl_tex_blit_compute`, which
+ // is used to emulate blits on 11_0 and up, supports 1D and 3D textures
+ gpu->limits.blittable_1d_3d = true;
+
+ gpu->glsl.max_shmem_size = D3D11_CS_TGSM_REGISTER_COUNT * sizeof(float);
+ gpu->glsl.max_group_threads = D3D11_CS_THREAD_GROUP_MAX_THREADS_PER_GROUP;
+ gpu->glsl.max_group_size[0] = D3D11_CS_THREAD_GROUP_MAX_X;
+ gpu->glsl.max_group_size[1] = D3D11_CS_THREAD_GROUP_MAX_Y;
+ gpu->glsl.max_group_size[2] = D3D11_CS_THREAD_GROUP_MAX_Z;
+ gpu->limits.max_dispatch[0] = gpu->limits.max_dispatch[1] =
+ gpu->limits.max_dispatch[2] =
+ D3D11_CS_DISPATCH_MAX_THREAD_GROUPS_PER_DIMENSION;
+ }
+
+ if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
+ // The offset limits are defined by HLSL:
+ // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/gather4-po--sm5---asm-
+ gpu->glsl.min_gather_offset = -32;
+ gpu->glsl.max_gather_offset = 31;
+ } else if (p->fl >= D3D_FEATURE_LEVEL_10_1) {
+ // SM4.1 has no gather4_po, so the offset must be specified by an
+ // immediate with a range of [-8, 7]
+ // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/gather4--sm4-1---asm-
+ // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/sample--sm4---asm-#address-offset
+ gpu->glsl.min_gather_offset = -8;
+ gpu->glsl.max_gather_offset = 7;
+ }
+
+ if (p->fl >= D3D_FEATURE_LEVEL_10_0) {
+ p->max_srvs = D3D11_COMMONSHADER_INPUT_RESOURCE_SLOT_COUNT;
+ } else {
+ // 10level9 restriction:
+ // https://docs.microsoft.com/en-us/windows/win32/direct3d11/d3d11-graphics-reference-10level9-context
+ p->max_srvs = 8;
+ }
+
+ if (p->fl >= D3D_FEATURE_LEVEL_11_1) {
+ p->max_uavs = D3D11_1_UAV_SLOT_COUNT;
+ } else {
+ p->max_uavs = D3D11_PS_CS_UAV_REGISTER_COUNT;
+ }
+
+ if (!load_d3d_compiler(gpu)) {
+ PL_FATAL(gpu, "Could not find D3DCompiler DLL");
+ goto error;
+ }
+ PL_INFO(gpu, "D3DCompiler version: %u.%u.%u.%u",
+ p->d3d_compiler_ver.major, p->d3d_compiler_ver.minor,
+ p->d3d_compiler_ver.build, p->d3d_compiler_ver.revision);
+
+ // Detect support for timestamp queries. Some FL9_x devices don't support them.
+ hr = ID3D11Device_CreateQuery(p->dev,
+ &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP }, NULL);
+ p->has_timestamp_queries = SUCCEEDED(hr);
+
+ pl_d3d11_setup_formats(gpu);
+
+ // The rasterizer state never changes, so create it here
+ D3D11_RASTERIZER_DESC rdesc = {
+ .FillMode = D3D11_FILL_SOLID,
+ .CullMode = D3D11_CULL_NONE,
+ .FrontCounterClockwise = FALSE,
+ .DepthClipEnable = TRUE, // Required for 10level9
+ .ScissorEnable = TRUE,
+ };
+ D3D(ID3D11Device_CreateRasterizerState(p->dev, &rdesc, &p->rstate));
+
+ // The depth stencil state never changes either, and we only set it to turn
+ // depth testing off so the debug layer doesn't complain about an unbound
+ // depth buffer
+ D3D11_DEPTH_STENCIL_DESC dsdesc = {
+ .DepthEnable = FALSE,
+ .DepthWriteMask = D3D11_DEPTH_WRITE_MASK_ALL,
+ .DepthFunc = D3D11_COMPARISON_LESS,
+ .StencilReadMask = D3D11_DEFAULT_STENCIL_READ_MASK,
+ .StencilWriteMask = D3D11_DEFAULT_STENCIL_WRITE_MASK,
+ .FrontFace = {
+ .StencilFailOp = D3D11_STENCIL_OP_KEEP,
+ .StencilDepthFailOp = D3D11_STENCIL_OP_KEEP,
+ .StencilPassOp = D3D11_STENCIL_OP_KEEP,
+ .StencilFunc = D3D11_COMPARISON_ALWAYS,
+ },
+ .BackFace = {
+ .StencilFailOp = D3D11_STENCIL_OP_KEEP,
+ .StencilDepthFailOp = D3D11_STENCIL_OP_KEEP,
+ .StencilPassOp = D3D11_STENCIL_OP_KEEP,
+ .StencilFunc = D3D11_COMPARISON_ALWAYS,
+ },
+ };
+ D3D(ID3D11Device_CreateDepthStencilState(p->dev, &dsdesc, &p->dsstate));
+
+ // Initialize the samplers
+ for (int sample_mode = 0; sample_mode < PL_TEX_SAMPLE_MODE_COUNT; sample_mode++) {
+ for (int address_mode = 0; address_mode < PL_TEX_ADDRESS_MODE_COUNT; address_mode++) {
+ static const D3D11_TEXTURE_ADDRESS_MODE d3d_address_mode[] = {
+ [PL_TEX_ADDRESS_CLAMP] = D3D11_TEXTURE_ADDRESS_CLAMP,
+ [PL_TEX_ADDRESS_REPEAT] = D3D11_TEXTURE_ADDRESS_WRAP,
+ [PL_TEX_ADDRESS_MIRROR] = D3D11_TEXTURE_ADDRESS_MIRROR,
+ };
+ static const D3D11_FILTER d3d_filter[] = {
+ [PL_TEX_SAMPLE_NEAREST] = D3D11_FILTER_MIN_MAG_MIP_POINT,
+ [PL_TEX_SAMPLE_LINEAR] = D3D11_FILTER_MIN_MAG_MIP_LINEAR,
+ };
+
+ D3D11_SAMPLER_DESC sdesc = {
+ .AddressU = d3d_address_mode[address_mode],
+ .AddressV = d3d_address_mode[address_mode],
+ .AddressW = d3d_address_mode[address_mode],
+ .ComparisonFunc = D3D11_COMPARISON_NEVER,
+ .MinLOD = 0,
+ .MaxLOD = D3D11_FLOAT32_MAX,
+ .MaxAnisotropy = 1,
+ .Filter = d3d_filter[sample_mode],
+ };
+ D3D(ID3D11Device_CreateSamplerState(p->dev, &sdesc,
+ &p->samplers[sample_mode][address_mode]));
+ }
+ }
+
+ hr = IDXGIAdapter1_QueryInterface(adapter, &IID_IDXGIAdapter4,
+ (void **) &adapter4);
+ if (SUCCEEDED(hr)) {
+ DXGI_ADAPTER_DESC3 adapter_desc3 = {0};
+ IDXGIAdapter4_GetDesc3(adapter4, &adapter_desc3);
+
+ p->has_monitored_fences =
+ adapter_desc3.Flags & DXGI_ADAPTER_FLAG3_SUPPORT_MONITORED_FENCES;
+ }
+
+ // Try to create a D3D11.4 fence object to wait on in pl_gpu_finish()
+ if (p->dev5 && p->has_monitored_fences) {
+ hr = ID3D11Device5_CreateFence(p->dev5, 0, D3D11_FENCE_FLAG_NONE,
+ &IID_ID3D11Fence,
+ (void **) &p->finish_fence);
+ if (SUCCEEDED(hr)) {
+ p->finish_event = CreateEventW(NULL, FALSE, FALSE, NULL);
+ if (!p->finish_event) {
+ PL_ERR(gpu, "Failed to create finish() event");
+ goto error;
+ }
+ }
+ }
+
+ // If fences are not available, we will have to poll a event query instead
+ if (!p->finish_fence) {
+ // Buffers for dummy copy/readback (see d3d11_gpu_finish())
+ p->finish_buf_src = pl_buf_create(gpu, pl_buf_params(
+ .size = sizeof(uint32_t),
+ .drawable = true, // Make these vertex buffers for 10level9
+ .initial_data = &(uint32_t) {0x11223344},
+ ));
+ p->finish_buf_dst = pl_buf_create(gpu, pl_buf_params(
+ .size = sizeof(uint32_t),
+ .host_readable = true,
+ .drawable = true,
+ ));
+
+ D3D(ID3D11Device_CreateQuery(p->dev,
+ &(D3D11_QUERY_DESC) { D3D11_QUERY_EVENT }, &p->finish_query));
+ }
+
+ pl_d3d11_flush_message_queue(ctx, "After gpu create");
+
+ success = true;
+error:
+ SAFE_RELEASE(dxgi_dev);
+ SAFE_RELEASE(adapter);
+ SAFE_RELEASE(adapter4);
+ if (success) {
+ return pl_gpu_finalize(gpu);
+ } else {
+ d3d11_gpu_destroy(gpu);
+ return NULL;
+ }
+}
diff --git a/src/d3d11/gpu.h b/src/d3d11/gpu.h
new file mode 100644
index 0000000..cbc706a
--- /dev/null
+++ b/src/d3d11/gpu.h
@@ -0,0 +1,212 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <stdalign.h>
+#include <d3d11_4.h>
+#include <dxgi1_6.h>
+#include <d3dcompiler.h>
+#include <spirv_cross_c.h>
+
+#include "../gpu.h"
+#include "../glsl/spirv.h"
+
+#include "common.h"
+#include "utils.h"
+
+pl_gpu pl_gpu_create_d3d11(struct d3d11_ctx *ctx);
+
+// --- pl_gpu internal structs and helpers
+
+// Size of one constant in a constant buffer
+#define CBUF_ELEM (sizeof(float[4]))
+
+struct d3d_stream_buf {
+ UINT bind_flags;
+ ID3D11Buffer *buf;
+ size_t size;
+ size_t used;
+ unsigned int align;
+};
+
+struct pl_gpu_d3d11 {
+ struct pl_gpu_fns impl;
+ struct d3d11_ctx *ctx;
+ ID3D11Device *dev;
+ ID3D11Device1 *dev1;
+ ID3D11Device5 *dev5;
+ ID3D11DeviceContext *imm;
+ ID3D11DeviceContext1 *imm1;
+ ID3D11DeviceContext4 *imm4;
+
+ // The Direct3D 11 minor version number
+ int minor;
+
+ pl_spirv spirv;
+
+ pD3DCompile D3DCompile;
+ struct dll_version d3d_compiler_ver;
+
+ // Device capabilities
+ D3D_FEATURE_LEVEL fl;
+ bool has_timestamp_queries;
+ bool has_monitored_fences;
+
+ int max_srvs;
+ int max_uavs;
+
+ // Streaming vertex and index buffers
+ struct d3d_stream_buf vbuf;
+ struct d3d_stream_buf ibuf;
+
+ // Shared rasterizer state
+ ID3D11RasterizerState *rstate;
+
+ // Shared depth-stencil state
+ ID3D11DepthStencilState *dsstate;
+
+ // Array of ID3D11SamplerStates for every combination of sample/address modes
+ ID3D11SamplerState *samplers[PL_TEX_SAMPLE_MODE_COUNT][PL_TEX_ADDRESS_MODE_COUNT];
+
+ // Resources for finish()
+ ID3D11Fence *finish_fence;
+ uint64_t finish_value;
+ HANDLE finish_event;
+ ID3D11Query *finish_query;
+ pl_buf finish_buf_src;
+ pl_buf finish_buf_dst;
+};
+
+void pl_d3d11_setup_formats(struct pl_gpu_t *gpu);
+
+void pl_d3d11_timer_start(pl_gpu gpu, pl_timer timer);
+void pl_d3d11_timer_end(pl_gpu gpu, pl_timer timer);
+
+struct pl_buf_d3d11 {
+ ID3D11Buffer *buf;
+ ID3D11Buffer *staging;
+ ID3D11ShaderResourceView *raw_srv;
+ ID3D11UnorderedAccessView *raw_uav;
+ ID3D11ShaderResourceView *texel_srv;
+ ID3D11UnorderedAccessView *texel_uav;
+
+ char *data;
+ bool dirty;
+};
+
+void pl_d3d11_buf_destroy(pl_gpu gpu, pl_buf buf);
+pl_buf pl_d3d11_buf_create(pl_gpu gpu, const struct pl_buf_params *params);
+void pl_d3d11_buf_write(pl_gpu gpu, pl_buf buf, size_t offset, const void *data,
+ size_t size);
+bool pl_d3d11_buf_read(pl_gpu gpu, pl_buf buf, size_t offset, void *dest,
+ size_t size);
+void pl_d3d11_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset, pl_buf src,
+ size_t src_offset, size_t size);
+
+// Ensure a buffer is up-to-date with its system memory mirror before it is used
+void pl_d3d11_buf_resolve(pl_gpu gpu, pl_buf buf);
+
+struct pl_tex_d3d11 {
+ // res mirrors one of tex1d, tex2d or tex3d for convenience. It does not
+ // hold an additional reference to the texture object.
+ ID3D11Resource *res;
+
+ ID3D11Texture1D *tex1d;
+ ID3D11Texture2D *tex2d;
+ ID3D11Texture3D *tex3d;
+ int array_slice;
+
+ // Mirrors one of staging1d, staging2d, or staging3d, and doesn't hold a ref
+ ID3D11Resource *staging;
+
+ // Staging textures for pl_tex_download
+ ID3D11Texture1D *staging1d;
+ ID3D11Texture2D *staging2d;
+ ID3D11Texture3D *staging3d;
+
+ ID3D11ShaderResourceView *srv;
+ ID3D11RenderTargetView *rtv;
+ ID3D11UnorderedAccessView *uav;
+
+ // for tex_upload/download fallback code
+ pl_fmt texel_fmt;
+};
+
+void pl_d3d11_tex_destroy(pl_gpu gpu, pl_tex tex);
+pl_tex pl_d3d11_tex_create(pl_gpu gpu, const struct pl_tex_params *params);
+void pl_d3d11_tex_invalidate(pl_gpu gpu, pl_tex tex);
+void pl_d3d11_tex_clear_ex(pl_gpu gpu, pl_tex tex,
+ const union pl_clear_color color);
+void pl_d3d11_tex_blit(pl_gpu gpu, const struct pl_tex_blit_params *params);
+bool pl_d3d11_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params);
+bool pl_d3d11_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params);
+
+// Constant buffer layout used for gl_NumWorkGroups emulation
+struct d3d_num_workgroups_buf {
+ alignas(CBUF_ELEM) uint32_t num_wgs[3];
+};
+
+enum {
+ HLSL_BINDING_NOT_USED = -1, // Slot should always be bound as NULL
+ HLSL_BINDING_NUM_WORKGROUPS = -2, // Slot used for gl_NumWorkGroups emulation
+};
+
+// Represents a specific shader stage in a pl_pass (VS, PS, CS)
+struct d3d_pass_stage {
+ // Lists for each resource type, to simplify binding in pl_pass_run. Indexes
+ // match the index of the arrays passed to the ID3D11DeviceContext methods.
+ // Entries are the index of pass->params.descriptors which should be bound
+ // in that position, or a HLSL_BINDING_* special value.
+ PL_ARRAY(int) cbvs;
+ PL_ARRAY(int) srvs;
+ PL_ARRAY(int) samplers;
+};
+
+struct pl_pass_d3d11 {
+ ID3D11PixelShader *ps;
+ ID3D11VertexShader *vs;
+ ID3D11ComputeShader *cs;
+ ID3D11InputLayout *layout;
+ ID3D11BlendState *bstate;
+
+ // gl_NumWorkGroups emulation
+ struct d3d_num_workgroups_buf last_num_wgs;
+ ID3D11Buffer *num_workgroups_buf;
+ bool num_workgroups_used;
+
+ // Maximum binding number
+ int max_binding;
+
+ struct d3d_pass_stage main; // PS and CS
+ struct d3d_pass_stage vertex;
+
+ // List of resources, as in `struct pass_stage`, except UAVs are shared
+ // between all shader stages
+ PL_ARRAY(int) uavs;
+
+ // Pre-allocated resource arrays to use in pl_pass_run
+ ID3D11Buffer **cbv_arr;
+ ID3D11ShaderResourceView **srv_arr;
+ ID3D11SamplerState **sampler_arr;
+ ID3D11UnorderedAccessView **uav_arr;
+};
+
+void pl_d3d11_pass_destroy(pl_gpu gpu, pl_pass pass);
+const struct pl_pass_t *pl_d3d11_pass_create(pl_gpu gpu,
+ const struct pl_pass_params *params);
+void pl_d3d11_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params);
diff --git a/src/d3d11/gpu_buf.c b/src/d3d11/gpu_buf.c
new file mode 100644
index 0000000..955e6e1
--- /dev/null
+++ b/src/d3d11/gpu_buf.c
@@ -0,0 +1,310 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+#include "formats.h"
+
+void pl_d3d11_buf_destroy(pl_gpu gpu, pl_buf buf)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+ struct pl_buf_d3d11 *buf_p = PL_PRIV(buf);
+
+ SAFE_RELEASE(buf_p->buf);
+ SAFE_RELEASE(buf_p->staging);
+ SAFE_RELEASE(buf_p->raw_srv);
+ SAFE_RELEASE(buf_p->raw_uav);
+ SAFE_RELEASE(buf_p->texel_srv);
+ SAFE_RELEASE(buf_p->texel_uav);
+
+ pl_d3d11_flush_message_queue(ctx, "After buffer destroy");
+
+ pl_free((void *) buf);
+}
+
+pl_buf pl_d3d11_buf_create(pl_gpu gpu, const struct pl_buf_params *params)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+
+ struct pl_buf_t *buf = pl_zalloc_obj(NULL, buf, struct pl_buf_d3d11);
+ buf->params = *params;
+ buf->params.initial_data = NULL;
+
+ struct pl_buf_d3d11 *buf_p = PL_PRIV(buf);
+
+ D3D11_BUFFER_DESC desc = { .ByteWidth = params->size };
+
+ if (params->uniform && !params->format &&
+ (params->storable || params->drawable))
+ {
+ // TODO: Figure out what to do with these
+ PL_ERR(gpu, "Uniform buffers cannot share any other buffer type");
+ goto error;
+ }
+
+ // TODO: Distinguish between uniform buffers and texel uniform buffers.
+ // Currently we assume that if uniform and format are set, it's a texel
+ // buffer and NOT a uniform buffer.
+ if (params->uniform && !params->format) {
+ desc.BindFlags |= D3D11_BIND_CONSTANT_BUFFER;
+ desc.ByteWidth = PL_ALIGN2(desc.ByteWidth, CBUF_ELEM);
+ }
+ if (params->uniform && params->format) {
+ desc.BindFlags |= D3D11_BIND_SHADER_RESOURCE;
+ }
+ if (params->storable) {
+ desc.BindFlags |= D3D11_BIND_UNORDERED_ACCESS
+ | D3D11_BIND_SHADER_RESOURCE;
+ desc.ByteWidth = PL_ALIGN2(desc.ByteWidth, sizeof(float));
+ desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS;
+ }
+ if (params->drawable) {
+ desc.BindFlags |= D3D11_BIND_VERTEX_BUFFER;
+
+ // In FL9_x, a vertex buffer can't also be an index buffer, so index
+ // buffers are unsupported in FL9_x for now
+ if (p->fl > D3D_FEATURE_LEVEL_9_3)
+ desc.BindFlags |= D3D11_BIND_INDEX_BUFFER;
+ }
+
+ char *data = NULL;
+
+ // D3D11 doesn't allow partial constant buffer updates without special
+ // conditions. To support partial buffer updates, keep a mirror of the
+ // buffer data in system memory and upload the whole thing before the buffer
+ // is used.
+ //
+ // Note: We don't use a staging buffer for this because of Intel.
+ // https://github.com/mpv-player/mpv/issues/5293
+ // https://crbug.com/593024
+ if (params->uniform && !params->format && params->host_writable) {
+ data = pl_zalloc(buf, desc.ByteWidth);
+ buf_p->data = data;
+ }
+
+ D3D11_SUBRESOURCE_DATA srdata = { 0 };
+ if (params->initial_data) {
+ if (desc.ByteWidth != params->size) {
+ // If the size had to be rounded-up, uploading from
+ // params->initial_data is technically undefined behavior, so copy
+ // the initial data to an allocation first
+ if (!data)
+ data = pl_zalloc(buf, desc.ByteWidth);
+ srdata.pSysMem = data;
+ } else {
+ srdata.pSysMem = params->initial_data;
+ }
+
+ if (data)
+ memcpy(data, params->initial_data, params->size);
+ }
+
+ D3D(ID3D11Device_CreateBuffer(p->dev, &desc,
+ params->initial_data ? &srdata : NULL,
+ &buf_p->buf));
+
+ if (!buf_p->data)
+ pl_free(data);
+
+ // Create raw views for PL_DESC_BUF_STORAGE
+ if (params->storable) {
+ // A SRV is used for PL_DESC_ACCESS_READONLY
+ D3D11_SHADER_RESOURCE_VIEW_DESC sdesc = {
+ .Format = DXGI_FORMAT_R32_TYPELESS,
+ .ViewDimension = D3D11_SRV_DIMENSION_BUFFEREX,
+ .BufferEx = {
+ .NumElements =
+ PL_ALIGN2(buf->params.size, sizeof(float)) / sizeof(float),
+ .Flags = D3D11_BUFFEREX_SRV_FLAG_RAW,
+ },
+ };
+ D3D(ID3D11Device_CreateShaderResourceView(p->dev,
+ (ID3D11Resource *) buf_p->buf, &sdesc, &buf_p->raw_srv));
+
+ // A UAV is used for all other access modes
+ D3D11_UNORDERED_ACCESS_VIEW_DESC udesc = {
+ .Format = DXGI_FORMAT_R32_TYPELESS,
+ .ViewDimension = D3D11_UAV_DIMENSION_BUFFER,
+ .Buffer = {
+ .NumElements =
+ PL_ALIGN2(buf->params.size, sizeof(float)) / sizeof(float),
+ .Flags = D3D11_BUFFER_UAV_FLAG_RAW,
+ },
+ };
+ D3D(ID3D11Device_CreateUnorderedAccessView(p->dev,
+ (ID3D11Resource *) buf_p->buf, &udesc, &buf_p->raw_uav));
+ }
+
+ // Create a typed SRV for PL_BUF_TEXEL_UNIFORM and PL_BUF_TEXEL_STORAGE
+ if (params->format) {
+ if (params->uniform) {
+ D3D11_SHADER_RESOURCE_VIEW_DESC sdesc = {
+ .Format = fmt_to_dxgi(params->format),
+ .ViewDimension = D3D11_SRV_DIMENSION_BUFFER,
+ .Buffer = {
+ .NumElements =
+ PL_ALIGN(buf->params.size, buf->params.format->texel_size)
+ / buf->params.format->texel_size,
+ },
+ };
+ D3D(ID3D11Device_CreateShaderResourceView(p->dev,
+ (ID3D11Resource *) buf_p->buf, &sdesc, &buf_p->texel_srv));
+ }
+
+ // Create a typed UAV for PL_BUF_TEXEL_STORAGE
+ if (params->storable) {
+ D3D11_UNORDERED_ACCESS_VIEW_DESC udesc = {
+ .Format = fmt_to_dxgi(buf->params.format),
+ .ViewDimension = D3D11_UAV_DIMENSION_BUFFER,
+ .Buffer = {
+ .NumElements =
+ PL_ALIGN(buf->params.size, buf->params.format->texel_size)
+ / buf->params.format->texel_size,
+ },
+ };
+ D3D(ID3D11Device_CreateUnorderedAccessView(p->dev,
+ (ID3D11Resource *) buf_p->buf, &udesc, &buf_p->texel_uav));
+ }
+ }
+
+
+ if (!buf_p->data) {
+ // Create the staging buffer regardless of whether params->host_readable
+ // is set or not, so that buf_copy can copy to system-memory-backed
+ // buffers
+ // TODO: Consider sharing a big staging buffer for this, rather than
+ // having one staging buffer per buffer
+ desc.BindFlags = 0;
+ desc.MiscFlags = 0;
+ desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+ desc.Usage = D3D11_USAGE_STAGING;
+ D3D(ID3D11Device_CreateBuffer(p->dev, &desc, NULL, &buf_p->staging));
+ }
+
+ pl_d3d11_flush_message_queue(ctx, "After buffer create");
+
+ return buf;
+
+error:
+ pl_d3d11_buf_destroy(gpu, buf);
+ return NULL;
+}
+
+void pl_d3d11_buf_write(pl_gpu gpu, pl_buf buf, size_t offset, const void *data,
+ size_t size)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct pl_buf_d3d11 *buf_p = PL_PRIV(buf);
+
+ if (buf_p->data) {
+ memcpy(buf_p->data + offset, data, size);
+ buf_p->dirty = true;
+ } else {
+ ID3D11DeviceContext_UpdateSubresource(p->imm,
+ (ID3D11Resource *) buf_p->buf, 0, (&(D3D11_BOX) {
+ .left = offset,
+ .top = 0,
+ .front = 0,
+ .right = offset + size,
+ .bottom = 1,
+ .back = 1,
+ }), data, 0, 0);
+ }
+}
+
+void pl_d3d11_buf_resolve(pl_gpu gpu, pl_buf buf)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct pl_buf_d3d11 *buf_p = PL_PRIV(buf);
+
+ if (!buf_p->data || !buf_p->dirty)
+ return;
+
+ ID3D11DeviceContext_UpdateSubresource(p->imm, (ID3D11Resource *) buf_p->buf,
+ 0, NULL, buf_p->data, 0, 0);
+}
+
+bool pl_d3d11_buf_read(pl_gpu gpu, pl_buf buf, size_t offset, void *dest,
+ size_t size)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+ struct pl_buf_d3d11 *buf_p = PL_PRIV(buf);
+
+ // If there is a system-memory mirror of the buffer contents, use it
+ if (buf_p->data) {
+ memcpy(dest, buf_p->data + offset, size);
+ return true;
+ }
+
+ ID3D11DeviceContext_CopyResource(p->imm, (ID3D11Resource *) buf_p->staging,
+ (ID3D11Resource *) buf_p->buf);
+
+ D3D11_MAPPED_SUBRESOURCE lock;
+ D3D(ID3D11DeviceContext_Map(p->imm, (ID3D11Resource *) buf_p->staging, 0,
+ D3D11_MAP_READ, 0, &lock));
+
+ char *csrc = lock.pData;
+ memcpy(dest, csrc + offset, size);
+
+ ID3D11DeviceContext_Unmap(p->imm, (ID3D11Resource *) buf_p->staging, 0);
+
+ pl_d3d11_flush_message_queue(ctx, "After buffer read");
+
+ return true;
+
+error:
+ return false;
+}
+
+void pl_d3d11_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset, pl_buf src,
+ size_t src_offset, size_t size)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+ struct pl_buf_d3d11 *src_p = PL_PRIV(src);
+ struct pl_buf_d3d11 *dst_p = PL_PRIV(dst);
+
+ // Handle system memory copies in case one or both of the buffers has a
+ // system memory mirror
+ if (src_p->data && dst_p->data) {
+ memcpy(dst_p->data + dst_offset, src_p->data + src_offset, size);
+ dst_p->dirty = true;
+ } else if (src_p->data) {
+ pl_d3d11_buf_write(gpu, dst, dst_offset, src_p->data + src_offset, size);
+ } else if (dst_p->data) {
+ if (pl_d3d11_buf_read(gpu, src, src_offset, dst_p->data + dst_offset, size)) {
+ dst_p->dirty = true;
+ } else {
+ PL_ERR(gpu, "Failed to read from GPU during buffer copy");
+ }
+ } else {
+ ID3D11DeviceContext_CopySubresourceRegion(p->imm,
+ (ID3D11Resource *) dst_p->buf, 0, dst_offset, 0, 0,
+ (ID3D11Resource *) src_p->buf, 0, (&(D3D11_BOX) {
+ .left = src_offset,
+ .top = 0,
+ .front = 0,
+ .right = src_offset + size,
+ .bottom = 1,
+ .back = 1,
+ }));
+ }
+
+ pl_d3d11_flush_message_queue(ctx, "After buffer copy");
+}
diff --git a/src/d3d11/gpu_pass.c b/src/d3d11/gpu_pass.c
new file mode 100644
index 0000000..0e46ccd
--- /dev/null
+++ b/src/d3d11/gpu_pass.c
@@ -0,0 +1,1293 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+#include "formats.h"
+#include "glsl/spirv.h"
+#include "../cache.h"
+
+struct stream_buf_slice {
+ const void *data;
+ unsigned int size;
+ unsigned int offset;
+};
+
+// Upload one or more slices of single-use data to a suballocated dynamic
+// buffer. Only call this once per-buffer per-pass, since it will discard or
+// reallocate the buffer when full.
+static bool stream_buf_upload(pl_gpu gpu, struct d3d_stream_buf *stream,
+ struct stream_buf_slice *slices, int num_slices)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+ unsigned int align = PL_DEF(stream->align, sizeof(float));
+
+ // Get total size, rounded up to the buffer's alignment
+ size_t size = 0;
+ for (int i = 0; i < num_slices; i++)
+ size += PL_ALIGN2(slices[i].size, align);
+
+ if (size > gpu->limits.max_buf_size) {
+ PL_ERR(gpu, "Streaming buffer is too large");
+ return -1;
+ }
+
+ // If the data doesn't fit, realloc the buffer
+ if (size > stream->size) {
+ size_t new_size = stream->size;
+ // Arbitrary base size
+ if (!new_size)
+ new_size = 16 * 1024;
+ while (new_size < size)
+ new_size *= 2;
+ new_size = PL_MIN(new_size, gpu->limits.max_buf_size);
+
+ ID3D11Buffer *new_buf;
+ D3D11_BUFFER_DESC vbuf_desc = {
+ .ByteWidth = new_size,
+ .Usage = D3D11_USAGE_DYNAMIC,
+ .BindFlags = stream->bind_flags,
+ .CPUAccessFlags = D3D11_CPU_ACCESS_WRITE,
+ };
+ D3D(ID3D11Device_CreateBuffer(p->dev, &vbuf_desc, NULL, &new_buf));
+
+ SAFE_RELEASE(stream->buf);
+ stream->buf = new_buf;
+ stream->size = new_size;
+ stream->used = 0;
+ }
+
+ bool discard = false;
+ size_t offset = stream->used;
+ if (offset + size > stream->size) {
+ // We reached the end of the buffer, so discard and wrap around
+ discard = true;
+ offset = 0;
+ }
+
+ D3D11_MAPPED_SUBRESOURCE map = {0};
+ UINT type = discard ? D3D11_MAP_WRITE_DISCARD : D3D11_MAP_WRITE_NO_OVERWRITE;
+ D3D(ID3D11DeviceContext_Map(p->imm, (ID3D11Resource *) stream->buf, 0, type,
+ 0, &map));
+
+ // Upload each slice
+ char *cdata = map.pData;
+ stream->used = offset;
+ for (int i = 0; i < num_slices; i++) {
+ slices[i].offset = stream->used;
+ memcpy(cdata + slices[i].offset, slices[i].data, slices[i].size);
+ stream->used += PL_ALIGN2(slices[i].size, align);
+ }
+
+ ID3D11DeviceContext_Unmap(p->imm, (ID3D11Resource *) stream->buf, 0);
+
+ return true;
+
+error:
+ return false;
+}
+
+static const char *get_shader_target(pl_gpu gpu, enum glsl_shader_stage stage)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ switch (p->fl) {
+ default:
+ switch (stage) {
+ case GLSL_SHADER_VERTEX: return "vs_5_0";
+ case GLSL_SHADER_FRAGMENT: return "ps_5_0";
+ case GLSL_SHADER_COMPUTE: return "cs_5_0";
+ }
+ break;
+ case D3D_FEATURE_LEVEL_10_1:
+ switch (stage) {
+ case GLSL_SHADER_VERTEX: return "vs_4_1";
+ case GLSL_SHADER_FRAGMENT: return "ps_4_1";
+ case GLSL_SHADER_COMPUTE: return "cs_4_1";
+ }
+ break;
+ case D3D_FEATURE_LEVEL_10_0:
+ switch (stage) {
+ case GLSL_SHADER_VERTEX: return "vs_4_0";
+ case GLSL_SHADER_FRAGMENT: return "ps_4_0";
+ case GLSL_SHADER_COMPUTE: return "cs_4_0";
+ }
+ break;
+ case D3D_FEATURE_LEVEL_9_3:
+ switch (stage) {
+ case GLSL_SHADER_VERTEX: return "vs_4_0_level_9_3";
+ case GLSL_SHADER_FRAGMENT: return "ps_4_0_level_9_3";
+ case GLSL_SHADER_COMPUTE: return NULL;
+ }
+ break;
+ case D3D_FEATURE_LEVEL_9_2:
+ case D3D_FEATURE_LEVEL_9_1:
+ switch (stage) {
+ case GLSL_SHADER_VERTEX: return "vs_4_0_level_9_1";
+ case GLSL_SHADER_FRAGMENT: return "ps_4_0_level_9_1";
+ case GLSL_SHADER_COMPUTE: return NULL;
+ }
+ break;
+ }
+ return NULL;
+}
+
+static SpvExecutionModel stage_to_spv(enum glsl_shader_stage stage)
+{
+ static const SpvExecutionModel spv_execution_model[] = {
+ [GLSL_SHADER_VERTEX] = SpvExecutionModelVertex,
+ [GLSL_SHADER_FRAGMENT] = SpvExecutionModelFragment,
+ [GLSL_SHADER_COMPUTE] = SpvExecutionModelGLCompute,
+ };
+ return spv_execution_model[stage];
+}
+
+#define SC(cmd) \
+ do { \
+ spvc_result res = (cmd); \
+ if (res != SPVC_SUCCESS) { \
+ PL_ERR(gpu, "%s: %s (%d) (%s:%d)", \
+ #cmd, sc ? spvc_context_get_last_error_string(sc) : "", \
+ res, __FILE__, __LINE__); \
+ goto error; \
+ } \
+ } while (0)
+
+// Some decorations, like SpvDecorationNonWritable, are actually found on the
+// members of a buffer block, rather than the buffer block itself. If all
+// members have a certain decoration, SPIRV-Cross considers it to apply to the
+// buffer block too, which determines things like whether a SRV or UAV is used
+// for an SSBO. This function checks if SPIRV-Cross considers a decoration to
+// apply to a buffer block.
+static spvc_result buffer_block_has_decoration(spvc_compiler sc_comp,
+ spvc_variable_id id,
+ SpvDecoration decoration,
+ bool *out)
+{
+ const SpvDecoration *decorations;
+ size_t num_decorations = 0;
+
+ spvc_result res = spvc_compiler_get_buffer_block_decorations(sc_comp, id,
+ &decorations, &num_decorations);
+ if (res != SPVC_SUCCESS)
+ return res;
+
+ for (size_t j = 0; j < num_decorations; j++) {
+ if (decorations[j] == decoration) {
+ *out = true;
+ return res;
+ }
+ }
+
+ *out = false;
+ return res;
+}
+
+static bool alloc_hlsl_reg_bindings(pl_gpu gpu, pl_pass pass,
+ struct d3d_pass_stage *pass_s,
+ spvc_context sc,
+ spvc_compiler sc_comp,
+ spvc_resources resources,
+ spvc_resource_type res_type,
+ enum glsl_shader_stage stage)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
+ const spvc_reflected_resource *res_list;
+ size_t res_count;
+
+ SC(spvc_resources_get_resource_list_for_type(resources, res_type,
+ &res_list, &res_count));
+
+ // In a raster pass, one of the UAV slots is used by the runtime for the RTV
+ int uav_offset = stage == GLSL_SHADER_COMPUTE ? 0 : 1;
+ int max_uavs = p->max_uavs - uav_offset;
+
+ for (int i = 0; i < res_count; i++) {
+ unsigned int binding = spvc_compiler_get_decoration(sc_comp,
+ res_list[i].id, SpvDecorationBinding);
+ unsigned int descriptor_set = spvc_compiler_get_decoration(sc_comp,
+ res_list[i].id, SpvDecorationDescriptorSet);
+ if (descriptor_set != 0)
+ continue;
+
+ pass_p->max_binding = PL_MAX(pass_p->max_binding, binding);
+
+ spvc_hlsl_resource_binding hlslbind;
+ spvc_hlsl_resource_binding_init(&hlslbind);
+ hlslbind.stage = stage_to_spv(stage);
+ hlslbind.binding = binding;
+ hlslbind.desc_set = descriptor_set;
+
+ bool has_cbv = false, has_sampler = false, has_srv = false, has_uav = false;
+ switch (res_type) {
+ case SPVC_RESOURCE_TYPE_UNIFORM_BUFFER:
+ has_cbv = true;
+ break;
+ case SPVC_RESOURCE_TYPE_STORAGE_BUFFER:;
+ bool non_writable_bb = false;
+ SC(buffer_block_has_decoration(sc_comp, res_list[i].id,
+ SpvDecorationNonWritable, &non_writable_bb));
+ if (non_writable_bb) {
+ has_srv = true;
+ } else {
+ has_uav = true;
+ }
+ break;
+ case SPVC_RESOURCE_TYPE_STORAGE_IMAGE:;
+ bool non_writable = spvc_compiler_has_decoration(sc_comp,
+ res_list[i].id, SpvDecorationNonWritable);
+ if (non_writable) {
+ has_srv = true;
+ } else {
+ has_uav = true;
+ }
+ break;
+ case SPVC_RESOURCE_TYPE_SEPARATE_IMAGE:
+ has_srv = true;
+ break;
+ case SPVC_RESOURCE_TYPE_SAMPLED_IMAGE:;
+ spvc_type type = spvc_compiler_get_type_handle(sc_comp,
+ res_list[i].type_id);
+ SpvDim dimension = spvc_type_get_image_dimension(type);
+ // Uniform texel buffers are technically sampled images, but they
+ // aren't sampled from, so don't allocate a sampler
+ if (dimension != SpvDimBuffer)
+ has_sampler = true;
+ has_srv = true;
+ break;
+ default:
+ break;
+ }
+
+ if (has_cbv) {
+ hlslbind.cbv.register_binding = pass_s->cbvs.num;
+ PL_ARRAY_APPEND(pass, pass_s->cbvs, binding);
+ if (pass_s->cbvs.num > D3D11_COMMONSHADER_CONSTANT_BUFFER_API_SLOT_COUNT) {
+ PL_ERR(gpu, "Too many constant buffers in shader");
+ goto error;
+ }
+ }
+
+ if (has_sampler) {
+ hlslbind.sampler.register_binding = pass_s->samplers.num;
+ PL_ARRAY_APPEND(pass, pass_s->samplers, binding);
+ if (pass_s->samplers.num > D3D11_COMMONSHADER_SAMPLER_SLOT_COUNT) {
+ PL_ERR(gpu, "Too many samplers in shader");
+ goto error;
+ }
+ }
+
+ if (has_srv) {
+ hlslbind.srv.register_binding = pass_s->srvs.num;
+ PL_ARRAY_APPEND(pass, pass_s->srvs, binding);
+ if (pass_s->srvs.num > p->max_srvs) {
+ PL_ERR(gpu, "Too many SRVs in shader");
+ goto error;
+ }
+ }
+
+ if (has_uav) {
+ // UAV registers are shared between the vertex and fragment shaders
+ // in a raster pass, so check if the UAV for this resource has
+ // already been allocated
+ bool uav_bound = false;
+ for (int j = 0; j < pass_p->uavs.num; j++) {
+ if (pass_p->uavs.elem[j] == binding) {
+ uav_bound = true;
+ break;
+ }
+ }
+
+ if (!uav_bound) {
+ hlslbind.uav.register_binding = pass_p->uavs.num + uav_offset;
+ PL_ARRAY_APPEND(pass, pass_p->uavs, binding);
+ if (pass_p->uavs.num > max_uavs) {
+ PL_ERR(gpu, "Too many UAVs in shader");
+ goto error;
+ }
+ }
+ }
+
+ SC(spvc_compiler_hlsl_add_resource_binding(sc_comp, &hlslbind));
+ }
+
+ return true;
+error:
+ return false;
+}
+
+static const char *shader_names[] = {
+ [GLSL_SHADER_VERTEX] = "vertex",
+ [GLSL_SHADER_FRAGMENT] = "fragment",
+ [GLSL_SHADER_COMPUTE] = "compute",
+};
+
+static ID3DBlob *shader_compile_glsl(pl_gpu gpu, pl_pass pass,
+ struct d3d_pass_stage *pass_s,
+ enum glsl_shader_stage stage,
+ const char *glsl)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
+ void *tmp = pl_tmp(NULL);
+ spvc_context sc = NULL;
+ spvc_compiler sc_comp = NULL;
+ const char *hlsl = NULL;
+ ID3DBlob *out = NULL;
+ ID3DBlob *errors = NULL;
+ HRESULT hr;
+
+ pl_clock_t start = pl_clock_now();
+ pl_str spirv = pl_spirv_compile_glsl(p->spirv, tmp, gpu->glsl, stage, glsl);
+ if (!spirv.len)
+ goto error;
+
+ pl_clock_t after_glsl = pl_clock_now();
+ pl_log_cpu_time(gpu->log, start, after_glsl, "translating GLSL to SPIR-V");
+
+ SC(spvc_context_create(&sc));
+
+ spvc_parsed_ir sc_ir;
+ SC(spvc_context_parse_spirv(sc, (SpvId *) spirv.buf,
+ spirv.len / sizeof(SpvId), &sc_ir));
+
+ SC(spvc_context_create_compiler(sc, SPVC_BACKEND_HLSL, sc_ir,
+ SPVC_CAPTURE_MODE_TAKE_OWNERSHIP,
+ &sc_comp));
+
+ spvc_compiler_options sc_opts;
+ SC(spvc_compiler_create_compiler_options(sc_comp, &sc_opts));
+
+ int sc_shader_model;
+ if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
+ sc_shader_model = 50;
+ } else if (p->fl >= D3D_FEATURE_LEVEL_10_1) {
+ sc_shader_model = 41;
+ } else {
+ sc_shader_model = 40;
+ }
+
+ SC(spvc_compiler_options_set_uint(sc_opts,
+ SPVC_COMPILER_OPTION_HLSL_SHADER_MODEL, sc_shader_model));
+
+ // Unlike Vulkan and OpenGL, in D3D11, the clip-space is "flipped" with
+ // respect to framebuffer-space. In other words, if you render to a pixel at
+ // (0, -1), you have to sample from (0, 1) to get the value back. We unflip
+ // it by setting the following option, which inserts the equivalent of
+ // `gl_Position.y = -gl_Position.y` into the vertex shader
+ if (stage == GLSL_SHADER_VERTEX) {
+ SC(spvc_compiler_options_set_bool(sc_opts,
+ SPVC_COMPILER_OPTION_FLIP_VERTEX_Y, SPVC_TRUE));
+ }
+
+ // Bind readonly images and imageBuffers as SRVs. This is done because a lot
+ // of hardware (especially FL11_x hardware) has very poor format support for
+ // reading values from UAVs. It allows the common case of readonly and
+ // writeonly images to support more formats, though the less common case of
+ // readwrite images still requires format support for UAV loads (represented
+ // by the PL_FMT_CAP_READWRITE cap in libplacebo.)
+ //
+ // Note that setting this option comes at the cost of GLSL support. Readonly
+ // and readwrite images are the same type in GLSL, but SRV and UAV bound
+ // textures are different types in HLSL, so for example, a GLSL function
+ // with an image parameter may fail to compile as HLSL if it's called with a
+ // readonly image and a readwrite image at different call sites.
+ SC(spvc_compiler_options_set_bool(sc_opts,
+ SPVC_COMPILER_OPTION_HLSL_NONWRITABLE_UAV_TEXTURE_AS_SRV, SPVC_TRUE));
+
+ SC(spvc_compiler_install_compiler_options(sc_comp, sc_opts));
+
+ spvc_set active = NULL;
+ SC(spvc_compiler_get_active_interface_variables(sc_comp, &active));
+ spvc_resources resources = NULL;
+ SC(spvc_compiler_create_shader_resources_for_active_variables(
+ sc_comp, &resources, active));
+
+ // Allocate HLSL registers for each resource type
+ alloc_hlsl_reg_bindings(gpu, pass, pass_s, sc, sc_comp, resources,
+ SPVC_RESOURCE_TYPE_SAMPLED_IMAGE, stage);
+ alloc_hlsl_reg_bindings(gpu, pass, pass_s, sc, sc_comp, resources,
+ SPVC_RESOURCE_TYPE_SEPARATE_IMAGE, stage);
+ alloc_hlsl_reg_bindings(gpu, pass, pass_s, sc, sc_comp, resources,
+ SPVC_RESOURCE_TYPE_UNIFORM_BUFFER, stage);
+ alloc_hlsl_reg_bindings(gpu, pass, pass_s, sc, sc_comp, resources,
+ SPVC_RESOURCE_TYPE_STORAGE_BUFFER, stage);
+ alloc_hlsl_reg_bindings(gpu, pass, pass_s, sc, sc_comp, resources,
+ SPVC_RESOURCE_TYPE_STORAGE_IMAGE, stage);
+
+ if (stage == GLSL_SHADER_COMPUTE) {
+ // Check if the gl_NumWorkGroups builtin is used. If it is, we have to
+ // emulate it with a constant buffer, so allocate it a CBV register.
+ spvc_variable_id num_workgroups_id =
+ spvc_compiler_hlsl_remap_num_workgroups_builtin(sc_comp);
+ if (num_workgroups_id) {
+ pass_p->num_workgroups_used = true;
+
+ spvc_hlsl_resource_binding binding;
+ spvc_hlsl_resource_binding_init(&binding);
+ binding.stage = stage_to_spv(stage);
+ binding.binding = pass_p->max_binding + 1;
+
+ // Allocate a CBV register for the buffer
+ binding.cbv.register_binding = pass_s->cbvs.num;
+ PL_ARRAY_APPEND(pass, pass_s->cbvs, HLSL_BINDING_NUM_WORKGROUPS);
+ if (pass_s->cbvs.num >
+ D3D11_COMMONSHADER_CONSTANT_BUFFER_API_SLOT_COUNT) {
+ PL_ERR(gpu, "Not enough constant buffer slots for gl_NumWorkGroups");
+ goto error;
+ }
+
+ spvc_compiler_set_decoration(sc_comp, num_workgroups_id,
+ SpvDecorationDescriptorSet, 0);
+ spvc_compiler_set_decoration(sc_comp, num_workgroups_id,
+ SpvDecorationBinding, binding.binding);
+
+ SC(spvc_compiler_hlsl_add_resource_binding(sc_comp, &binding));
+ }
+ }
+
+ SC(spvc_compiler_compile(sc_comp, &hlsl));
+
+ pl_clock_t after_spvc = pl_clock_now();
+ pl_log_cpu_time(gpu->log, after_glsl, after_spvc, "translating SPIR-V to HLSL");
+
+ hr = p->D3DCompile(hlsl, strlen(hlsl), NULL, NULL, NULL, "main",
+ get_shader_target(gpu, stage),
+ D3DCOMPILE_SKIP_VALIDATION | D3DCOMPILE_OPTIMIZATION_LEVEL3, 0, &out,
+ &errors);
+ if (FAILED(hr)) {
+ SAFE_RELEASE(out);
+ PL_ERR(gpu, "D3DCompile failed: %s\n%.*s", pl_hresult_to_str(hr),
+ (int) ID3D10Blob_GetBufferSize(errors),
+ (char *) ID3D10Blob_GetBufferPointer(errors));
+ goto error;
+ }
+
+ pl_log_cpu_time(gpu->log, after_spvc, pl_clock_now(), "translating HLSL to DXBC");
+
+error:;
+ if (hlsl) {
+ int level = out ? PL_LOG_DEBUG : PL_LOG_ERR;
+ PL_MSG(gpu, level, "%s shader HLSL source:", shader_names[stage]);
+ pl_msg_source(gpu->log, level, hlsl);
+ }
+
+ if (sc)
+ spvc_context_destroy(sc);
+ SAFE_RELEASE(errors);
+ pl_free(tmp);
+ return out;
+}
+
+struct d3d11_cache_header {
+ uint64_t hash;
+ bool num_workgroups_used;
+ int num_main_cbvs;
+ int num_main_srvs;
+ int num_main_samplers;
+ int num_vertex_cbvs;
+ int num_vertex_srvs;
+ int num_vertex_samplers;
+ int num_uavs;
+ size_t vert_bc_len;
+ size_t frag_bc_len;
+ size_t comp_bc_len;
+};
+
+static inline uint64_t pass_cache_signature(pl_gpu gpu, uint64_t *key,
+ const struct pl_pass_params *params)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+
+ uint64_t hash = CACHE_KEY_D3D_DXBC; // seed to uniquely identify d3d11 shaders
+
+ pl_hash_merge(&hash, pl_str0_hash(params->glsl_shader));
+ if (params->type == PL_PASS_RASTER)
+ pl_hash_merge(&hash, pl_str0_hash(params->vertex_shader));
+
+ // store hash based on the shader bodys as the lookup key
+ if (key)
+ *key = hash;
+
+ // and add the compiler version information into the verification signature
+ pl_hash_merge(&hash, p->spirv->signature);
+
+ unsigned spvc_major, spvc_minor, spvc_patch;
+ spvc_get_version(&spvc_major, &spvc_minor, &spvc_patch);
+
+ pl_hash_merge(&hash, spvc_major);
+ pl_hash_merge(&hash, spvc_minor);
+ pl_hash_merge(&hash, spvc_patch);
+
+ pl_hash_merge(&hash, ((uint64_t)p->d3d_compiler_ver.major << 48)
+ | ((uint64_t)p->d3d_compiler_ver.minor << 32)
+ | ((uint64_t)p->d3d_compiler_ver.build << 16)
+ | (uint64_t)p->d3d_compiler_ver.revision);
+ pl_hash_merge(&hash, p->fl);
+
+ return hash;
+}
+
+static inline size_t cache_payload_size(struct d3d11_cache_header *header)
+{
+ size_t required = (header->num_main_cbvs + header->num_main_srvs +
+ header->num_main_samplers + header->num_vertex_cbvs +
+ header->num_vertex_srvs + header->num_vertex_samplers +
+ header->num_uavs) * sizeof(int) + header->vert_bc_len +
+ header->frag_bc_len + header->comp_bc_len;
+
+ return required;
+}
+
+static bool d3d11_use_cached_program(pl_gpu gpu, struct pl_pass_t *pass,
+ const struct pl_pass_params *params,
+ pl_cache_obj *obj, uint64_t *out_sig,
+ pl_str *vert_bc, pl_str *frag_bc, pl_str *comp_bc)
+{
+ struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
+ const pl_cache gpu_cache = pl_gpu_cache(gpu);
+ if (!gpu_cache)
+ return false;
+
+ *out_sig = pass_cache_signature(gpu, &obj->key, params);
+ if (!pl_cache_get(gpu_cache, obj))
+ return false;
+
+ pl_str cache = (pl_str) { obj->data, obj->size };
+ if (cache.len < sizeof(struct d3d11_cache_header))
+ return false;
+
+ struct d3d11_cache_header *header = (struct d3d11_cache_header *) cache.buf;
+ cache = pl_str_drop(cache, sizeof(*header));
+
+ if (header->hash != *out_sig)
+ return false;
+
+ // determine required cache size before reading anything
+ size_t required = cache_payload_size(header);
+
+ if (cache.len < required)
+ return false;
+
+ pass_p->num_workgroups_used = header->num_workgroups_used;
+
+#define GET_ARRAY(object, name, num_elems) \
+ do { \
+ PL_ARRAY_MEMDUP(pass, (object)->name, cache.buf, num_elems); \
+ cache = pl_str_drop(cache, num_elems * sizeof(*(object)->name.elem)); \
+ } while (0)
+
+#define GET_STAGE_ARRAY(stage, name) \
+ GET_ARRAY(&pass_p->stage, name, header->num_##stage##_##name)
+
+ GET_STAGE_ARRAY(main, cbvs);
+ GET_STAGE_ARRAY(main, srvs);
+ GET_STAGE_ARRAY(main, samplers);
+ GET_STAGE_ARRAY(vertex, cbvs);
+ GET_STAGE_ARRAY(vertex, srvs);
+ GET_STAGE_ARRAY(vertex, samplers);
+ GET_ARRAY(pass_p, uavs, header->num_uavs);
+
+#define GET_SHADER(ptr) \
+ do { \
+ if (ptr) \
+ *ptr = pl_str_take(cache, header->ptr##_len); \
+ cache = pl_str_drop(cache, header->ptr##_len); \
+ } while (0)
+
+ GET_SHADER(vert_bc);
+ GET_SHADER(frag_bc);
+ GET_SHADER(comp_bc);
+
+ return true;
+}
+
+static void d3d11_update_program_cache(pl_gpu gpu, struct pl_pass_t *pass,
+ uint64_t key, uint64_t sig,
+ const pl_str *vs_str, const pl_str *ps_str,
+ const pl_str *cs_str)
+{
+ struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
+ const pl_cache gpu_cache = pl_gpu_cache(gpu);
+ if (!gpu_cache)
+ return;
+
+ struct d3d11_cache_header header = {
+ .hash = sig,
+ .num_workgroups_used = pass_p->num_workgroups_used,
+ .num_main_cbvs = pass_p->main.cbvs.num,
+ .num_main_srvs = pass_p->main.srvs.num,
+ .num_main_samplers = pass_p->main.samplers.num,
+ .num_vertex_cbvs = pass_p->vertex.cbvs.num,
+ .num_vertex_srvs = pass_p->vertex.srvs.num,
+ .num_vertex_samplers = pass_p->vertex.samplers.num,
+ .num_uavs = pass_p->uavs.num,
+ .vert_bc_len = vs_str ? vs_str->len : 0,
+ .frag_bc_len = ps_str ? ps_str->len : 0,
+ .comp_bc_len = cs_str ? cs_str->len : 0,
+ };
+
+ size_t cache_size = sizeof(header) + cache_payload_size(&header);
+ pl_str cache = {0};
+ pl_str_append(NULL, &cache, (pl_str){ (uint8_t *) &header, sizeof(header) });
+
+#define WRITE_ARRAY(name) pl_str_append(NULL, &cache, \
+ (pl_str){ (uint8_t *) pass_p->name.elem, \
+ sizeof(*pass_p->name.elem) * pass_p->name.num })
+ WRITE_ARRAY(main.cbvs);
+ WRITE_ARRAY(main.srvs);
+ WRITE_ARRAY(main.samplers);
+ WRITE_ARRAY(vertex.cbvs);
+ WRITE_ARRAY(vertex.srvs);
+ WRITE_ARRAY(vertex.samplers);
+ WRITE_ARRAY(uavs);
+
+ if (vs_str)
+ pl_str_append(NULL, &cache, *vs_str);
+
+ if (ps_str)
+ pl_str_append(NULL, &cache, *ps_str);
+
+ if (cs_str)
+ pl_str_append(NULL, &cache, *cs_str);
+
+ pl_assert(cache_size == cache.len);
+ pl_cache_str(gpu_cache, key, &cache);
+}
+
+void pl_d3d11_pass_destroy(pl_gpu gpu, pl_pass pass)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+ struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
+
+ SAFE_RELEASE(pass_p->vs);
+ SAFE_RELEASE(pass_p->ps);
+ SAFE_RELEASE(pass_p->cs);
+ SAFE_RELEASE(pass_p->layout);
+ SAFE_RELEASE(pass_p->bstate);
+ SAFE_RELEASE(pass_p->num_workgroups_buf);
+
+ pl_d3d11_flush_message_queue(ctx, "After pass destroy");
+
+ pl_free((void *) pass);
+}
+
+static bool pass_create_raster(pl_gpu gpu, struct pl_pass_t *pass,
+ const struct pl_pass_params *params)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+ struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
+ ID3DBlob *vs_blob = NULL;
+ pl_str vs_str = {0};
+ ID3DBlob *ps_blob = NULL;
+ pl_str ps_str = {0};
+ D3D11_INPUT_ELEMENT_DESC *in_descs = NULL;
+ pl_cache_obj obj = {0};
+ uint64_t sig = 0;
+ bool success = false;
+
+ if (d3d11_use_cached_program(gpu, pass, params, &obj, &sig, &vs_str, &ps_str, NULL))
+ PL_DEBUG(gpu, "Using cached DXBC shaders");
+
+ pl_assert((vs_str.len == 0) == (ps_str.len == 0));
+ if (vs_str.len == 0) {
+ vs_blob = shader_compile_glsl(gpu, pass, &pass_p->vertex,
+ GLSL_SHADER_VERTEX, params->vertex_shader);
+ if (!vs_blob)
+ goto error;
+
+ vs_str = (pl_str) {
+ .buf = ID3D10Blob_GetBufferPointer(vs_blob),
+ .len = ID3D10Blob_GetBufferSize(vs_blob),
+ };
+
+ ps_blob = shader_compile_glsl(gpu, pass, &pass_p->main,
+ GLSL_SHADER_FRAGMENT, params->glsl_shader);
+ if (!ps_blob)
+ goto error;
+
+ ps_str = (pl_str) {
+ .buf = ID3D10Blob_GetBufferPointer(ps_blob),
+ .len = ID3D10Blob_GetBufferSize(ps_blob),
+ };
+ }
+
+ D3D(ID3D11Device_CreateVertexShader(p->dev, vs_str.buf, vs_str.len, NULL,
+ &pass_p->vs));
+
+ D3D(ID3D11Device_CreatePixelShader(p->dev, ps_str.buf, ps_str.len, NULL,
+ &pass_p->ps));
+
+ in_descs = pl_calloc_ptr(pass, params->num_vertex_attribs, in_descs);
+ for (int i = 0; i < params->num_vertex_attribs; i++) {
+ struct pl_vertex_attrib *va = &params->vertex_attribs[i];
+
+ in_descs[i] = (D3D11_INPUT_ELEMENT_DESC) {
+ // The semantic name doesn't mean much and is just used to verify
+ // the input description matches the shader. SPIRV-Cross always
+ // uses TEXCOORD, so we should too.
+ .SemanticName = "TEXCOORD",
+ .SemanticIndex = va->location,
+ .AlignedByteOffset = va->offset,
+ .Format = fmt_to_dxgi(va->fmt),
+ };
+ }
+ D3D(ID3D11Device_CreateInputLayout(p->dev, in_descs,
+ params->num_vertex_attribs, vs_str.buf, vs_str.len, &pass_p->layout));
+
+ static const D3D11_BLEND blend_options[] = {
+ [PL_BLEND_ZERO] = D3D11_BLEND_ZERO,
+ [PL_BLEND_ONE] = D3D11_BLEND_ONE,
+ [PL_BLEND_SRC_ALPHA] = D3D11_BLEND_SRC_ALPHA,
+ [PL_BLEND_ONE_MINUS_SRC_ALPHA] = D3D11_BLEND_INV_SRC_ALPHA,
+ };
+
+ D3D11_BLEND_DESC bdesc = {
+ .RenderTarget[0] = {
+ .RenderTargetWriteMask = D3D11_COLOR_WRITE_ENABLE_ALL,
+ },
+ };
+ if (params->blend_params) {
+ bdesc.RenderTarget[0] = (D3D11_RENDER_TARGET_BLEND_DESC) {
+ .BlendEnable = TRUE,
+ .SrcBlend = blend_options[params->blend_params->src_rgb],
+ .DestBlend = blend_options[params->blend_params->dst_rgb],
+ .BlendOp = D3D11_BLEND_OP_ADD,
+ .SrcBlendAlpha = blend_options[params->blend_params->src_alpha],
+ .DestBlendAlpha = blend_options[params->blend_params->dst_alpha],
+ .BlendOpAlpha = D3D11_BLEND_OP_ADD,
+ .RenderTargetWriteMask = D3D11_COLOR_WRITE_ENABLE_ALL,
+ };
+ }
+ D3D(ID3D11Device_CreateBlendState(p->dev, &bdesc, &pass_p->bstate));
+
+ d3d11_update_program_cache(gpu, pass, obj.key, sig, &vs_str, &ps_str, NULL);
+
+ success = true;
+error:
+ SAFE_RELEASE(vs_blob);
+ SAFE_RELEASE(ps_blob);
+ pl_cache_obj_free(&obj);
+ pl_free(in_descs);
+ return success;
+}
+
+static bool pass_create_compute(pl_gpu gpu, struct pl_pass_t *pass,
+ const struct pl_pass_params *params)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+ struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
+ ID3DBlob *cs_blob = NULL;
+ pl_str cs_str = {0};
+ pl_cache_obj obj = {0};
+ uint64_t sig = 0;
+ bool success = false;
+
+ if (d3d11_use_cached_program(gpu, pass, params, &obj, &sig, NULL, NULL, &cs_str))
+ PL_DEBUG(gpu, "Using cached DXBC shader");
+
+ if (cs_str.len == 0) {
+ cs_blob = shader_compile_glsl(gpu, pass, &pass_p->main,
+ GLSL_SHADER_COMPUTE, params->glsl_shader);
+ if (!cs_blob)
+ goto error;
+
+ cs_str = (pl_str) {
+ .buf = ID3D10Blob_GetBufferPointer(cs_blob),
+ .len = ID3D10Blob_GetBufferSize(cs_blob),
+ };
+ }
+
+ D3D(ID3D11Device_CreateComputeShader(p->dev, cs_str.buf, cs_str.len, NULL,
+ &pass_p->cs));
+
+ if (pass_p->num_workgroups_used) {
+ D3D11_BUFFER_DESC bdesc = {
+ .BindFlags = D3D11_BIND_CONSTANT_BUFFER,
+ .ByteWidth = sizeof(pass_p->last_num_wgs),
+ };
+ D3D(ID3D11Device_CreateBuffer(p->dev, &bdesc, NULL,
+ &pass_p->num_workgroups_buf));
+ }
+
+ d3d11_update_program_cache(gpu, pass, obj.key, sig, NULL, NULL, &cs_str);
+
+ success = true;
+error:
+ pl_cache_obj_free(&obj);
+ SAFE_RELEASE(cs_blob);
+ return success;
+}
+
+const struct pl_pass_t *pl_d3d11_pass_create(pl_gpu gpu,
+ const struct pl_pass_params *params)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+
+ struct pl_pass_t *pass = pl_zalloc_obj(NULL, pass, struct pl_pass_d3d11);
+ pass->params = pl_pass_params_copy(pass, params);
+ struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
+ *pass_p = (struct pl_pass_d3d11) {
+ .max_binding = -1,
+ };
+
+ if (params->type == PL_PASS_COMPUTE) {
+ if (!pass_create_compute(gpu, pass, params))
+ goto error;
+ } else {
+ if (!pass_create_raster(gpu, pass, params))
+ goto error;
+ }
+
+ // Pre-allocate resource arrays to use in pl_pass_run
+ pass_p->cbv_arr = pl_calloc(pass,
+ PL_MAX(pass_p->main.cbvs.num, pass_p->vertex.cbvs.num),
+ sizeof(*pass_p->cbv_arr));
+ pass_p->srv_arr = pl_calloc(pass,
+ PL_MAX(pass_p->main.srvs.num, pass_p->vertex.srvs.num),
+ sizeof(*pass_p->srv_arr));
+ pass_p->sampler_arr = pl_calloc(pass,
+ PL_MAX(pass_p->main.samplers.num, pass_p->vertex.samplers.num),
+ sizeof(*pass_p->sampler_arr));
+ pass_p->uav_arr = pl_calloc(pass, pass_p->uavs.num, sizeof(*pass_p->uav_arr));
+
+ // Find the highest binding number used in `params->descriptors` if we
+ // haven't found it already. (If the shader was compiled fresh rather than
+ // loaded from cache, `pass_p->max_binding` should already be set.)
+ if (pass_p->max_binding == -1) {
+ for (int i = 0; i < params->num_descriptors; i++) {
+ pass_p->max_binding = PL_MAX(pass_p->max_binding,
+ params->descriptors[i].binding);
+ }
+ }
+
+ // Build a mapping from binding numbers to descriptor array indexes
+ int *binding_map = pl_calloc_ptr(pass, pass_p->max_binding + 1, binding_map);
+ for (int i = 0; i <= pass_p->max_binding; i++)
+ binding_map[i] = HLSL_BINDING_NOT_USED;
+ for (int i = 0; i < params->num_descriptors; i++)
+ binding_map[params->descriptors[i].binding] = i;
+
+#define MAP_RESOURCES(array) \
+ do { \
+ for (int i = 0; i < array.num; i++) { \
+ if (array.elem[i] > pass_p->max_binding) { \
+ array.elem[i] = HLSL_BINDING_NOT_USED; \
+ } else if (array.elem[i] >= 0) { \
+ array.elem[i] = binding_map[array.elem[i]]; \
+ } \
+ } \
+ } while (0)
+
+ // During shader compilation (or after loading a compiled shader from cache)
+ // the entries of the following resource lists are shader binding numbers,
+ // however, it's more efficient for `pl_pass_run` if they refer to indexes
+ // of the `params->descriptors` array instead, so remap them here
+ MAP_RESOURCES(pass_p->main.cbvs);
+ MAP_RESOURCES(pass_p->main.samplers);
+ MAP_RESOURCES(pass_p->main.srvs);
+ MAP_RESOURCES(pass_p->vertex.cbvs);
+ MAP_RESOURCES(pass_p->vertex.samplers);
+ MAP_RESOURCES(pass_p->vertex.srvs);
+ MAP_RESOURCES(pass_p->uavs);
+ pl_free(binding_map);
+
+ pl_d3d11_flush_message_queue(ctx, "After pass create");
+
+ return pass;
+
+error:
+ pl_d3d11_pass_destroy(gpu, pass);
+ return NULL;
+}
+
+// Shared logic between VS, PS and CS for filling the resource arrays that are
+// passed to ID3D11DeviceContext methods
+static void fill_resources(pl_gpu gpu, pl_pass pass,
+ struct d3d_pass_stage *pass_s,
+ const struct pl_pass_run_params *params,
+ ID3D11Buffer **cbvs, ID3D11ShaderResourceView **srvs,
+ ID3D11SamplerState **samplers)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
+
+ for (int i = 0; i < pass_s->cbvs.num; i++) {
+ int binding = pass_s->cbvs.elem[i];
+ if (binding == HLSL_BINDING_NUM_WORKGROUPS) {
+ cbvs[i] = pass_p->num_workgroups_buf;
+ continue;
+ } else if (binding < 0) {
+ cbvs[i] = NULL;
+ continue;
+ }
+
+ pl_buf buf = params->desc_bindings[binding].object;
+ pl_d3d11_buf_resolve(gpu, buf);
+ struct pl_buf_d3d11 *buf_p = PL_PRIV(buf);
+ cbvs[i] = buf_p->buf;
+ }
+
+ for (int i = 0; i < pass_s->srvs.num; i++) {
+ int binding = pass_s->srvs.elem[i];
+ if (binding < 0) {
+ srvs[i] = NULL;
+ continue;
+ }
+
+ pl_tex tex;
+ struct pl_tex_d3d11 *tex_p;
+ pl_buf buf;
+ struct pl_buf_d3d11 *buf_p;
+ switch (pass->params.descriptors[binding].type) {
+ case PL_DESC_SAMPLED_TEX:
+ case PL_DESC_STORAGE_IMG:
+ tex = params->desc_bindings[binding].object;
+ tex_p = PL_PRIV(tex);
+ srvs[i] = tex_p->srv;
+ break;
+ case PL_DESC_BUF_STORAGE:
+ buf = params->desc_bindings[binding].object;
+ buf_p = PL_PRIV(buf);
+ srvs[i] = buf_p->raw_srv;
+ break;
+ case PL_DESC_BUF_TEXEL_UNIFORM:
+ case PL_DESC_BUF_TEXEL_STORAGE:
+ buf = params->desc_bindings[binding].object;
+ buf_p = PL_PRIV(buf);
+ srvs[i] = buf_p->texel_srv;
+ break;
+ default:
+ break;
+ }
+ }
+
+ for (int i = 0; i < pass_s->samplers.num; i++) {
+ int binding = pass_s->samplers.elem[i];
+ if (binding < 0) {
+ samplers[i] = NULL;
+ continue;
+ }
+
+ struct pl_desc_binding *db = &params->desc_bindings[binding];
+ samplers[i] = p->samplers[db->sample_mode][db->address_mode];
+ }
+}
+
+static void fill_uavs(pl_pass pass, const struct pl_pass_run_params *params,
+ ID3D11UnorderedAccessView **uavs)
+{
+ struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
+
+ for (int i = 0; i < pass_p->uavs.num; i++) {
+ int binding = pass_p->uavs.elem[i];
+ if (binding < 0) {
+ uavs[i] = NULL;
+ continue;
+ }
+
+ pl_tex tex;
+ struct pl_tex_d3d11 *tex_p;
+ pl_buf buf;
+ struct pl_buf_d3d11 *buf_p;
+ switch (pass->params.descriptors[binding].type) {
+ case PL_DESC_BUF_STORAGE:
+ buf = params->desc_bindings[binding].object;
+ buf_p = PL_PRIV(buf);
+ uavs[i] = buf_p->raw_uav;
+ break;
+ case PL_DESC_STORAGE_IMG:
+ tex = params->desc_bindings[binding].object;
+ tex_p = PL_PRIV(tex);
+ uavs[i] = tex_p->uav;
+ break;
+ case PL_DESC_BUF_TEXEL_STORAGE:
+ buf = params->desc_bindings[binding].object;
+ buf_p = PL_PRIV(buf);
+ uavs[i] = buf_p->texel_uav;
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+static void pass_run_raster(pl_gpu gpu, const struct pl_pass_run_params *params)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ pl_pass pass = params->pass;
+ struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
+
+ if (p->fl <= D3D_FEATURE_LEVEL_9_3 && params->index_buf) {
+ // Index buffers are unsupported because we can't tell if they are an
+ // index buffer or a vertex buffer on creation, and FL9_x allows only
+ // one binding type per-buffer
+ PL_ERR(gpu, "Index buffers are unsupported in FL9_x");
+ return;
+ }
+
+ if (p->fl <= D3D_FEATURE_LEVEL_9_1 && params->index_data &&
+ params->index_fmt != PL_INDEX_UINT16)
+ {
+ PL_ERR(gpu, "32-bit index format is unsupported in FL9_1");
+ return;
+ }
+
+ // Figure out how much vertex/index data to upload, if any
+ size_t vertex_alloc = params->vertex_data ? pl_vertex_buf_size(params) : 0;
+ size_t index_alloc = params->index_data ? pl_index_buf_size(params) : 0;
+
+ static const DXGI_FORMAT index_fmts[PL_INDEX_FORMAT_COUNT] = {
+ [PL_INDEX_UINT16] = DXGI_FORMAT_R16_UINT,
+ [PL_INDEX_UINT32] = DXGI_FORMAT_R32_UINT,
+ };
+
+ // Upload vertex data. On >=FL10_0 we use the same buffer for index data, so
+ // upload that too.
+ bool share_vertex_index_buf = p->fl > D3D_FEATURE_LEVEL_9_3;
+ if (vertex_alloc || (share_vertex_index_buf && index_alloc)) {
+ struct stream_buf_slice slices[] = {
+ { .data = params->vertex_data, .size = vertex_alloc },
+ { .data = params->index_data, .size = index_alloc },
+ };
+
+ if (!stream_buf_upload(gpu, &p->vbuf, slices,
+ share_vertex_index_buf ? 2 : 1)) {
+ PL_ERR(gpu, "Failed to upload vertex data");
+ return;
+ }
+
+ if (vertex_alloc) {
+ ID3D11DeviceContext_IASetVertexBuffers(p->imm, 0, 1, &p->vbuf.buf,
+ &(UINT) { pass->params.vertex_stride }, &slices[0].offset);
+ }
+ if (share_vertex_index_buf && index_alloc) {
+ ID3D11DeviceContext_IASetIndexBuffer(p->imm, p->vbuf.buf,
+ index_fmts[params->index_fmt], slices[1].offset);
+ }
+ }
+
+ // Upload index data for <=FL9_3, which must be in its own buffer
+ if (!share_vertex_index_buf && index_alloc) {
+ struct stream_buf_slice slices[] = {
+ { .data = params->index_data, .size = index_alloc },
+ };
+
+ if (!stream_buf_upload(gpu, &p->ibuf, slices, PL_ARRAY_SIZE(slices))) {
+ PL_ERR(gpu, "Failed to upload index data");
+ return;
+ }
+
+ ID3D11DeviceContext_IASetIndexBuffer(p->imm, p->ibuf.buf,
+ index_fmts[params->index_fmt], slices[0].offset);
+ }
+
+ if (params->vertex_buf) {
+ struct pl_buf_d3d11 *buf_p = PL_PRIV(params->vertex_buf);
+ ID3D11DeviceContext_IASetVertexBuffers(p->imm, 0, 1, &buf_p->buf,
+ &(UINT) { pass->params.vertex_stride },
+ &(UINT) { params->buf_offset });
+ }
+
+ if (params->index_buf) {
+ struct pl_buf_d3d11 *buf_p = PL_PRIV(params->index_buf);
+ ID3D11DeviceContext_IASetIndexBuffer(p->imm, buf_p->buf,
+ index_fmts[params->index_fmt], params->index_offset);
+ }
+
+ ID3D11DeviceContext_IASetInputLayout(p->imm, pass_p->layout);
+
+ static const D3D_PRIMITIVE_TOPOLOGY prim_topology[] = {
+ [PL_PRIM_TRIANGLE_LIST] = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST,
+ [PL_PRIM_TRIANGLE_STRIP] = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP,
+ };
+ ID3D11DeviceContext_IASetPrimitiveTopology(p->imm,
+ prim_topology[pass->params.vertex_type]);
+
+ ID3D11DeviceContext_VSSetShader(p->imm, pass_p->vs, NULL, 0);
+
+ ID3D11Buffer **cbvs = pass_p->cbv_arr;
+ ID3D11ShaderResourceView **srvs = pass_p->srv_arr;
+ ID3D11SamplerState **samplers = pass_p->sampler_arr;
+ ID3D11UnorderedAccessView **uavs = pass_p->uav_arr;
+
+ // Set vertex shader resources. The device context is called conditionally
+ // because the debug layer complains if these are called with 0 resources.
+ fill_resources(gpu, pass, &pass_p->vertex, params, cbvs, srvs, samplers);
+ if (pass_p->vertex.cbvs.num)
+ ID3D11DeviceContext_VSSetConstantBuffers(p->imm, 0, pass_p->vertex.cbvs.num, cbvs);
+ if (pass_p->vertex.srvs.num)
+ ID3D11DeviceContext_VSSetShaderResources(p->imm, 0, pass_p->vertex.srvs.num, srvs);
+ if (pass_p->vertex.samplers.num)
+ ID3D11DeviceContext_VSSetSamplers(p->imm, 0, pass_p->vertex.samplers.num, samplers);
+
+ ID3D11DeviceContext_RSSetState(p->imm, p->rstate);
+ ID3D11DeviceContext_RSSetViewports(p->imm, 1, (&(D3D11_VIEWPORT) {
+ .TopLeftX = params->viewport.x0,
+ .TopLeftY = params->viewport.y0,
+ .Width = pl_rect_w(params->viewport),
+ .Height = pl_rect_h(params->viewport),
+ .MinDepth = 0,
+ .MaxDepth = 1,
+ }));
+ ID3D11DeviceContext_RSSetScissorRects(p->imm, 1, (&(D3D11_RECT) {
+ .left = params->scissors.x0,
+ .top = params->scissors.y0,
+ .right = params->scissors.x1,
+ .bottom = params->scissors.y1,
+ }));
+
+ ID3D11DeviceContext_PSSetShader(p->imm, pass_p->ps, NULL, 0);
+
+ // Set pixel shader resources
+ fill_resources(gpu, pass, &pass_p->main, params, cbvs, srvs, samplers);
+ if (pass_p->main.cbvs.num)
+ ID3D11DeviceContext_PSSetConstantBuffers(p->imm, 0, pass_p->main.cbvs.num, cbvs);
+ if (pass_p->main.srvs.num)
+ ID3D11DeviceContext_PSSetShaderResources(p->imm, 0, pass_p->main.srvs.num, srvs);
+ if (pass_p->main.samplers.num)
+ ID3D11DeviceContext_PSSetSamplers(p->imm, 0, pass_p->main.samplers.num, samplers);
+
+ ID3D11DeviceContext_OMSetBlendState(p->imm, pass_p->bstate, NULL,
+ D3D11_DEFAULT_SAMPLE_MASK);
+ ID3D11DeviceContext_OMSetDepthStencilState(p->imm, p->dsstate, 0);
+
+ fill_uavs(pass, params, uavs);
+
+ struct pl_tex_d3d11 *target_p = PL_PRIV(params->target);
+ ID3D11DeviceContext_OMSetRenderTargetsAndUnorderedAccessViews(
+ p->imm, 1, &target_p->rtv, NULL, 1, pass_p->uavs.num, uavs, NULL);
+
+ if (params->index_data || params->index_buf) {
+ ID3D11DeviceContext_DrawIndexed(p->imm, params->vertex_count, 0, 0);
+ } else {
+ ID3D11DeviceContext_Draw(p->imm, params->vertex_count, 0);
+ }
+
+ // Unbind everything. It's easier to do this than to actually track state,
+ // and if we leave the RTV bound, it could trip up D3D's conflict checker.
+ // Also, apparently unbinding SRVs can prevent a 10level9 bug?
+ // https://docs.microsoft.com/en-us/windows/win32/direct3d11/overviews-direct3d-11-devices-downlevel-prevent-null-srvs
+ for (int i = 0; i < PL_MAX(pass_p->main.cbvs.num, pass_p->vertex.cbvs.num); i++)
+ cbvs[i] = NULL;
+ for (int i = 0; i < PL_MAX(pass_p->main.srvs.num, pass_p->vertex.srvs.num); i++)
+ srvs[i] = NULL;
+ for (int i = 0; i < PL_MAX(pass_p->main.samplers.num, pass_p->vertex.samplers.num); i++)
+ samplers[i] = NULL;
+ for (int i = 0; i < pass_p->uavs.num; i++)
+ uavs[i] = NULL;
+ if (pass_p->vertex.cbvs.num)
+ ID3D11DeviceContext_VSSetConstantBuffers(p->imm, 0, pass_p->vertex.cbvs.num, cbvs);
+ if (pass_p->vertex.srvs.num)
+ ID3D11DeviceContext_VSSetShaderResources(p->imm, 0, pass_p->vertex.srvs.num, srvs);
+ if (pass_p->vertex.samplers.num)
+ ID3D11DeviceContext_VSSetSamplers(p->imm, 0, pass_p->vertex.samplers.num, samplers);
+ if (pass_p->main.cbvs.num)
+ ID3D11DeviceContext_PSSetConstantBuffers(p->imm, 0, pass_p->main.cbvs.num, cbvs);
+ if (pass_p->main.srvs.num)
+ ID3D11DeviceContext_PSSetShaderResources(p->imm, 0, pass_p->main.srvs.num, srvs);
+ if (pass_p->main.samplers.num)
+ ID3D11DeviceContext_PSSetSamplers(p->imm, 0, pass_p->main.samplers.num, samplers);
+ ID3D11DeviceContext_OMSetRenderTargetsAndUnorderedAccessViews(
+ p->imm, 0, NULL, NULL, 1, pass_p->uavs.num, uavs, NULL);
+}
+
+static void pass_run_compute(pl_gpu gpu, const struct pl_pass_run_params *params)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ pl_pass pass = params->pass;
+ struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
+
+ // Update gl_NumWorkGroups emulation buffer if necessary
+ if (pass_p->num_workgroups_used) {
+ bool needs_update = false;
+ for (int i = 0; i < 3; i++) {
+ if (pass_p->last_num_wgs.num_wgs[i] != params->compute_groups[i])
+ needs_update = true;
+ pass_p->last_num_wgs.num_wgs[i] = params->compute_groups[i];
+ }
+
+ if (needs_update) {
+ ID3D11DeviceContext_UpdateSubresource(p->imm,
+ (ID3D11Resource *) pass_p->num_workgroups_buf, 0, NULL,
+ &pass_p->last_num_wgs, 0, 0);
+ }
+ }
+
+ ID3D11DeviceContext_CSSetShader(p->imm, pass_p->cs, NULL, 0);
+
+ ID3D11Buffer **cbvs = pass_p->cbv_arr;
+ ID3D11ShaderResourceView **srvs = pass_p->srv_arr;
+ ID3D11UnorderedAccessView **uavs = pass_p->uav_arr;
+ ID3D11SamplerState **samplers = pass_p->sampler_arr;
+
+ fill_resources(gpu, pass, &pass_p->main, params, cbvs, srvs, samplers);
+ fill_uavs(pass, params, uavs);
+
+ if (pass_p->main.cbvs.num)
+ ID3D11DeviceContext_CSSetConstantBuffers(p->imm, 0, pass_p->main.cbvs.num, cbvs);
+ if (pass_p->main.srvs.num)
+ ID3D11DeviceContext_CSSetShaderResources(p->imm, 0, pass_p->main.srvs.num, srvs);
+ if (pass_p->main.samplers.num)
+ ID3D11DeviceContext_CSSetSamplers(p->imm, 0, pass_p->main.samplers.num, samplers);
+ if (pass_p->uavs.num)
+ ID3D11DeviceContext_CSSetUnorderedAccessViews(p->imm, 0, pass_p->uavs.num, uavs, NULL);
+
+ ID3D11DeviceContext_Dispatch(p->imm, params->compute_groups[0],
+ params->compute_groups[1],
+ params->compute_groups[2]);
+
+ // Unbind everything
+ for (int i = 0; i < pass_p->main.cbvs.num; i++)
+ cbvs[i] = NULL;
+ for (int i = 0; i < pass_p->main.srvs.num; i++)
+ srvs[i] = NULL;
+ for (int i = 0; i < pass_p->main.samplers.num; i++)
+ samplers[i] = NULL;
+ for (int i = 0; i < pass_p->uavs.num; i++)
+ uavs[i] = NULL;
+ if (pass_p->main.cbvs.num)
+ ID3D11DeviceContext_CSSetConstantBuffers(p->imm, 0, pass_p->main.cbvs.num, cbvs);
+ if (pass_p->main.srvs.num)
+ ID3D11DeviceContext_CSSetShaderResources(p->imm, 0, pass_p->main.srvs.num, srvs);
+ if (pass_p->main.samplers.num)
+ ID3D11DeviceContext_CSSetSamplers(p->imm, 0, pass_p->main.samplers.num, samplers);
+ if (pass_p->uavs.num)
+ ID3D11DeviceContext_CSSetUnorderedAccessViews(p->imm, 0, pass_p->uavs.num, uavs, NULL);
+}
+
+void pl_d3d11_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+ pl_pass pass = params->pass;
+
+ pl_d3d11_timer_start(gpu, params->timer);
+
+ if (pass->params.type == PL_PASS_COMPUTE) {
+ pass_run_compute(gpu, params);
+ } else {
+ pass_run_raster(gpu, params);
+ }
+
+ pl_d3d11_timer_end(gpu, params->timer);
+ pl_d3d11_flush_message_queue(ctx, "After pass run");
+}
diff --git a/src/d3d11/gpu_tex.c b/src/d3d11/gpu_tex.c
new file mode 100644
index 0000000..d63fc17
--- /dev/null
+++ b/src/d3d11/gpu_tex.c
@@ -0,0 +1,745 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+#include "formats.h"
+
+static inline UINT tex_subresource(pl_tex tex)
+{
+ struct pl_tex_d3d11 *tex_p = PL_PRIV(tex);
+ return tex_p->array_slice >= 0 ? tex_p->array_slice : 0;
+}
+
+static bool tex_init(pl_gpu gpu, pl_tex tex)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+ struct pl_tex_d3d11 *tex_p = PL_PRIV(tex);
+
+ // View formats may be omitted when they match the texture format, but for
+ // simplicity's sake we always set it. It will match the texture format for
+ // textures created with tex_create, but it can be different for video
+ // textures wrapped with pl_d3d11_wrap.
+ DXGI_FORMAT fmt = fmt_to_dxgi(tex->params.format);
+
+ if (tex->params.sampleable || tex->params.storable) {
+ D3D11_SHADER_RESOURCE_VIEW_DESC srvdesc = {
+ .Format = fmt,
+ };
+ switch (pl_tex_params_dimension(tex->params)) {
+ case 1:
+ if (tex_p->array_slice >= 0) {
+ srvdesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE1DARRAY;
+ srvdesc.Texture1DArray.MipLevels = 1;
+ srvdesc.Texture1DArray.FirstArraySlice = tex_p->array_slice;
+ srvdesc.Texture1DArray.ArraySize = 1;
+ } else {
+ srvdesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE1D;
+ srvdesc.Texture1D.MipLevels = 1;
+ }
+ break;
+ case 2:
+ if (tex_p->array_slice >= 0) {
+ srvdesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2DARRAY;
+ srvdesc.Texture2DArray.MipLevels = 1;
+ srvdesc.Texture2DArray.FirstArraySlice = tex_p->array_slice;
+ srvdesc.Texture2DArray.ArraySize = 1;
+ } else {
+ srvdesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D;
+ srvdesc.Texture2D.MipLevels = 1;
+ }
+ break;
+ case 3:
+ // D3D11 does not have Texture3D arrays
+ srvdesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE3D;
+ srvdesc.Texture3D.MipLevels = 1;
+ break;
+ }
+ D3D(ID3D11Device_CreateShaderResourceView(p->dev, tex_p->res, &srvdesc,
+ &tex_p->srv));
+ }
+
+ if (tex->params.renderable) {
+ D3D11_RENDER_TARGET_VIEW_DESC rtvdesc = {
+ .Format = fmt,
+ };
+ switch (pl_tex_params_dimension(tex->params)) {
+ case 1:
+ if (tex_p->array_slice >= 0) {
+ rtvdesc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE1DARRAY;
+ rtvdesc.Texture1DArray.FirstArraySlice = tex_p->array_slice;
+ rtvdesc.Texture1DArray.ArraySize = 1;
+ } else {
+ rtvdesc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE1D;
+ }
+ break;
+ case 2:
+ if (tex_p->array_slice >= 0) {
+ rtvdesc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE2DARRAY;
+ rtvdesc.Texture2DArray.FirstArraySlice = tex_p->array_slice;
+ rtvdesc.Texture2DArray.ArraySize = 1;
+ } else {
+ rtvdesc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE2D;
+ }
+ break;
+ case 3:
+ // D3D11 does not have Texture3D arrays
+ rtvdesc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE3D;
+ rtvdesc.Texture3D.WSize = -1;
+ break;
+ }
+ D3D(ID3D11Device_CreateRenderTargetView(p->dev, tex_p->res, &rtvdesc,
+ &tex_p->rtv));
+ }
+
+ if (p->fl >= D3D_FEATURE_LEVEL_11_0 && tex->params.storable) {
+ D3D11_UNORDERED_ACCESS_VIEW_DESC uavdesc = {
+ .Format = fmt,
+ };
+ switch (pl_tex_params_dimension(tex->params)) {
+ case 1:
+ if (tex_p->array_slice >= 0) {
+ uavdesc.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE1DARRAY;
+ uavdesc.Texture1DArray.FirstArraySlice = tex_p->array_slice;
+ uavdesc.Texture1DArray.ArraySize = 1;
+ } else {
+ uavdesc.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE1D;
+ }
+ break;
+ case 2:
+ if (tex_p->array_slice >= 0) {
+ uavdesc.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE2DARRAY;
+ uavdesc.Texture2DArray.FirstArraySlice = tex_p->array_slice;
+ uavdesc.Texture2DArray.ArraySize = 1;
+ } else {
+ uavdesc.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE2D;
+ }
+ break;
+ case 3:
+ // D3D11 does not have Texture3D arrays
+ uavdesc.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE3D;
+ uavdesc.Texture3D.WSize = -1;
+ break;
+ }
+ D3D(ID3D11Device_CreateUnorderedAccessView(p->dev, tex_p->res, &uavdesc,
+ &tex_p->uav));
+ }
+
+ return true;
+error:
+ return false;
+}
+
+void pl_d3d11_tex_destroy(pl_gpu gpu, pl_tex tex)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+ struct pl_tex_d3d11 *tex_p = PL_PRIV(tex);
+
+ SAFE_RELEASE(tex_p->srv);
+ SAFE_RELEASE(tex_p->rtv);
+ SAFE_RELEASE(tex_p->uav);
+ SAFE_RELEASE(tex_p->res);
+ SAFE_RELEASE(tex_p->staging);
+
+ pl_d3d11_flush_message_queue(ctx, "After texture destroy");
+
+ pl_free((void *) tex);
+}
+
+pl_tex pl_d3d11_tex_create(pl_gpu gpu, const struct pl_tex_params *params)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+
+ struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, struct pl_tex_d3d11);
+ tex->params = *params;
+ tex->params.initial_data = NULL;
+ tex->sampler_type = PL_SAMPLER_NORMAL;
+
+ struct pl_tex_d3d11 *tex_p = PL_PRIV(tex);
+
+ DXGI_FORMAT dxfmt = fmt_to_dxgi(params->format);
+
+ D3D11_USAGE usage = D3D11_USAGE_DEFAULT;
+ D3D11_BIND_FLAG bind_flags = 0;
+
+ if (params->format->emulated) {
+ tex_p->texel_fmt = pl_find_fmt(gpu, params->format->type, 1, 0,
+ params->format->host_bits[0],
+ PL_FMT_CAP_TEXEL_UNIFORM);
+
+ if (!tex_p->texel_fmt) {
+ PL_ERR(gpu, "Failed picking texel format for emulated texture!");
+ goto error;
+ }
+
+ tex->params.storable = true;
+ }
+
+ if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
+ // On >=FL11_0, blit emulation needs image storage
+ tex->params.storable |= params->blit_src || params->blit_dst;
+
+ // Blit emulation can use a sampler for linear filtering during stretch
+ if ((tex->params.format->caps & PL_FMT_CAP_LINEAR) && params->blit_src)
+ tex->params.sampleable = true;
+ } else {
+ // On <FL11_0, blit emulation uses a render pass
+ tex->params.sampleable |= params->blit_src;
+ tex->params.renderable |= params->blit_dst;
+ }
+
+ if (tex->params.sampleable)
+ bind_flags |= D3D11_BIND_SHADER_RESOURCE;
+ if (tex->params.renderable)
+ bind_flags |= D3D11_BIND_RENDER_TARGET;
+ if (p->fl >= D3D_FEATURE_LEVEL_11_0 && tex->params.storable)
+ bind_flags |= D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_UNORDERED_ACCESS;
+
+ // Apparently IMMUTABLE textures are efficient, so try to infer whether we
+ // can use one
+ if (params->initial_data && !params->format->emulated &&
+ !tex->params.renderable && !tex->params.storable && !params->host_writable)
+ {
+ usage = D3D11_USAGE_IMMUTABLE;
+ }
+
+ // In FL9_x, resources with only D3D11_BIND_SHADER_RESOURCE can't be copied
+ // from GPU-accessible memory to CPU-accessible memory. The only other bind
+ // flag we set on this FL is D3D11_BIND_RENDER_TARGET, so set it.
+ if (p->fl <= D3D_FEATURE_LEVEL_9_3 && tex->params.host_readable)
+ bind_flags |= D3D11_BIND_RENDER_TARGET;
+
+ // In FL9_x, when using DEFAULT or IMMUTABLE, BindFlags cannot be zero
+ if (p->fl <= D3D_FEATURE_LEVEL_9_3 && !bind_flags)
+ bind_flags |= D3D11_BIND_SHADER_RESOURCE;
+
+ D3D11_SUBRESOURCE_DATA data;
+ D3D11_SUBRESOURCE_DATA *pdata = NULL;
+ if (params->initial_data && !params->format->emulated) {
+ data = (D3D11_SUBRESOURCE_DATA) {
+ .pSysMem = params->initial_data,
+ .SysMemPitch = params->w * params->format->texel_size,
+ };
+ if (params->d)
+ data.SysMemSlicePitch = data.SysMemPitch * params->h;
+ pdata = &data;
+ }
+
+ switch (pl_tex_params_dimension(*params)) {
+ case 1:;
+ D3D11_TEXTURE1D_DESC desc1d = {
+ .Width = params->w,
+ .MipLevels = 1,
+ .ArraySize = 1,
+ .Format = dxfmt,
+ .Usage = usage,
+ .BindFlags = bind_flags,
+ };
+ D3D(ID3D11Device_CreateTexture1D(p->dev, &desc1d, pdata, &tex_p->tex1d));
+ tex_p->res = (ID3D11Resource *)tex_p->tex1d;
+
+ // Create a staging texture with CPU access for pl_tex_download()
+ if (params->host_readable) {
+ desc1d.BindFlags = 0;
+ desc1d.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+ desc1d.Usage = D3D11_USAGE_STAGING;
+
+ D3D(ID3D11Device_CreateTexture1D(p->dev, &desc1d, NULL,
+ &tex_p->staging1d));
+ tex_p->staging = (ID3D11Resource *) tex_p->staging1d;
+ }
+ break;
+ case 2:;
+ D3D11_TEXTURE2D_DESC desc2d = {
+ .Width = params->w,
+ .Height = params->h,
+ .MipLevels = 1,
+ .ArraySize = 1,
+ .SampleDesc.Count = 1,
+ .Format = dxfmt,
+ .Usage = usage,
+ .BindFlags = bind_flags,
+ };
+ D3D(ID3D11Device_CreateTexture2D(p->dev, &desc2d, pdata, &tex_p->tex2d));
+ tex_p->res = (ID3D11Resource *)tex_p->tex2d;
+
+ // Create a staging texture with CPU access for pl_tex_download()
+ if (params->host_readable) {
+ desc2d.BindFlags = 0;
+ desc2d.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+ desc2d.Usage = D3D11_USAGE_STAGING;
+
+ D3D(ID3D11Device_CreateTexture2D(p->dev, &desc2d, NULL,
+ &tex_p->staging2d));
+ tex_p->staging = (ID3D11Resource *) tex_p->staging2d;
+ }
+ break;
+ case 3:;
+ D3D11_TEXTURE3D_DESC desc3d = {
+ .Width = params->w,
+ .Height = params->h,
+ .Depth = params->d,
+ .MipLevels = 1,
+ .Format = dxfmt,
+ .Usage = usage,
+ .BindFlags = bind_flags,
+ };
+ D3D(ID3D11Device_CreateTexture3D(p->dev, &desc3d, pdata, &tex_p->tex3d));
+ tex_p->res = (ID3D11Resource *)tex_p->tex3d;
+
+ // Create a staging texture with CPU access for pl_tex_download()
+ if (params->host_readable) {
+ desc3d.BindFlags = 0;
+ desc3d.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+ desc3d.Usage = D3D11_USAGE_STAGING;
+
+ D3D(ID3D11Device_CreateTexture3D(p->dev, &desc3d, NULL,
+ &tex_p->staging3d));
+ tex_p->staging = (ID3D11Resource *) tex_p->staging3d;
+ }
+ break;
+ default:
+ pl_unreachable();
+ }
+
+ tex_p->array_slice = -1;
+
+ if (!tex_init(gpu, tex))
+ goto error;
+
+ if (params->initial_data && params->format->emulated) {
+ struct pl_tex_transfer_params ul_params = {
+ .tex = tex,
+ .ptr = (void *) params->initial_data,
+ .rc = { 0, 0, 0, params->w, params->h, params->d },
+ };
+
+ // Since we re-use GPU helpers which require writable images, just fake it
+ bool writable = tex->params.host_writable;
+ tex->params.host_writable = true;
+ if (!pl_tex_upload(gpu, &ul_params))
+ goto error;
+ tex->params.host_writable = writable;
+ }
+
+ pl_d3d11_flush_message_queue(ctx, "After texture create");
+
+ return tex;
+
+error:
+ pl_d3d11_tex_destroy(gpu, tex);
+ return NULL;
+}
+
+pl_tex pl_d3d11_wrap(pl_gpu gpu, const struct pl_d3d11_wrap_params *params)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+
+ struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, struct pl_tex_d3d11);
+ tex->sampler_type = PL_SAMPLER_NORMAL;
+
+ struct pl_tex_d3d11 *tex_p = PL_PRIV(tex);
+
+ DXGI_FORMAT fmt = DXGI_FORMAT_UNKNOWN;
+ D3D11_USAGE usage = D3D11_USAGE_DEFAULT;
+ D3D11_BIND_FLAG bind_flags = 0;
+ UINT mip_levels = 1;
+ UINT array_size = 1;
+ UINT sample_count = 1;
+
+ D3D11_RESOURCE_DIMENSION type;
+ ID3D11Resource_GetType(params->tex, &type);
+
+ switch (type) {
+ case D3D11_RESOURCE_DIMENSION_TEXTURE1D:
+ D3D(ID3D11Resource_QueryInterface(params->tex, &IID_ID3D11Texture1D,
+ (void **) &tex_p->tex1d));
+ tex_p->res = (ID3D11Resource *) tex_p->tex1d;
+
+ D3D11_TEXTURE1D_DESC desc1d;
+ ID3D11Texture1D_GetDesc(tex_p->tex1d, &desc1d);
+
+ tex->params.w = desc1d.Width;
+ mip_levels = desc1d.MipLevels;
+ array_size = desc1d.ArraySize;
+ fmt = desc1d.Format;
+ usage = desc1d.Usage;
+ bind_flags = desc1d.BindFlags;
+ break;
+
+ case D3D11_RESOURCE_DIMENSION_TEXTURE2D:
+ D3D(ID3D11Resource_QueryInterface(params->tex, &IID_ID3D11Texture2D,
+ (void **) &tex_p->tex2d));
+ tex_p->res = (ID3D11Resource *) tex_p->tex2d;
+
+ D3D11_TEXTURE2D_DESC desc2d;
+ ID3D11Texture2D_GetDesc(tex_p->tex2d, &desc2d);
+
+ tex->params.w = desc2d.Width;
+ tex->params.h = desc2d.Height;
+ mip_levels = desc2d.MipLevels;
+ array_size = desc2d.ArraySize;
+ fmt = desc2d.Format;
+ sample_count = desc2d.SampleDesc.Count;
+ usage = desc2d.Usage;
+ bind_flags = desc2d.BindFlags;
+
+ // Allow the format and size of 2D textures to be overridden to support
+ // shader views of video resources
+ if (params->fmt) {
+ fmt = params->fmt;
+ tex->params.w = params->w;
+ tex->params.h = params->h;
+ }
+
+ break;
+
+ case D3D11_RESOURCE_DIMENSION_TEXTURE3D:
+ D3D(ID3D11Resource_QueryInterface(params->tex, &IID_ID3D11Texture3D,
+ (void **) &tex_p->tex3d));
+ tex_p->res = (ID3D11Resource *) tex_p->tex3d;
+
+ D3D11_TEXTURE3D_DESC desc3d;
+ ID3D11Texture3D_GetDesc(tex_p->tex3d, &desc3d);
+
+ tex->params.w = desc3d.Width;
+ tex->params.h = desc3d.Height;
+ tex->params.d = desc3d.Depth;
+ mip_levels = desc3d.MipLevels;
+ fmt = desc3d.Format;
+ usage = desc3d.Usage;
+ bind_flags = desc3d.BindFlags;
+ break;
+
+ case D3D11_RESOURCE_DIMENSION_UNKNOWN:
+ case D3D11_RESOURCE_DIMENSION_BUFFER:
+ PL_ERR(gpu, "Resource is not suitable to wrap");
+ goto error;
+ }
+
+ if (mip_levels != 1) {
+ PL_ERR(gpu, "Mipmapped textures not supported for wrapping");
+ goto error;
+ }
+ if (sample_count != 1) {
+ PL_ERR(gpu, "Multisampled textures not supported for wrapping");
+ goto error;
+ }
+ if (usage != D3D11_USAGE_DEFAULT) {
+ PL_ERR(gpu, "Resource is not D3D11_USAGE_DEFAULT");
+ goto error;
+ }
+
+ if (array_size > 1) {
+ if (params->array_slice < 0 || params->array_slice >= array_size) {
+ PL_ERR(gpu, "array_slice out of range");
+ goto error;
+ }
+ tex_p->array_slice = params->array_slice;
+ } else {
+ tex_p->array_slice = -1;
+ }
+
+ if (bind_flags & D3D11_BIND_SHADER_RESOURCE) {
+ tex->params.sampleable = true;
+
+ // Blit emulation uses a render pass on <FL11_0
+ if (p->fl < D3D_FEATURE_LEVEL_11_0)
+ tex->params.blit_src = true;
+ }
+ if (bind_flags & D3D11_BIND_RENDER_TARGET) {
+ tex->params.renderable = true;
+
+ // Blit emulation uses a render pass on <FL11_0
+ if (p->fl < D3D_FEATURE_LEVEL_11_0)
+ tex->params.blit_dst = true;
+ }
+ static const D3D11_BIND_FLAG storable_flags =
+ D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE;
+ if ((bind_flags & storable_flags) == storable_flags) {
+ tex->params.storable = true;
+
+ // Blit emulation uses image storage on >=FL11_0. A feature level check
+ // isn't required because <FL11_0 doesn't have storable images.
+ tex->params.blit_src = tex->params.blit_dst = true;
+ }
+
+ for (int i = 0; i < gpu->num_formats; i++) {
+ DXGI_FORMAT target_fmt = fmt_to_dxgi(gpu->formats[i]);
+ if (fmt == target_fmt) {
+ tex->params.format = gpu->formats[i];
+ break;
+ }
+ }
+ if (!tex->params.format) {
+ PL_ERR(gpu, "Could not find a suitable pl_fmt for wrapped resource");
+ goto error;
+ }
+
+ if (!tex_init(gpu, tex))
+ goto error;
+
+ pl_d3d11_flush_message_queue(ctx, "After texture wrap");
+
+ return tex;
+
+error:
+ pl_d3d11_tex_destroy(gpu, tex);
+ return NULL;
+}
+
+void pl_d3d11_tex_invalidate(pl_gpu gpu, pl_tex tex)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+ struct pl_tex_d3d11 *tex_p = PL_PRIV(tex);
+
+ // Resource discarding requires D3D11.1
+ if (!p->imm1)
+ return;
+
+ // Prefer discarding a view to discarding the whole resource. The reason
+ // for this is that a pl_tex can refer to a single member of a texture
+ // array. Discarding the SRV, RTV or UAV should only discard that member.
+ if (tex_p->rtv) {
+ ID3D11DeviceContext1_DiscardView(p->imm1, (ID3D11View *) tex_p->rtv);
+ } else if (tex_p->uav) {
+ ID3D11DeviceContext1_DiscardView(p->imm1, (ID3D11View *) tex_p->uav);
+ } else if (tex_p->srv) {
+ ID3D11DeviceContext1_DiscardView(p->imm1, (ID3D11View *) tex_p->srv);
+ } else if (tex_p->array_slice < 0) {
+ // If there are no views, only discard if the ID3D11Resource is not a
+ // texture array
+ ID3D11DeviceContext1_DiscardResource(p->imm1, tex_p->res);
+ }
+
+ pl_d3d11_flush_message_queue(ctx, "After texture invalidate");
+}
+
+void pl_d3d11_tex_clear_ex(pl_gpu gpu, pl_tex tex,
+ const union pl_clear_color color)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+ struct pl_tex_d3d11 *tex_p = PL_PRIV(tex);
+
+ if (tex->params.format->type == PL_FMT_UINT) {
+ if (tex_p->uav) {
+ ID3D11DeviceContext_ClearUnorderedAccessViewUint(p->imm, tex_p->uav,
+ color.u);
+ } else {
+ float c[4] = { color.u[0], color.u[1], color.u[2], color.u[3] };
+ ID3D11DeviceContext_ClearRenderTargetView(p->imm, tex_p->rtv, c);
+ }
+
+ } else if (tex->params.format->type == PL_FMT_SINT) {
+ if (tex_p->uav) {
+ ID3D11DeviceContext_ClearUnorderedAccessViewUint(p->imm, tex_p->uav,
+ (const uint32_t *)color.i);
+ } else {
+ float c[4] = { color.i[0], color.i[1], color.i[2], color.i[3] };
+ ID3D11DeviceContext_ClearRenderTargetView(p->imm, tex_p->rtv, c);
+ }
+
+ } else if (tex_p->rtv) {
+ ID3D11DeviceContext_ClearRenderTargetView(p->imm, tex_p->rtv, color.f);
+ } else {
+ ID3D11DeviceContext_ClearUnorderedAccessViewFloat(p->imm, tex_p->uav, color.f);
+ }
+
+ pl_d3d11_flush_message_queue(ctx, "After texture clear");
+}
+
+#define pl_rect3d_to_box(rc) \
+ ((D3D11_BOX) { \
+ .left = rc.x0, .top = rc.y0, .front = rc.z0, \
+ .right = rc.x1, .bottom = rc.y1, .back = rc.z1, \
+ })
+
+void pl_d3d11_tex_blit(pl_gpu gpu, const struct pl_tex_blit_params *params)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+ struct pl_tex_d3d11 *src_p = PL_PRIV(params->src);
+ DXGI_FORMAT src_fmt = fmt_to_dxgi(params->src->params.format);
+ struct pl_tex_d3d11 *dst_p = PL_PRIV(params->dst);
+ DXGI_FORMAT dst_fmt = fmt_to_dxgi(params->dst->params.format);
+
+ // If the blit operation doesn't require flipping, scaling or format
+ // conversion, we can use CopySubresourceRegion
+ pl_rect3d src_rc = params->src_rc, dst_rc = params->dst_rc;
+ if (pl_rect3d_eq(src_rc, dst_rc) && src_fmt == dst_fmt) {
+ pl_rect3d rc = params->src_rc;
+ pl_rect3d_normalize(&rc);
+
+ ID3D11DeviceContext_CopySubresourceRegion(p->imm, dst_p->res,
+ tex_subresource(params->dst), rc.x0, rc.y0, rc.z0, src_p->res,
+ tex_subresource(params->src), &pl_rect3d_to_box(rc));
+ } else if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
+ if (!pl_tex_blit_compute(gpu, params))
+ PL_ERR(gpu, "Failed compute shader fallback blit");
+ } else {
+ pl_tex_blit_raster(gpu, params);
+ }
+
+ pl_d3d11_flush_message_queue(ctx, "After texture blit");
+}
+
+bool pl_d3d11_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+ pl_tex tex = params->tex;
+ pl_fmt fmt = tex->params.format;
+ struct pl_tex_d3d11 *tex_p = PL_PRIV(tex);
+ struct pl_tex_transfer_params *slices = NULL;
+ bool ret = false;
+
+ pl_d3d11_timer_start(gpu, params->timer);
+
+ if (fmt->emulated) {
+
+ int num_slices = pl_tex_transfer_slices(gpu, tex_p->texel_fmt, params, &slices);
+ for (int i = 0; i < num_slices; i++) {
+ // Copy the source data buffer into an intermediate buffer
+ pl_buf tbuf = pl_buf_create(gpu, pl_buf_params(
+ .memory_type = PL_BUF_MEM_DEVICE,
+ .format = tex_p->texel_fmt,
+ .size = pl_tex_transfer_size(&slices[i]),
+ .initial_data = slices[i].ptr,
+ .storable = true,
+ ));
+
+ if (!tbuf) {
+ PL_ERR(gpu, "Failed creating buffer for tex upload fallback!");
+ goto error;
+ }
+
+ slices[i].ptr = NULL;
+ slices[i].buf = tbuf;
+ slices[i].buf_offset = 0;
+ bool ok = pl_tex_upload_texel(gpu, &slices[i]);
+ pl_buf_destroy(gpu, &tbuf);
+ if (!ok)
+ goto error;
+ }
+
+ } else {
+
+ ID3D11DeviceContext_UpdateSubresource(p->imm, tex_p->res,
+ tex_subresource(tex), &pl_rect3d_to_box(params->rc), params->ptr,
+ params->row_pitch, params->depth_pitch);
+
+ }
+
+ ret = true;
+
+error:
+ pl_d3d11_timer_end(gpu, params->timer);
+ pl_d3d11_flush_message_queue(ctx, "After texture upload");
+
+ pl_free(slices);
+ return ret;
+}
+
+bool pl_d3d11_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+ struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+ struct d3d11_ctx *ctx = p->ctx;
+ const struct pl_tex_t *tex = params->tex;
+ pl_fmt fmt = tex->params.format;
+ struct pl_tex_d3d11 *tex_p = PL_PRIV(tex);
+ struct pl_tex_transfer_params *slices = NULL;
+ bool ret = false;
+
+ if (!tex_p->staging)
+ return false;
+
+ pl_d3d11_timer_start(gpu, params->timer);
+
+ if (fmt->emulated) {
+
+ pl_buf tbuf = NULL;
+ int num_slices = pl_tex_transfer_slices(gpu, tex_p->texel_fmt, params, &slices);
+ for (int i = 0; i < num_slices; i++) {
+ const size_t slice_size = pl_tex_transfer_size(&slices[i]);
+ bool ok = pl_buf_recreate(gpu, &tbuf, pl_buf_params(
+ .storable = true,
+ .size = slice_size,
+ .memory_type = PL_BUF_MEM_DEVICE,
+ .format = tex_p->texel_fmt,
+ .host_readable = true,
+ ));
+
+ if (!ok) {
+ PL_ERR(gpu, "Failed creating buffer for tex download fallback!");
+ goto error;
+ }
+
+ void *ptr = slices[i].ptr;
+ slices[i].ptr = NULL;
+ slices[i].buf = tbuf;
+ slices[i].buf_offset = 0;
+
+ // Download into an intermediate buffer first
+ ok = pl_tex_download_texel(gpu, &slices[i]);
+ ok = ok && pl_buf_read(gpu, tbuf, 0, ptr, slice_size);
+ if (!ok) {
+ pl_buf_destroy(gpu, &tbuf);
+ goto error;
+ }
+ }
+ pl_buf_destroy(gpu, &tbuf);
+
+ } else {
+
+ ID3D11DeviceContext_CopySubresourceRegion(p->imm,
+ (ID3D11Resource *) tex_p->staging, 0, params->rc.x0, params->rc.y0,
+ params->rc.z0, tex_p->res, tex_subresource(tex),
+ &pl_rect3d_to_box(params->rc));
+
+ D3D11_MAPPED_SUBRESOURCE lock;
+ D3D(ID3D11DeviceContext_Map(p->imm, (ID3D11Resource *) tex_p->staging, 0,
+ D3D11_MAP_READ, 0, &lock));
+
+ char *cdst = params->ptr;
+ char *csrc = lock.pData;
+ size_t line_size = pl_rect_w(params->rc) * tex->params.format->texel_size;
+ for (int z = 0; z < pl_rect_d(params->rc); z++) {
+ for (int y = 0; y < pl_rect_h(params->rc); y++) {
+ memcpy(cdst + z * params->depth_pitch + y * params->row_pitch,
+ csrc + (params->rc.z0 + z) * lock.DepthPitch +
+ (params->rc.y0 + y) * lock.RowPitch + params->rc.x0,
+ line_size);
+ }
+ }
+
+ ID3D11DeviceContext_Unmap(p->imm, (ID3D11Resource*)tex_p->staging, 0);
+ }
+
+ ret = true;
+
+error:
+ pl_d3d11_timer_end(gpu, params->timer);
+ pl_d3d11_flush_message_queue(ctx, "After texture download");
+
+ pl_free(slices);
+ return ret;
+}
diff --git a/src/d3d11/meson.build b/src/d3d11/meson.build
new file mode 100644
index 0000000..d4c4b44
--- /dev/null
+++ b/src/d3d11/meson.build
@@ -0,0 +1,41 @@
+d3d11 = get_option('d3d11')
+d3d11_header = cc.check_header('d3d11.h', required: false) # needed publicly
+d3d11_headers_extra = [ # needed internally
+ cc.check_header('d3d11_4.h', required: d3d11),
+ cc.check_header('dxgi1_6.h', required: d3d11),
+]
+d3d11_deps = [
+ dependency('spirv-cross-c-shared', version: '>=0.29.0', required: d3d11),
+ cc.find_library('version', required: d3d11),
+]
+
+d3d11 = d3d11.require(d3d11_header)
+foreach h : d3d11_headers_extra
+ d3d11 = d3d11.require(h)
+endforeach
+foreach d : d3d11_deps
+ d3d11 = d3d11.require(d.found())
+endforeach
+
+components.set('d3d11', d3d11.allowed())
+if d3d11.allowed()
+ conf_internal.set('PL_HAVE_DXGI_DEBUG',
+ cc.has_header_symbol('dxgidebug.h', 'IID_IDXGIInfoQueue'))
+ conf_internal.set('PL_HAVE_DXGI_DEBUG_D3D11',
+ cc.has_header_symbol('d3d11sdklayers.h', 'DXGI_DEBUG_D3D11'))
+ add_project_arguments(['-DCOBJMACROS'], language: ['c', 'cpp'])
+ build_deps += declare_dependency(dependencies: d3d11_deps)
+ tests += 'd3d11.c'
+ sources += [
+ 'd3d11/context.c',
+ 'd3d11/formats.c',
+ 'd3d11/gpu.c',
+ 'd3d11/gpu_buf.c',
+ 'd3d11/gpu_tex.c',
+ 'd3d11/gpu_pass.c',
+ 'd3d11/swapchain.c',
+ 'd3d11/utils.c',
+ ]
+elif d3d11_header
+ sources += 'd3d11/stubs.c'
+endif
diff --git a/src/d3d11/stubs.c b/src/d3d11/stubs.c
new file mode 100644
index 0000000..b3f259c
--- /dev/null
+++ b/src/d3d11/stubs.c
@@ -0,0 +1,56 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../common.h"
+#include "log.h"
+
+#include <libplacebo/d3d11.h>
+
+const struct pl_d3d11_params pl_d3d11_default_params = { PL_D3D11_DEFAULTS };
+
+pl_d3d11 pl_d3d11_create(pl_log log, const struct pl_d3d11_params *params)
+{
+ pl_fatal(log, "libplacebo compiled without D3D11 support!");
+ return NULL;
+}
+
+void pl_d3d11_destroy(pl_d3d11 *pd3d11)
+{
+ pl_d3d11 d3d11 = *pd3d11;
+ pl_assert(!d3d11);
+}
+
+pl_d3d11 pl_d3d11_get(pl_gpu gpu)
+{
+ return NULL;
+}
+
+pl_swapchain pl_d3d11_create_swapchain(pl_d3d11 d3d11,
+ const struct pl_d3d11_swapchain_params *params)
+{
+ pl_unreachable();
+}
+
+IDXGISwapChain *pl_d3d11_swapchain_unwrap(pl_swapchain sw)
+{
+ pl_unreachable();
+}
+
+pl_tex pl_d3d11_wrap(pl_gpu gpu, const struct pl_d3d11_wrap_params *params)
+{
+ pl_unreachable();
+}
diff --git a/src/d3d11/swapchain.c b/src/d3d11/swapchain.c
new file mode 100644
index 0000000..8a53632
--- /dev/null
+++ b/src/d3d11/swapchain.c
@@ -0,0 +1,667 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <windows.h>
+#include <versionhelpers.h>
+#include <math.h>
+
+#include "gpu.h"
+#include "swapchain.h"
+#include "utils.h"
+
+struct d3d11_csp_mapping {
+ DXGI_COLOR_SPACE_TYPE d3d11_csp;
+ DXGI_FORMAT d3d11_fmt;
+ struct pl_color_space out_csp;
+};
+
+static struct d3d11_csp_mapping map_pl_csp_to_d3d11(const struct pl_color_space *hint,
+ bool use_8bit_sdr)
+{
+ if (pl_color_space_is_hdr(hint) &&
+ hint->transfer != PL_COLOR_TRC_LINEAR)
+ {
+ struct pl_color_space pl_csp = pl_color_space_hdr10;
+ pl_csp.hdr = (struct pl_hdr_metadata) {
+ // Whitelist only values that we support signalling metadata for
+ .prim = hint->hdr.prim,
+ .min_luma = hint->hdr.min_luma,
+ .max_luma = hint->hdr.max_luma,
+ .max_cll = hint->hdr.max_cll,
+ .max_fall = hint->hdr.max_fall,
+ };
+
+ return (struct d3d11_csp_mapping){
+ .d3d11_csp = DXGI_COLOR_SPACE_RGB_FULL_G2084_NONE_P2020,
+ .d3d11_fmt = DXGI_FORMAT_R10G10B10A2_UNORM,
+ .out_csp = pl_csp,
+ };
+ } else if (pl_color_primaries_is_wide_gamut(hint->primaries) ||
+ hint->transfer == PL_COLOR_TRC_LINEAR)
+ {
+ // scRGB a la VK_COLOR_SPACE_EXTENDED_SRGB_LINEAR_EXT,
+ // so could be utilized for HDR/wide gamut content as well
+ // with content that goes beyond 0.0-1.0.
+ return (struct d3d11_csp_mapping){
+ .d3d11_csp = DXGI_COLOR_SPACE_RGB_FULL_G10_NONE_P709,
+ .d3d11_fmt = DXGI_FORMAT_R16G16B16A16_FLOAT,
+ .out_csp = {
+ .primaries = PL_COLOR_PRIM_BT_709,
+ .transfer = PL_COLOR_TRC_LINEAR,
+ }
+ };
+ }
+
+ return (struct d3d11_csp_mapping){
+ .d3d11_csp = DXGI_COLOR_SPACE_RGB_FULL_G22_NONE_P709,
+ .d3d11_fmt = use_8bit_sdr ? DXGI_FORMAT_R8G8B8A8_UNORM :
+ DXGI_FORMAT_R10G10B10A2_UNORM,
+ .out_csp = pl_color_space_monitor,
+ };
+}
+
+struct priv {
+ struct pl_sw_fns impl;
+
+ struct d3d11_ctx *ctx;
+ IDXGISwapChain *swapchain;
+ pl_tex backbuffer;
+
+ // Currently requested or applied swap chain configuration.
+ // Affected by received colorspace hints.
+ struct d3d11_csp_mapping csp_map;
+
+ // Whether a swapchain backbuffer format reconfiguration has been
+ // requested by means of an additional resize action.
+ bool update_swapchain_format;
+
+ // Whether 10-bit backbuffer format is disabled for SDR content.
+ bool disable_10bit_sdr;
+
+ // Fallback to 8-bit RGB was triggered due to lack of compatiblity
+ bool fallback_8bit_rgb;
+};
+
+static void d3d11_sw_destroy(pl_swapchain sw)
+{
+ struct priv *p = PL_PRIV(sw);
+
+ pl_tex_destroy(sw->gpu, &p->backbuffer);
+ SAFE_RELEASE(p->swapchain);
+ pl_free((void *) sw);
+}
+
+static int d3d11_sw_latency(pl_swapchain sw)
+{
+ struct priv *p = PL_PRIV(sw);
+ struct d3d11_ctx *ctx = p->ctx;
+
+ UINT max_latency;
+ IDXGIDevice1_GetMaximumFrameLatency(ctx->dxgi_dev, &max_latency);
+ return max_latency;
+}
+
+static pl_tex get_backbuffer(pl_swapchain sw)
+{
+ struct priv *p = PL_PRIV(sw);
+ struct d3d11_ctx *ctx = p->ctx;
+ ID3D11Texture2D *backbuffer = NULL;
+ pl_tex tex = NULL;
+
+ D3D(IDXGISwapChain_GetBuffer(p->swapchain, 0, &IID_ID3D11Texture2D,
+ (void **) &backbuffer));
+
+ tex = pl_d3d11_wrap(sw->gpu, pl_d3d11_wrap_params(
+ .tex = (ID3D11Resource *) backbuffer,
+ ));
+
+error:
+ SAFE_RELEASE(backbuffer);
+ return tex;
+}
+
+static bool d3d11_sw_resize(pl_swapchain sw, int *width, int *height)
+{
+ struct priv *p = PL_PRIV(sw);
+ struct d3d11_ctx *ctx = p->ctx;
+
+ DXGI_SWAP_CHAIN_DESC desc = {0};
+ IDXGISwapChain_GetDesc(p->swapchain, &desc);
+ int w = PL_DEF(*width, desc.BufferDesc.Width);
+ int h = PL_DEF(*height, desc.BufferDesc.Height);
+ bool format_changed = p->csp_map.d3d11_fmt != desc.BufferDesc.Format;
+ if (format_changed) {
+ PL_INFO(ctx, "Attempting to reconfigure swap chain format: %s -> %s",
+ pl_get_dxgi_format_name(desc.BufferDesc.Format),
+ pl_get_dxgi_format_name(p->csp_map.d3d11_fmt));
+ }
+
+ if (w != desc.BufferDesc.Width || h != desc.BufferDesc.Height ||
+ format_changed)
+ {
+ if (p->backbuffer) {
+ PL_ERR(sw, "Tried resizing the swapchain while a frame was in "
+ "progress! Please submit the current frame first.");
+ return false;
+ }
+
+ HRESULT hr = IDXGISwapChain_ResizeBuffers(p->swapchain, 0, w, h,
+ p->csp_map.d3d11_fmt, desc.Flags);
+
+ if (hr == E_INVALIDARG && p->csp_map.d3d11_fmt != DXGI_FORMAT_R8G8B8A8_UNORM)
+ {
+ PL_WARN(sw, "Reconfiguring the swapchain failed, re-trying with R8G8B8A8_UNORM fallback.");
+ D3D(IDXGISwapChain_ResizeBuffers(p->swapchain, 0, w, h,
+ DXGI_FORMAT_R8G8B8A8_UNORM, desc.Flags));
+
+ // re-configure the colorspace to 8-bit RGB SDR fallback
+ p->csp_map = map_pl_csp_to_d3d11(&pl_color_space_unknown, true);
+ p->fallback_8bit_rgb = true;
+ }
+ else if (FAILED(hr))
+ {
+ PL_ERR(sw, "Reconfiguring the swapchain failed with error: %s", pl_hresult_to_str(hr));
+ return false;
+ }
+ }
+
+ *width = w;
+ *height = h;
+ p->update_swapchain_format = false;
+ return true;
+
+error:
+ return false;
+}
+
+static bool d3d11_sw_start_frame(pl_swapchain sw,
+ struct pl_swapchain_frame *out_frame)
+{
+ struct priv *p = PL_PRIV(sw);
+ struct d3d11_ctx *ctx = p->ctx;
+
+ if (ctx->is_failed)
+ return false;
+ if (p->backbuffer) {
+ PL_ERR(sw, "Attempted calling `pl_swapchain_start_frame` while a frame "
+ "was already in progress! Call `pl_swapchain_submit_frame` first.");
+ return false;
+ }
+
+ if (p->update_swapchain_format) {
+ int w = 0, h = 0;
+ if (!d3d11_sw_resize(sw, &w, &h))
+ return false;
+ }
+
+ p->backbuffer = get_backbuffer(sw);
+ if (!p->backbuffer)
+ return false;
+
+ int bits = 0;
+ pl_fmt fmt = p->backbuffer->params.format;
+ for (int i = 0; i < fmt->num_components; i++)
+ bits = PL_MAX(bits, fmt->component_depth[i]);
+
+ *out_frame = (struct pl_swapchain_frame) {
+ .fbo = p->backbuffer,
+ .flipped = false,
+ .color_repr = {
+ .sys = PL_COLOR_SYSTEM_RGB,
+ .levels = PL_COLOR_LEVELS_FULL,
+ .alpha = PL_ALPHA_UNKNOWN,
+ .bits = {
+ .sample_depth = bits,
+ .color_depth = bits,
+ },
+ },
+ .color_space = p->csp_map.out_csp,
+ };
+
+ return true;
+}
+
+static bool d3d11_sw_submit_frame(pl_swapchain sw)
+{
+ struct priv *p = PL_PRIV(sw);
+ struct d3d11_ctx *ctx = p->ctx;
+
+ // Release the backbuffer. We shouldn't hold onto it unnecessarily, because
+ // it prevents external code from resizing the swapchain, which we'd
+ // otherwise support just fine.
+ pl_tex_destroy(sw->gpu, &p->backbuffer);
+
+ return !ctx->is_failed;
+}
+
+static void d3d11_sw_swap_buffers(pl_swapchain sw)
+{
+ struct priv *p = PL_PRIV(sw);
+ struct d3d11_ctx *ctx = p->ctx;
+
+ // Present can fail with a device removed error
+ D3D(IDXGISwapChain_Present(p->swapchain, 1, 0));
+
+error:
+ return;
+}
+
+static DXGI_HDR_METADATA_HDR10 set_hdr10_metadata(const struct pl_hdr_metadata *hdr)
+{
+ return (DXGI_HDR_METADATA_HDR10) {
+ .RedPrimary = { roundf(hdr->prim.red.x * 50000),
+ roundf(hdr->prim.red.y * 50000) },
+ .GreenPrimary = { roundf(hdr->prim.green.x * 50000),
+ roundf(hdr->prim.green.y * 50000) },
+ .BluePrimary = { roundf(hdr->prim.blue.x * 50000),
+ roundf(hdr->prim.blue.y * 50000) },
+ .WhitePoint = { roundf(hdr->prim.white.x * 50000),
+ roundf(hdr->prim.white.y * 50000) },
+ .MaxMasteringLuminance = roundf(hdr->max_luma),
+ .MinMasteringLuminance = roundf(hdr->min_luma * 10000),
+ .MaxContentLightLevel = roundf(hdr->max_cll),
+ .MaxFrameAverageLightLevel = roundf(hdr->max_fall),
+ };
+}
+
+static bool set_swapchain_metadata(struct d3d11_ctx *ctx,
+ IDXGISwapChain3 *swapchain3,
+ struct d3d11_csp_mapping *csp_map)
+{
+ IDXGISwapChain4 *swapchain4 = NULL;
+ bool ret = false;
+ bool is_hdr = pl_color_space_is_hdr(&csp_map->out_csp);
+ DXGI_HDR_METADATA_HDR10 hdr10 = is_hdr ?
+ set_hdr10_metadata(&csp_map->out_csp.hdr) : (DXGI_HDR_METADATA_HDR10){ 0 };
+
+ D3D(IDXGISwapChain3_SetColorSpace1(swapchain3, csp_map->d3d11_csp));
+
+ // if we succeeded to set the color space, it's good enough,
+ // since older versions of Windows 10 will not have swapchain v4 available.
+ ret = true;
+
+ if (FAILED(IDXGISwapChain3_QueryInterface(swapchain3, &IID_IDXGISwapChain4,
+ (void **)&swapchain4)))
+ {
+ PL_TRACE(ctx, "v4 swap chain interface is not available, skipping HDR10 "
+ "metadata configuration.");
+ goto error;
+ }
+
+ D3D(IDXGISwapChain4_SetHDRMetaData(swapchain4,
+ is_hdr ?
+ DXGI_HDR_METADATA_TYPE_HDR10 :
+ DXGI_HDR_METADATA_TYPE_NONE,
+ is_hdr ? sizeof(hdr10) : 0,
+ is_hdr ? &hdr10 : NULL));
+
+ goto success;
+
+error:
+ csp_map->out_csp.hdr = (struct pl_hdr_metadata) { 0 };
+success:
+ SAFE_RELEASE(swapchain4);
+ return ret;
+}
+
+static bool d3d11_format_supported(struct d3d11_ctx *ctx, DXGI_FORMAT fmt)
+{
+ UINT sup = 0;
+ UINT wanted_sup =
+ D3D11_FORMAT_SUPPORT_TEXTURE2D | D3D11_FORMAT_SUPPORT_DISPLAY |
+ D3D11_FORMAT_SUPPORT_SHADER_SAMPLE | D3D11_FORMAT_SUPPORT_RENDER_TARGET |
+ D3D11_FORMAT_SUPPORT_BLENDABLE;
+
+ D3D(ID3D11Device_CheckFormatSupport(ctx->dev, fmt, &sup));
+
+ return (sup & wanted_sup) == wanted_sup;
+
+error:
+ return false;
+}
+
+static bool d3d11_csp_supported(struct d3d11_ctx *ctx,
+ IDXGISwapChain3 *swapchain3,
+ DXGI_COLOR_SPACE_TYPE color_space)
+{
+ UINT csp_support_flags = 0;
+
+ D3D(IDXGISwapChain3_CheckColorSpaceSupport(swapchain3,
+ color_space,
+ &csp_support_flags));
+
+ return (csp_support_flags & DXGI_SWAP_CHAIN_COLOR_SPACE_SUPPORT_FLAG_PRESENT);
+
+error:
+ return false;
+}
+
+static void update_swapchain_color_config(pl_swapchain sw,
+ const struct pl_color_space *csp,
+ bool is_internal)
+{
+ struct priv *p = PL_PRIV(sw);
+ struct d3d11_ctx *ctx = p->ctx;
+ IDXGISwapChain3 *swapchain3 = NULL;
+ struct d3d11_csp_mapping old_map = p->csp_map;
+
+ // ignore config changes in fallback mode
+ if (p->fallback_8bit_rgb)
+ goto cleanup;
+
+ HRESULT hr = IDXGISwapChain_QueryInterface(p->swapchain, &IID_IDXGISwapChain3,
+ (void **)&swapchain3);
+ if (FAILED(hr)) {
+ PL_TRACE(ctx, "v3 swap chain interface is not available, skipping "
+ "color space configuration.");
+ swapchain3 = NULL;
+ }
+
+ // Lack of swap chain v3 means we cannot control swap chain color space;
+ // Only effective formats are the 8 and 10 bit RGB ones.
+ struct d3d11_csp_mapping csp_map =
+ map_pl_csp_to_d3d11(swapchain3 ? csp : &pl_color_space_unknown,
+ p->disable_10bit_sdr);
+
+ if (p->csp_map.d3d11_fmt == csp_map.d3d11_fmt &&
+ p->csp_map.d3d11_csp == csp_map.d3d11_csp &&
+ pl_color_space_equal(&p->csp_map.out_csp, &csp_map.out_csp))
+ goto cleanup;
+
+ PL_INFO(ctx, "%s swap chain configuration%s: format: %s, color space: %s.",
+ is_internal ? "Initial" : "New",
+ is_internal ? "" : " received from hint",
+ pl_get_dxgi_format_name(csp_map.d3d11_fmt),
+ pl_get_dxgi_csp_name(csp_map.d3d11_csp));
+
+ bool fmt_supported = d3d11_format_supported(ctx, csp_map.d3d11_fmt);
+ bool csp_supported = swapchain3 ?
+ d3d11_csp_supported(ctx, swapchain3, csp_map.d3d11_csp) : true;
+ if (!fmt_supported || !csp_supported) {
+ PL_ERR(ctx, "New swap chain configuration was deemed not supported: "
+ "format: %s, color space: %s. Failling back to 8bit RGB.",
+ fmt_supported ? "supported" : "unsupported",
+ csp_supported ? "supported" : "unsupported");
+ // fall back to 8bit sRGB if requested configuration is not supported
+ csp_map = map_pl_csp_to_d3d11(&pl_color_space_unknown, true);
+ }
+
+ p->csp_map = csp_map;
+ p->update_swapchain_format = true;
+
+ if (!swapchain3)
+ goto cleanup;
+
+ if (!set_swapchain_metadata(ctx, swapchain3, &p->csp_map)) {
+ // format succeeded, but color space configuration failed
+ p->csp_map = old_map;
+ p->csp_map.d3d11_fmt = csp_map.d3d11_fmt;
+ }
+
+ pl_d3d11_flush_message_queue(ctx, "After colorspace hint");
+
+cleanup:
+ SAFE_RELEASE(swapchain3);
+}
+
+static void d3d11_sw_colorspace_hint(pl_swapchain sw,
+ const struct pl_color_space *csp)
+{
+ update_swapchain_color_config(sw, csp, false);
+}
+
+IDXGISwapChain *pl_d3d11_swapchain_unwrap(pl_swapchain sw)
+{
+ struct priv *p = PL_PRIV(sw);
+ IDXGISwapChain_AddRef(p->swapchain);
+ return p->swapchain;
+}
+
+static const struct pl_sw_fns d3d11_swapchain = {
+ .destroy = d3d11_sw_destroy,
+ .latency = d3d11_sw_latency,
+ .resize = d3d11_sw_resize,
+ .colorspace_hint = d3d11_sw_colorspace_hint,
+ .start_frame = d3d11_sw_start_frame,
+ .submit_frame = d3d11_sw_submit_frame,
+ .swap_buffers = d3d11_sw_swap_buffers,
+};
+
+static HRESULT create_swapchain_1_2(struct d3d11_ctx *ctx,
+ IDXGIFactory2 *factory, const struct pl_d3d11_swapchain_params *params,
+ bool flip, UINT width, UINT height, DXGI_FORMAT format,
+ IDXGISwapChain **swapchain_out)
+{
+ IDXGISwapChain *swapchain = NULL;
+ IDXGISwapChain1 *swapchain1 = NULL;
+ HRESULT hr;
+
+ DXGI_SWAP_CHAIN_DESC1 desc = {
+ .Width = width,
+ .Height = height,
+ .Format = format,
+ .SampleDesc.Count = 1,
+ .BufferUsage = DXGI_USAGE_SHADER_INPUT | DXGI_USAGE_RENDER_TARGET_OUTPUT,
+ .Flags = params->flags,
+ };
+
+ if (ID3D11Device_GetFeatureLevel(ctx->dev) >= D3D_FEATURE_LEVEL_11_0)
+ desc.BufferUsage |= DXGI_USAGE_UNORDERED_ACCESS;
+
+ if (flip) {
+ UINT max_latency;
+ IDXGIDevice1_GetMaximumFrameLatency(ctx->dxgi_dev, &max_latency);
+
+ // Make sure we have at least enough buffers to allow `max_latency`
+ // frames in-flight at once, plus one frame for the frontbuffer
+ desc.BufferCount = max_latency + 1;
+
+ if (IsWindows10OrGreater()) {
+ desc.SwapEffect = DXGI_SWAP_EFFECT_FLIP_DISCARD;
+ } else {
+ desc.SwapEffect = DXGI_SWAP_EFFECT_FLIP_SEQUENTIAL;
+ }
+
+ desc.BufferCount = PL_MIN(desc.BufferCount, DXGI_MAX_SWAP_CHAIN_BUFFERS);
+ } else {
+ desc.SwapEffect = DXGI_SWAP_EFFECT_DISCARD;
+ desc.BufferCount = 1;
+ }
+
+ if (params->window) {
+ hr = IDXGIFactory2_CreateSwapChainForHwnd(factory, (IUnknown *) ctx->dev,
+ params->window, &desc, NULL, NULL, &swapchain1);
+ } else if (params->core_window) {
+ hr = IDXGIFactory2_CreateSwapChainForCoreWindow(factory,
+ (IUnknown *) ctx->dev, params->core_window, &desc, NULL, &swapchain1);
+ } else {
+ hr = IDXGIFactory2_CreateSwapChainForComposition(factory,
+ (IUnknown *) ctx->dev, &desc, NULL, &swapchain1);
+ }
+ if (FAILED(hr))
+ goto done;
+ hr = IDXGISwapChain1_QueryInterface(swapchain1, &IID_IDXGISwapChain,
+ (void **) &swapchain);
+ if (FAILED(hr))
+ goto done;
+
+ *swapchain_out = swapchain;
+ swapchain = NULL;
+
+done:
+ SAFE_RELEASE(swapchain1);
+ SAFE_RELEASE(swapchain);
+ return hr;
+}
+
+static HRESULT create_swapchain_1_1(struct d3d11_ctx *ctx,
+ IDXGIFactory1 *factory, const struct pl_d3d11_swapchain_params *params,
+ UINT width, UINT height, DXGI_FORMAT format, IDXGISwapChain **swapchain_out)
+{
+ DXGI_SWAP_CHAIN_DESC desc = {
+ .BufferDesc = {
+ .Width = width,
+ .Height = height,
+ .Format = format,
+ },
+ .SampleDesc.Count = 1,
+ .BufferUsage = DXGI_USAGE_SHADER_INPUT | DXGI_USAGE_RENDER_TARGET_OUTPUT,
+ .BufferCount = 1,
+ .OutputWindow = params->window,
+ .Windowed = TRUE,
+ .SwapEffect = DXGI_SWAP_EFFECT_DISCARD,
+ .Flags = params->flags,
+ };
+
+ return IDXGIFactory1_CreateSwapChain(factory, (IUnknown *) ctx->dev, &desc,
+ swapchain_out);
+}
+
+static IDXGISwapChain *create_swapchain(struct d3d11_ctx *ctx,
+ const struct pl_d3d11_swapchain_params *params, DXGI_FORMAT format)
+{
+ IDXGIDevice1 *dxgi_dev = NULL;
+ IDXGIAdapter1 *adapter = NULL;
+ IDXGIFactory1 *factory = NULL;
+ IDXGIFactory2 *factory2 = NULL;
+ IDXGISwapChain *swapchain = NULL;
+ bool success = false;
+ HRESULT hr;
+
+ D3D(ID3D11Device_QueryInterface(ctx->dev, &IID_IDXGIDevice1,
+ (void **) &dxgi_dev));
+ D3D(IDXGIDevice1_GetParent(dxgi_dev, &IID_IDXGIAdapter1, (void **) &adapter));
+ D3D(IDXGIAdapter1_GetParent(adapter, &IID_IDXGIFactory1, (void **) &factory));
+
+ hr = IDXGIFactory1_QueryInterface(factory, &IID_IDXGIFactory2,
+ (void **) &factory2);
+ if (FAILED(hr))
+ factory2 = NULL;
+
+ bool flip = factory2 && !params->blit;
+ UINT width = PL_DEF(params->width, 1);
+ UINT height = PL_DEF(params->height, 1);
+
+ // If both width and height are unset, the default size is the window size
+ if (params->window && params->width == 0 && params->height == 0) {
+ RECT rc;
+ if (GetClientRect(params->window, &rc)) {
+ width = PL_DEF(rc.right - rc.left, 1);
+ height = PL_DEF(rc.bottom - rc.top, 1);
+ }
+ }
+
+ // Return here to retry creating the swapchain
+ do {
+ if (factory2) {
+ // Create a DXGI 1.2+ (Windows 8+) swap chain if possible
+ hr = create_swapchain_1_2(ctx, factory2, params, flip, width,
+ height, format, &swapchain);
+ } else {
+ // Fall back to DXGI 1.1 (Windows 7)
+ hr = create_swapchain_1_1(ctx, factory, params, width, height,
+ format, &swapchain);
+ }
+ if (SUCCEEDED(hr))
+ break;
+
+ pl_d3d11_after_error(ctx, hr);
+ if (flip) {
+ PL_DEBUG(ctx, "Failed to create flip-model swapchain, trying bitblt");
+ flip = false;
+ continue;
+ }
+
+ PL_FATAL(ctx, "Failed to create swapchain: %s", pl_hresult_to_str(hr));
+ goto error;
+ } while (true);
+
+ // Prevent DXGI from making changes to the window, otherwise it will hook
+ // the Alt+Enter keystroke and make it trigger an ugly transition to
+ // legacy exclusive fullscreen mode.
+ IDXGIFactory_MakeWindowAssociation(factory, params->window,
+ DXGI_MWA_NO_WINDOW_CHANGES | DXGI_MWA_NO_ALT_ENTER |
+ DXGI_MWA_NO_PRINT_SCREEN);
+
+ success = true;
+error:
+ if (!success)
+ SAFE_RELEASE(swapchain);
+ SAFE_RELEASE(factory2);
+ SAFE_RELEASE(factory);
+ SAFE_RELEASE(adapter);
+ SAFE_RELEASE(dxgi_dev);
+ return swapchain;
+}
+
+pl_swapchain pl_d3d11_create_swapchain(pl_d3d11 d3d11,
+ const struct pl_d3d11_swapchain_params *params)
+{
+ struct d3d11_ctx *ctx = PL_PRIV(d3d11);
+ pl_gpu gpu = d3d11->gpu;
+ bool success = false;
+
+ struct pl_swapchain_t *sw = pl_zalloc_obj(NULL, sw, struct priv);
+ struct priv *p = PL_PRIV(sw);
+ *sw = (struct pl_swapchain_t) {
+ .log = gpu->log,
+ .gpu = gpu,
+ };
+ *p = (struct priv) {
+ .impl = d3d11_swapchain,
+ .ctx = ctx,
+ // default to standard 8 or 10 bit RGB, unset pl_color_space
+ .csp_map = {
+ .d3d11_fmt = params->disable_10bit_sdr ?
+ DXGI_FORMAT_R8G8B8A8_UNORM :
+ (d3d11_format_supported(ctx, DXGI_FORMAT_R10G10B10A2_UNORM) ?
+ DXGI_FORMAT_R10G10B10A2_UNORM : DXGI_FORMAT_R8G8B8A8_UNORM),
+ },
+ .disable_10bit_sdr = params->disable_10bit_sdr,
+ };
+
+ if (params->swapchain) {
+ p->swapchain = params->swapchain;
+ IDXGISwapChain_AddRef(params->swapchain);
+ } else {
+ p->swapchain = create_swapchain(ctx, params, p->csp_map.d3d11_fmt);
+ if (!p->swapchain)
+ goto error;
+ }
+
+ DXGI_SWAP_CHAIN_DESC scd = {0};
+ IDXGISwapChain_GetDesc(p->swapchain, &scd);
+ if (scd.SwapEffect == DXGI_SWAP_EFFECT_FLIP_SEQUENTIAL ||
+ scd.SwapEffect == DXGI_SWAP_EFFECT_FLIP_DISCARD) {
+ PL_INFO(gpu, "Using flip-model presentation");
+ } else {
+ PL_INFO(gpu, "Using bitblt-model presentation");
+ }
+
+ p->csp_map.d3d11_fmt = scd.BufferDesc.Format;
+
+ update_swapchain_color_config(sw, &pl_color_space_unknown, true);
+
+ success = true;
+error:
+ if (!success) {
+ PL_FATAL(gpu, "Failed to create Direct3D 11 swapchain");
+ d3d11_sw_destroy(sw);
+ sw = NULL;
+ }
+ return sw;
+}
diff --git a/src/d3d11/utils.c b/src/d3d11/utils.c
new file mode 100644
index 0000000..47154b5
--- /dev/null
+++ b/src/d3d11/utils.c
@@ -0,0 +1,500 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <string.h>
+
+#include "utils.h"
+
+// D3D11.3 message IDs, not present in mingw-w64 v9
+#define D3D11_MESSAGE_ID_CREATE_FENCE (0x30020c)
+#define D3D11_MESSAGE_ID_DESTROY_FENCE (0x30020a)
+
+#ifdef PL_HAVE_DXGI_DEBUG
+static enum pl_log_level log_level_override(unsigned int id)
+{
+ switch (id) {
+ // These warnings can happen when a pl_timer is used too often before a
+ // blocking pl_swapchain_swap_buffers() or pl_gpu_finish(), overflowing
+ // its internal ring buffer and causing older query objects to be reused
+ // before their results are read. This is expected behavior, so reduce
+ // the log level to PL_LOG_TRACE to prevent log spam.
+ case D3D11_MESSAGE_ID_QUERY_BEGIN_ABANDONING_PREVIOUS_RESULTS:
+ case D3D11_MESSAGE_ID_QUERY_END_ABANDONING_PREVIOUS_RESULTS:
+ return PL_LOG_TRACE;
+
+ // D3D11 writes log messages every time an object is created or
+ // destroyed. That results in a lot of log spam, so force PL_LOG_TRACE.
+#define OBJ_LIFETIME_MESSAGES(obj) \
+ case D3D11_MESSAGE_ID_CREATE_ ## obj: \
+ case D3D11_MESSAGE_ID_DESTROY_ ## obj
+
+ OBJ_LIFETIME_MESSAGES(CONTEXT):
+ OBJ_LIFETIME_MESSAGES(BUFFER):
+ OBJ_LIFETIME_MESSAGES(TEXTURE1D):
+ OBJ_LIFETIME_MESSAGES(TEXTURE2D):
+ OBJ_LIFETIME_MESSAGES(TEXTURE3D):
+ OBJ_LIFETIME_MESSAGES(SHADERRESOURCEVIEW):
+ OBJ_LIFETIME_MESSAGES(RENDERTARGETVIEW):
+ OBJ_LIFETIME_MESSAGES(DEPTHSTENCILVIEW):
+ OBJ_LIFETIME_MESSAGES(VERTEXSHADER):
+ OBJ_LIFETIME_MESSAGES(HULLSHADER):
+ OBJ_LIFETIME_MESSAGES(DOMAINSHADER):
+ OBJ_LIFETIME_MESSAGES(GEOMETRYSHADER):
+ OBJ_LIFETIME_MESSAGES(PIXELSHADER):
+ OBJ_LIFETIME_MESSAGES(INPUTLAYOUT):
+ OBJ_LIFETIME_MESSAGES(SAMPLER):
+ OBJ_LIFETIME_MESSAGES(BLENDSTATE):
+ OBJ_LIFETIME_MESSAGES(DEPTHSTENCILSTATE):
+ OBJ_LIFETIME_MESSAGES(RASTERIZERSTATE):
+ OBJ_LIFETIME_MESSAGES(QUERY):
+ OBJ_LIFETIME_MESSAGES(PREDICATE):
+ OBJ_LIFETIME_MESSAGES(COUNTER):
+ OBJ_LIFETIME_MESSAGES(COMMANDLIST):
+ OBJ_LIFETIME_MESSAGES(CLASSINSTANCE):
+ OBJ_LIFETIME_MESSAGES(CLASSLINKAGE):
+ OBJ_LIFETIME_MESSAGES(COMPUTESHADER):
+ OBJ_LIFETIME_MESSAGES(UNORDEREDACCESSVIEW):
+ OBJ_LIFETIME_MESSAGES(VIDEODECODER):
+ OBJ_LIFETIME_MESSAGES(VIDEOPROCESSORENUM):
+ OBJ_LIFETIME_MESSAGES(VIDEOPROCESSOR):
+ OBJ_LIFETIME_MESSAGES(DECODEROUTPUTVIEW):
+ OBJ_LIFETIME_MESSAGES(PROCESSORINPUTVIEW):
+ OBJ_LIFETIME_MESSAGES(PROCESSOROUTPUTVIEW):
+ OBJ_LIFETIME_MESSAGES(DEVICECONTEXTSTATE):
+ OBJ_LIFETIME_MESSAGES(FENCE):
+ return PL_LOG_TRACE;
+
+#undef OBJ_LIFETIME_MESSAGES
+
+ // Don't force the log level of any other messages. It will be mapped
+ // from the D3D severity code instead.
+ default:
+ return PL_LOG_NONE;
+ }
+}
+#endif
+
+void pl_d3d11_flush_message_queue(struct d3d11_ctx *ctx, const char *header)
+{
+#ifdef PL_HAVE_DXGI_DEBUG
+ if (!ctx->iqueue)
+ return;
+
+ static const enum pl_log_level severity_map[] = {
+ [DXGI_INFO_QUEUE_MESSAGE_SEVERITY_CORRUPTION] = PL_LOG_FATAL,
+ [DXGI_INFO_QUEUE_MESSAGE_SEVERITY_ERROR] = PL_LOG_ERR,
+ [DXGI_INFO_QUEUE_MESSAGE_SEVERITY_WARNING] = PL_LOG_WARN,
+ [DXGI_INFO_QUEUE_MESSAGE_SEVERITY_INFO] = PL_LOG_DEBUG,
+ [DXGI_INFO_QUEUE_MESSAGE_SEVERITY_MESSAGE] = PL_LOG_DEBUG,
+ };
+
+ enum pl_log_level header_printed = PL_LOG_NONE;
+
+ // After the storage limit is reached and ID3D11InfoQueue::ClearStoredMessages
+ // is called message counter seems to be initialized to -1 which is quite big
+ // number if we read it as uint64_t. Any subsequent call to the
+ // ID3D11InfoQueue::GetNumStoredMessages will be off by one.
+ // Use ID3D11InfoQueue_GetNumStoredMessagesAllowedByRetrievalFilter without
+ // any filter set, which seem to be unaffected by this bug and return correct
+ // number of messages.
+ // IDXGIInfoQueue seems to be unaffected, but keep the same way of retrival
+ uint64_t messages = IDXGIInfoQueue_GetNumStoredMessagesAllowedByRetrievalFilters(ctx->iqueue, DXGI_DEBUG_ALL);
+
+ // Just to be on the safe side, check also for the mentioned -1 value...
+ if (!messages || messages == UINT64_C(-1))
+ return;
+
+ uint64_t discarded =
+ IDXGIInfoQueue_GetNumMessagesDiscardedByMessageCountLimit(ctx->iqueue, DXGI_DEBUG_ALL);
+ if (discarded > ctx->last_discarded) {
+ PL_WARN(ctx, "%s:", header);
+ header_printed = PL_LOG_WARN;
+
+ // Notify number of messages skipped due to the message count limit
+ PL_WARN(ctx, " (skipped %"PRIu64" debug layer messages)",
+ discarded - ctx->last_discarded);
+ ctx->last_discarded = discarded;
+ }
+
+ // Copy debug layer messages to libplacebo's log output
+ for (uint64_t i = 0; i < messages; i++) {
+ SIZE_T len;
+ if (FAILED(IDXGIInfoQueue_GetMessage(ctx->iqueue, DXGI_DEBUG_ALL, i, NULL, &len)))
+ goto error;
+
+ pl_grow((void *) ctx->d3d11, &ctx->dxgi_msg, len);
+ DXGI_INFO_QUEUE_MESSAGE *dxgi_msg = ctx->dxgi_msg;
+
+ if (FAILED(IDXGIInfoQueue_GetMessage(ctx->iqueue, DXGI_DEBUG_ALL, i, dxgi_msg, &len)))
+ goto error;
+
+ enum pl_log_level level = PL_LOG_NONE;
+ if (IsEqualGUID(&dxgi_msg->Producer, &DXGI_DEBUG_D3D11))
+ level = log_level_override(dxgi_msg->ID);
+ if (level == PL_LOG_NONE)
+ level = severity_map[dxgi_msg->Severity];
+
+ if (pl_msg_test(ctx->log, level)) {
+ // If the header hasn't been printed, or it was printed for a lower
+ // log level than the current message, print it (again)
+ if (header_printed == PL_LOG_NONE || header_printed > level) {
+ PL_MSG(ctx, level, "%s:", header);
+ pl_log_stack_trace(ctx->log, level);
+ header_printed = level;
+ }
+
+ PL_MSG(ctx, level, " %d: %.*s", (int) dxgi_msg->ID,
+ (int) dxgi_msg->DescriptionByteLength, dxgi_msg->pDescription);
+ }
+
+ if (dxgi_msg->Severity <= DXGI_INFO_QUEUE_MESSAGE_SEVERITY_ERROR)
+ pl_debug_abort();
+ }
+
+error:
+ IDXGIInfoQueue_ClearStoredMessages(ctx->iqueue, DXGI_DEBUG_ALL);
+#endif
+}
+
+HRESULT pl_d3d11_check_device_removed(struct d3d11_ctx *ctx, HRESULT hr)
+{
+ // This can be called before we have a device
+ if (!ctx->dev)
+ return hr;
+
+ switch (hr) {
+ case DXGI_ERROR_DEVICE_HUNG:
+ case DXGI_ERROR_DEVICE_RESET:
+ case DXGI_ERROR_DRIVER_INTERNAL_ERROR:
+ ctx->is_failed = true;
+ break;
+ case D3DDDIERR_DEVICEREMOVED:
+ case DXGI_ERROR_DEVICE_REMOVED:
+ hr = ID3D11Device_GetDeviceRemovedReason(ctx->dev);
+ ctx->is_failed = true;
+ break;
+ }
+ if (ctx->is_failed)
+ PL_ERR(ctx, "Device lost!");
+ return hr;
+}
+
+HRESULT pl_d3d11_after_error(struct d3d11_ctx *ctx, HRESULT hr)
+{
+ hr = pl_d3d11_check_device_removed(ctx, hr);
+ pl_d3d11_flush_message_queue(ctx, "After error");
+ return hr;
+}
+
+struct dll_version pl_get_dll_version(const wchar_t *name)
+{
+ void *data = NULL;
+ struct dll_version ret = {0};
+
+ DWORD size = GetFileVersionInfoSizeW(name, &(DWORD) {0});
+ if (!size)
+ goto error;
+ data = pl_alloc(NULL, size);
+
+ if (!GetFileVersionInfoW(name, 0, size, data))
+ goto error;
+
+ VS_FIXEDFILEINFO *ffi;
+ UINT ffi_len;
+ if (!VerQueryValueW(data, L"\\", (void**)&ffi, &ffi_len))
+ goto error;
+ if (ffi_len < sizeof(*ffi))
+ goto error;
+
+ ret = (struct dll_version) {
+ .major = HIWORD(ffi->dwFileVersionMS),
+ .minor = LOWORD(ffi->dwFileVersionMS),
+ .build = HIWORD(ffi->dwFileVersionLS),
+ .revision = LOWORD(ffi->dwFileVersionLS),
+ };
+
+error:
+ pl_free(data);
+ return ret;
+}
+
+wchar_t *pl_from_utf8(void *ctx, const char *str)
+{
+ int count = MultiByteToWideChar(CP_UTF8, 0, str, -1, NULL, 0);
+ pl_assert(count > 0);
+ wchar_t *ret = pl_calloc_ptr(ctx, count, ret);
+ MultiByteToWideChar(CP_UTF8, 0, str, -1, ret, count);
+ return ret;
+}
+
+char *pl_to_utf8(void *ctx, const wchar_t *str)
+{
+ int count = WideCharToMultiByte(CP_UTF8, 0, str, -1, NULL, 0, NULL, NULL);
+ pl_assert(count > 0);
+ char *ret = pl_calloc_ptr(ctx, count, ret);
+ WideCharToMultiByte(CP_UTF8, 0, str, -1, ret, count, NULL, NULL);
+ return ret;
+}
+
+static const char *hresult_str(HRESULT hr)
+{
+ switch (hr) {
+#define CASE(name) case name: return #name
+ CASE(S_OK);
+ CASE(S_FALSE);
+ CASE(E_ABORT);
+ CASE(E_ACCESSDENIED);
+ CASE(E_FAIL);
+ CASE(E_HANDLE);
+ CASE(E_INVALIDARG);
+ CASE(E_NOINTERFACE);
+ CASE(E_NOTIMPL);
+ CASE(E_OUTOFMEMORY);
+ CASE(E_POINTER);
+ CASE(E_UNEXPECTED);
+
+ CASE(DXGI_ERROR_ACCESS_DENIED);
+ CASE(DXGI_ERROR_ACCESS_LOST);
+ CASE(DXGI_ERROR_CANNOT_PROTECT_CONTENT);
+ CASE(DXGI_ERROR_DEVICE_HUNG);
+ CASE(DXGI_ERROR_DEVICE_REMOVED);
+ CASE(DXGI_ERROR_DEVICE_RESET);
+ CASE(DXGI_ERROR_DRIVER_INTERNAL_ERROR);
+ CASE(DXGI_ERROR_FRAME_STATISTICS_DISJOINT);
+ CASE(DXGI_ERROR_GRAPHICS_VIDPN_SOURCE_IN_USE);
+ CASE(DXGI_ERROR_INVALID_CALL);
+ CASE(DXGI_ERROR_MORE_DATA);
+ CASE(DXGI_ERROR_NAME_ALREADY_EXISTS);
+ CASE(DXGI_ERROR_NONEXCLUSIVE);
+ CASE(DXGI_ERROR_NOT_CURRENTLY_AVAILABLE);
+ CASE(DXGI_ERROR_NOT_FOUND);
+ CASE(DXGI_ERROR_REMOTE_CLIENT_DISCONNECTED);
+ CASE(DXGI_ERROR_REMOTE_OUTOFMEMORY);
+ CASE(DXGI_ERROR_RESTRICT_TO_OUTPUT_STALE);
+ CASE(DXGI_ERROR_SDK_COMPONENT_MISSING);
+ CASE(DXGI_ERROR_SESSION_DISCONNECTED);
+ CASE(DXGI_ERROR_UNSUPPORTED);
+ CASE(DXGI_ERROR_WAIT_TIMEOUT);
+ CASE(DXGI_ERROR_WAS_STILL_DRAWING);
+#undef CASE
+
+ default:
+ return "Unknown error";
+ }
+}
+
+static char *format_error(void *ctx, DWORD error)
+{
+ wchar_t *wstr;
+ if (!FormatMessageW(FORMAT_MESSAGE_ALLOCATE_BUFFER |
+ FORMAT_MESSAGE_FROM_SYSTEM |
+ FORMAT_MESSAGE_IGNORE_INSERTS, NULL, error,
+ MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+ (LPWSTR)&wstr, 0, NULL))
+ {
+ return NULL;
+ }
+
+ // Trim any trailing newline from the message
+ for (int i = wcslen(wstr) - 1; i >= 0; i--) {
+ if (wstr[i] != '\r' && wstr[i] != '\n') {
+ wstr[i + 1] = '\0';
+ break;
+ }
+ }
+
+ char *str = pl_to_utf8(ctx, wstr);
+ LocalFree(wstr);
+ return str;
+}
+
+char *pl_hresult_to_str_buf(char *buf, size_t buf_size, HRESULT hr)
+{
+ char *fmsg = format_error(NULL, hr);
+ const char *code = hresult_str(hr);
+ if (fmsg) {
+ snprintf(buf, buf_size, "%s (%s, 0x%08lx)", fmsg, code, hr);
+ } else {
+ snprintf(buf, buf_size, "%s, 0x%08lx", code, hr);
+ }
+ pl_free(fmsg);
+ return buf;
+}
+
+#define D3D11_DXGI_ENUM(prefix, define) { case prefix ## define: return #define; }
+
+const char *pl_get_dxgi_format_name(DXGI_FORMAT fmt)
+{
+ switch (fmt) {
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, UNKNOWN);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32B32A32_TYPELESS);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32B32A32_FLOAT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32B32A32_UINT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32B32A32_SINT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32B32_TYPELESS);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32B32_FLOAT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32B32_UINT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32B32_SINT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16B16A16_TYPELESS);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16B16A16_FLOAT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16B16A16_UNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16B16A16_UINT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16B16A16_SNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16B16A16_SINT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32_TYPELESS);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32_FLOAT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32_UINT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32_SINT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G8X24_TYPELESS);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, D32_FLOAT_S8X24_UINT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R32_FLOAT_X8X24_TYPELESS);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, X32_TYPELESS_G8X24_UINT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R10G10B10A2_TYPELESS);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R10G10B10A2_UNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R10G10B10A2_UINT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R11G11B10_FLOAT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8B8A8_TYPELESS);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8B8A8_UNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8B8A8_UNORM_SRGB);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8B8A8_UINT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8B8A8_SNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8B8A8_SINT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16_TYPELESS);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16_FLOAT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16_UNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16_UINT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16_SNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16_SINT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R32_TYPELESS);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, D32_FLOAT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R32_FLOAT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R32_UINT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R32_SINT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R24G8_TYPELESS);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, D24_UNORM_S8_UINT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R24_UNORM_X8_TYPELESS);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, X24_TYPELESS_G8_UINT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8_TYPELESS);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8_UNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8_UINT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8_SNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8_SINT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R16_TYPELESS);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R16_FLOAT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, D16_UNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R16_UNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R16_UINT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R16_SNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R16_SINT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R8_TYPELESS);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R8_UNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R8_UINT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R8_SNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R8_SINT);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, A8_UNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R1_UNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R9G9B9E5_SHAREDEXP);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8_B8G8_UNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, G8R8_G8B8_UNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, BC1_TYPELESS);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, BC1_UNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, BC1_UNORM_SRGB);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, BC2_TYPELESS);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, BC2_UNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, BC2_UNORM_SRGB);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, BC3_TYPELESS);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, BC3_UNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, BC3_UNORM_SRGB);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, BC4_TYPELESS);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, BC4_UNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, BC4_SNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, BC5_TYPELESS);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, BC5_UNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, BC5_SNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, B5G6R5_UNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, B5G5R5A1_UNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, B8G8R8A8_UNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, B8G8R8X8_UNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, R10G10B10_XR_BIAS_A2_UNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, B8G8R8A8_TYPELESS);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, B8G8R8A8_UNORM_SRGB);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, B8G8R8X8_TYPELESS);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, B8G8R8X8_UNORM_SRGB);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, BC6H_TYPELESS);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, BC6H_UF16);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, BC6H_SF16);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, BC7_TYPELESS);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, BC7_UNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, BC7_UNORM_SRGB);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, AYUV);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, Y410);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, Y416);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, NV12);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, P010);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, P016);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, 420_OPAQUE);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, YUY2);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, Y210);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, Y216);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, NV11);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, AI44);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, IA44);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, P8);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, A8P8);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, B4G4R4A4_UNORM);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, P208);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, V208);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, V408);
+ D3D11_DXGI_ENUM(DXGI_FORMAT_, FORCE_UINT);
+ }
+
+ return "<unknown>";
+}
+
+const char *pl_get_dxgi_csp_name(DXGI_COLOR_SPACE_TYPE csp)
+{
+ switch ((int) csp) {
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_FULL_G22_NONE_P709);
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_FULL_G10_NONE_P709);
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_STUDIO_G22_NONE_P709);
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_STUDIO_G22_NONE_P2020);
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RESERVED);
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_FULL_G22_NONE_P709_X601);
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G22_LEFT_P601);
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_FULL_G22_LEFT_P601);
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G22_LEFT_P709);
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_FULL_G22_LEFT_P709);
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G22_LEFT_P2020);
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_FULL_G22_LEFT_P2020);
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_FULL_G2084_NONE_P2020);
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G2084_LEFT_P2020);
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_STUDIO_G2084_NONE_P2020);
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G22_TOPLEFT_P2020);
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G2084_TOPLEFT_P2020);
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_FULL_G22_NONE_P2020);
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_GHLG_TOPLEFT_P2020);
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_FULL_GHLG_TOPLEFT_P2020);
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_STUDIO_G24_NONE_P709);
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_STUDIO_G24_NONE_P2020);
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G24_LEFT_P709);
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G24_LEFT_P2020);
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G24_TOPLEFT_P2020);
+ D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, CUSTOM);
+ }
+
+ return "<unknown>";
+}
diff --git a/src/d3d11/utils.h b/src/d3d11/utils.h
new file mode 100644
index 0000000..86b4072
--- /dev/null
+++ b/src/d3d11/utils.h
@@ -0,0 +1,88 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+#define DXGI_COLOR_SPACE_RGB_STUDIO_G24_NONE_P709 ((DXGI_COLOR_SPACE_TYPE)20)
+#define DXGI_COLOR_SPACE_RGB_STUDIO_G24_NONE_P2020 ((DXGI_COLOR_SPACE_TYPE)21)
+#define DXGI_COLOR_SPACE_YCBCR_STUDIO_G24_LEFT_P709 ((DXGI_COLOR_SPACE_TYPE)22)
+#define DXGI_COLOR_SPACE_YCBCR_STUDIO_G24_LEFT_P2020 ((DXGI_COLOR_SPACE_TYPE)23)
+#define DXGI_COLOR_SPACE_YCBCR_STUDIO_G24_TOPLEFT_P2020 ((DXGI_COLOR_SPACE_TYPE)24)
+
+// Flush debug messages from D3D11's info queue to libplacebo's log output.
+// Should be called regularly.
+void pl_d3d11_flush_message_queue(struct d3d11_ctx *ctx, const char *header);
+
+// Some D3D11 functions can fail with a set of HRESULT codes which indicate the
+// device has been removed. This is equivalent to libplacebo's gpu_is_failed
+// state and indicates that the pl_gpu needs to be recreated. This function
+// checks for one of those HRESULTs, sets the failed state, and returns a
+// specific HRESULT that indicates why the device was removed (eg. GPU hang,
+// driver crash, etc.)
+HRESULT pl_d3d11_check_device_removed(struct d3d11_ctx *ctx, HRESULT hr);
+
+// Helper function for the D3D() macro, though it can be called directly when
+// handling D3D11 errors if the D3D() macro isn't suitable for some reason.
+// Calls `pl_d3d11_check_device_removed` and `pl_d3d11_drain_debug_messages` and
+// returns the specific HRESULT from `pl_d3d11_check_device_removed` for logging
+// purposes.
+HRESULT pl_d3d11_after_error(struct d3d11_ctx *ctx, HRESULT hr);
+
+// Convenience macro for running DXGI/D3D11 functions and performing appropriate
+// actions on failure. Can also be used for any HRESULT-returning function.
+#define D3D(call) \
+ do { \
+ HRESULT hr_ = (call); \
+ if (FAILED(hr_)) { \
+ hr_ = pl_d3d11_after_error(ctx, hr_); \
+ PL_ERR(ctx, "%s: %s (%s:%d)", #call, pl_hresult_to_str(hr_), \
+ __FILE__, __LINE__); \
+ goto error; \
+ } \
+ } while (0);
+
+// Conditionally release a COM interface and set the pointer to NULL
+#define SAFE_RELEASE(iface) \
+ do { \
+ if (iface) \
+ (iface)->lpVtbl->Release(iface); \
+ (iface) = NULL; \
+ } while (0)
+
+struct dll_version {
+ uint16_t major;
+ uint16_t minor;
+ uint16_t build;
+ uint16_t revision;
+};
+
+// Get the version number of a DLL. This calls GetFileVersionInfoW, which should
+// call LoadLibraryExW internally, so it should get the same copy of the DLL
+// that is loaded into memory if there is a copy in System32 and a copy in the
+// %PATH% or application directory.
+struct dll_version pl_get_dll_version(const wchar_t *name);
+
+wchar_t *pl_from_utf8(void *ctx, const char *str);
+char *pl_to_utf8(void *ctx, const wchar_t *str);
+
+#define pl_hresult_to_str(hr) pl_hresult_to_str_buf((char[256]){0}, 256, (hr))
+char *pl_hresult_to_str_buf(char *buf, size_t buf_size, HRESULT hr);
+
+const char *pl_get_dxgi_csp_name(DXGI_COLOR_SPACE_TYPE csp);
+const char *pl_get_dxgi_format_name(DXGI_FORMAT fmt);
diff --git a/src/dispatch.c b/src/dispatch.c
new file mode 100644
index 0000000..308dd56
--- /dev/null
+++ b/src/dispatch.c
@@ -0,0 +1,1615 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+#include "log.h"
+#include "shaders.h"
+#include "dispatch.h"
+#include "gpu.h"
+#include "pl_thread.h"
+
+// Maximum number of passes to keep around at once. If full, passes older than
+// MIN_AGE are evicted to make room. (Failing that, the passes array doubles)
+#define MAX_PASSES 100
+#define MIN_AGE 10
+
+enum {
+ TMP_PRELUDE, // GLSL version, global definitions, etc.
+ TMP_MAIN, // main GLSL shader body
+ TMP_VERT_HEAD, // vertex shader inputs/outputs
+ TMP_VERT_BODY, // vertex shader body
+ TMP_COUNT,
+};
+
+struct pl_dispatch_t {
+ pl_mutex lock;
+ pl_log log;
+ pl_gpu gpu;
+ uint8_t current_ident;
+ uint8_t current_index;
+ bool dynamic_constants;
+ int max_passes;
+
+ void (*info_callback)(void *, const struct pl_dispatch_info *);
+ void *info_priv;
+
+ PL_ARRAY(pl_shader) shaders; // to avoid re-allocations
+ PL_ARRAY(struct pass *) passes; // compiled passes
+
+ // temporary buffers to help avoid re_allocations during pass creation
+ PL_ARRAY(const struct pl_buffer_var *) buf_tmp;
+ pl_str_builder tmp[TMP_COUNT];
+ uint8_t *ubo_tmp;
+};
+
+enum pass_var_type {
+ PASS_VAR_NONE = 0,
+ PASS_VAR_GLOBAL, // regular/global uniforms
+ PASS_VAR_UBO, // uniform buffers
+ PASS_VAR_PUSHC // push constants
+};
+
+// Cached metadata about a variable's effective placement / update method
+struct pass_var {
+ int index; // for pl_var_update
+ enum pass_var_type type;
+ struct pl_var_layout layout;
+ void *cached_data;
+};
+
+struct pass {
+ uint64_t signature;
+ pl_pass pass;
+ int last_index;
+
+ // contains cached data and update metadata, same order as pl_shader
+ struct pass_var *vars;
+ int num_var_locs;
+
+ // for uniform buffer updates
+ struct pl_shader_desc ubo_desc; // temporary
+ int ubo_index;
+ pl_buf ubo;
+
+ // Cached pl_pass_run_params. This will also contain mutable allocations
+ // for the push constants, descriptor bindings (including the binding for
+ // the UBO pre-filled), vertex array and variable updates
+ struct pl_pass_run_params run_params;
+
+ // for pl_dispatch_info
+ pl_timer timer;
+ uint64_t ts_last;
+ uint64_t ts_peak;
+ uint64_t ts_sum;
+ uint64_t samples[PL_ARRAY_SIZE(((struct pl_dispatch_info *) NULL)->samples)];
+ int ts_idx;
+};
+
+static void pass_destroy(pl_dispatch dp, struct pass *pass)
+{
+ if (!pass)
+ return;
+
+ pl_buf_destroy(dp->gpu, &pass->ubo);
+ pl_pass_destroy(dp->gpu, &pass->pass);
+ pl_timer_destroy(dp->gpu, &pass->timer);
+ pl_free(pass);
+}
+
+pl_dispatch pl_dispatch_create(pl_log log, pl_gpu gpu)
+{
+ struct pl_dispatch_t *dp = pl_zalloc_ptr(NULL, dp);
+ pl_mutex_init(&dp->lock);
+ dp->log = log;
+ dp->gpu = gpu;
+ dp->max_passes = MAX_PASSES;
+ for (int i = 0; i < PL_ARRAY_SIZE(dp->tmp); i++)
+ dp->tmp[i] = pl_str_builder_alloc(dp);
+
+ return dp;
+}
+
+void pl_dispatch_destroy(pl_dispatch *ptr)
+{
+ pl_dispatch dp = *ptr;
+ if (!dp)
+ return;
+
+ for (int i = 0; i < dp->passes.num; i++)
+ pass_destroy(dp, dp->passes.elem[i]);
+ for (int i = 0; i < dp->shaders.num; i++)
+ pl_shader_free(&dp->shaders.elem[i]);
+
+ pl_mutex_destroy(&dp->lock);
+ pl_free(dp);
+ *ptr = NULL;
+}
+
+pl_shader pl_dispatch_begin_ex(pl_dispatch dp, bool unique)
+{
+ pl_mutex_lock(&dp->lock);
+
+ struct pl_shader_params params = {
+ .id = unique ? dp->current_ident++ : 0,
+ .gpu = dp->gpu,
+ .index = dp->current_index,
+ .dynamic_constants = dp->dynamic_constants,
+ };
+
+ pl_shader sh = NULL;
+ PL_ARRAY_POP(dp->shaders, &sh);
+ pl_mutex_unlock(&dp->lock);
+
+ if (sh) {
+ pl_shader_reset(sh, &params);
+ return sh;
+ }
+
+ return pl_shader_alloc(dp->log, &params);
+}
+
+void pl_dispatch_mark_dynamic(pl_dispatch dp, bool dynamic)
+{
+ dp->dynamic_constants = dynamic;
+}
+
+void pl_dispatch_callback(pl_dispatch dp, void *priv,
+ void (*cb)(void *priv, const struct pl_dispatch_info *))
+{
+ dp->info_callback = cb;
+ dp->info_priv = priv;
+}
+
+pl_shader pl_dispatch_begin(pl_dispatch dp)
+{
+ return pl_dispatch_begin_ex(dp, false);
+}
+
+static bool add_pass_var(pl_dispatch dp, void *tmp, struct pass *pass,
+ struct pl_pass_params *params,
+ const struct pl_shader_var *sv, struct pass_var *pv,
+ bool greedy)
+{
+ pl_gpu gpu = dp->gpu;
+ if (pv->type)
+ return true;
+
+ // Try not to use push constants for "large" values like matrices in the
+ // first pass, since this is likely to exceed the VGPR/pushc size budgets
+ bool try_pushc = greedy || (sv->var.dim_m == 1 && sv->var.dim_a == 1) || sv->dynamic;
+ if (try_pushc && gpu->glsl.vulkan && gpu->limits.max_pushc_size) {
+ pv->layout = pl_std430_layout(params->push_constants_size, &sv->var);
+ size_t new_size = pv->layout.offset + pv->layout.size;
+ if (new_size <= gpu->limits.max_pushc_size) {
+ params->push_constants_size = new_size;
+ pv->type = PASS_VAR_PUSHC;
+ return true;
+ }
+ }
+
+ // If we haven't placed all PCs yet, don't place anything else, since
+ // we want to try and fit more stuff into PCs before "giving up"
+ if (!greedy)
+ return true;
+
+ int num_locs = sv->var.dim_v * sv->var.dim_m * sv->var.dim_a;
+ bool can_var = pass->num_var_locs + num_locs <= gpu->limits.max_variable_comps;
+
+ // Attempt using uniform buffer next. The GLSL version 440 check is due
+ // to explicit offsets on UBO entries. In theory we could leave away
+ // the offsets and support UBOs for older GL as well, but this is a nice
+ // safety net for driver bugs (and also rules out potentially buggy drivers)
+ // Also avoid UBOs for highly dynamic stuff since that requires synchronizing
+ // the UBO writes every frame
+ bool try_ubo = !can_var || !sv->dynamic;
+ if (try_ubo && gpu->glsl.version >= 440 && gpu->limits.max_ubo_size) {
+ if (sh_buf_desc_append(tmp, gpu, &pass->ubo_desc, &pv->layout, sv->var)) {
+ pv->type = PASS_VAR_UBO;
+ return true;
+ }
+ }
+
+ // Otherwise, use global uniforms
+ if (can_var) {
+ pv->type = PASS_VAR_GLOBAL;
+ pv->index = params->num_variables;
+ pv->layout = pl_var_host_layout(0, &sv->var);
+ PL_ARRAY_APPEND_RAW(tmp, params->variables, params->num_variables, sv->var);
+ pass->num_var_locs += num_locs;
+ return true;
+ }
+
+ // Ran out of variable binding methods. The most likely scenario in which
+ // this can happen is if we're using a GPU that does not support global
+ // input vars and we've exhausted the UBO size limits.
+ PL_ERR(dp, "Unable to add input variable: possibly exhausted "
+ "variable count / UBO size limits?");
+ return false;
+}
+
+#define ADD(b, ...) pl_str_builder_addf(b, __VA_ARGS__)
+#define ADD_CAT(b, cat) pl_str_builder_concat(b, cat)
+#define ADD_CONST(b, s) pl_str_builder_const_str(b, s)
+
+static void add_var(pl_str_builder body, const struct pl_var *var)
+{
+ const char *type = pl_var_glsl_type_name(*var);
+ if (var->dim_a > 1) {
+ ADD(body, "%s "$"[%d];\n", type, sh_ident_unpack(var->name), var->dim_a);
+ } else {
+ ADD(body, "%s "$";\n", type, sh_ident_unpack(var->name));
+ }
+}
+
+static int cmp_buffer_var(const void *pa, const void *pb)
+{
+ const struct pl_buffer_var * const *a = pa, * const *b = pb;
+ return PL_CMP((*a)->layout.offset, (*b)->layout.offset);
+}
+
+static void add_buffer_vars(pl_dispatch dp, void *tmp, pl_str_builder body,
+ const struct pl_buffer_var *vars, int num)
+{
+ // Sort buffer vars by offset
+ PL_ARRAY_RESIZE(dp, dp->buf_tmp, num);
+ for (int i = 0; i < num; i++)
+ dp->buf_tmp.elem[i] = &vars[i];
+ qsort(dp->buf_tmp.elem, num, sizeof(&vars[0]), cmp_buffer_var);
+
+ ADD(body, "{\n");
+ for (int i = 0; i < num; i++) {
+ const struct pl_buffer_var *bv = dp->buf_tmp.elem[i];
+ // Add an explicit offset wherever possible
+ if (dp->gpu->glsl.version >= 440)
+ ADD(body, " layout(offset=%zu) ", bv->layout.offset);
+ add_var(body, &bv->var);
+ }
+ ADD(body, "};\n");
+}
+
+struct generate_params {
+ void *tmp;
+ pl_shader sh;
+ struct pass *pass;
+ struct pl_pass_params *pass_params;
+ ident_t out_mat;
+ ident_t out_off;
+ int vert_idx;
+};
+
+static void generate_shaders(pl_dispatch dp,
+ const struct generate_params *params,
+ pl_str_builder *out_vert_builder,
+ pl_str_builder *out_glsl_builder)
+{
+ pl_gpu gpu = dp->gpu;
+ pl_shader sh = params->sh;
+ void *tmp = params->tmp;
+ struct pass *pass = params->pass;
+ struct pl_pass_params *pass_params = params->pass_params;
+ pl_str_builder shader_body = sh_finalize_internal(sh);
+
+ pl_str_builder pre = dp->tmp[TMP_PRELUDE];
+ ADD(pre, "#version %d%s\n", gpu->glsl.version,
+ (gpu->glsl.gles && gpu->glsl.version > 100) ? " es" : "");
+ if (pass_params->type == PL_PASS_COMPUTE)
+ ADD(pre, "#extension GL_ARB_compute_shader : enable\n");
+
+ // Enable this unconditionally if the GPU supports it, since we have no way
+ // of knowing whether subgroups are being used or not
+ if (gpu->glsl.subgroup_size) {
+ ADD(pre, "#extension GL_KHR_shader_subgroup_basic : enable \n"
+ "#extension GL_KHR_shader_subgroup_vote : enable \n"
+ "#extension GL_KHR_shader_subgroup_arithmetic : enable \n"
+ "#extension GL_KHR_shader_subgroup_ballot : enable \n"
+ "#extension GL_KHR_shader_subgroup_shuffle : enable \n"
+ "#extension GL_KHR_shader_subgroup_clustered : enable \n"
+ "#extension GL_KHR_shader_subgroup_quad : enable \n");
+ }
+
+ // Enable all extensions needed for different types of input
+ bool has_ssbo = false, has_ubo = false, has_img = false, has_texel = false,
+ has_ext = false, has_nofmt = false, has_gather = false;
+ for (int i = 0; i < sh->descs.num; i++) {
+ switch (sh->descs.elem[i].desc.type) {
+ case PL_DESC_BUF_UNIFORM: has_ubo = true; break;
+ case PL_DESC_BUF_STORAGE: has_ssbo = true; break;
+ case PL_DESC_BUF_TEXEL_UNIFORM: has_texel = true; break;
+ case PL_DESC_BUF_TEXEL_STORAGE: {
+ pl_buf buf = sh->descs.elem[i].binding.object;
+ has_nofmt |= !buf->params.format->glsl_format;
+ has_texel = true;
+ break;
+ }
+ case PL_DESC_STORAGE_IMG: {
+ pl_tex tex = sh->descs.elem[i].binding.object;
+ has_nofmt |= !tex->params.format->glsl_format;
+ has_img = true;
+ break;
+ }
+ case PL_DESC_SAMPLED_TEX: {
+ pl_tex tex = sh->descs.elem[i].binding.object;
+ has_gather |= tex->params.format->gatherable;
+ switch (tex->sampler_type) {
+ case PL_SAMPLER_NORMAL: break;
+ case PL_SAMPLER_RECT: break;
+ case PL_SAMPLER_EXTERNAL: has_ext = true; break;
+ case PL_SAMPLER_TYPE_COUNT: pl_unreachable();
+ }
+ break;
+ }
+
+ case PL_DESC_INVALID:
+ case PL_DESC_TYPE_COUNT:
+ pl_unreachable();
+ }
+ }
+
+ if (has_img)
+ ADD(pre, "#extension GL_ARB_shader_image_load_store : enable\n");
+ if (has_ubo)
+ ADD(pre, "#extension GL_ARB_uniform_buffer_object : enable\n");
+ if (has_ssbo)
+ ADD(pre, "#extension GL_ARB_shader_storage_buffer_object : enable\n");
+ if (has_texel)
+ ADD(pre, "#extension GL_ARB_texture_buffer_object : enable\n");
+ if (has_ext) {
+ if (gpu->glsl.version >= 300) {
+ ADD(pre, "#extension GL_OES_EGL_image_external_essl3 : enable\n");
+ } else {
+ ADD(pre, "#extension GL_OES_EGL_image_external : enable\n");
+ }
+ }
+ if (has_nofmt)
+ ADD(pre, "#extension GL_EXT_shader_image_load_formatted : enable\n");
+ if (has_gather)
+ ADD(pre, "#extension GL_ARB_texture_gather : enable\n");
+
+ if (gpu->glsl.gles) {
+ // Use 32-bit precision for floats if possible
+ ADD(pre, "#ifdef GL_FRAGMENT_PRECISION_HIGH \n"
+ "precision highp float; \n"
+ "#else \n"
+ "precision mediump float; \n"
+ "#endif \n");
+
+ // Always use 16-bit precision for samplers
+ ADD(pre, "precision mediump sampler2D; \n");
+ if (gpu->limits.max_tex_1d_dim)
+ ADD(pre, "precision mediump sampler1D; \n");
+ if (gpu->limits.max_tex_3d_dim && gpu->glsl.version > 100)
+ ADD(pre, "precision mediump sampler3D; \n");
+
+ // Integer math has a good chance of caring about precision
+ ADD(pre, "precision highp int; \n");
+ }
+
+ // textureLod() doesn't work on external/rect samplers, simply disable
+ // LOD sampling in this case. We don't currently support mipmaps anyway.
+ for (int i = 0; i < sh->descs.num; i++) {
+ if (pass_params->descriptors[i].type != PL_DESC_SAMPLED_TEX)
+ continue;
+ pl_tex tex = sh->descs.elem[i].binding.object;
+ if (tex->sampler_type != PL_SAMPLER_NORMAL) {
+ ADD(pre, "#define textureLod(t, p, b) texture(t, p) \n"
+ "#define textureLodOffset(t, p, b, o) \\\n"
+ " textureOffset(t, p, o) \n");
+ break;
+ }
+ }
+
+ // Add all of the push constants as their own element
+ if (pass_params->push_constants_size) {
+ // We re-use add_buffer_vars to make sure variables are sorted, this
+ // is important because the push constants can be out-of-order in
+ // `pass->vars`
+ PL_ARRAY(struct pl_buffer_var) pc_bvars = {0};
+ for (int i = 0; i < sh->vars.num; i++) {
+ if (pass->vars[i].type != PASS_VAR_PUSHC)
+ continue;
+
+ PL_ARRAY_APPEND(tmp, pc_bvars, (struct pl_buffer_var) {
+ .var = sh->vars.elem[i].var,
+ .layout = pass->vars[i].layout,
+ });
+ }
+
+ ADD(pre, "layout(std430, push_constant) uniform PushC ");
+ add_buffer_vars(dp, tmp, pre, pc_bvars.elem, pc_bvars.num);
+ }
+
+ // Add all of the specialization constants
+ for (int i = 0; i < sh->consts.num; i++) {
+ static const char *types[PL_VAR_TYPE_COUNT] = {
+ [PL_VAR_SINT] = "int",
+ [PL_VAR_UINT] = "uint",
+ [PL_VAR_FLOAT] = "float",
+ };
+
+ const struct pl_shader_const *sc = &sh->consts.elem[i];
+ ADD(pre, "layout(constant_id=%"PRIu32") const %s "$" = 1; \n",
+ pass_params->constants[i].id, types[sc->type],
+ sh_ident_unpack(sc->name));
+ }
+
+ static const char sampler_prefixes[PL_FMT_TYPE_COUNT] = {
+ [PL_FMT_FLOAT] = ' ',
+ [PL_FMT_UNORM] = ' ',
+ [PL_FMT_SNORM] = ' ',
+ [PL_FMT_UINT] = 'u',
+ [PL_FMT_SINT] = 'i',
+ };
+
+ // Add all of the required descriptors
+ for (int i = 0; i < sh->descs.num; i++) {
+ const struct pl_shader_desc *sd = &sh->descs.elem[i];
+ const struct pl_desc *desc = &pass_params->descriptors[i];
+
+ switch (desc->type) {
+ case PL_DESC_SAMPLED_TEX: {
+ static const char *types[][4] = {
+ [PL_SAMPLER_NORMAL][1] = "sampler1D",
+ [PL_SAMPLER_NORMAL][2] = "sampler2D",
+ [PL_SAMPLER_NORMAL][3] = "sampler3D",
+ [PL_SAMPLER_RECT][2] = "sampler2DRect",
+ [PL_SAMPLER_EXTERNAL][2] = "samplerExternalOES",
+ };
+
+ pl_tex tex = sd->binding.object;
+ int dims = pl_tex_params_dimension(tex->params);
+ const char *type = types[tex->sampler_type][dims];
+ char prefix = sampler_prefixes[tex->params.format->type];
+ ident_t id = sh_ident_unpack(desc->name);
+ pl_assert(type && prefix);
+
+ // Vulkan requires explicit bindings; GL always sets the
+ // bindings manually to avoid relying on the user doing so
+ if (gpu->glsl.vulkan) {
+ ADD(pre, "layout(binding=%d) uniform %c%s "$";\n",
+ desc->binding, prefix, type, id);
+ } else if (gpu->glsl.gles && prefix != ' ') {
+ ADD(pre, "uniform highp %c%s "$";\n", prefix, type, id);
+ } else {
+ ADD(pre, "uniform %c%s "$";\n", prefix, type, id);
+ }
+ break;
+ }
+
+ case PL_DESC_STORAGE_IMG: {
+ static const char *types[] = {
+ [1] = "image1D",
+ [2] = "image2D",
+ [3] = "image3D",
+ };
+
+ // For better compatibility, we have to explicitly label the
+ // type of data we will be reading/writing to this image.
+ pl_tex tex = sd->binding.object;
+ const char *format = tex->params.format->glsl_format;
+ int dims = pl_tex_params_dimension(tex->params);
+ if (gpu->glsl.vulkan) {
+ if (format) {
+ ADD(pre, "layout(binding=%d, %s) ", desc->binding, format);
+ } else {
+ ADD(pre, "layout(binding=%d) ", desc->binding);
+ }
+ } else if (format) {
+ ADD(pre, "layout(%s) ", format);
+ }
+
+ ADD_CONST(pre, pl_desc_access_glsl_name(desc->access));
+ if (sd->memory & PL_MEMORY_COHERENT)
+ ADD(pre, " coherent");
+ if (sd->memory & PL_MEMORY_VOLATILE)
+ ADD(pre, " volatile");
+ ADD(pre, " restrict uniform %s "$";\n",
+ types[dims], sh_ident_unpack(desc->name));
+ break;
+ }
+
+ case PL_DESC_BUF_UNIFORM:
+ if (gpu->glsl.vulkan) {
+ ADD(pre, "layout(std140, binding=%d) ", desc->binding);
+ } else {
+ ADD(pre, "layout(std140) ");
+ }
+ ADD(pre, "uniform "$" ", sh_ident_unpack(desc->name));
+ add_buffer_vars(dp, tmp, pre, sd->buffer_vars, sd->num_buffer_vars);
+ break;
+
+ case PL_DESC_BUF_STORAGE:
+ if (gpu->glsl.version >= 140)
+ ADD(pre, "layout(std430, binding=%d) ", desc->binding);
+ ADD_CONST(pre, pl_desc_access_glsl_name(desc->access));
+ if (sd->memory & PL_MEMORY_COHERENT)
+ ADD(pre, " coherent");
+ if (sd->memory & PL_MEMORY_VOLATILE)
+ ADD(pre, " volatile");
+ ADD(pre, " restrict buffer "$" ", sh_ident_unpack(desc->name));
+ add_buffer_vars(dp, tmp, pre, sd->buffer_vars, sd->num_buffer_vars);
+ break;
+
+ case PL_DESC_BUF_TEXEL_UNIFORM: {
+ pl_buf buf = sd->binding.object;
+ char prefix = sampler_prefixes[buf->params.format->type];
+ if (gpu->glsl.vulkan)
+ ADD(pre, "layout(binding=%d) ", desc->binding);
+ ADD(pre, "uniform %csamplerBuffer "$";\n", prefix,
+ sh_ident_unpack(desc->name));
+ break;
+ }
+
+ case PL_DESC_BUF_TEXEL_STORAGE: {
+ pl_buf buf = sd->binding.object;
+ const char *format = buf->params.format->glsl_format;
+ char prefix = sampler_prefixes[buf->params.format->type];
+ if (gpu->glsl.vulkan) {
+ if (format) {
+ ADD(pre, "layout(binding=%d, %s) ", desc->binding, format);
+ } else {
+ ADD(pre, "layout(binding=%d) ", desc->binding);
+ }
+ } else if (format) {
+ ADD(pre, "layout(%s) ", format);
+ }
+
+ ADD_CONST(pre, pl_desc_access_glsl_name(desc->access));
+ if (sd->memory & PL_MEMORY_COHERENT)
+ ADD(pre, " coherent");
+ if (sd->memory & PL_MEMORY_VOLATILE)
+ ADD(pre, " volatile");
+ ADD(pre, " restrict uniform %cimageBuffer "$";\n",
+ prefix, sh_ident_unpack(desc->name));
+ break;
+ }
+
+ case PL_DESC_INVALID:
+ case PL_DESC_TYPE_COUNT:
+ pl_unreachable();
+ }
+ }
+
+ // Add all of the remaining variables
+ for (int i = 0; i < sh->vars.num; i++) {
+ const struct pl_var *var = &sh->vars.elem[i].var;
+ const struct pass_var *pv = &pass->vars[i];
+ if (pv->type != PASS_VAR_GLOBAL)
+ continue;
+ ADD(pre, "uniform ");
+ add_var(pre, var);
+ }
+
+ pl_str_builder glsl = dp->tmp[TMP_MAIN];
+ ADD_CAT(glsl, pre);
+
+ switch(pass_params->type) {
+ case PL_PASS_RASTER: {
+ pl_assert(params->vert_idx >= 0);
+ pl_str_builder vert_head = dp->tmp[TMP_VERT_HEAD];
+ pl_str_builder vert_body = dp->tmp[TMP_VERT_BODY];
+
+ // Older GLSL doesn't support the use of explicit locations
+ bool has_loc = gpu->glsl.version >= 430;
+
+ // Set up a trivial vertex shader
+ ADD_CAT(vert_head, pre);
+ ADD(vert_body, "void main() {\n");
+ for (int i = 0; i < sh->vas.num; i++) {
+ const struct pl_vertex_attrib *va = &pass_params->vertex_attribs[i];
+ const struct pl_shader_va *sva = &sh->vas.elem[i];
+ const char *type = va->fmt->glsl_type;
+
+ // Use the pl_shader_va for the name in the fragment shader since
+ // the pl_vertex_attrib is already mangled for the vertex shader
+ ident_t id = sh_ident_unpack(sva->attr.name);
+
+ if (has_loc) {
+ ADD(vert_head, "layout(location=%d) in %s "$";\n",
+ va->location, type, sh_ident_unpack(va->name));
+ } else {
+ ADD(vert_head, "in %s "$";\n", type, sh_ident_unpack(va->name));
+ }
+
+ if (i == params->vert_idx) {
+ pl_assert(va->fmt->num_components == 2);
+ ADD(vert_body, "vec2 va_pos = "$"; \n", sh_ident_unpack(va->name));
+ if (params->out_mat)
+ ADD(vert_body, "va_pos = "$" * va_pos; \n", params->out_mat);
+ if (params->out_off)
+ ADD(vert_body, "va_pos += "$"; \n", params->out_off);
+ ADD(vert_body, "gl_Position = vec4(va_pos, 0.0, 1.0); \n");
+ } else {
+ // Everything else is just blindly passed through
+ if (has_loc) {
+ ADD(vert_head, "layout(location=%d) out %s "$";\n",
+ va->location, type, id);
+ ADD(glsl, "layout(location=%d) in %s "$";\n",
+ va->location, type, id);
+ } else {
+ ADD(vert_head, "out %s "$";\n", type, id);
+ ADD(glsl, "in %s "$";\n", type, id);
+ }
+ ADD(vert_body, $" = "$";\n", id, sh_ident_unpack(va->name));
+ }
+ }
+
+ ADD(vert_body, "}");
+ ADD_CAT(vert_head, vert_body);
+ pl_hash_merge(&pass->signature, pl_str_builder_hash(vert_head));
+ *out_vert_builder = vert_head;
+
+ if (has_loc) {
+ ADD(glsl, "layout(location=0) out vec4 out_color;\n");
+ } else {
+ ADD(glsl, "out vec4 out_color;\n");
+ }
+ break;
+ }
+ case PL_PASS_COMPUTE:
+ ADD(glsl, "layout (local_size_x = %d, local_size_y = %d) in;\n",
+ sh->group_size[0], sh->group_size[1]);
+ break;
+ case PL_PASS_INVALID:
+ case PL_PASS_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ // Set up the main shader body
+ ADD_CAT(glsl, shader_body);
+ ADD(glsl, "void main() {\n");
+
+ pl_assert(sh->input == PL_SHADER_SIG_NONE);
+ switch (pass_params->type) {
+ case PL_PASS_RASTER:
+ pl_assert(sh->output == PL_SHADER_SIG_COLOR);
+ ADD(glsl, "out_color = "$"();\n", sh->name);
+ break;
+ case PL_PASS_COMPUTE:
+ ADD(glsl, $"();\n", sh->name);
+ break;
+ case PL_PASS_INVALID:
+ case PL_PASS_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ ADD(glsl, "}");
+
+ pl_hash_merge(&pass->signature, pl_str_builder_hash(glsl));
+ *out_glsl_builder = glsl;
+}
+
+#undef ADD
+#undef ADD_CAT
+
+#define pass_age(pass) (dp->current_index - (pass)->last_index)
+
+static int cmp_pass_age(const void *ptra, const void *ptrb)
+{
+ const struct pass *a = *(const struct pass **) ptra;
+ const struct pass *b = *(const struct pass **) ptrb;
+ return b->last_index - a->last_index;
+}
+
+static void garbage_collect_passes(pl_dispatch dp)
+{
+ if (dp->passes.num <= dp->max_passes)
+ return;
+
+ // Garbage collect oldest passes, starting at the middle
+ qsort(dp->passes.elem, dp->passes.num, sizeof(struct pass *), cmp_pass_age);
+ int idx = dp->passes.num / 2;
+ while (idx < dp->passes.num && pass_age(dp->passes.elem[idx]) < MIN_AGE)
+ idx++;
+
+ for (int i = idx; i < dp->passes.num; i++)
+ pass_destroy(dp, dp->passes.elem[i]);
+
+ int num_evicted = dp->passes.num - idx;
+ dp->passes.num = idx;
+
+ if (num_evicted) {
+ PL_DEBUG(dp, "Evicted %d passes from dispatch cache, consider "
+ "using more dynamic shaders", num_evicted);
+ } else {
+ dp->max_passes *= 2;
+ }
+}
+
+static struct pass *finalize_pass(pl_dispatch dp, pl_shader sh,
+ pl_tex target, int vert_idx,
+ const struct pl_blend_params *blend, bool load,
+ const struct pl_dispatch_vertex_params *vparams,
+ const pl_transform2x2 *proj)
+{
+ struct pass *pass = pl_alloc_ptr(dp, pass);
+ *pass = (struct pass) {
+ .signature = 0x0, // updated incrementally below
+ .last_index = dp->current_index,
+ .ubo_desc = {
+ .desc = {
+ .name = sh_ident_pack(sh_fresh(sh, "UBO")),
+ .type = PL_DESC_BUF_UNIFORM,
+ },
+ },
+ };
+
+ // For identifiers tied to the lifetime of this shader
+ void *tmp = sh->tmp;
+
+ struct pl_pass_params params = {
+ .type = pl_shader_is_compute(sh) ? PL_PASS_COMPUTE : PL_PASS_RASTER,
+ .num_descriptors = sh->descs.num,
+ .vertex_type = vparams ? vparams->vertex_type : PL_PRIM_TRIANGLE_STRIP,
+ .vertex_stride = vparams ? vparams->vertex_stride : 0,
+ .blend_params = blend,
+ };
+
+ struct generate_params gen_params = {
+ .tmp = tmp,
+ .pass = pass,
+ .pass_params = &params,
+ .sh = sh,
+ .vert_idx = vert_idx,
+ };
+
+ if (params.type == PL_PASS_RASTER) {
+ assert(target);
+ params.target_format = target->params.format;
+ params.load_target = load;
+
+ // Fill in the vertex attributes array
+ params.num_vertex_attribs = sh->vas.num;
+ params.vertex_attribs = pl_calloc_ptr(tmp, sh->vas.num, params.vertex_attribs);
+
+ int va_loc = 0;
+ for (int i = 0; i < sh->vas.num; i++) {
+ struct pl_vertex_attrib *va = &params.vertex_attribs[i];
+ *va = sh->vas.elem[i].attr;
+
+ // Mangle the name to make sure it doesn't conflict with the
+ // fragment shader input, this will be converted back to a legal
+ // string by the shader compilation code
+ va->name = sh_ident_pack(sh_fresh(sh, "va"));
+
+ // Place the vertex attribute
+ va->location = va_loc;
+ if (!vparams) {
+ va->offset = params.vertex_stride;
+ params.vertex_stride += va->fmt->texel_size;
+ }
+
+ // The number of vertex attribute locations consumed by a vertex
+ // attribute is the number of vec4s it consumes, rounded up
+ const size_t va_loc_size = sizeof(float[4]);
+ va_loc += PL_DIV_UP(va->fmt->texel_size, va_loc_size);
+ }
+
+ // Hash in the raster state configuration
+ pl_hash_merge(&pass->signature, (uint64_t) params.vertex_type);
+ pl_hash_merge(&pass->signature, (uint64_t) params.vertex_stride);
+ pl_hash_merge(&pass->signature, (uint64_t) params.load_target);
+ pl_hash_merge(&pass->signature, target->params.format->signature);
+ if (blend) {
+ pl_static_assert(sizeof(*blend) == sizeof(enum pl_blend_mode) * 4);
+ pl_hash_merge(&pass->signature, pl_var_hash(*blend));
+ }
+
+ // Load projection matrix if required
+ if (proj && memcmp(&proj->mat, &pl_matrix2x2_identity, sizeof(proj->mat)) != 0) {
+ gen_params.out_mat = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_mat2("proj"),
+ .data = PL_TRANSPOSE_2X2(proj->mat.m),
+ });
+ }
+
+ if (proj && (proj->c[0] || proj->c[1])) {
+ gen_params.out_off = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec2("offset"),
+ .data = proj->c,
+ });
+ }
+ }
+
+ // Place all of the compile-time constants
+ uint8_t *constant_data = NULL;
+ if (sh->consts.num) {
+ params.num_constants = sh->consts.num;
+ params.constants = pl_alloc(tmp, sh->consts.num * sizeof(struct pl_constant));
+
+ // Compute offsets
+ size_t total_size = 0;
+ uint32_t const_id = 0;
+ for (int i = 0; i < sh->consts.num; i++) {
+ params.constants[i] = (struct pl_constant) {
+ .type = sh->consts.elem[i].type,
+ .id = const_id++,
+ .offset = total_size,
+ };
+ total_size += pl_var_type_size(sh->consts.elem[i].type);
+ }
+
+ // Write values into the constants buffer
+ params.constant_data = constant_data = pl_alloc(pass, total_size);
+ for (int i = 0; i < sh->consts.num; i++) {
+ const struct pl_shader_const *sc = &sh->consts.elem[i];
+ void *data = constant_data + params.constants[i].offset;
+ memcpy(data, sc->data, pl_var_type_size(sc->type));
+ }
+ }
+
+ // Place all the variables; these will dynamically end up in different
+ // locations based on what the underlying GPU supports (UBOs, pushc, etc.)
+ //
+ // We go through the list twice, once to place stuff that we definitely
+ // want inside PCs, and then a second time to opportunistically place the rest.
+ pass->vars = pl_calloc_ptr(pass, sh->vars.num, pass->vars);
+ for (int i = 0; i < sh->vars.num; i++) {
+ if (!add_pass_var(dp, tmp, pass, &params, &sh->vars.elem[i], &pass->vars[i], false))
+ goto error;
+ }
+ for (int i = 0; i < sh->vars.num; i++) {
+ if (!add_pass_var(dp, tmp, pass, &params, &sh->vars.elem[i], &pass->vars[i], true))
+ goto error;
+ }
+
+ // Now that we know the variable placement, finalize pushc/UBO sizes
+ params.push_constants_size = PL_ALIGN2(params.push_constants_size, 4);
+ size_t ubo_size = sh_buf_desc_size(&pass->ubo_desc);
+ if (ubo_size) {
+ pass->ubo_index = sh->descs.num;
+ PL_ARRAY_APPEND(sh, sh->descs, pass->ubo_desc); // don't mangle names
+ };
+
+ // Place and fill in the descriptors
+ const int num_descs = sh->descs.num;
+ int binding[PL_DESC_TYPE_COUNT] = {0};
+ params.num_descriptors = num_descs;
+ params.descriptors = pl_calloc_ptr(tmp, num_descs, params.descriptors);
+ for (int i = 0; i < num_descs; i++) {
+ struct pl_desc *desc = &params.descriptors[i];
+ *desc = sh->descs.elem[i].desc;
+ desc->binding = binding[pl_desc_namespace(dp->gpu, desc->type)]++;
+ }
+
+ // Finalize the shader and look it up in the pass cache
+ pl_str_builder vert_builder = NULL, glsl_builder = NULL;
+ generate_shaders(dp, &gen_params, &vert_builder, &glsl_builder);
+ for (int i = 0; i < dp->passes.num; i++) {
+ struct pass *p = dp->passes.elem[i];
+ if (p->signature != pass->signature)
+ continue;
+
+ // Found existing shader, re-use directly
+ if (p->ubo)
+ sh->descs.elem[p->ubo_index].binding.object = p->ubo;
+ pl_free(p->run_params.constant_data);
+ p->run_params.constant_data = pl_steal(p, constant_data);
+ p->last_index = dp->current_index;
+ pl_free(pass);
+ return p;
+ }
+
+ // Need to compile new shader, execute templates now
+ if (vert_builder) {
+ pl_str vert = pl_str_builder_exec(vert_builder);
+ params.vertex_shader = (char *) vert.buf;
+ }
+ pl_str glsl = pl_str_builder_exec(glsl_builder);
+ params.glsl_shader = (char *) glsl.buf;
+
+ // Turn all shader identifiers into actual strings before passing it
+ // to the `pl_gpu`
+#define FIX_IDENT(name) \
+ name = sh_ident_tostr(sh_ident_unpack(name))
+ for (int i = 0; i < params.num_variables; i++)
+ FIX_IDENT(params.variables[i].name);
+ for (int i = 0; i < params.num_descriptors; i++)
+ FIX_IDENT(params.descriptors[i].name);
+ for (int i = 0; i < params.num_vertex_attribs; i++)
+ FIX_IDENT(params.vertex_attribs[i].name);
+#undef FIX_IDENT
+
+ pass->pass = pl_pass_create(dp->gpu, &params);
+ if (!pass->pass) {
+ PL_ERR(dp, "Failed creating render pass for dispatch");
+ // Add it anyway
+ }
+
+ struct pl_pass_run_params *rparams = &pass->run_params;
+ rparams->pass = pass->pass;
+ rparams->constant_data = constant_data;
+ rparams->push_constants = pl_zalloc(pass, params.push_constants_size);
+ rparams->desc_bindings = pl_calloc_ptr(pass, params.num_descriptors,
+ rparams->desc_bindings);
+
+ if (ubo_size && pass->pass) {
+ // Create the UBO
+ pass->ubo = pl_buf_create(dp->gpu, pl_buf_params(
+ .size = ubo_size,
+ .uniform = true,
+ .host_writable = true,
+ ));
+
+ if (!pass->ubo) {
+ PL_ERR(dp, "Failed creating uniform buffer for dispatch");
+ goto error;
+ }
+
+ sh->descs.elem[pass->ubo_index].binding.object = pass->ubo;
+ }
+
+ if (params.type == PL_PASS_RASTER && !vparams) {
+ // Generate the vertex array placeholder
+ rparams->vertex_count = 4; // single quad
+ size_t vert_size = rparams->vertex_count * params.vertex_stride;
+ rparams->vertex_data = pl_zalloc(pass, vert_size);
+ }
+
+ pass->timer = pl_timer_create(dp->gpu);
+
+ PL_ARRAY_APPEND(dp, dp->passes, pass);
+ return pass;
+
+error:
+ pass_destroy(dp, pass);
+ return NULL;
+}
+
+static void update_pass_var(pl_dispatch dp, struct pass *pass,
+ const struct pl_shader_var *sv, struct pass_var *pv)
+{
+ struct pl_var_layout host_layout = pl_var_host_layout(0, &sv->var);
+ pl_assert(host_layout.size);
+
+ // Use the cache to skip updates if possible
+ if (pv->cached_data && !memcmp(sv->data, pv->cached_data, host_layout.size))
+ return;
+ if (!pv->cached_data)
+ pv->cached_data = pl_alloc(pass, host_layout.size);
+ memcpy(pv->cached_data, sv->data, host_layout.size);
+
+ struct pl_pass_run_params *rparams = &pass->run_params;
+ switch (pv->type) {
+ case PASS_VAR_NONE:
+ pl_unreachable();
+ case PASS_VAR_GLOBAL: {
+ struct pl_var_update vu = {
+ .index = pv->index,
+ .data = sv->data,
+ };
+ PL_ARRAY_APPEND_RAW(pass, rparams->var_updates, rparams->num_var_updates, vu);
+ break;
+ }
+ case PASS_VAR_UBO: {
+ pl_assert(pass->ubo);
+ const size_t offset = pv->layout.offset;
+ if (host_layout.stride == pv->layout.stride) {
+ pl_assert(host_layout.size == pv->layout.size);
+ pl_buf_write(dp->gpu, pass->ubo, offset, sv->data, host_layout.size);
+ } else {
+ // Coalesce strided UBO write into a single pl_buf_write to avoid
+ // unnecessary synchronization overhead by assembling the correctly
+ // strided upload in RAM
+ pl_grow(dp, &dp->ubo_tmp, pv->layout.size);
+ uint8_t * const tmp = dp->ubo_tmp;
+ const uint8_t *src = sv->data;
+ const uint8_t *end = src + host_layout.size;
+ uint8_t *dst = tmp;
+ while (src < end) {
+ memcpy(dst, src, host_layout.stride);
+ src += host_layout.stride;
+ dst += pv->layout.stride;
+ }
+ pl_buf_write(dp->gpu, pass->ubo, offset, tmp, pv->layout.size);
+ }
+ break;
+ }
+ case PASS_VAR_PUSHC:
+ pl_assert(rparams->push_constants);
+ memcpy_layout(rparams->push_constants, pv->layout, sv->data, host_layout);
+ break;
+ };
+}
+
+static void compute_vertex_attribs(pl_dispatch dp, pl_shader sh,
+ int width, int height, ident_t *out_scale)
+{
+ // Simulate vertex attributes using global definitions
+ *out_scale = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec2("out_scale"),
+ .data = &(float[2]){ 1.0 / width, 1.0 / height },
+ .dynamic = true,
+ });
+
+ GLSLP("#define frag_pos(id) (vec2(id) + vec2(0.5)) \n"
+ "#define frag_map(id) ("$" * frag_pos(id)) \n"
+ "#define gl_FragCoord vec4(frag_pos(gl_GlobalInvocationID), 0.0, 1.0) \n",
+ *out_scale);
+
+ for (int n = 0; n < sh->vas.num; n++) {
+ const struct pl_shader_va *sva = &sh->vas.elem[n];
+
+ ident_t points[4];
+ for (int i = 0; i < PL_ARRAY_SIZE(points); i++) {
+ points[i] = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_from_fmt(sva->attr.fmt, "pt"),
+ .data = sva->data[i],
+ });
+ }
+
+ GLSLP("#define "$"_map(id) "
+ "(mix(mix("$", "$", frag_map(id).x), "
+ " mix("$", "$", frag_map(id).x), "
+ "frag_map(id).y)) \n"
+ "#define "$" ("$"_map(gl_GlobalInvocationID)) \n",
+ sh_ident_unpack(sva->attr.name),
+ points[0], points[1], points[2], points[3],
+ sh_ident_unpack(sva->attr.name),
+ sh_ident_unpack(sva->attr.name));
+ }
+}
+
+static void translate_compute_shader(pl_dispatch dp, pl_shader sh,
+ const pl_rect2d *rc,
+ const struct pl_dispatch_params *params)
+{
+ int width = abs(pl_rect_w(*rc)), height = abs(pl_rect_h(*rc));
+ if (sh->transpose)
+ PL_SWAP(width, height);
+ ident_t out_scale;
+ compute_vertex_attribs(dp, sh, width, height, &out_scale);
+
+ // Simulate a framebuffer using storage images
+ pl_assert(params->target->params.storable);
+ pl_assert(sh->output == PL_SHADER_SIG_COLOR);
+ ident_t fbo = sh_desc(sh, (struct pl_shader_desc) {
+ .binding.object = params->target,
+ .desc = {
+ .name = "out_image",
+ .type = PL_DESC_STORAGE_IMG,
+ .access = params->blend_params ? PL_DESC_ACCESS_READWRITE
+ : PL_DESC_ACCESS_WRITEONLY,
+ },
+ });
+
+ ident_t base = sh_var(sh, (struct pl_shader_var) {
+ .data = &(int[2]){ rc->x0, rc->y0 },
+ .dynamic = true,
+ .var = {
+ .name = "base",
+ .type = PL_VAR_SINT,
+ .dim_v = 2,
+ .dim_m = 1,
+ .dim_a = 1,
+ },
+ });
+
+ int dx = rc->x0 > rc->x1 ? -1 : 1, dy = rc->y0 > rc->y1 ? -1 : 1;
+ GLSL("ivec2 dir = ivec2(%d, %d);\n", dx, dy); // hard-code, not worth var
+ GLSL("ivec2 pos = "$" + dir * ivec2(gl_GlobalInvocationID).%c%c;\n",
+ base, sh->transpose ? 'y' : 'x', sh->transpose ? 'x' : 'y');
+ GLSL("vec2 fpos = "$" * vec2(gl_GlobalInvocationID);\n", out_scale);
+ GLSL("if (fpos.x < 1.0 && fpos.y < 1.0) {\n");
+ if (params->blend_params) {
+ GLSL("vec4 orig = imageLoad("$", pos);\n", fbo);
+
+ static const char *modes[] = {
+ [PL_BLEND_ZERO] = "0.0",
+ [PL_BLEND_ONE] = "1.0",
+ [PL_BLEND_SRC_ALPHA] = "color.a",
+ [PL_BLEND_ONE_MINUS_SRC_ALPHA] = "(1.0 - color.a)",
+ };
+
+ GLSL("color = vec4(color.rgb * vec3(%s), color.a * %s) \n"
+ " + vec4(orig.rgb * vec3(%s), orig.a * %s);\n",
+ modes[params->blend_params->src_rgb],
+ modes[params->blend_params->src_alpha],
+ modes[params->blend_params->dst_rgb],
+ modes[params->blend_params->dst_alpha]);
+ }
+ GLSL("imageStore("$", pos, color);\n", fbo);
+ GLSL("}\n");
+ sh->output = PL_SHADER_SIG_NONE;
+}
+
+static void run_pass(pl_dispatch dp, pl_shader sh, struct pass *pass)
+{
+ pl_shader_info shader = &sh->info->info;
+ pl_pass_run(dp->gpu, &pass->run_params);
+
+ for (uint64_t ts; (ts = pl_timer_query(dp->gpu, pass->timer));) {
+ PL_TRACE(dp, "Spent %.3f ms on shader: %s", ts / 1e6, shader->description);
+
+ uint64_t old = pass->samples[pass->ts_idx];
+ pass->samples[pass->ts_idx] = ts;
+ pass->ts_last = ts;
+ pass->ts_peak = PL_MAX(pass->ts_peak, ts);
+ pass->ts_sum += ts;
+ pass->ts_idx = (pass->ts_idx + 1) % PL_ARRAY_SIZE(pass->samples);
+
+ if (old) {
+ pass->ts_sum -= old;
+ if (old == pass->ts_peak) {
+ uint64_t new_peak = 0;
+ for (int i = 0; i < PL_ARRAY_SIZE(pass->samples); i++)
+ new_peak = PL_MAX(new_peak, pass->samples[i]);
+ pass->ts_peak = new_peak;
+ }
+ }
+ }
+
+ if (!dp->info_callback)
+ return;
+
+ struct pl_dispatch_info info;
+ info.signature = pass->signature;
+ info.shader = shader;
+
+ // Test to see if the ring buffer already wrapped around once
+ if (pass->samples[pass->ts_idx]) {
+ info.num_samples = PL_ARRAY_SIZE(pass->samples);
+ int num_wrapped = info.num_samples - pass->ts_idx;
+ memcpy(info.samples, &pass->samples[pass->ts_idx],
+ num_wrapped * sizeof(info.samples[0]));
+ memcpy(&info.samples[num_wrapped], pass->samples,
+ pass->ts_idx * sizeof(info.samples[0]));
+ } else {
+ info.num_samples = pass->ts_idx;
+ memcpy(info.samples, pass->samples,
+ pass->ts_idx * sizeof(info.samples[0]));
+ }
+
+ info.last = pass->ts_last;
+ info.peak = pass->ts_peak;
+ info.average = pass->ts_sum / PL_MAX(info.num_samples, 1);
+ dp->info_callback(dp->info_priv, &info);
+}
+
+bool pl_dispatch_finish(pl_dispatch dp, const struct pl_dispatch_params *params)
+{
+ pl_shader sh = *params->shader;
+ bool ret = false;
+ pl_mutex_lock(&dp->lock);
+
+ if (sh->failed) {
+ PL_ERR(sh, "Trying to dispatch a failed shader.");
+ goto error;
+ }
+
+ if (!sh->mutable) {
+ PL_ERR(dp, "Trying to dispatch non-mutable shader?");
+ goto error;
+ }
+
+ if (sh->input != PL_SHADER_SIG_NONE || sh->output != PL_SHADER_SIG_COLOR) {
+ PL_ERR(dp, "Trying to dispatch shader with incompatible signature!");
+ goto error;
+ }
+
+ const struct pl_tex_params *tpars = &params->target->params;
+ if (pl_tex_params_dimension(*tpars) != 2 || !tpars->renderable) {
+ PL_ERR(dp, "Trying to dispatch a shader using an invalid target "
+ "texture. The target must be a renderable 2D texture.");
+ goto error;
+ }
+
+ const struct pl_gpu_limits *limits = &dp->gpu->limits;
+ bool can_compute = tpars->storable;
+ if (can_compute && params->blend_params)
+ can_compute = tpars->format->caps & PL_FMT_CAP_READWRITE;
+
+ if (pl_shader_is_compute(sh) && !can_compute) {
+ PL_ERR(dp, "Trying to dispatch using a compute shader with a "
+ "non-storable or incompatible target texture.");
+ goto error;
+ } else if (can_compute && limits->compute_queues > limits->fragment_queues) {
+ if (sh_try_compute(sh, 16, 16, true, 0))
+ PL_TRACE(dp, "Upgrading fragment shader to compute shader.");
+ }
+
+ pl_rect2d rc = params->rect;
+ if (!pl_rect_w(rc)) {
+ rc.x0 = 0;
+ rc.x1 = tpars->w;
+ }
+ if (!pl_rect_h(rc)) {
+ rc.y0 = 0;
+ rc.y1 = tpars->h;
+ }
+
+ int w, h, tw = abs(pl_rect_w(rc)), th = abs(pl_rect_h(rc));
+ if (pl_shader_output_size(sh, &w, &h) && (w != tw || h != th))
+ {
+ PL_ERR(dp, "Trying to dispatch a shader with explicit output size "
+ "requirements %dx%d%s using a target rect of size %dx%d.",
+ w, h, sh->transpose ? " (transposed)" : "", tw, th);
+ goto error;
+ }
+
+ int vert_idx = -1;
+ const pl_transform2x2 *proj = NULL;
+ if (pl_shader_is_compute(sh)) {
+ // Translate the compute shader to simulate vertices etc.
+ translate_compute_shader(dp, sh, &rc, params);
+ } else {
+ // Add the vertex information encoding the position
+ pl_rect2df vert_rect = {
+ .x0 = 2.0 * rc.x0 / tpars->w - 1.0,
+ .y0 = 2.0 * rc.y0 / tpars->h - 1.0,
+ .x1 = 2.0 * rc.x1 / tpars->w - 1.0,
+ .y1 = 2.0 * rc.y1 / tpars->h - 1.0,
+ };
+
+ if (sh->transpose) {
+ static const pl_transform2x2 transpose_proj = {{{
+ { 0, 1 },
+ { 1, 0 },
+ }}};
+ proj = &transpose_proj;
+ PL_SWAP(vert_rect.x0, vert_rect.y0);
+ PL_SWAP(vert_rect.x1, vert_rect.y1);
+ }
+
+ sh_attr_vec2(sh, "position", &vert_rect);
+ vert_idx = sh->vas.num - 1;
+ }
+
+ // We need to set pl_pass_params.load_target when either blending is
+ // enabled or we're drawing to some scissored sub-rect of the texture
+ pl_rect2d full = { 0, 0, tpars->w, tpars->h };
+ pl_rect2d rc_norm = rc;
+ pl_rect2d_normalize(&rc_norm);
+ rc_norm.x0 = PL_MAX(rc_norm.x0, 0);
+ rc_norm.y0 = PL_MAX(rc_norm.y0, 0);
+ rc_norm.x1 = PL_MIN(rc_norm.x1, tpars->w);
+ rc_norm.y1 = PL_MIN(rc_norm.y1, tpars->h);
+ bool load = params->blend_params || !pl_rect2d_eq(rc_norm, full);
+
+ struct pass *pass = finalize_pass(dp, sh, params->target, vert_idx,
+ params->blend_params, load, NULL, proj);
+
+ // Silently return on failed passes
+ if (!pass || !pass->pass)
+ goto error;
+
+ struct pl_pass_run_params *rparams = &pass->run_params;
+
+ // Update the descriptor bindings
+ for (int i = 0; i < sh->descs.num; i++)
+ rparams->desc_bindings[i] = sh->descs.elem[i].binding;
+
+ // Update all of the variables (if needed)
+ rparams->num_var_updates = 0;
+ for (int i = 0; i < sh->vars.num; i++)
+ update_pass_var(dp, pass, &sh->vars.elem[i], &pass->vars[i]);
+
+ // Update the vertex data
+ if (rparams->vertex_data) {
+ uintptr_t vert_base = (uintptr_t) rparams->vertex_data;
+ size_t stride = rparams->pass->params.vertex_stride;
+ for (int i = 0; i < sh->vas.num; i++) {
+ const struct pl_shader_va *sva = &sh->vas.elem[i];
+ struct pl_vertex_attrib *va = &rparams->pass->params.vertex_attribs[i];
+
+ size_t size = sva->attr.fmt->texel_size;
+ uintptr_t va_base = vert_base + va->offset; // use placed offset
+ for (int n = 0; n < 4; n++)
+ memcpy((void *) (va_base + n * stride), sva->data[n], size);
+ }
+ }
+
+ // For compute shaders: also update the dispatch dimensions
+ if (pl_shader_is_compute(sh)) {
+ int width = abs(pl_rect_w(rc)),
+ height = abs(pl_rect_h(rc));
+ if (sh->transpose)
+ PL_SWAP(width, height);
+ // Round up to make sure we don't leave off a part of the target
+ int block_w = sh->group_size[0],
+ block_h = sh->group_size[1],
+ num_x = PL_DIV_UP(width, block_w),
+ num_y = PL_DIV_UP(height, block_h);
+
+ rparams->compute_groups[0] = num_x;
+ rparams->compute_groups[1] = num_y;
+ rparams->compute_groups[2] = 1;
+ } else {
+ // Update the scissors for performance
+ rparams->scissors = rc_norm;
+ }
+
+ // Dispatch the actual shader
+ rparams->target = params->target;
+ rparams->timer = PL_DEF(params->timer, pass->timer);
+ run_pass(dp, sh, pass);
+
+ ret = true;
+ // fall through
+
+error:
+ // Reset the temporary buffers which we use to build the shader
+ for (int i = 0; i < PL_ARRAY_SIZE(dp->tmp); i++)
+ pl_str_builder_reset(dp->tmp[i]);
+
+ pl_mutex_unlock(&dp->lock);
+ pl_dispatch_abort(dp, params->shader);
+ return ret;
+}
+
+bool pl_dispatch_compute(pl_dispatch dp, const struct pl_dispatch_compute_params *params)
+{
+ pl_shader sh = *params->shader;
+ bool ret = false;
+ pl_mutex_lock(&dp->lock);
+
+ if (sh->failed) {
+ PL_ERR(sh, "Trying to dispatch a failed shader.");
+ goto error;
+ }
+
+ if (!sh->mutable) {
+ PL_ERR(dp, "Trying to dispatch non-mutable shader?");
+ goto error;
+ }
+
+ if (sh->input != PL_SHADER_SIG_NONE) {
+ PL_ERR(dp, "Trying to dispatch shader with incompatible signature!");
+ goto error;
+ }
+
+ if (!pl_shader_is_compute(sh)) {
+ PL_ERR(dp, "Trying to dispatch a non-compute shader using "
+ "`pl_dispatch_compute`!");
+ goto error;
+ }
+
+ if (sh->vas.num) {
+ if (!params->width || !params->height) {
+ PL_ERR(dp, "Trying to dispatch a targetless compute shader that "
+ "uses vertex attributes, this requires specifying the size "
+ "of the effective rendering area!");
+ goto error;
+ }
+
+ compute_vertex_attribs(dp, sh, params->width, params->height,
+ &(ident_t){0});
+ }
+
+ struct pass *pass = finalize_pass(dp, sh, NULL, -1, NULL, false, NULL, NULL);
+
+ // Silently return on failed passes
+ if (!pass || !pass->pass)
+ goto error;
+
+ struct pl_pass_run_params *rparams = &pass->run_params;
+
+ // Update the descriptor bindings
+ for (int i = 0; i < sh->descs.num; i++)
+ rparams->desc_bindings[i] = sh->descs.elem[i].binding;
+
+ // Update all of the variables (if needed)
+ rparams->num_var_updates = 0;
+ for (int i = 0; i < sh->vars.num; i++)
+ update_pass_var(dp, pass, &sh->vars.elem[i], &pass->vars[i]);
+
+ // Update the dispatch size
+ int groups = 1;
+ for (int i = 0; i < 3; i++) {
+ groups *= params->dispatch_size[i];
+ rparams->compute_groups[i] = params->dispatch_size[i];
+ }
+
+ if (!groups) {
+ pl_assert(params->width && params->height);
+ int block_w = sh->group_size[0],
+ block_h = sh->group_size[1],
+ num_x = PL_DIV_UP(params->width, block_w),
+ num_y = PL_DIV_UP(params->height, block_h);
+
+ rparams->compute_groups[0] = num_x;
+ rparams->compute_groups[1] = num_y;
+ rparams->compute_groups[2] = 1;
+ }
+
+ // Dispatch the actual shader
+ rparams->timer = PL_DEF(params->timer, pass->timer);
+ run_pass(dp, sh, pass);
+
+ ret = true;
+ // fall through
+
+error:
+ // Reset the temporary buffers which we use to build the shader
+ for (int i = 0; i < PL_ARRAY_SIZE(dp->tmp); i++)
+ pl_str_builder_reset(dp->tmp[i]);
+
+ pl_mutex_unlock(&dp->lock);
+ pl_dispatch_abort(dp, params->shader);
+ return ret;
+}
+
+bool pl_dispatch_vertex(pl_dispatch dp, const struct pl_dispatch_vertex_params *params)
+{
+ pl_shader sh = *params->shader;
+ bool ret = false;
+ pl_mutex_lock(&dp->lock);
+
+ if (sh->failed) {
+ PL_ERR(sh, "Trying to dispatch a failed shader.");
+ goto error;
+ }
+
+ if (!sh->mutable) {
+ PL_ERR(dp, "Trying to dispatch non-mutable shader?");
+ goto error;
+ }
+
+ if (sh->input != PL_SHADER_SIG_NONE || sh->output != PL_SHADER_SIG_COLOR) {
+ PL_ERR(dp, "Trying to dispatch shader with incompatible signature!");
+ goto error;
+ }
+
+ const struct pl_tex_params *tpars = &params->target->params;
+ if (pl_tex_params_dimension(*tpars) != 2 || !tpars->renderable) {
+ PL_ERR(dp, "Trying to dispatch a shader using an invalid target "
+ "texture. The target must be a renderable 2D texture.");
+ goto error;
+ }
+
+ if (pl_shader_is_compute(sh)) {
+ PL_ERR(dp, "Trying to dispatch a compute shader using pl_dispatch_vertex.");
+ goto error;
+ }
+
+ if (sh->vas.num) {
+ PL_ERR(dp, "Trying to dispatch a custom vertex shader with already "
+ "attached vertex attributes.");
+ goto error;
+ }
+
+ if (sh->transpose) {
+ PL_ERR(dp, "Trying to dispatch a transposed shader using "
+ "pl_dispatch_vertex, unlikely to be correct. Erroring as a "
+ "safety precaution!");
+ goto error;
+ }
+
+ int pos_idx = params->vertex_position_idx;
+ if (pos_idx < 0 || pos_idx >= params->num_vertex_attribs) {
+ PL_ERR(dp, "Vertex position index out of range?");
+ goto error;
+ }
+
+ // Attach all of the vertex attributes to the shader manually
+ sh->vas.num = params->num_vertex_attribs;
+ PL_ARRAY_RESIZE(sh, sh->vas, sh->vas.num);
+ for (int i = 0; i < params->num_vertex_attribs; i++) {
+ ident_t id = sh_fresh(sh, params->vertex_attribs[i].name);
+ sh->vas.elem[i].attr = params->vertex_attribs[i];
+ sh->vas.elem[i].attr.name = sh_ident_pack(id);
+ GLSLP("#define %s "$"\n", params->vertex_attribs[i].name, id);
+ }
+
+ // Compute the coordinate projection matrix
+ pl_transform2x2 proj = pl_transform2x2_identity;
+ switch (params->vertex_coords) {
+ case PL_COORDS_ABSOLUTE:
+ proj.mat.m[0][0] /= tpars->w;
+ proj.mat.m[1][1] /= tpars->h;
+ // fall through
+ case PL_COORDS_RELATIVE:
+ proj.mat.m[0][0] *= 2.0;
+ proj.mat.m[1][1] *= 2.0;
+ proj.c[0] -= 1.0;
+ proj.c[1] -= 1.0;
+ // fall through
+ case PL_COORDS_NORMALIZED:
+ if (params->vertex_flipped) {
+ proj.mat.m[1][1] = -proj.mat.m[1][1];
+ proj.c[1] += 2.0;
+ }
+ break;
+ }
+
+ struct pass *pass = finalize_pass(dp, sh, params->target, pos_idx,
+ params->blend_params, true, params, &proj);
+
+ // Silently return on failed passes
+ if (!pass || !pass->pass)
+ goto error;
+
+ struct pl_pass_run_params *rparams = &pass->run_params;
+
+ // Update the descriptor bindings
+ for (int i = 0; i < sh->descs.num; i++)
+ rparams->desc_bindings[i] = sh->descs.elem[i].binding;
+
+ // Update all of the variables (if needed)
+ rparams->num_var_updates = 0;
+ for (int i = 0; i < sh->vars.num; i++)
+ update_pass_var(dp, pass, &sh->vars.elem[i], &pass->vars[i]);
+
+ // Update the scissors
+ rparams->scissors = params->scissors;
+ if (params->vertex_flipped) {
+ rparams->scissors.y0 = tpars->h - rparams->scissors.y0;
+ rparams->scissors.y1 = tpars->h - rparams->scissors.y1;
+ }
+ pl_rect2d_normalize(&rparams->scissors);
+
+ // Dispatch the actual shader
+ rparams->target = params->target;
+ rparams->vertex_count = params->vertex_count;
+ rparams->vertex_data = params->vertex_data;
+ rparams->vertex_buf = params->vertex_buf;
+ rparams->buf_offset = params->buf_offset;
+ rparams->index_data = params->index_data;
+ rparams->index_fmt = params->index_fmt;
+ rparams->index_buf = params->index_buf;
+ rparams->index_offset = params->index_offset;
+ rparams->timer = PL_DEF(params->timer, pass->timer);
+ run_pass(dp, sh, pass);
+
+ ret = true;
+ // fall through
+
+error:
+ // Reset the temporary buffers which we use to build the shader
+ for (int i = 0; i < PL_ARRAY_SIZE(dp->tmp); i++)
+ pl_str_builder_reset(dp->tmp[i]);
+
+ pl_mutex_unlock(&dp->lock);
+ pl_dispatch_abort(dp, params->shader);
+ return ret;
+}
+
+void pl_dispatch_abort(pl_dispatch dp, pl_shader *psh)
+{
+ pl_shader sh = *psh;
+ if (!sh)
+ return;
+
+ // Free unused memory as early as possible
+ sh_deref(sh);
+
+ // Re-add the shader to the internal pool of shaders
+ pl_mutex_lock(&dp->lock);
+ PL_ARRAY_APPEND(dp, dp->shaders, sh);
+ pl_mutex_unlock(&dp->lock);
+ *psh = NULL;
+}
+
+void pl_dispatch_reset_frame(pl_dispatch dp)
+{
+ pl_mutex_lock(&dp->lock);
+
+ dp->current_ident = 0;
+ dp->current_index++;
+ garbage_collect_passes(dp);
+
+ pl_mutex_unlock(&dp->lock);
+}
+
+size_t pl_dispatch_save(pl_dispatch dp, uint8_t *out)
+{
+ return pl_cache_save(pl_gpu_cache(dp->gpu), out, out ? SIZE_MAX : 0);
+}
+
+void pl_dispatch_load(pl_dispatch dp, const uint8_t *cache)
+{
+ pl_cache_load(pl_gpu_cache(dp->gpu), cache, SIZE_MAX);
+}
diff --git a/src/dispatch.h b/src/dispatch.h
new file mode 100644
index 0000000..66c10f6
--- /dev/null
+++ b/src/dispatch.h
@@ -0,0 +1,31 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+// Like `pl_dispatch_begin`, but has an extra `unique` parameter. If this is
+// true, the generated shader will be uniquely namespaced `unique` and may be
+// freely merged with other shaders (`sh_subpass`). Otherwise, all shaders have
+// the same namespace and merging them is an error.
+pl_shader pl_dispatch_begin_ex(pl_dispatch dp, bool unique);
+
+// Set the `dynamic_constants` field for newly created `pl_shader` objects.
+//
+// This is a private API because it's sort of clunky/stateful.
+void pl_dispatch_mark_dynamic(pl_dispatch dp, bool dynamic);
diff --git a/src/dither.c b/src/dither.c
new file mode 100644
index 0000000..13f68e4
--- /dev/null
+++ b/src/dither.c
@@ -0,0 +1,317 @@
+/*
+ * Generate a noise texture for dithering images.
+ * Copyright © 2013 Wessel Dankers <wsl@fruit.je>
+ *
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The original code is taken from mpv, under the same license.
+ */
+
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+
+#include "common.h"
+
+#include <libplacebo/dither.h>
+
+void pl_generate_bayer_matrix(float *data, int size)
+{
+ pl_assert(size >= 0);
+
+ // Start with a single entry of 0
+ data[0] = 0;
+
+ for (int sz = 1; sz < size; sz *= 2) {
+ // Make three copies of the current, appropriately shifted and scaled
+ for (int y = 0; y < sz; y ++) {
+ for (int x = 0; x < sz; x++) {
+ int offsets[] = {0, sz * size + sz, sz, sz * size};
+ int pos = y * size + x;
+
+ for (int i = 1; i < 4; i++)
+ data[pos + offsets[i]] = data[pos] + i / (4.0 * sz * sz);
+ }
+ }
+ }
+}
+
+#define MAX_SIZEB 8
+#define MAX_SIZE (1 << MAX_SIZEB)
+#define MAX_SIZE2 (MAX_SIZE * MAX_SIZE)
+
+typedef uint_fast32_t index_t;
+
+#define WRAP_SIZE2(k, x) ((index_t)((index_t)(x) & ((k)->size2 - 1)))
+#define XY(k, x, y) ((index_t)(((x) | ((y) << (k)->sizeb))))
+
+struct ctx {
+ unsigned int sizeb, size, size2;
+ unsigned int gauss_radius;
+ unsigned int gauss_middle;
+ uint64_t gauss[MAX_SIZE2];
+ index_t randomat[MAX_SIZE2];
+ bool calcmat[MAX_SIZE2];
+ uint64_t gaussmat[MAX_SIZE2];
+ index_t unimat[MAX_SIZE2];
+};
+
+static void makegauss(struct ctx *k, unsigned int sizeb)
+{
+ pl_assert(sizeb >= 1 && sizeb <= MAX_SIZEB);
+
+ k->sizeb = sizeb;
+ k->size = 1 << k->sizeb;
+ k->size2 = k->size * k->size;
+
+ k->gauss_radius = k->size / 2 - 1;
+ k->gauss_middle = XY(k, k->gauss_radius, k->gauss_radius);
+
+ unsigned int gauss_size = k->gauss_radius * 2 + 1;
+ unsigned int gauss_size2 = gauss_size * gauss_size;
+
+ for (index_t c = 0; c < k->size2; c++)
+ k->gauss[c] = 0;
+
+ double sigma = -log(1.5 / (double) UINT64_MAX * gauss_size2) / k->gauss_radius;
+
+ for (index_t gy = 0; gy <= k->gauss_radius; gy++) {
+ for (index_t gx = 0; gx <= gy; gx++) {
+ int cx = (int)gx - k->gauss_radius;
+ int cy = (int)gy - k->gauss_radius;
+ int sq = cx * cx + cy * cy;
+ double e = exp(-sqrt(sq) * sigma);
+ uint64_t v = e / gauss_size2 * (double) UINT64_MAX;
+ k->gauss[XY(k, gx, gy)] =
+ k->gauss[XY(k, gy, gx)] =
+ k->gauss[XY(k, gx, gauss_size - 1 - gy)] =
+ k->gauss[XY(k, gy, gauss_size - 1 - gx)] =
+ k->gauss[XY(k, gauss_size - 1 - gx, gy)] =
+ k->gauss[XY(k, gauss_size - 1 - gy, gx)] =
+ k->gauss[XY(k, gauss_size - 1 - gx, gauss_size - 1 - gy)] =
+ k->gauss[XY(k, gauss_size - 1 - gy, gauss_size - 1 - gx)] = v;
+ }
+ }
+
+#ifndef NDEBUG
+ uint64_t total = 0;
+ for (index_t c = 0; c < k->size2; c++) {
+ uint64_t oldtotal = total;
+ total += k->gauss[c];
+ assert(total >= oldtotal);
+ }
+#endif
+}
+
+static void setbit(struct ctx *k, index_t c)
+{
+ if (k->calcmat[c])
+ return;
+ k->calcmat[c] = true;
+ uint64_t *m = k->gaussmat;
+ uint64_t *me = k->gaussmat + k->size2;
+ uint64_t *g = k->gauss + WRAP_SIZE2(k, k->gauss_middle + k->size2 - c);
+ uint64_t *ge = k->gauss + k->size2;
+ while (g < ge)
+ *m++ += *g++;
+ g = k->gauss;
+ while (m < me)
+ *m++ += *g++;
+}
+
+static index_t getmin(struct ctx *k)
+{
+ uint64_t min = UINT64_MAX;
+ index_t resnum = 0;
+ unsigned int size2 = k->size2;
+ for (index_t c = 0; c < size2; c++) {
+ if (k->calcmat[c])
+ continue;
+ uint64_t total = k->gaussmat[c];
+ if (total <= min) {
+ if (total != min) {
+ min = total;
+ resnum = 0;
+ }
+ k->randomat[resnum++] = c;
+ }
+ }
+ assert(resnum > 0);
+ if (resnum == 1)
+ return k->randomat[0];
+ if (resnum == size2)
+ return size2 / 2;
+ return k->randomat[rand() % resnum];
+}
+
+static void makeuniform(struct ctx *k)
+{
+ unsigned int size2 = k->size2;
+ for (index_t c = 0; c < size2; c++) {
+ index_t r = getmin(k);
+ setbit(k, r);
+ k->unimat[r] = c;
+ }
+}
+
+void pl_generate_blue_noise(float *data, int size)
+{
+ pl_assert(size > 0);
+ int shift = PL_LOG2(size);
+
+ pl_assert((1 << shift) == size);
+ struct ctx *k = pl_zalloc_ptr(NULL, k);
+ makegauss(k, shift);
+ makeuniform(k);
+ float invscale = k->size2;
+ for(index_t y = 0; y < k->size; y++) {
+ for(index_t x = 0; x < k->size; x++)
+ data[x + y * k->size] = k->unimat[XY(k, x, y)] / invscale;
+ }
+ pl_free(k);
+}
+
+const struct pl_error_diffusion_kernel pl_error_diffusion_simple = {
+ .name = "simple",
+ .description = "Simple error diffusion",
+ .shift = 1,
+ .pattern = {{0, 0, 0, 1, 0},
+ {0, 0, 1, 0, 0},
+ {0, 0, 0, 0, 0}},
+ .divisor = 2,
+};
+
+const struct pl_error_diffusion_kernel pl_error_diffusion_false_fs = {
+ .name = "false-fs",
+ .description = "False Floyd-Steinberg kernel",
+ .shift = 1,
+ .pattern = {{0, 0, 0, 3, 0},
+ {0, 0, 3, 2, 0},
+ {0, 0, 0, 0, 0}},
+ .divisor = 8,
+};
+
+const struct pl_error_diffusion_kernel pl_error_diffusion_sierra_lite = {
+ .name = "sierra-lite",
+ .description = "Sierra Lite kernel",
+ .shift = 2,
+ .pattern = {{0, 0, 0, 2, 0},
+ {0, 1, 1, 0, 0},
+ {0, 0, 0, 0, 0}},
+ .divisor = 4,
+};
+
+const struct pl_error_diffusion_kernel pl_error_diffusion_floyd_steinberg = {
+ .name = "floyd-steinberg",
+ .description = "Floyd Steinberg kernel",
+ .shift = 2,
+ .pattern = {{0, 0, 0, 7, 0},
+ {0, 3, 5, 1, 0},
+ {0, 0, 0, 0, 0}},
+ .divisor = 16,
+};
+
+const struct pl_error_diffusion_kernel pl_error_diffusion_atkinson = {
+ .name = "atkinson",
+ .description = "Atkinson kernel",
+ .shift = 2,
+ .pattern = {{0, 0, 0, 1, 1},
+ {0, 1, 1, 1, 0},
+ {0, 0, 1, 0, 0}},
+ .divisor = 8,
+};
+
+const struct pl_error_diffusion_kernel pl_error_diffusion_jarvis_judice_ninke = {
+ .name = "jarvis-judice-ninke",
+ .description = "Jarvis, Judice & Ninke kernel",
+ .shift = 3,
+ .pattern = {{0, 0, 0, 7, 5},
+ {3, 5, 7, 5, 3},
+ {1, 3, 5, 3, 1}},
+ .divisor = 48,
+};
+
+const struct pl_error_diffusion_kernel pl_error_diffusion_stucki = {
+ .name = "stucki",
+ .description = "Stucki kernel",
+ .shift = 3,
+ .pattern = {{0, 0, 0, 8, 4},
+ {2, 4, 8, 4, 2},
+ {1, 2, 4, 2, 1}},
+ .divisor = 42,
+};
+
+const struct pl_error_diffusion_kernel pl_error_diffusion_burkes = {
+ .name = "burkes",
+ .description = "Burkes kernel",
+ .shift = 3,
+ .pattern = {{0, 0, 0, 8, 4},
+ {2, 4, 8, 4, 2},
+ {0, 0, 0, 0, 0}},
+ .divisor = 32,
+};
+
+const struct pl_error_diffusion_kernel pl_error_diffusion_sierra2 = {
+ .name = "sierra-2",
+ .description = "Two-row Sierra",
+ .shift = 3,
+ .pattern = {{0, 0, 0, 4, 3},
+ {1, 2, 3, 2, 1},
+ {0, 0, 0, 0, 0}},
+ .divisor = 16,
+};
+
+const struct pl_error_diffusion_kernel pl_error_diffusion_sierra3 = {
+ .name = "sierra-3",
+ .description = "Three-row Sierra",
+ .shift = 3,
+ .pattern = {{0, 0, 0, 5, 3},
+ {2, 4, 5, 4, 2},
+ {0, 2, 3, 2, 0}},
+ .divisor = 32,
+};
+
+const struct pl_error_diffusion_kernel * const pl_error_diffusion_kernels[] = {
+ &pl_error_diffusion_simple,
+ &pl_error_diffusion_false_fs,
+ &pl_error_diffusion_sierra_lite,
+ &pl_error_diffusion_floyd_steinberg,
+ &pl_error_diffusion_atkinson,
+ &pl_error_diffusion_jarvis_judice_ninke,
+ &pl_error_diffusion_stucki,
+ &pl_error_diffusion_burkes,
+ &pl_error_diffusion_sierra2,
+ &pl_error_diffusion_sierra3,
+ NULL
+};
+
+const int pl_num_error_diffusion_kernels = PL_ARRAY_SIZE(pl_error_diffusion_kernels) - 1;
+
+// Find the error diffusion kernel with the given name, or NULL on failure.
+const struct pl_error_diffusion_kernel *pl_find_error_diffusion_kernel(const char *name)
+{
+ for (int i = 0; i < pl_num_error_diffusion_kernels; i++) {
+ if (strcmp(name, pl_error_diffusion_kernels[i]->name) == 0)
+ return pl_error_diffusion_kernels[i];
+ }
+
+ return NULL;
+}
diff --git a/src/dummy.c b/src/dummy.c
new file mode 100644
index 0000000..cd80080
--- /dev/null
+++ b/src/dummy.c
@@ -0,0 +1,348 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <limits.h>
+#include <string.h>
+
+#include "gpu.h"
+
+#include <libplacebo/dummy.h>
+
+const struct pl_gpu_dummy_params pl_gpu_dummy_default_params = { PL_GPU_DUMMY_DEFAULTS };
+static const struct pl_gpu_fns pl_fns_dummy;
+
+struct priv {
+ struct pl_gpu_fns impl;
+ struct pl_gpu_dummy_params params;
+};
+
+pl_gpu pl_gpu_dummy_create(pl_log log, const struct pl_gpu_dummy_params *params)
+{
+ params = PL_DEF(params, &pl_gpu_dummy_default_params);
+
+ struct pl_gpu_t *gpu = pl_zalloc_obj(NULL, gpu, struct priv);
+ gpu->log = log;
+ gpu->glsl = params->glsl;
+ gpu->limits = params->limits;
+
+ struct priv *p = PL_PRIV(gpu);
+ p->impl = pl_fns_dummy;
+ p->params = *params;
+
+ // Forcibly override these, because we know for sure what the values are
+ gpu->limits.align_tex_xfer_pitch = 1;
+ gpu->limits.align_tex_xfer_offset = 1;
+ gpu->limits.align_vertex_stride = 1;
+
+ // Set up the dummy formats, add one for each possible format type that we
+ // can represent on the host
+ PL_ARRAY(pl_fmt) formats = {0};
+ for (enum pl_fmt_type type = 1; type < PL_FMT_TYPE_COUNT; type++) {
+ for (int comps = 1; comps <= 4; comps++) {
+ for (int depth = 8; depth < 128; depth *= 2) {
+ if (type == PL_FMT_FLOAT && depth < 16)
+ continue;
+
+ static const char *cnames[] = {
+ [1] = "r",
+ [2] = "rg",
+ [3] = "rgb",
+ [4] = "rgba",
+ };
+
+ static const char *tnames[] = {
+ [PL_FMT_UNORM] = "",
+ [PL_FMT_SNORM] = "s",
+ [PL_FMT_UINT] = "u",
+ [PL_FMT_SINT] = "i",
+ [PL_FMT_FLOAT] = "f",
+ };
+
+ const char *tname = tnames[type];
+ if (type == PL_FMT_FLOAT && depth == 16)
+ tname = "hf";
+
+ struct pl_fmt_t *fmt = pl_alloc_ptr(gpu, fmt);
+ *fmt = (struct pl_fmt_t) {
+ .name = pl_asprintf(fmt, "%s%d%s", cnames[comps], depth, tname),
+ .type = type,
+ .num_components = comps,
+ .opaque = false,
+ .gatherable = true,
+ .internal_size = comps * depth / 8,
+ .texel_size = comps * depth / 8,
+ .texel_align = 1,
+ .caps = PL_FMT_CAP_SAMPLEABLE | PL_FMT_CAP_LINEAR |
+ PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_BLENDABLE |
+ PL_FMT_CAP_VERTEX | PL_FMT_CAP_HOST_READABLE,
+ };
+
+ for (int i = 0; i < comps; i++) {
+ fmt->component_depth[i] = depth;
+ fmt->host_bits[i] = depth;
+ fmt->sample_order[i] = i;
+ }
+
+ if (gpu->glsl.compute)
+ fmt->caps |= PL_FMT_CAP_STORABLE;
+ if (gpu->limits.max_buffer_texels && gpu->limits.max_ubo_size)
+ fmt->caps |= PL_FMT_CAP_TEXEL_UNIFORM;
+ if (gpu->limits.max_buffer_texels && gpu->limits.max_ssbo_size)
+ fmt->caps |= PL_FMT_CAP_TEXEL_STORAGE;
+
+ fmt->glsl_type = pl_var_glsl_type_name(pl_var_from_fmt(fmt, ""));
+ fmt->glsl_format = pl_fmt_glsl_format(fmt, comps);
+ fmt->fourcc = pl_fmt_fourcc(fmt);
+ if (!fmt->glsl_format)
+ fmt->caps &= ~(PL_FMT_CAP_STORABLE | PL_FMT_CAP_TEXEL_STORAGE);
+ PL_ARRAY_APPEND(gpu, formats, fmt);
+ }
+ }
+ }
+
+ gpu->formats = formats.elem;
+ gpu->num_formats = formats.num;
+ return pl_gpu_finalize(gpu);
+}
+
+static void dumb_destroy(pl_gpu gpu)
+{
+ pl_free((void *) gpu);
+}
+
+void pl_gpu_dummy_destroy(pl_gpu *gpu)
+{
+ pl_gpu_destroy(*gpu);
+ *gpu = NULL;
+}
+
+struct buf_priv {
+ uint8_t *data;
+};
+
+static pl_buf dumb_buf_create(pl_gpu gpu, const struct pl_buf_params *params)
+{
+ struct pl_buf_t *buf = pl_zalloc_obj(NULL, buf, struct buf_priv);
+ buf->params = *params;
+ buf->params.initial_data = NULL;
+
+ struct buf_priv *p = PL_PRIV(buf);
+ p->data = malloc(params->size);
+ if (!p->data) {
+ PL_ERR(gpu, "Failed allocating memory for dummy buffer!");
+ pl_free(buf);
+ return NULL;
+ }
+
+ if (params->initial_data)
+ memcpy(p->data, params->initial_data, params->size);
+ if (params->host_mapped)
+ buf->data = p->data;
+
+ return buf;
+}
+
+static void dumb_buf_destroy(pl_gpu gpu, pl_buf buf)
+{
+ struct buf_priv *p = PL_PRIV(buf);
+ free(p->data);
+ pl_free((void *) buf);
+}
+
+uint8_t *pl_buf_dummy_data(pl_buf buf)
+{
+ struct buf_priv *p = PL_PRIV(buf);
+ return p->data;
+}
+
+static void dumb_buf_write(pl_gpu gpu, pl_buf buf, size_t buf_offset,
+ const void *data, size_t size)
+{
+ struct buf_priv *p = PL_PRIV(buf);
+ memcpy(p->data + buf_offset, data, size);
+}
+
+static bool dumb_buf_read(pl_gpu gpu, pl_buf buf, size_t buf_offset,
+ void *dest, size_t size)
+{
+ struct buf_priv *p = PL_PRIV(buf);
+ memcpy(dest, p->data + buf_offset, size);
+ return true;
+}
+
+static void dumb_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset,
+ pl_buf src, size_t src_offset, size_t size)
+{
+ struct buf_priv *dstp = PL_PRIV(dst);
+ struct buf_priv *srcp = PL_PRIV(src);
+ memcpy(dstp->data + dst_offset, srcp->data + src_offset, size);
+}
+
+struct tex_priv {
+ void *data;
+};
+
+static size_t tex_size(pl_gpu gpu, pl_tex tex)
+{
+ size_t size = tex->params.format->texel_size * tex->params.w;
+ size *= PL_DEF(tex->params.h, 1);
+ size *= PL_DEF(tex->params.d, 1);
+ return size;
+}
+
+static pl_tex dumb_tex_create(pl_gpu gpu, const struct pl_tex_params *params)
+{
+ struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, void *);
+ tex->params = *params;
+ tex->params.initial_data = NULL;
+
+ struct tex_priv *p = PL_PRIV(tex);
+ p->data = malloc(tex_size(gpu, tex));
+ if (!p->data) {
+ PL_ERR(gpu, "Failed allocating memory for dummy texture!");
+ pl_free(tex);
+ return NULL;
+ }
+
+ if (params->initial_data)
+ memcpy(p->data, params->initial_data, tex_size(gpu, tex));
+
+ return tex;
+}
+
+pl_tex pl_tex_dummy_create(pl_gpu gpu, const struct pl_tex_dummy_params *params)
+{
+ // Only do minimal sanity checking, since this is just a dummy texture
+ pl_assert(params->format && params->w >= 0 && params->h >= 0 && params->d >= 0);
+
+ struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, struct tex_priv);
+ tex->sampler_type = params->sampler_type;
+ tex->params = (struct pl_tex_params) {
+ .w = params->w,
+ .h = params->h,
+ .d = params->d,
+ .format = params->format,
+ .sampleable = true,
+ .user_data = params->user_data,
+ };
+
+ return tex;
+}
+
+static void dumb_tex_destroy(pl_gpu gpu, pl_tex tex)
+{
+ struct tex_priv *p = PL_PRIV(tex);
+ if (p->data)
+ free(p->data);
+ pl_free((void *) tex);
+}
+
+uint8_t *pl_tex_dummy_data(pl_tex tex)
+{
+ struct tex_priv *p = PL_PRIV(tex);
+ return p->data;
+}
+
+static bool dumb_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+ pl_tex tex = params->tex;
+ struct tex_priv *p = PL_PRIV(tex);
+ pl_assert(p->data);
+
+ const uint8_t *src = params->ptr;
+ uint8_t *dst = p->data;
+ if (params->buf) {
+ struct buf_priv *bufp = PL_PRIV(params->buf);
+ src = (uint8_t *) bufp->data + params->buf_offset;
+ }
+
+ size_t texel_size = tex->params.format->texel_size;
+ size_t row_size = pl_rect_w(params->rc) * texel_size;
+ for (int z = params->rc.z0; z < params->rc.z1; z++) {
+ size_t src_plane = z * params->depth_pitch;
+ size_t dst_plane = z * tex->params.h * tex->params.w * texel_size;
+ for (int y = params->rc.y0; y < params->rc.y1; y++) {
+ size_t src_row = src_plane + y * params->row_pitch;
+ size_t dst_row = dst_plane + y * tex->params.w * texel_size;
+ size_t pos = params->rc.x0 * texel_size;
+ memcpy(&dst[dst_row + pos], &src[src_row + pos], row_size);
+ }
+ }
+
+ return true;
+}
+
+static bool dumb_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+ pl_tex tex = params->tex;
+ struct tex_priv *p = PL_PRIV(tex);
+ pl_assert(p->data);
+
+ const uint8_t *src = p->data;
+ uint8_t *dst = params->ptr;
+ if (params->buf) {
+ struct buf_priv *bufp = PL_PRIV(params->buf);
+ dst = (uint8_t *) bufp->data + params->buf_offset;
+ }
+
+ size_t texel_size = tex->params.format->texel_size;
+ size_t row_size = pl_rect_w(params->rc) * texel_size;
+ for (int z = params->rc.z0; z < params->rc.z1; z++) {
+ size_t src_plane = z * tex->params.h * tex->params.w * texel_size;
+ size_t dst_plane = z * params->depth_pitch;
+ for (int y = params->rc.y0; y < params->rc.y1; y++) {
+ size_t src_row = src_plane + y * tex->params.w * texel_size;
+ size_t dst_row = dst_plane + y * params->row_pitch;
+ size_t pos = params->rc.x0 * texel_size;
+ memcpy(&dst[dst_row + pos], &src[src_row + pos], row_size);
+ }
+ }
+
+ return true;
+}
+
+static int dumb_desc_namespace(pl_gpu gpu, enum pl_desc_type type)
+{
+ return 0; // safest behavior: never alias bindings
+}
+
+static pl_pass dumb_pass_create(pl_gpu gpu, const struct pl_pass_params *params)
+{
+ PL_ERR(gpu, "Creating render passes is not supported for dummy GPUs");
+ return NULL;
+}
+
+static void dumb_gpu_finish(pl_gpu gpu)
+{
+ // no-op
+}
+
+static const struct pl_gpu_fns pl_fns_dummy = {
+ .destroy = dumb_destroy,
+ .buf_create = dumb_buf_create,
+ .buf_destroy = dumb_buf_destroy,
+ .buf_write = dumb_buf_write,
+ .buf_read = dumb_buf_read,
+ .buf_copy = dumb_buf_copy,
+ .tex_create = dumb_tex_create,
+ .tex_destroy = dumb_tex_destroy,
+ .tex_upload = dumb_tex_upload,
+ .tex_download = dumb_tex_download,
+ .desc_namespace = dumb_desc_namespace,
+ .pass_create = dumb_pass_create,
+ .gpu_finish = dumb_gpu_finish,
+};
diff --git a/src/filters.c b/src/filters.c
new file mode 100644
index 0000000..cc4871f
--- /dev/null
+++ b/src/filters.c
@@ -0,0 +1,1015 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Some of the filter code originally derives (via mpv) from Glumpy:
+ * # Copyright (c) 2009-2016 Nicolas P. Rougier. All rights reserved.
+ * # Distributed under the (new) BSD License.
+ * (https://github.com/glumpy/glumpy/blob/master/glumpy/library/build-spatial-filters.py)
+ *
+ * The math underlying each filter function was written from scratch, with
+ * some algorithms coming from a number of different sources, including:
+ * - https://en.wikipedia.org/wiki/Window_function
+ * - https://en.wikipedia.org/wiki/Jinc
+ * - http://vector-agg.cvs.sourceforge.net/viewvc/vector-agg/agg-2.5/include/agg_image_filters.h
+ * - Vapoursynth plugin fmtconv (WTFPL Licensed), which is based on
+ * dither plugin for avisynth from the same author:
+ * https://github.com/vapoursynth/fmtconv/tree/master/src/fmtc
+ * - Paul Heckbert's "zoom"
+ * - XBMC: ConvolutionKernels.cpp etc.
+ * - https://github.com/AviSynth/jinc-resize (only used to verify the math)
+ */
+
+#include <math.h>
+
+#include "common.h"
+#include "filters.h"
+#include "log.h"
+
+#ifdef PL_HAVE_WIN32
+#define j1 _j1
+#endif
+
+bool pl_filter_function_eq(const struct pl_filter_function *a,
+ const struct pl_filter_function *b)
+{
+ return (a ? a->weight : NULL) == (b ? b->weight : NULL);
+}
+
+bool pl_filter_config_eq(const struct pl_filter_config *a,
+ const struct pl_filter_config *b)
+{
+ if (!a || !b)
+ return a == b;
+
+ bool eq = pl_filter_function_eq(a->kernel, b->kernel) &&
+ pl_filter_function_eq(a->window, b->window) &&
+ a->radius == b->radius &&
+ a->clamp == b->clamp &&
+ a->blur == b->blur &&
+ a->taper == b->taper &&
+ a->polar == b->polar &&
+ a->antiring == b->antiring;
+
+ for (int i = 0; i < PL_FILTER_MAX_PARAMS; i++) {
+ if (a->kernel->tunable[i])
+ eq &= a->params[i] == b->params[i];
+ if (a->window && a->window->tunable[i])
+ eq &= a->wparams[i] == b->wparams[i];
+ }
+
+ return eq;
+}
+
+double pl_filter_sample(const struct pl_filter_config *c, double x)
+{
+ const float radius = pl_filter_radius_bound(c);
+
+ // All filters are symmetric, and in particular only need to be defined
+ // for [0, radius].
+ x = fabs(x);
+
+ // Return early for values outside of the kernel radius, since the functions
+ // are not necessarily valid outside of this interval. No such check is
+ // needed for the window, because it's always stretched to fit.
+ if (x > radius)
+ return 0.0;
+
+ // Apply the blur and taper coefficients as needed
+ double kx = x <= c->taper ? 0.0 : (x - c->taper) / (1.0 - c->taper / radius);
+ if (c->blur > 0.0)
+ kx /= c->blur;
+
+ pl_assert(!c->kernel->opaque);
+ double k = c->kernel->weight(&(const struct pl_filter_ctx) {
+ .radius = radius,
+ .params = {
+ c->kernel->tunable[0] ? c->params[0] : c->kernel->params[0],
+ c->kernel->tunable[1] ? c->params[1] : c->kernel->params[1],
+ },
+ }, kx);
+
+ // Apply the optional windowing function
+ if (c->window) {
+ pl_assert(!c->window->opaque);
+ double wx = x / radius * c->window->radius;
+ k *= c->window->weight(&(struct pl_filter_ctx) {
+ .radius = c->window->radius,
+ .params = {
+ c->window->tunable[0] ? c->wparams[0] : c->window->params[0],
+ c->window->tunable[1] ? c->wparams[1] : c->window->params[1],
+ },
+ }, wx);
+ }
+
+ return k < 0 ? (1 - c->clamp) * k : k;
+}
+
+static void filter_cutoffs(const struct pl_filter_config *c, float cutoff,
+ float *out_radius, float *out_radius_zero)
+{
+ const float bound = pl_filter_radius_bound(c);
+ float prev = 0.0, fprev = pl_filter_sample(c, prev);
+ bool found_root = false;
+
+ const float step = 1e-2f;
+ for (float x = 0.0; x < bound + step; x += step) {
+ float fx = pl_filter_sample(c, x);
+ if ((fprev > cutoff && fx <= cutoff) || (fprev < -cutoff && fx >= -cutoff)) {
+ // Found zero crossing
+ float root = x - fx * (x - prev) / (fx - fprev); // secant method
+ root = fminf(root, bound);
+ *out_radius = root;
+ if (!found_root) // first root
+ *out_radius_zero = root;
+ found_root = true;
+ }
+ prev = x;
+ fprev = fx;
+ }
+
+ if (!found_root)
+ *out_radius_zero = *out_radius = bound;
+}
+
+// Compute a single row of weights for a given filter in one dimension, indexed
+// by the indicated subpixel offset. Writes `f->row_size` values to `out`.
+static void compute_row(struct pl_filter_t *f, double offset, float *out)
+{
+ double wsum = 0.0;
+ for (int i = 0; i < f->row_size; i++) {
+ // For the example of a filter with row size 4 and offset 0.3, we have:
+ //
+ // 0 1 * 2 3
+ //
+ // * indicates the sampled position. What we want to compute is the
+ // distance from each index to that sampled position.
+ pl_assert(f->row_size % 2 == 0);
+ const int base = f->row_size / 2 - 1; // index to the left of the center
+ const double center = base + offset; // offset of center relative to idx 0
+ double w = pl_filter_sample(&f->params.config, i - center);
+ out[i] = w;
+ wsum += w;
+ }
+
+ // Readjust weights to preserve energy
+ pl_assert(wsum > 0);
+ for (int i = 0; i < f->row_size; i++)
+ out[i] /= wsum;
+}
+
+// Needed for backwards compatibility with v1 configuration API
+static struct pl_filter_function *dupfilter(void *alloc,
+ const struct pl_filter_function *f)
+{
+ return f ? pl_memdup(alloc, (void *)f, sizeof(*f)) : NULL;
+}
+
+pl_filter pl_filter_generate(pl_log log, const struct pl_filter_params *params)
+{
+ pl_assert(params);
+ if (params->lut_entries <= 0 || !params->config.kernel) {
+ pl_fatal(log, "Invalid params: missing lut_entries or config.kernel");
+ return NULL;
+ }
+
+ if (params->config.kernel->opaque) {
+ pl_err(log, "Trying to use opaque kernel '%s' in non-opaque context!",
+ params->config.kernel->name);
+ return NULL;
+ }
+
+ if (params->config.window && params->config.window->opaque) {
+ pl_err(log, "Trying to use opaque window '%s' in non-opaque context!",
+ params->config.window->name);
+ return NULL;
+ }
+
+ struct pl_filter_t *f = pl_zalloc_ptr(NULL, f);
+ f->params = *params;
+ f->params.config.kernel = dupfilter(f, params->config.kernel);
+ f->params.config.window = dupfilter(f, params->config.window);
+
+ // Compute main lobe and total filter size
+ filter_cutoffs(&params->config, params->cutoff, &f->radius, &f->radius_zero);
+ f->radius_cutoff = f->radius; // backwards compatibility
+
+ float *weights;
+ if (params->config.polar) {
+ // Compute a 1D array indexed by radius
+ weights = pl_alloc(f, params->lut_entries * sizeof(float));
+ for (int i = 0; i < params->lut_entries; i++) {
+ double x = f->radius * i / (params->lut_entries - 1);
+ weights[i] = pl_filter_sample(&params->config, x);
+ }
+ } else {
+ // Pick the most appropriate row size
+ f->row_size = ceilf(f->radius) * 2;
+ if (params->max_row_size && f->row_size > params->max_row_size) {
+ pl_info(log, "Required filter size %d exceeds the maximum allowed "
+ "size of %d. This may result in adverse effects (aliasing, "
+ "or moiré artifacts).", f->row_size, params->max_row_size);
+ f->row_size = params->max_row_size;
+ f->insufficient = true;
+ }
+ f->row_stride = PL_ALIGN(f->row_size, params->row_stride_align);
+
+ // Compute a 2D array indexed by the subpixel position
+ weights = pl_calloc(f, params->lut_entries * f->row_stride, sizeof(float));
+ for (int i = 0; i < params->lut_entries; i++) {
+ compute_row(f, i / (double)(params->lut_entries - 1),
+ weights + f->row_stride * i);
+ }
+ }
+
+ f->weights = weights;
+ return f;
+}
+
+void pl_filter_free(pl_filter *filter)
+{
+ pl_free_ptr((void **) filter);
+}
+
+// Built-in filter functions
+
+static double box(const struct pl_filter_ctx *f, double x)
+{
+ return 1.0;
+}
+
+const struct pl_filter_function pl_filter_function_box = {
+ .weight = box,
+ .name = "box",
+ .radius = 1.0,
+ .resizable = true,
+};
+
+static const struct pl_filter_function filter_function_dirichlet = {
+ .name = "dirichlet", // alias
+ .weight = box,
+ .radius = 1.0,
+ .resizable = true,
+};
+
+static double triangle(const struct pl_filter_ctx *f, double x)
+{
+ return 1.0 - x / f->radius;
+}
+
+const struct pl_filter_function pl_filter_function_triangle = {
+ .name = "triangle",
+ .weight = triangle,
+ .radius = 1.0,
+ .resizable = true,
+};
+
+static double cosine(const struct pl_filter_ctx *f, double x)
+{
+ return cos(x);
+}
+
+const struct pl_filter_function pl_filter_function_cosine = {
+ .name = "cosine",
+ .weight = cosine,
+ .radius = M_PI / 2.0,
+};
+
+static double hann(const struct pl_filter_ctx *f, double x)
+{
+ return 0.5 + 0.5 * cos(M_PI * x);
+}
+
+const struct pl_filter_function pl_filter_function_hann = {
+ .name = "hann",
+ .weight = hann,
+ .radius = 1.0,
+};
+
+static const struct pl_filter_function filter_function_hanning = {
+ .name = "hanning", // alias
+ .weight = hann,
+ .radius = 1.0,
+};
+
+static double hamming(const struct pl_filter_ctx *f, double x)
+{
+ return 0.54 + 0.46 * cos(M_PI * x);
+}
+
+const struct pl_filter_function pl_filter_function_hamming = {
+ .name = "hamming",
+ .weight = hamming,
+ .radius = 1.0,
+};
+
+static double welch(const struct pl_filter_ctx *f, double x)
+{
+ return 1.0 - x * x;
+}
+
+const struct pl_filter_function pl_filter_function_welch = {
+ .name = "welch",
+ .weight = welch,
+ .radius = 1.0,
+};
+
+static double bessel_i0(double x)
+{
+ double s = 1.0;
+ double y = x * x / 4.0;
+ double t = y;
+ int i = 2;
+ while (t > 1e-12) {
+ s += t;
+ t *= y / (i * i);
+ i += 1;
+ }
+ return s;
+}
+
+static double kaiser(const struct pl_filter_ctx *f, double x)
+{
+ double alpha = fmax(f->params[0], 0.0);
+ double scale = bessel_i0(alpha);
+ return bessel_i0(alpha * sqrt(1.0 - x * x)) / scale;
+}
+
+const struct pl_filter_function pl_filter_function_kaiser = {
+ .name = "kaiser",
+ .weight = kaiser,
+ .radius = 1.0,
+ .params = {2.0},
+ .tunable = {true},
+};
+
+static double blackman(const struct pl_filter_ctx *f, double x)
+{
+ double a = f->params[0];
+ double a0 = (1 - a) / 2.0, a1 = 1 / 2.0, a2 = a / 2.0;
+ x *= M_PI;
+ return a0 + a1 * cos(x) + a2 * cos(2 * x);
+}
+
+const struct pl_filter_function pl_filter_function_blackman = {
+ .name = "blackman",
+ .weight = blackman,
+ .radius = 1.0,
+ .params = {0.16},
+ .tunable = {true},
+};
+
+static double bohman(const struct pl_filter_ctx *f, double x)
+{
+ double pix = M_PI * x;
+ return (1.0 - x) * cos(pix) + sin(pix) / M_PI;
+}
+
+const struct pl_filter_function pl_filter_function_bohman = {
+ .name = "bohman",
+ .weight = bohman,
+ .radius = 1.0,
+};
+
+static double gaussian(const struct pl_filter_ctx *f, double x)
+{
+ return exp(-2.0 * x * x / f->params[0]);
+}
+
+const struct pl_filter_function pl_filter_function_gaussian = {
+ .name = "gaussian",
+ .weight = gaussian,
+ .radius = 2.0,
+ .resizable = true,
+ .params = {1.0},
+ .tunable = {true},
+};
+
+static double quadratic(const struct pl_filter_ctx *f, double x)
+{
+ if (x < 0.5) {
+ return 1.0 - 4.0/3.0 * (x * x);
+ } else {
+ return 2.0 / 3.0 * (x - 1.5) * (x - 1.5);
+ }
+}
+
+const struct pl_filter_function pl_filter_function_quadratic = {
+ .name = "quadratic",
+ .weight = quadratic,
+ .radius = 1.5,
+};
+
+static const struct pl_filter_function filter_function_quadric = {
+ .name = "quadric", // alias
+ .weight = quadratic,
+ .radius = 1.5,
+};
+
+static double sinc(const struct pl_filter_ctx *f, double x)
+{
+ if (x < 1e-8)
+ return 1.0;
+ x *= M_PI;
+ return sin(x) / x;
+}
+
+const struct pl_filter_function pl_filter_function_sinc = {
+ .name = "sinc",
+ .weight = sinc,
+ .radius = 1.0,
+ .resizable = true,
+};
+
+static double jinc(const struct pl_filter_ctx *f, double x)
+{
+ if (x < 1e-8)
+ return 1.0;
+ x *= M_PI;
+ return 2.0 * j1(x) / x;
+}
+
+const struct pl_filter_function pl_filter_function_jinc = {
+ .name = "jinc",
+ .weight = jinc,
+ .radius = 1.2196698912665045, // first zero
+ .resizable = true,
+};
+
+static double sphinx(const struct pl_filter_ctx *f, double x)
+{
+ if (x < 1e-8)
+ return 1.0;
+ x *= M_PI;
+ return 3.0 * (sin(x) - x * cos(x)) / (x * x * x);
+}
+
+const struct pl_filter_function pl_filter_function_sphinx = {
+ .name = "sphinx",
+ .weight = sphinx,
+ .radius = 1.4302966531242027, // first zero
+ .resizable = true,
+};
+
+static double cubic(const struct pl_filter_ctx *f, double x)
+{
+ const double b = f->params[0], c = f->params[1];
+ double p0 = 6.0 - 2.0 * b,
+ p2 = -18.0 + 12.0 * b + 6.0 * c,
+ p3 = 12.0 - 9.0 * b - 6.0 * c,
+ q0 = 8.0 * b + 24.0 * c,
+ q1 = -12.0 * b - 48.0 * c,
+ q2 = 6.0 * b + 30.0 * c,
+ q3 = -b - 6.0 * c;
+
+ if (x < 1.0) {
+ return (p0 + x * x * (p2 + x * p3)) / p0;
+ } else {
+ return (q0 + x * (q1 + x * (q2 + x * q3))) / p0;
+ }
+}
+
+const struct pl_filter_function pl_filter_function_cubic = {
+ .name = "cubic",
+ .weight = cubic,
+ .radius = 2.0,
+ .params = {1.0, 0.0},
+ .tunable = {true, true},
+};
+
+static const struct pl_filter_function filter_function_bicubic = {
+ .name = "bicubic", // alias
+ .weight = cubic,
+ .radius = 2.0,
+ .params = {1.0, 0.0},
+ .tunable = {true, true},
+};
+
+static const struct pl_filter_function filter_function_bcspline = {
+ .name = "bcspline", // alias
+ .weight = cubic,
+ .radius = 2.0,
+ .params = {1.0, 0.0},
+ .tunable = {true, true},
+};
+
+const struct pl_filter_function pl_filter_function_hermite = {
+ .name = "hermite",
+ .weight = cubic,
+ .radius = 1.0,
+ .params = {0.0, 0.0},
+};
+
+static double spline16(const struct pl_filter_ctx *f, double x)
+{
+ if (x < 1.0) {
+ return ((x - 9.0/5.0 ) * x - 1.0/5.0 ) * x + 1.0;
+ } else {
+ return ((-1.0/3.0 * (x-1) + 4.0/5.0) * (x-1) - 7.0/15.0 ) * (x-1);
+ }
+}
+
+const struct pl_filter_function pl_filter_function_spline16 = {
+ .name = "spline16",
+ .weight = spline16,
+ .radius = 2.0,
+};
+
+static double spline36(const struct pl_filter_ctx *f, double x)
+{
+ if (x < 1.0) {
+ return ((13.0/11.0 * x - 453.0/209.0) * x - 3.0/209.0) * x + 1.0;
+ } else if (x < 2.0) {
+ return ((-6.0/11.0 * (x-1) + 270.0/209.0) * (x-1) - 156.0/ 209.0) * (x-1);
+ } else {
+ return ((1.0/11.0 * (x-2) - 45.0/209.0) * (x-2) + 26.0/209.0) * (x-2);
+ }
+}
+
+const struct pl_filter_function pl_filter_function_spline36 = {
+ .name = "spline36",
+ .weight = spline36,
+ .radius = 3.0,
+};
+
+static double spline64(const struct pl_filter_ctx *f, double x)
+{
+ if (x < 1.0) {
+ return ((49.0/41.0 * x - 6387.0/2911.0) * x - 3.0/2911.0) * x + 1.0;
+ } else if (x < 2.0) {
+ return ((-24.0/41.0 * (x-1) + 4032.0/2911.0) * (x-1) - 2328.0/2911.0) * (x-1);
+ } else if (x < 3.0) {
+ return ((6.0/41.0 * (x-2) - 1008.0/2911.0) * (x-2) + 582.0/2911.0) * (x-2);
+ } else {
+ return ((-1.0/41.0 * (x-3) + 168.0/2911.0) * (x-3) - 97.0/2911.0) * (x-3);
+ }
+}
+
+const struct pl_filter_function pl_filter_function_spline64 = {
+ .name = "spline64",
+ .weight = spline64,
+ .radius = 4.0,
+};
+
+static double oversample(const struct pl_filter_ctx *f, double x)
+{
+ return 0.0;
+}
+
+const struct pl_filter_function pl_filter_function_oversample = {
+ .name = "oversample",
+ .weight = oversample,
+ .params = {0.0},
+ .tunable = {true},
+ .opaque = true,
+};
+
+const struct pl_filter_function * const pl_filter_functions[] = {
+ &pl_filter_function_box,
+ &filter_function_dirichlet, // alias
+ &pl_filter_function_triangle,
+ &pl_filter_function_cosine,
+ &pl_filter_function_hann,
+ &filter_function_hanning, // alias
+ &pl_filter_function_hamming,
+ &pl_filter_function_welch,
+ &pl_filter_function_kaiser,
+ &pl_filter_function_blackman,
+ &pl_filter_function_bohman,
+ &pl_filter_function_gaussian,
+ &pl_filter_function_quadratic,
+ &filter_function_quadric, // alias
+ &pl_filter_function_sinc,
+ &pl_filter_function_jinc,
+ &pl_filter_function_sphinx,
+ &pl_filter_function_cubic,
+ &filter_function_bicubic, // alias
+ &filter_function_bcspline, // alias
+ &pl_filter_function_hermite,
+ &pl_filter_function_spline16,
+ &pl_filter_function_spline36,
+ &pl_filter_function_spline64,
+ &pl_filter_function_oversample,
+ NULL,
+};
+
+const int pl_num_filter_functions = PL_ARRAY_SIZE(pl_filter_functions) - 1;
+
+const struct pl_filter_function *pl_find_filter_function(const char *name)
+{
+ if (!name)
+ return NULL;
+
+ for (int i = 0; i < pl_num_filter_functions; i++) {
+ if (strcmp(name, pl_filter_functions[i]->name) == 0)
+ return pl_filter_functions[i];
+ }
+
+ return NULL;
+}
+
+// Built-in filter function configs
+
+const struct pl_filter_config pl_filter_spline16 = {
+ .name = "spline16",
+ .description = "Spline (2 taps)",
+ .kernel = &pl_filter_function_spline16,
+ .allowed = PL_FILTER_ALL,
+};
+
+const struct pl_filter_config pl_filter_spline36 = {
+ .name = "spline36",
+ .description = "Spline (3 taps)",
+ .kernel = &pl_filter_function_spline36,
+ .allowed = PL_FILTER_ALL,
+};
+
+const struct pl_filter_config pl_filter_spline64 = {
+ .name = "spline64",
+ .description = "Spline (4 taps)",
+ .kernel = &pl_filter_function_spline64,
+ .allowed = PL_FILTER_ALL,
+};
+
+const struct pl_filter_config pl_filter_nearest = {
+ .name = "nearest",
+ .description = "Nearest neighbor",
+ .kernel = &pl_filter_function_box,
+ .radius = 0.5,
+ .allowed = PL_FILTER_UPSCALING,
+ .recommended = PL_FILTER_UPSCALING,
+};
+
+const struct pl_filter_config pl_filter_box = {
+ .name = "box",
+ .description = "Box averaging",
+ .kernel = &pl_filter_function_box,
+ .radius = 0.5,
+ .allowed = PL_FILTER_SCALING,
+ .recommended = PL_FILTER_DOWNSCALING,
+};
+
+const struct pl_filter_config pl_filter_bilinear = {
+ .name = "bilinear",
+ .description = "Bilinear",
+ .kernel = &pl_filter_function_triangle,
+ .allowed = PL_FILTER_ALL,
+ .recommended = PL_FILTER_SCALING,
+};
+
+const struct pl_filter_config filter_linear = {
+ .name = "linear",
+ .description = "Linear mixing",
+ .kernel = &pl_filter_function_triangle,
+ .allowed = PL_FILTER_FRAME_MIXING,
+ .recommended = PL_FILTER_FRAME_MIXING,
+};
+
+static const struct pl_filter_config filter_triangle = {
+ .name = "triangle",
+ .kernel = &pl_filter_function_triangle,
+ .allowed = PL_FILTER_SCALING,
+};
+
+const struct pl_filter_config pl_filter_gaussian = {
+ .name = "gaussian",
+ .description = "Gaussian",
+ .kernel = &pl_filter_function_gaussian,
+ .params = {1.0},
+ .allowed = PL_FILTER_ALL,
+ .recommended = PL_FILTER_SCALING,
+};
+
+const struct pl_filter_config pl_filter_sinc = {
+ .name = "sinc",
+ .description = "Sinc (unwindowed)",
+ .kernel = &pl_filter_function_sinc,
+ .radius = 3.0,
+ .allowed = PL_FILTER_ALL,
+};
+
+const struct pl_filter_config pl_filter_lanczos = {
+ .name = "lanczos",
+ .description = "Lanczos",
+ .kernel = &pl_filter_function_sinc,
+ .window = &pl_filter_function_sinc,
+ .radius = 3.0,
+ .allowed = PL_FILTER_ALL,
+ .recommended = PL_FILTER_SCALING,
+};
+
+const struct pl_filter_config pl_filter_ginseng = {
+ .name = "ginseng",
+ .description = "Ginseng (Jinc-Sinc)",
+ .kernel = &pl_filter_function_sinc,
+ .window = &pl_filter_function_jinc,
+ .radius = 3.0,
+ .allowed = PL_FILTER_ALL,
+};
+
+#define JINC_ZERO3 3.2383154841662362076499
+#define JINC_ZERO4 4.2410628637960698819573
+
+const struct pl_filter_config pl_filter_ewa_jinc = {
+ .name = "ewa_jinc",
+ .description = "EWA Jinc (unwindowed)",
+ .kernel = &pl_filter_function_jinc,
+ .radius = JINC_ZERO3,
+ .polar = true,
+ .allowed = PL_FILTER_SCALING,
+};
+
+const struct pl_filter_config pl_filter_ewa_lanczos = {
+ .name = "ewa_lanczos",
+ .description = "Jinc (EWA Lanczos)",
+ .kernel = &pl_filter_function_jinc,
+ .window = &pl_filter_function_jinc,
+ .radius = JINC_ZERO3,
+ .polar = true,
+ .allowed = PL_FILTER_SCALING,
+ .recommended = PL_FILTER_UPSCALING,
+};
+
+const struct pl_filter_config pl_filter_ewa_lanczossharp = {
+ .name = "ewa_lanczossharp",
+ .description = "Sharpened Jinc",
+ .kernel = &pl_filter_function_jinc,
+ .window = &pl_filter_function_jinc,
+ .radius = JINC_ZERO3,
+ .blur = 0.98125058372237073562493,
+ .polar = true,
+ .allowed = PL_FILTER_SCALING,
+ .recommended = PL_FILTER_UPSCALING,
+};
+
+const struct pl_filter_config pl_filter_ewa_lanczos4sharpest = {
+ .name = "ewa_lanczos4sharpest",
+ .description = "Sharpened Jinc-AR, 4 taps",
+ .kernel = &pl_filter_function_jinc,
+ .window = &pl_filter_function_jinc,
+ .radius = JINC_ZERO4,
+ .blur = 0.88451209326050047745788,
+ .antiring = 0.8,
+ .polar = true,
+ .allowed = PL_FILTER_SCALING,
+ .recommended = PL_FILTER_UPSCALING,
+};
+
+const struct pl_filter_config pl_filter_ewa_ginseng = {
+ .name = "ewa_ginseng",
+ .description = "EWA Ginseng",
+ .kernel = &pl_filter_function_jinc,
+ .window = &pl_filter_function_sinc,
+ .radius = JINC_ZERO3,
+ .polar = true,
+ .allowed = PL_FILTER_SCALING,
+};
+
+const struct pl_filter_config pl_filter_ewa_hann = {
+ .name = "ewa_hann",
+ .description = "EWA Hann",
+ .kernel = &pl_filter_function_jinc,
+ .window = &pl_filter_function_hann,
+ .radius = JINC_ZERO3,
+ .polar = true,
+ .allowed = PL_FILTER_SCALING,
+};
+
+static const struct pl_filter_config filter_ewa_hanning = {
+ .name = "ewa_hanning",
+ .kernel = &pl_filter_function_jinc,
+ .window = &pl_filter_function_hann,
+ .radius = JINC_ZERO3,
+ .polar = true,
+ .allowed = PL_FILTER_SCALING,
+};
+
+// Spline family
+const struct pl_filter_config pl_filter_bicubic = {
+ .name = "bicubic",
+ .description = "Bicubic",
+ .kernel = &pl_filter_function_cubic,
+ .params = {1.0, 0.0},
+ .allowed = PL_FILTER_SCALING,
+ .recommended = PL_FILTER_SCALING,
+};
+
+static const struct pl_filter_config filter_cubic = {
+ .name = "cubic",
+ .description = "Cubic",
+ .kernel = &pl_filter_function_cubic,
+ .params = {1.0, 0.0},
+ .allowed = PL_FILTER_FRAME_MIXING,
+};
+
+const struct pl_filter_config pl_filter_hermite = {
+ .name = "hermite",
+ .description = "Hermite",
+ .kernel = &pl_filter_function_hermite,
+ .allowed = PL_FILTER_ALL,
+ .recommended = PL_FILTER_DOWNSCALING | PL_FILTER_FRAME_MIXING,
+};
+
+const struct pl_filter_config pl_filter_catmull_rom = {
+ .name = "catmull_rom",
+ .description = "Catmull-Rom",
+ .kernel = &pl_filter_function_cubic,
+ .params = {0.0, 0.5},
+ .allowed = PL_FILTER_ALL,
+ .recommended = PL_FILTER_SCALING,
+};
+
+const struct pl_filter_config pl_filter_mitchell = {
+ .name = "mitchell",
+ .description = "Mitchell-Netravali",
+ .kernel = &pl_filter_function_cubic,
+ .params = {1/3.0, 1/3.0},
+ .allowed = PL_FILTER_ALL,
+ .recommended = PL_FILTER_DOWNSCALING,
+};
+
+const struct pl_filter_config pl_filter_mitchell_clamp = {
+ .name = "mitchell_clamp",
+ .description = "Mitchell (clamped)",
+ .kernel = &pl_filter_function_cubic,
+ .params = {1/3.0, 1/3.0},
+ .clamp = 1.0,
+ .allowed = PL_FILTER_ALL,
+};
+
+const struct pl_filter_config pl_filter_robidoux = {
+ .name = "robidoux",
+ .description = "Robidoux",
+ .kernel = &pl_filter_function_cubic,
+ .params = {12 / (19 + 9 * M_SQRT2), 113 / (58 + 216 * M_SQRT2)},
+ .allowed = PL_FILTER_ALL,
+};
+
+const struct pl_filter_config pl_filter_robidouxsharp = {
+ .name = "robidouxsharp",
+ .description = "RobidouxSharp",
+ .kernel = &pl_filter_function_cubic,
+ .params = {6 / (13 + 7 * M_SQRT2), 7 / (2 + 12 * M_SQRT2)},
+ .allowed = PL_FILTER_ALL,
+};
+
+const struct pl_filter_config pl_filter_ewa_robidoux = {
+ .name = "ewa_robidoux",
+ .description = "EWA Robidoux",
+ .kernel = &pl_filter_function_cubic,
+ .params = {12 / (19 + 9 * M_SQRT2), 113 / (58 + 216 * M_SQRT2)},
+ .polar = true,
+ .allowed = PL_FILTER_SCALING,
+};
+
+const struct pl_filter_config pl_filter_ewa_robidouxsharp = {
+ .name = "ewa_robidouxsharp",
+ .description = "EWA RobidouxSharp",
+ .kernel = &pl_filter_function_cubic,
+ .params = {6 / (13 + 7 * M_SQRT2), 7 / (2 + 12 * M_SQRT2)},
+ .polar = true,
+ .allowed = PL_FILTER_SCALING,
+};
+
+const struct pl_filter_config pl_filter_oversample = {
+ .name = "oversample",
+ .description = "Oversampling",
+ .kernel = &pl_filter_function_oversample,
+ .params = {0.0},
+ .allowed = PL_FILTER_UPSCALING | PL_FILTER_FRAME_MIXING,
+ .recommended = PL_FILTER_UPSCALING | PL_FILTER_FRAME_MIXING,
+};
+
+const struct pl_filter_config * const pl_filter_configs[] = {
+ // Sorted roughly in terms of priority / relevance
+ &pl_filter_bilinear,
+ &filter_triangle, // alias
+ &filter_linear, // pseudo-alias (frame mixing only)
+ &pl_filter_nearest,
+ &pl_filter_spline16,
+ &pl_filter_spline36,
+ &pl_filter_spline64,
+ &pl_filter_lanczos,
+ &pl_filter_ewa_lanczos,
+ &pl_filter_ewa_lanczossharp,
+ &pl_filter_ewa_lanczos4sharpest,
+ &pl_filter_bicubic,
+ &filter_cubic, // pseudo-alias (frame mixing only)
+ &pl_filter_hermite,
+ &pl_filter_gaussian,
+ &pl_filter_oversample,
+ &pl_filter_mitchell,
+ &pl_filter_mitchell_clamp,
+ &pl_filter_sinc,
+ &pl_filter_ginseng,
+ &pl_filter_ewa_jinc,
+ &pl_filter_ewa_ginseng,
+ &pl_filter_ewa_hann,
+ &filter_ewa_hanning, // alias
+ &pl_filter_catmull_rom,
+ &pl_filter_robidoux,
+ &pl_filter_robidouxsharp,
+ &pl_filter_ewa_robidoux,
+ &pl_filter_ewa_robidouxsharp,
+
+ NULL,
+};
+
+const int pl_num_filter_configs = PL_ARRAY_SIZE(pl_filter_configs) - 1;
+
+const struct pl_filter_config *
+pl_find_filter_config(const char *name, enum pl_filter_usage usage)
+{
+ if (!name)
+ return NULL;
+
+ for (int i = 0; i < pl_num_filter_configs; i++) {
+ if ((pl_filter_configs[i]->allowed & usage) != usage)
+ continue;
+ if (strcmp(name, pl_filter_configs[i]->name) == 0)
+ return pl_filter_configs[i];
+ }
+
+ return NULL;
+}
+
+// Backwards compatibility with older API
+
+const struct pl_filter_function_preset pl_filter_function_presets[] = {
+ {"none", NULL},
+ {"box", &pl_filter_function_box},
+ {"dirichlet", &filter_function_dirichlet}, // alias
+ {"triangle", &pl_filter_function_triangle},
+ {"cosine", &pl_filter_function_cosine},
+ {"hann", &pl_filter_function_hann},
+ {"hanning", &filter_function_hanning}, // alias
+ {"hamming", &pl_filter_function_hamming},
+ {"welch", &pl_filter_function_welch},
+ {"kaiser", &pl_filter_function_kaiser},
+ {"blackman", &pl_filter_function_blackman},
+ {"bohman", &pl_filter_function_bohman},
+ {"gaussian", &pl_filter_function_gaussian},
+ {"quadratic", &pl_filter_function_quadratic},
+ {"quadric", &filter_function_quadric}, // alias
+ {"sinc", &pl_filter_function_sinc},
+ {"jinc", &pl_filter_function_jinc},
+ {"sphinx", &pl_filter_function_sphinx},
+ {"cubic", &pl_filter_function_cubic},
+ {"bicubic", &filter_function_bicubic}, // alias
+ {"bcspline", &filter_function_bcspline}, // alias
+ {"hermite", &pl_filter_function_hermite},
+ {"spline16", &pl_filter_function_spline16},
+ {"spline36", &pl_filter_function_spline36},
+ {"spline64", &pl_filter_function_spline64},
+ {0},
+};
+
+const int pl_num_filter_function_presets = PL_ARRAY_SIZE(pl_filter_function_presets) - 1;
+
+const struct pl_filter_function_preset *pl_find_filter_function_preset(const char *name)
+{
+ if (!name)
+ return NULL;
+
+ for (int i = 0; pl_filter_function_presets[i].name; i++) {
+ if (strcmp(pl_filter_function_presets[i].name, name) == 0)
+ return &pl_filter_function_presets[i];
+ }
+
+ return NULL;
+}
+
+const struct pl_filter_preset *pl_find_filter_preset(const char *name)
+{
+ if (!name)
+ return NULL;
+
+ for (int i = 0; pl_filter_presets[i].name; i++) {
+ if (strcmp(pl_filter_presets[i].name, name) == 0)
+ return &pl_filter_presets[i];
+ }
+
+ return NULL;
+}
+
+const struct pl_filter_preset pl_filter_presets[] = {
+ {"none", NULL, "Built-in sampling"},
+ COMMON_FILTER_PRESETS,
+ {0}
+};
+
+const int pl_num_filter_presets = PL_ARRAY_SIZE(pl_filter_presets) - 1;
diff --git a/src/filters.h b/src/filters.h
new file mode 100644
index 0000000..c3227db
--- /dev/null
+++ b/src/filters.h
@@ -0,0 +1,58 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <libplacebo/filters.h>
+
+static inline float pl_filter_radius_bound(const struct pl_filter_config *c)
+{
+ const float r = c->radius && c->kernel->resizable ? c->radius : c->kernel->radius;
+ return c->blur > 0.0 ? r * c->blur : r;
+}
+
+#define COMMON_FILTER_PRESETS \
+ /* Highest priority / recommended filters */ \
+ {"bilinear", &pl_filter_bilinear, "Bilinear"}, \
+ {"nearest", &pl_filter_nearest, "Nearest neighbour"}, \
+ {"bicubic", &pl_filter_bicubic, "Bicubic"}, \
+ {"lanczos", &pl_filter_lanczos, "Lanczos"}, \
+ {"ewa_lanczos", &pl_filter_ewa_lanczos, "Jinc (EWA Lanczos)"}, \
+ {"ewa_lanczossharp", &pl_filter_ewa_lanczossharp, "Sharpened Jinc"}, \
+ {"ewa_lanczos4sharpest",&pl_filter_ewa_lanczos4sharpest, "Sharpened Jinc-AR, 4 taps"},\
+ {"gaussian", &pl_filter_gaussian, "Gaussian"}, \
+ {"spline16", &pl_filter_spline16, "Spline (2 taps)"}, \
+ {"spline36", &pl_filter_spline36, "Spline (3 taps)"}, \
+ {"spline64", &pl_filter_spline64, "Spline (4 taps)"}, \
+ {"mitchell", &pl_filter_mitchell, "Mitchell-Netravali"}, \
+ \
+ /* Remaining filters */ \
+ {"sinc", &pl_filter_sinc, "Sinc (unwindowed)"}, \
+ {"ginseng", &pl_filter_ginseng, "Ginseng (Jinc-Sinc)"}, \
+ {"ewa_jinc", &pl_filter_ewa_jinc, "EWA Jinc (unwindowed)"}, \
+ {"ewa_ginseng", &pl_filter_ewa_ginseng, "EWA Ginseng"}, \
+ {"ewa_hann", &pl_filter_ewa_hann, "EWA Hann"}, \
+ {"hermite", &pl_filter_hermite, "Hermite"}, \
+ {"catmull_rom", &pl_filter_catmull_rom, "Catmull-Rom"}, \
+ {"robidoux", &pl_filter_robidoux, "Robidoux"}, \
+ {"robidouxsharp", &pl_filter_robidouxsharp, "RobidouxSharp"}, \
+ {"ewa_robidoux", &pl_filter_ewa_robidoux, "EWA Robidoux"}, \
+ {"ewa_robidouxsharp", &pl_filter_ewa_robidouxsharp, "EWA RobidouxSharp"}, \
+ \
+ /* Aliases */ \
+ {"triangle", &pl_filter_bilinear}, \
+ {"ewa_hanning", &pl_filter_ewa_hann}
diff --git a/src/format.c b/src/format.c
new file mode 100644
index 0000000..458d493
--- /dev/null
+++ b/src/format.c
@@ -0,0 +1,205 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+
+#include "common.h"
+
+void pl_str_append_asprintf_c(void *alloc, pl_str *str, const char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap, fmt);
+ pl_str_append_vasprintf_c(alloc, str, fmt, ap);
+ va_end(ap);
+}
+
+void pl_str_append_vasprintf_c(void *alloc, pl_str *str, const char *fmt,
+ va_list ap)
+{
+ for (const char *c; (c = strchr(fmt, '%')) != NULL; fmt = c + 1) {
+ // Append the preceding string literal
+ pl_str_append_raw(alloc, str, fmt, c - fmt);
+ c++; // skip '%'
+
+ char buf[32];
+ int len;
+
+ // The format character follows the % sign
+ switch (c[0]) {
+ case '%':
+ pl_str_append_raw(alloc, str, c, 1);
+ continue;
+ case 's': {
+ const char *arg = va_arg(ap, const char *);
+ pl_str_append_raw(alloc, str, arg, strlen(arg));
+ continue;
+ }
+ case '.': { // only used for %.*s
+ assert(c[1] == '*');
+ assert(c[2] == 's');
+ len = va_arg(ap, int);
+ pl_str_append_raw(alloc, str, va_arg(ap, char *), len);
+ c += 2; // skip '*s'
+ continue;
+ }
+ case 'c':
+ buf[0] = (char) va_arg(ap, int);
+ len = 1;
+ break;
+ case 'd':
+ len = pl_str_print_int(buf, sizeof(buf), va_arg(ap, int));
+ break;
+ case 'h': ; // only used for %hx
+ assert(c[1] == 'x');
+ len = pl_str_print_hex(buf, sizeof(buf), (unsigned short) va_arg(ap, unsigned int));
+ c++;
+ break;
+ case 'u':
+ len = pl_str_print_uint(buf, sizeof(buf), va_arg(ap, unsigned int));
+ break;
+ case 'l':
+ assert(c[1] == 'l');
+ switch (c[2]) {
+ case 'u':
+ len = pl_str_print_uint64(buf, sizeof(buf), va_arg(ap, unsigned long long));
+ break;
+ case 'd':
+ len = pl_str_print_int64(buf, sizeof(buf), va_arg(ap, long long));
+ break;
+ default: pl_unreachable();
+ }
+ c += 2;
+ break;
+ case 'z':
+ assert(c[1] == 'u');
+ len = pl_str_print_uint64(buf, sizeof(buf), va_arg(ap, size_t));
+ c++;
+ break;
+ case 'f':
+ len = pl_str_print_double(buf, sizeof(buf), va_arg(ap, double));
+ break;
+ default:
+ fprintf(stderr, "Invalid conversion character: '%c'!\n", c[0]);
+ abort();
+ }
+
+ pl_str_append_raw(alloc, str, buf, len);
+ }
+
+ // Append the remaining string literal
+ pl_str_append(alloc, str, pl_str0(fmt));
+}
+
+size_t pl_str_append_memprintf_c(void *alloc, pl_str *str, const char *fmt,
+ const void *args)
+{
+ const uint8_t *ptr = args;
+
+ for (const char *c; (c = strchr(fmt, '%')) != NULL; fmt = c + 1) {
+ pl_str_append_raw(alloc, str, fmt, c - fmt);
+ c++;
+
+ char buf[32];
+ int len;
+
+#define LOAD(var) \
+ do { \
+ memcpy(&(var), ptr, sizeof(var)); \
+ ptr += sizeof(var); \
+ } while (0)
+
+ switch (c[0]) {
+ case '%':
+ pl_str_append_raw(alloc, str, c, 1);
+ continue;
+ case 's': {
+ len = strlen((const char *) ptr);
+ pl_str_append_raw(alloc, str, ptr, len);
+ ptr += len + 1; // also skip \0
+ continue;
+ }
+ case '.': {
+ assert(c[1] == '*');
+ assert(c[2] == 's');
+ LOAD(len);
+ pl_str_append_raw(alloc, str, ptr, len);
+ ptr += len; // no trailing \0
+ c += 2;
+ continue;
+ }
+ case 'c':
+ LOAD(buf[0]);
+ len = 1;
+ break;
+ case 'd': ;
+ int d;
+ LOAD(d);
+ len = pl_str_print_int(buf, sizeof(buf), d);
+ break;
+ case 'h': ;
+ assert(c[1] == 'x');
+ unsigned short hx;
+ LOAD(hx);
+ len = pl_str_print_hex(buf, sizeof(buf), hx);
+ c++;
+ break;
+ case 'u': ;
+ unsigned u;
+ LOAD(u);
+ len = pl_str_print_uint(buf, sizeof(buf), u);
+ break;
+ case 'l':
+ assert(c[1] == 'l');
+ switch (c[2]) {
+ case 'u': ;
+ long long unsigned llu;
+ LOAD(llu);
+ len = pl_str_print_uint64(buf, sizeof(buf), llu);
+ break;
+ case 'd': ;
+ long long int lld;
+ LOAD(lld);
+ len = pl_str_print_int64(buf, sizeof(buf), lld);
+ break;
+ default: pl_unreachable();
+ }
+ c += 2;
+ break;
+ case 'z': ;
+ assert(c[1] == 'u');
+ size_t zu;
+ LOAD(zu);
+ len = pl_str_print_uint64(buf, sizeof(buf), zu);
+ c++;
+ break;
+ case 'f': ;
+ double f;
+ LOAD(f);
+ len = pl_str_print_double(buf, sizeof(buf), f);
+ break;
+ default:
+ fprintf(stderr, "Invalid conversion character: '%c'!\n", c[0]);
+ abort();
+ }
+
+ pl_str_append_raw(alloc, str, buf, len);
+ }
+#undef LOAD
+
+ pl_str_append(alloc, str, pl_str0(fmt));
+ return (uintptr_t) ptr - (uintptr_t) args;
+}
diff --git a/src/gamut_mapping.c b/src/gamut_mapping.c
new file mode 100644
index 0000000..e80d0a7
--- /dev/null
+++ b/src/gamut_mapping.c
@@ -0,0 +1,1008 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+
+#include "common.h"
+#include "pl_thread.h"
+
+#include <libplacebo/gamut_mapping.h>
+
+#define fclampf(x, lo, hi) fminf(fmaxf(x, lo), hi)
+static void fix_constants(struct pl_gamut_map_constants *c)
+{
+ c->perceptual_deadzone = fclampf(c->perceptual_deadzone, 0.0f, 1.0f);
+ c->perceptual_strength = fclampf(c->perceptual_strength, 0.0f, 1.0f);
+ c->colorimetric_gamma = fclampf(c->colorimetric_gamma, 0.0f, 10.0f);
+ c->softclip_knee = fclampf(c->softclip_knee, 0.0f, 1.0f);
+ c->softclip_desat = fclampf(c->softclip_desat, 0.0f, 1.0f);
+}
+
+static inline bool constants_equal(const struct pl_gamut_map_constants *a,
+ const struct pl_gamut_map_constants *b)
+{
+ pl_static_assert(sizeof(*a) % sizeof(float) == 0);
+ return !memcmp(a, b, sizeof(*a));
+}
+
+bool pl_gamut_map_params_equal(const struct pl_gamut_map_params *a,
+ const struct pl_gamut_map_params *b)
+{
+ return a->function == b->function &&
+ a->min_luma == b->min_luma &&
+ a->max_luma == b->max_luma &&
+ a->lut_size_I == b->lut_size_I &&
+ a->lut_size_C == b->lut_size_C &&
+ a->lut_size_h == b->lut_size_h &&
+ a->lut_stride == b->lut_stride &&
+ constants_equal(&a->constants, &b->constants) &&
+ pl_raw_primaries_equal(&a->input_gamut, &b->input_gamut) &&
+ pl_raw_primaries_equal(&a->output_gamut, &b->output_gamut);
+}
+
+#define FUN(params) (params->function ? *params->function : pl_gamut_map_clip)
+
+static void noop(float *lut, const struct pl_gamut_map_params *params);
+bool pl_gamut_map_params_noop(const struct pl_gamut_map_params *params)
+{
+ if (FUN(params).map == &noop)
+ return true;
+
+ struct pl_raw_primaries src = params->input_gamut, dst = params->output_gamut;
+ if (!pl_primaries_compatible(&dst, &src))
+ return true;
+
+ bool need_map = !pl_primaries_superset(&dst, &src);
+ need_map |= !pl_cie_xy_equal(&src.white, &dst.white);
+ if (FUN(params).bidirectional)
+ need_map |= !pl_raw_primaries_equal(&dst, &src);
+
+ return !need_map;
+}
+
+// For some minimal type safety, and code cleanliness
+struct RGB {
+ float R, G, B;
+};
+
+struct IPT {
+ float I, P, T;
+};
+
+struct ICh {
+ float I, C, h;
+};
+
+static inline struct ICh ipt2ich(struct IPT c)
+{
+ return (struct ICh) {
+ .I = c.I,
+ .C = sqrtf(c.P * c.P + c.T * c.T),
+ .h = atan2f(c.T, c.P),
+ };
+}
+
+static inline struct IPT ich2ipt(struct ICh c)
+{
+ return (struct IPT) {
+ .I = c.I,
+ .P = c.C * cosf(c.h),
+ .T = c.C * sinf(c.h),
+ };
+}
+
+static const float PQ_M1 = 2610./4096 * 1./4,
+ PQ_M2 = 2523./4096 * 128,
+ PQ_C1 = 3424./4096,
+ PQ_C2 = 2413./4096 * 32,
+ PQ_C3 = 2392./4096 * 32;
+
+enum { PQ_LUT_SIZE = 1024 };
+static const float pq_eotf_lut[1024+1] = {
+ 0.0000000e+00f, 4.0422718e-09f, 1.3111372e-08f, 2.6236826e-08f, 4.3151495e-08f, 6.3746885e-08f, 8.7982383e-08f, 1.1585362e-07f,
+ 1.4737819e-07f, 1.8258818e-07f, 2.2152586e-07f, 2.6424098e-07f, 3.1078907e-07f, 3.6123021e-07f, 4.1562821e-07f, 4.7405001e-07f,
+ 5.3656521e-07f, 6.0324583e-07f, 6.7416568e-07f, 7.4940095e-07f, 8.2902897e-07f, 9.1312924e-07f, 1.0017822e-06f, 1.0950702e-06f,
+ 1.1930764e-06f, 1.2958861e-06f, 1.4035847e-06f, 1.5162600e-06f, 1.6340000e-06f, 1.7568948e-06f, 1.8850346e-06f, 2.0185119e-06f,
+ 2.1574192e-06f, 2.3018509e-06f, 2.4519029e-06f, 2.6076704e-06f, 2.7692516e-06f, 2.9367449e-06f, 3.1102509e-06f, 3.2898690e-06f,
+ 3.4757019e-06f, 3.6678526e-06f, 3.8664261e-06f, 4.0715262e-06f, 4.2832601e-06f, 4.5017354e-06f, 4.7270617e-06f, 4.9593473e-06f,
+ 5.1987040e-06f, 5.4452441e-06f, 5.6990819e-06f, 5.9603301e-06f, 6.2291055e-06f, 6.5055251e-06f, 6.7897080e-06f, 7.0817717e-06f,
+ 7.3818379e-06f, 7.6900283e-06f, 8.0064675e-06f, 8.3312774e-06f, 8.6645849e-06f, 9.0065169e-06f, 9.3572031e-06f, 9.7167704e-06f,
+ 1.0085351e-05f, 1.0463077e-05f, 1.0850082e-05f, 1.1246501e-05f, 1.1652473e-05f, 1.2068130e-05f, 1.2493614e-05f, 1.2929066e-05f,
+ 1.3374626e-05f, 1.3830439e-05f, 1.4296648e-05f, 1.4773401e-05f, 1.5260848e-05f, 1.5759132e-05f, 1.6268405e-05f, 1.6788821e-05f,
+ 1.7320534e-05f, 1.7863697e-05f, 1.8418467e-05f, 1.8985004e-05f, 1.9563470e-05f, 2.0154019e-05f, 2.0756818e-05f, 2.1372031e-05f,
+ 2.1999824e-05f, 2.2640365e-05f, 2.3293824e-05f, 2.3960372e-05f, 2.4640186e-05f, 2.5333431e-05f, 2.6040288e-05f, 2.6760935e-05f,
+ 2.7495552e-05f, 2.8244319e-05f, 2.9007421e-05f, 2.9785041e-05f, 3.0577373e-05f, 3.1384594e-05f, 3.2206899e-05f, 3.3044481e-05f,
+ 3.3897533e-05f, 3.4766253e-05f, 3.5650838e-05f, 3.6551487e-05f, 3.7468409e-05f, 3.8401794e-05f, 3.9351855e-05f, 4.0318799e-05f,
+ 4.1302836e-05f, 4.2304177e-05f, 4.3323036e-05f, 4.4359629e-05f, 4.5414181e-05f, 4.6486897e-05f, 4.7578006e-05f, 4.8687732e-05f,
+ 4.9816302e-05f, 5.0963944e-05f, 5.2130889e-05f, 5.3317369e-05f, 5.4523628e-05f, 5.5749886e-05f, 5.6996391e-05f, 5.8263384e-05f,
+ 5.9551111e-05f, 6.0859816e-05f, 6.2189750e-05f, 6.3541162e-05f, 6.4914307e-05f, 6.6309439e-05f, 6.7726819e-05f, 6.9166705e-05f,
+ 7.0629384e-05f, 7.2115077e-05f, 7.3624074e-05f, 7.5156646e-05f, 7.6713065e-05f, 7.8293608e-05f, 7.9898553e-05f, 8.1528181e-05f,
+ 8.3182776e-05f, 8.4862623e-05f, 8.6568012e-05f, 8.8299235e-05f, 9.0056585e-05f, 9.1840360e-05f, 9.3650860e-05f, 9.5488388e-05f,
+ 9.7353277e-05f, 9.9245779e-05f, 1.0116623e-04f, 1.0311496e-04f, 1.0509226e-04f, 1.0709847e-04f, 1.0913391e-04f, 1.1119889e-04f,
+ 1.1329376e-04f, 1.1541885e-04f, 1.1757448e-04f, 1.1976100e-04f, 1.2197875e-04f, 1.2422807e-04f, 1.2650931e-04f, 1.2882282e-04f,
+ 1.3116900e-04f, 1.3354812e-04f, 1.3596059e-04f, 1.3840676e-04f, 1.4088701e-04f, 1.4340170e-04f, 1.4595121e-04f, 1.4853593e-04f,
+ 1.5115622e-04f, 1.5381247e-04f, 1.5650507e-04f, 1.5923442e-04f, 1.6200090e-04f, 1.6480492e-04f, 1.6764688e-04f, 1.7052718e-04f,
+ 1.7344629e-04f, 1.7640451e-04f, 1.7940233e-04f, 1.8244015e-04f, 1.8551840e-04f, 1.8863752e-04f, 1.9179792e-04f, 1.9500006e-04f,
+ 1.9824437e-04f, 2.0153130e-04f, 2.0486129e-04f, 2.0823479e-04f, 2.1165227e-04f, 2.1511419e-04f, 2.1862101e-04f, 2.2217319e-04f,
+ 2.2577128e-04f, 2.2941563e-04f, 2.3310679e-04f, 2.3684523e-04f, 2.4063146e-04f, 2.4446597e-04f, 2.4834925e-04f, 2.5228182e-04f,
+ 2.5626417e-04f, 2.6029683e-04f, 2.6438031e-04f, 2.6851514e-04f, 2.7270184e-04f, 2.7694094e-04f, 2.8123299e-04f, 2.8557852e-04f,
+ 2.8997815e-04f, 2.9443230e-04f, 2.9894159e-04f, 3.0350657e-04f, 3.0812783e-04f, 3.1280593e-04f, 3.1754144e-04f, 3.2233495e-04f,
+ 3.2718705e-04f, 3.3209833e-04f, 3.3706938e-04f, 3.4210082e-04f, 3.4719324e-04f, 3.5234727e-04f, 3.5756351e-04f, 3.6284261e-04f,
+ 3.6818526e-04f, 3.7359195e-04f, 3.7906340e-04f, 3.8460024e-04f, 3.9020315e-04f, 3.9587277e-04f, 4.0160977e-04f, 4.0741483e-04f,
+ 4.1328861e-04f, 4.1923181e-04f, 4.2524511e-04f, 4.3132921e-04f, 4.3748480e-04f, 4.4371260e-04f, 4.5001332e-04f, 4.5638768e-04f,
+ 4.6283650e-04f, 4.6936032e-04f, 4.7595999e-04f, 4.8263624e-04f, 4.8938982e-04f, 4.9622151e-04f, 5.0313205e-04f, 5.1012223e-04f,
+ 5.1719283e-04f, 5.2434463e-04f, 5.3157843e-04f, 5.3889502e-04f, 5.4629521e-04f, 5.5377982e-04f, 5.6134968e-04f, 5.6900560e-04f,
+ 5.7674843e-04f, 5.8457900e-04f, 5.9249818e-04f, 6.0050682e-04f, 6.0860578e-04f, 6.1679595e-04f, 6.2507819e-04f, 6.3345341e-04f,
+ 6.4192275e-04f, 6.5048661e-04f, 6.5914616e-04f, 6.6790231e-04f, 6.7675600e-04f, 6.8570816e-04f, 6.9475975e-04f, 7.0391171e-04f,
+ 7.1316500e-04f, 7.2252060e-04f, 7.3197948e-04f, 7.4154264e-04f, 7.5121107e-04f, 7.6098577e-04f, 7.7086777e-04f, 7.8085807e-04f,
+ 7.9095772e-04f, 8.0116775e-04f, 8.1148922e-04f, 8.2192318e-04f, 8.3247071e-04f, 8.4313287e-04f, 8.5391076e-04f, 8.6480548e-04f,
+ 8.7581812e-04f, 8.8694982e-04f, 8.9820168e-04f, 9.0957485e-04f, 9.2107048e-04f, 9.3268971e-04f, 9.4443372e-04f, 9.5630368e-04f,
+ 9.6830115e-04f, 9.8042658e-04f, 9.9268155e-04f, 1.0050673e-03f, 1.0175850e-03f, 1.0302359e-03f, 1.0430213e-03f, 1.0559425e-03f,
+ 1.0690006e-03f, 1.0821970e-03f, 1.0955331e-03f, 1.1090100e-03f, 1.1226290e-03f, 1.1363917e-03f, 1.1502992e-03f, 1.1643529e-03f,
+ 1.1785542e-03f, 1.1929044e-03f, 1.2074050e-03f, 1.2220573e-03f, 1.2368628e-03f, 1.2518229e-03f, 1.2669390e-03f, 1.2822125e-03f,
+ 1.2976449e-03f, 1.3132377e-03f, 1.3289925e-03f, 1.3449105e-03f, 1.3609935e-03f, 1.3772429e-03f, 1.3936602e-03f, 1.4102470e-03f,
+ 1.4270054e-03f, 1.4439360e-03f, 1.4610407e-03f, 1.4783214e-03f, 1.4957794e-03f, 1.5134166e-03f, 1.5312345e-03f, 1.5492348e-03f,
+ 1.5674192e-03f, 1.5857894e-03f, 1.6043471e-03f, 1.6230939e-03f, 1.6420317e-03f, 1.6611622e-03f, 1.6804871e-03f, 1.7000083e-03f,
+ 1.7197275e-03f, 1.7396465e-03f, 1.7597672e-03f, 1.7800914e-03f, 1.8006210e-03f, 1.8213578e-03f, 1.8423038e-03f, 1.8634608e-03f,
+ 1.8848308e-03f, 1.9064157e-03f, 1.9282175e-03f, 1.9502381e-03f, 1.9724796e-03f, 1.9949439e-03f, 2.0176331e-03f, 2.0405492e-03f,
+ 2.0636950e-03f, 2.0870711e-03f, 2.1106805e-03f, 2.1345250e-03f, 2.1586071e-03f, 2.1829286e-03f, 2.2074919e-03f, 2.2322992e-03f,
+ 2.2573525e-03f, 2.2826542e-03f, 2.3082066e-03f, 2.3340118e-03f, 2.3600721e-03f, 2.3863900e-03f, 2.4129676e-03f, 2.4398074e-03f,
+ 2.4669117e-03f, 2.4942828e-03f, 2.5219233e-03f, 2.5498355e-03f, 2.5780219e-03f, 2.6064849e-03f, 2.6352271e-03f, 2.6642509e-03f,
+ 2.6935589e-03f, 2.7231536e-03f, 2.7530377e-03f, 2.7832137e-03f, 2.8136843e-03f, 2.8444520e-03f, 2.8755196e-03f, 2.9068898e-03f,
+ 2.9385662e-03f, 2.9705496e-03f, 3.0028439e-03f, 3.0354517e-03f, 3.0683758e-03f, 3.1016192e-03f, 3.1351846e-03f, 3.1690750e-03f,
+ 3.2032932e-03f, 3.2378422e-03f, 3.2727250e-03f, 3.3079445e-03f, 3.3435038e-03f, 3.3794058e-03f, 3.4156537e-03f, 3.4522505e-03f,
+ 3.4891993e-03f, 3.5265034e-03f, 3.5641658e-03f, 3.6021897e-03f, 3.6405785e-03f, 3.6793353e-03f, 3.7184634e-03f, 3.7579661e-03f,
+ 3.7978468e-03f, 3.8381088e-03f, 3.8787555e-03f, 3.9197904e-03f, 3.9612169e-03f, 4.0030385e-03f, 4.0452587e-03f, 4.0878810e-03f,
+ 4.1309104e-03f, 4.1743478e-03f, 4.2181981e-03f, 4.2624651e-03f, 4.3071525e-03f, 4.3522639e-03f, 4.3978031e-03f, 4.4437739e-03f,
+ 4.4901803e-03f, 4.5370259e-03f, 4.5843148e-03f, 4.6320508e-03f, 4.6802379e-03f, 4.7288801e-03f, 4.7779815e-03f, 4.8275461e-03f,
+ 4.8775780e-03f, 4.9280813e-03f, 4.9790603e-03f, 5.0305191e-03f, 5.0824620e-03f, 5.1348933e-03f, 5.1878172e-03f, 5.2412382e-03f,
+ 5.2951607e-03f, 5.3495890e-03f, 5.4045276e-03f, 5.4599811e-03f, 5.5159540e-03f, 5.5724510e-03f, 5.6294765e-03f, 5.6870353e-03f,
+ 5.7451339e-03f, 5.8037735e-03f, 5.8629606e-03f, 5.9227001e-03f, 5.9829968e-03f, 6.0438557e-03f, 6.1052818e-03f, 6.1672799e-03f,
+ 6.2298552e-03f, 6.2930128e-03f, 6.3567578e-03f, 6.4210953e-03f, 6.4860306e-03f, 6.5515690e-03f, 6.6177157e-03f, 6.6844762e-03f,
+ 6.7518558e-03f, 6.8198599e-03f, 6.8884942e-03f, 6.9577641e-03f, 7.0276752e-03f, 7.0982332e-03f, 7.1694438e-03f, 7.2413127e-03f,
+ 7.3138457e-03f, 7.3870486e-03f, 7.4609273e-03f, 7.5354878e-03f, 7.6107361e-03f, 7.6866782e-03f, 7.7633203e-03f, 7.8406684e-03f,
+ 7.9187312e-03f, 7.9975101e-03f, 8.0770139e-03f, 8.1572490e-03f, 8.2382216e-03f, 8.3199385e-03f, 8.4024059e-03f, 8.4856307e-03f,
+ 8.5696193e-03f, 8.6543786e-03f, 8.7399153e-03f, 8.8262362e-03f, 8.9133482e-03f, 9.0012582e-03f, 9.0899733e-03f, 9.1795005e-03f,
+ 9.2698470e-03f, 9.3610199e-03f, 9.4530265e-03f, 9.5458741e-03f, 9.6395701e-03f, 9.7341219e-03f, 9.8295370e-03f, 9.9258231e-03f,
+ 1.0022988e-02f, 1.0121039e-02f, 1.0219984e-02f, 1.0319830e-02f, 1.0420587e-02f, 1.0522261e-02f, 1.0624862e-02f, 1.0728396e-02f,
+ 1.0832872e-02f, 1.0938299e-02f, 1.1044684e-02f, 1.1152036e-02f, 1.1260365e-02f, 1.1369677e-02f, 1.1479982e-02f, 1.1591288e-02f,
+ 1.1703605e-02f, 1.1816941e-02f, 1.1931305e-02f, 1.2046706e-02f, 1.2163153e-02f, 1.2280656e-02f, 1.2399223e-02f, 1.2518864e-02f,
+ 1.2639596e-02f, 1.2761413e-02f, 1.2884333e-02f, 1.3008365e-02f, 1.3133519e-02f, 1.3259804e-02f, 1.3387231e-02f, 1.3515809e-02f,
+ 1.3645549e-02f, 1.3776461e-02f, 1.3908555e-02f, 1.4041841e-02f, 1.4176331e-02f, 1.4312034e-02f, 1.4448961e-02f, 1.4587123e-02f,
+ 1.4726530e-02f, 1.4867194e-02f, 1.5009126e-02f, 1.5152336e-02f, 1.5296837e-02f, 1.5442638e-02f, 1.5589753e-02f, 1.5738191e-02f,
+ 1.5887965e-02f, 1.6039087e-02f, 1.6191567e-02f, 1.6345419e-02f, 1.6500655e-02f, 1.6657285e-02f, 1.6815323e-02f, 1.6974781e-02f,
+ 1.7135672e-02f, 1.7298007e-02f, 1.7461800e-02f, 1.7627063e-02f, 1.7793810e-02f, 1.7962053e-02f, 1.8131805e-02f, 1.8303080e-02f,
+ 1.8475891e-02f, 1.8650252e-02f, 1.8826176e-02f, 1.9003676e-02f, 1.9182767e-02f, 1.9363463e-02f, 1.9545777e-02f, 1.9729724e-02f,
+ 1.9915319e-02f, 2.0102575e-02f, 2.0291507e-02f, 2.0482131e-02f, 2.0674460e-02f, 2.0868510e-02f, 2.1064296e-02f, 2.1261833e-02f,
+ 2.1461136e-02f, 2.1662222e-02f, 2.1865105e-02f, 2.2069802e-02f, 2.2276328e-02f, 2.2484700e-02f, 2.2694934e-02f, 2.2907045e-02f,
+ 2.3121064e-02f, 2.3336982e-02f, 2.3554827e-02f, 2.3774618e-02f, 2.3996370e-02f, 2.4220102e-02f, 2.4445831e-02f, 2.4673574e-02f,
+ 2.4903349e-02f, 2.5135174e-02f, 2.5369067e-02f, 2.5605046e-02f, 2.5843129e-02f, 2.6083336e-02f, 2.6325684e-02f, 2.6570192e-02f,
+ 2.6816880e-02f, 2.7065767e-02f, 2.7316872e-02f, 2.7570215e-02f, 2.7825815e-02f, 2.8083692e-02f, 2.8343867e-02f, 2.8606359e-02f,
+ 2.8871189e-02f, 2.9138378e-02f, 2.9407946e-02f, 2.9679914e-02f, 2.9954304e-02f, 3.0231137e-02f, 3.0510434e-02f, 3.0792217e-02f,
+ 3.1076508e-02f, 3.1363330e-02f, 3.1652704e-02f, 3.1944653e-02f, 3.2239199e-02f, 3.2536367e-02f, 3.2836178e-02f, 3.3138657e-02f,
+ 3.3443826e-02f, 3.3751710e-02f, 3.4062333e-02f, 3.4375718e-02f, 3.4691890e-02f, 3.5010874e-02f, 3.5332694e-02f, 3.5657377e-02f,
+ 3.5984946e-02f, 3.6315428e-02f, 3.6648848e-02f, 3.6985233e-02f, 3.7324608e-02f, 3.7667000e-02f, 3.8012436e-02f, 3.8360942e-02f,
+ 3.8712547e-02f, 3.9067276e-02f, 3.9425159e-02f, 3.9786223e-02f, 4.0150496e-02f, 4.0518006e-02f, 4.0888783e-02f, 4.1262855e-02f,
+ 4.1640274e-02f, 4.2021025e-02f, 4.2405159e-02f, 4.2792707e-02f, 4.3183699e-02f, 4.3578166e-02f, 4.3976138e-02f, 4.4377647e-02f,
+ 4.4782724e-02f, 4.5191401e-02f, 4.5603709e-02f, 4.6019681e-02f, 4.6439350e-02f, 4.6862749e-02f, 4.7289910e-02f, 4.7720867e-02f,
+ 4.8155654e-02f, 4.8594305e-02f, 4.9036854e-02f, 4.9483336e-02f, 4.9933787e-02f, 5.0388240e-02f, 5.0846733e-02f, 5.1309301e-02f,
+ 5.1775981e-02f, 5.2246808e-02f, 5.2721821e-02f, 5.3201056e-02f, 5.3684551e-02f, 5.4172344e-02f, 5.4664473e-02f, 5.5160978e-02f,
+ 5.5661897e-02f, 5.6167269e-02f, 5.6677135e-02f, 5.7191535e-02f, 5.7710508e-02f, 5.8234097e-02f, 5.8762342e-02f, 5.9295285e-02f,
+ 5.9832968e-02f, 6.0375433e-02f, 6.0922723e-02f, 6.1474882e-02f, 6.2031952e-02f, 6.2593979e-02f, 6.3161006e-02f, 6.3733078e-02f,
+ 6.4310241e-02f, 6.4892540e-02f, 6.5480021e-02f, 6.6072730e-02f, 6.6670715e-02f, 6.7274023e-02f, 6.7882702e-02f, 6.8496800e-02f,
+ 6.9116365e-02f, 6.9741447e-02f, 7.0372096e-02f, 7.1008361e-02f, 7.1650293e-02f, 7.2297942e-02f, 7.2951361e-02f, 7.3610602e-02f,
+ 7.4275756e-02f, 7.4946797e-02f, 7.5623818e-02f, 7.6306873e-02f, 7.6996016e-02f, 7.7691302e-02f, 7.8392787e-02f, 7.9100526e-02f,
+ 7.9814576e-02f, 8.0534993e-02f, 8.1261837e-02f, 8.1995163e-02f, 8.2735032e-02f, 8.3481501e-02f, 8.4234632e-02f, 8.4994483e-02f,
+ 8.5761116e-02f, 8.6534592e-02f, 8.7314974e-02f, 8.8102323e-02f, 8.8896702e-02f, 8.9698176e-02f, 9.0506809e-02f, 9.1322665e-02f,
+ 9.2145810e-02f, 9.2976310e-02f, 9.3814232e-02f, 9.4659643e-02f, 9.5512612e-02f, 9.6373206e-02f, 9.7241496e-02f, 9.8117550e-02f,
+ 9.9001441e-02f, 9.9893238e-02f, 1.0079301e-01f, 1.0170084e-01f, 1.0261679e-01f, 1.0354094e-01f, 1.0447337e-01f, 1.0541414e-01f,
+ 1.0636334e-01f, 1.0732104e-01f, 1.0828731e-01f, 1.0926225e-01f, 1.1024592e-01f, 1.1123841e-01f, 1.1223979e-01f, 1.1325016e-01f,
+ 1.1426958e-01f, 1.1529814e-01f, 1.1633594e-01f, 1.1738304e-01f, 1.1843954e-01f, 1.1950552e-01f, 1.2058107e-01f, 1.2166627e-01f,
+ 1.2276122e-01f, 1.2386601e-01f, 1.2498072e-01f, 1.2610544e-01f, 1.2724027e-01f, 1.2838531e-01f, 1.2954063e-01f, 1.3070635e-01f,
+ 1.3188262e-01f, 1.3306940e-01f, 1.3426686e-01f, 1.3547509e-01f, 1.3669420e-01f, 1.3792428e-01f, 1.3916544e-01f, 1.4041778e-01f,
+ 1.4168140e-01f, 1.4295640e-01f, 1.4424289e-01f, 1.4554098e-01f, 1.4685078e-01f, 1.4817238e-01f, 1.4950591e-01f, 1.5085147e-01f,
+ 1.5220916e-01f, 1.5357912e-01f, 1.5496144e-01f, 1.5635624e-01f, 1.5776364e-01f, 1.5918375e-01f, 1.6061670e-01f, 1.6206260e-01f,
+ 1.6352156e-01f, 1.6499372e-01f, 1.6647920e-01f, 1.6797811e-01f, 1.6949059e-01f, 1.7101676e-01f, 1.7255674e-01f, 1.7411067e-01f,
+ 1.7567867e-01f, 1.7726087e-01f, 1.7885742e-01f, 1.8046844e-01f, 1.8209406e-01f, 1.8373443e-01f, 1.8538967e-01f, 1.8705994e-01f,
+ 1.8874536e-01f, 1.9044608e-01f, 1.9216225e-01f, 1.9389401e-01f, 1.9564150e-01f, 1.9740486e-01f, 1.9918426e-01f, 2.0097984e-01f,
+ 2.0279175e-01f, 2.0462014e-01f, 2.0646517e-01f, 2.0832699e-01f, 2.1020577e-01f, 2.1210165e-01f, 2.1401481e-01f, 2.1594540e-01f,
+ 2.1789359e-01f, 2.1985954e-01f, 2.2184342e-01f, 2.2384540e-01f, 2.2586565e-01f, 2.2790434e-01f, 2.2996165e-01f, 2.3203774e-01f,
+ 2.3413293e-01f, 2.3624714e-01f, 2.3838068e-01f, 2.4053372e-01f, 2.4270646e-01f, 2.4489908e-01f, 2.4711177e-01f, 2.4934471e-01f,
+ 2.5159811e-01f, 2.5387214e-01f, 2.5616702e-01f, 2.5848293e-01f, 2.6082007e-01f, 2.6317866e-01f, 2.6555888e-01f, 2.6796095e-01f,
+ 2.7038507e-01f, 2.7283145e-01f, 2.7530031e-01f, 2.7779186e-01f, 2.8030631e-01f, 2.8284388e-01f, 2.8540479e-01f, 2.8798927e-01f,
+ 2.9059754e-01f, 2.9322983e-01f, 2.9588635e-01f, 2.9856736e-01f, 3.0127308e-01f, 3.0400374e-01f, 3.0675959e-01f, 3.0954086e-01f,
+ 3.1234780e-01f, 3.1518066e-01f, 3.1803969e-01f, 3.2092512e-01f, 3.2383723e-01f, 3.2677625e-01f, 3.2974246e-01f, 3.3273611e-01f,
+ 3.3575747e-01f, 3.3880680e-01f, 3.4188437e-01f, 3.4499045e-01f, 3.4812533e-01f, 3.5128926e-01f, 3.5448255e-01f, 3.5770546e-01f,
+ 3.6095828e-01f, 3.6424131e-01f, 3.6755483e-01f, 3.7089914e-01f, 3.7427454e-01f, 3.7768132e-01f, 3.8111979e-01f, 3.8459027e-01f,
+ 3.8809304e-01f, 3.9162844e-01f, 3.9519678e-01f, 3.9879837e-01f, 4.0243354e-01f, 4.0610261e-01f, 4.0980592e-01f, 4.1354380e-01f,
+ 4.1731681e-01f, 4.2112483e-01f, 4.2496844e-01f, 4.2884798e-01f, 4.3276381e-01f, 4.3671627e-01f, 4.4070572e-01f, 4.4473253e-01f,
+ 4.4879706e-01f, 4.5289968e-01f, 4.5704076e-01f, 4.6122068e-01f, 4.6543981e-01f, 4.6969854e-01f, 4.7399727e-01f, 4.7833637e-01f,
+ 4.8271625e-01f, 4.8713731e-01f, 4.9159995e-01f, 4.9610458e-01f, 5.0065162e-01f, 5.0524147e-01f, 5.0987457e-01f, 5.1455133e-01f,
+ 5.1927219e-01f, 5.2403759e-01f, 5.2884795e-01f, 5.3370373e-01f, 5.3860537e-01f, 5.4355333e-01f, 5.4854807e-01f, 5.5359004e-01f,
+ 5.5867972e-01f, 5.6381757e-01f, 5.6900408e-01f, 5.7423972e-01f, 5.7952499e-01f, 5.8486037e-01f, 5.9024637e-01f, 5.9568349e-01f,
+ 6.0117223e-01f, 6.0671311e-01f, 6.1230664e-01f, 6.1795336e-01f, 6.2365379e-01f, 6.2940847e-01f, 6.3521793e-01f, 6.4108273e-01f,
+ 6.4700342e-01f, 6.5298056e-01f, 6.5901471e-01f, 6.6510643e-01f, 6.7125632e-01f, 6.7746495e-01f, 6.8373290e-01f, 6.9006078e-01f,
+ 6.9644918e-01f, 7.0289872e-01f, 7.0941001e-01f, 7.1598366e-01f, 7.2262031e-01f, 7.2932059e-01f, 7.3608513e-01f, 7.4291460e-01f,
+ 7.4981006e-01f, 7.5677134e-01f, 7.6379952e-01f, 7.7089527e-01f, 7.7805929e-01f, 7.8529226e-01f, 7.9259489e-01f, 7.9996786e-01f,
+ 8.0741191e-01f, 8.1492774e-01f, 8.2251609e-01f, 8.3017769e-01f, 8.3791329e-01f, 8.4572364e-01f, 8.5360950e-01f, 8.6157163e-01f,
+ 8.6961082e-01f, 8.7772786e-01f, 8.8592352e-01f, 8.9419862e-01f, 9.0255397e-01f, 9.1099038e-01f, 9.1950869e-01f, 9.2810973e-01f,
+ 9.3679435e-01f, 9.4556340e-01f, 9.5441776e-01f, 9.6335829e-01f, 9.7238588e-01f, 9.8150143e-01f, 9.9070583e-01f, 1.0000000e+00f,
+ 1.0f, // extra padding to avoid out of bounds access
+};
+
+static inline float pq_eotf(float x)
+{
+ float idxf = fminf(fmaxf(x, 0.0f), 1.0f) * (PQ_LUT_SIZE - 1);
+ int ipart = floorf(idxf);
+ float fpart = idxf - ipart;
+ return PL_MIX(pq_eotf_lut[ipart], pq_eotf_lut[ipart + 1], fpart);
+}
+
+static inline float pq_oetf(float x)
+{
+ x = powf(fmaxf(x, 0.0f), PQ_M1);
+ x = (PQ_C1 + PQ_C2 * x) / (1.0f + PQ_C3 * x);
+ return powf(x, PQ_M2);
+}
+
+// Helper struct containing pre-computed cached values describing a gamut
+struct gamut {
+ pl_matrix3x3 lms2rgb;
+ pl_matrix3x3 rgb2lms;
+ float min_luma, max_luma; // pq
+ float min_rgb, max_rgb; // 10k normalized
+ struct ICh *peak_cache; // 1-item cache for computed peaks (per hue)
+};
+
+struct cache {
+ struct ICh src_cache;
+ struct ICh dst_cache;
+};
+
+static void get_gamuts(struct gamut *dst, struct gamut *src, struct cache *cache,
+ const struct pl_gamut_map_params *params)
+{
+ const float epsilon = 1e-6;
+ memset(cache, 0, sizeof(*cache));
+ struct gamut base = {
+ .min_luma = params->min_luma,
+ .max_luma = params->max_luma,
+ .min_rgb = pq_eotf(params->min_luma) - epsilon,
+ .max_rgb = pq_eotf(params->max_luma) + epsilon,
+ };
+
+ if (dst) {
+ *dst = base;
+ dst->lms2rgb = dst->rgb2lms = pl_ipt_rgb2lms(&params->output_gamut);
+ dst->peak_cache = &cache->dst_cache;
+ pl_matrix3x3_invert(&dst->lms2rgb);
+ }
+
+ if (src) {
+ *src = base;
+ src->lms2rgb = src->rgb2lms = pl_ipt_rgb2lms(&params->input_gamut);
+ src->peak_cache = &cache->src_cache;
+ pl_matrix3x3_invert(&src->lms2rgb);
+ }
+}
+
+static inline struct IPT rgb2ipt(struct RGB c, struct gamut gamut)
+{
+ const float L = gamut.rgb2lms.m[0][0] * c.R +
+ gamut.rgb2lms.m[0][1] * c.G +
+ gamut.rgb2lms.m[0][2] * c.B;
+ const float M = gamut.rgb2lms.m[1][0] * c.R +
+ gamut.rgb2lms.m[1][1] * c.G +
+ gamut.rgb2lms.m[1][2] * c.B;
+ const float S = gamut.rgb2lms.m[2][0] * c.R +
+ gamut.rgb2lms.m[2][1] * c.G +
+ gamut.rgb2lms.m[2][2] * c.B;
+ const float Lp = pq_oetf(L);
+ const float Mp = pq_oetf(M);
+ const float Sp = pq_oetf(S);
+ return (struct IPT) {
+ .I = 0.4000f * Lp + 0.4000f * Mp + 0.2000f * Sp,
+ .P = 4.4550f * Lp - 4.8510f * Mp + 0.3960f * Sp,
+ .T = 0.8056f * Lp + 0.3572f * Mp - 1.1628f * Sp,
+ };
+}
+
+static inline struct RGB ipt2rgb(struct IPT c, struct gamut gamut)
+{
+ const float Lp = c.I + 0.0975689f * c.P + 0.205226f * c.T;
+ const float Mp = c.I - 0.1138760f * c.P + 0.133217f * c.T;
+ const float Sp = c.I + 0.0326151f * c.P - 0.676887f * c.T;
+ const float L = pq_eotf(Lp);
+ const float M = pq_eotf(Mp);
+ const float S = pq_eotf(Sp);
+ return (struct RGB) {
+ .R = gamut.lms2rgb.m[0][0] * L +
+ gamut.lms2rgb.m[0][1] * M +
+ gamut.lms2rgb.m[0][2] * S,
+ .G = gamut.lms2rgb.m[1][0] * L +
+ gamut.lms2rgb.m[1][1] * M +
+ gamut.lms2rgb.m[1][2] * S,
+ .B = gamut.lms2rgb.m[2][0] * L +
+ gamut.lms2rgb.m[2][1] * M +
+ gamut.lms2rgb.m[2][2] * S,
+ };
+}
+
+static inline bool ingamut(struct IPT c, struct gamut gamut)
+{
+ const float Lp = c.I + 0.0975689f * c.P + 0.205226f * c.T;
+ const float Mp = c.I - 0.1138760f * c.P + 0.133217f * c.T;
+ const float Sp = c.I + 0.0326151f * c.P - 0.676887f * c.T;
+ if (Lp < gamut.min_luma || Lp > gamut.max_luma ||
+ Mp < gamut.min_luma || Mp > gamut.max_luma ||
+ Sp < gamut.min_luma || Sp > gamut.max_luma)
+ {
+ // Early exit for values outside legal LMS range
+ return false;
+ }
+
+ const float L = pq_eotf(Lp);
+ const float M = pq_eotf(Mp);
+ const float S = pq_eotf(Sp);
+ struct RGB rgb = {
+ .R = gamut.lms2rgb.m[0][0] * L +
+ gamut.lms2rgb.m[0][1] * M +
+ gamut.lms2rgb.m[0][2] * S,
+ .G = gamut.lms2rgb.m[1][0] * L +
+ gamut.lms2rgb.m[1][1] * M +
+ gamut.lms2rgb.m[1][2] * S,
+ .B = gamut.lms2rgb.m[2][0] * L +
+ gamut.lms2rgb.m[2][1] * M +
+ gamut.lms2rgb.m[2][2] * S,
+ };
+ return rgb.R >= gamut.min_rgb && rgb.R <= gamut.max_rgb &&
+ rgb.G >= gamut.min_rgb && rgb.G <= gamut.max_rgb &&
+ rgb.B >= gamut.min_rgb && rgb.B <= gamut.max_rgb;
+}
+
+struct generate_args {
+ const struct pl_gamut_map_params *params;
+ float *out;
+ int start;
+ int count;
+};
+
+static PL_THREAD_VOID generate(void *priv)
+{
+ const struct generate_args *args = priv;
+ const struct pl_gamut_map_params *params = args->params;
+
+ float *in = args->out;
+ const int end = args->start + args->count;
+ for (int h = args->start; h < end; h++) {
+ for (int C = 0; C < params->lut_size_C; C++) {
+ for (int I = 0; I < params->lut_size_I; I++) {
+ float Ix = (float) I / (params->lut_size_I - 1);
+ float Cx = (float) C / (params->lut_size_C - 1);
+ float hx = (float) h / (params->lut_size_h - 1);
+ struct IPT ipt = ich2ipt((struct ICh) {
+ .I = PL_MIX(params->min_luma, params->max_luma, Ix),
+ .C = PL_MIX(0.0f, 0.5f, Cx),
+ .h = PL_MIX(-M_PI, M_PI, hx),
+ });
+ in[0] = ipt.I;
+ in[1] = ipt.P;
+ in[2] = ipt.T;
+ in += params->lut_stride;
+ }
+ }
+ }
+
+ struct pl_gamut_map_params fixed = *params;
+ fix_constants(&fixed.constants);
+ fixed.lut_size_h = args->count;
+ FUN(params).map(args->out, &fixed);
+ PL_THREAD_RETURN();
+}
+
+void pl_gamut_map_generate(float *out, const struct pl_gamut_map_params *params)
+{
+ enum { MAX_WORKERS = 32 };
+ struct generate_args args[MAX_WORKERS];
+
+ const int num_per_worker = PL_DIV_UP(params->lut_size_h, MAX_WORKERS);
+ const int num_workers = PL_DIV_UP(params->lut_size_h, num_per_worker);
+ for (int i = 0; i < num_workers; i++) {
+ const int start = i * num_per_worker;
+ const int count = PL_MIN(num_per_worker, params->lut_size_h - start);
+ args[i] = (struct generate_args) {
+ .params = params,
+ .out = out,
+ .start = start,
+ .count = count,
+ };
+ out += count * params->lut_size_C * params->lut_size_I * params->lut_stride;
+ }
+
+ pl_thread workers[MAX_WORKERS] = {0};
+ for (int i = 0; i < num_workers; i++) {
+ if (pl_thread_create(&workers[i], generate, &args[i]) != 0)
+ generate(&args[i]); // fallback
+ }
+
+ for (int i = 0; i < num_workers; i++) {
+ if (!workers[i])
+ continue;
+ if (pl_thread_join(workers[i]) != 0)
+ generate(&args[i]); // fallback
+ }
+}
+
+void pl_gamut_map_sample(float x[3], const struct pl_gamut_map_params *params)
+{
+ struct pl_gamut_map_params fixed = *params;
+ fix_constants(&fixed.constants);
+ fixed.lut_size_I = fixed.lut_size_C = fixed.lut_size_h = 1;
+ fixed.lut_stride = 3;
+
+ FUN(params).map(x, &fixed);
+}
+
+#define LUT_SIZE(p) (p->lut_size_I * p->lut_size_C * p->lut_size_h * p->lut_stride)
+#define FOREACH_LUT(lut, C) \
+ for (struct IPT *_i = (struct IPT *) lut, \
+ *_end = (struct IPT *) (lut + LUT_SIZE(params)), \
+ C; \
+ _i < _end && ( C = *_i, 1 ); \
+ *_i = C, _i = (struct IPT *) ((float *) _i + params->lut_stride))
+
+// Something like PL_MIX(base, c, x) but follows an exponential curve, note
+// that this can be used to extend 'c' outwards for x > 1
+static inline struct ICh mix_exp(struct ICh c, float x, float gamma, float base)
+{
+ return (struct ICh) {
+ .I = base + (c.I - base) * powf(x, gamma),
+ .C = c.C * x,
+ .h = c.h,
+ };
+}
+
+// Drop gamma for colors approaching black and achromatic to avoid numerical
+// instabilities, and excessive brightness boosting of grain, while also
+// strongly boosting gamma for values exceeding the target peak
+static inline float scale_gamma(float gamma, struct ICh ich, struct ICh peak,
+ struct gamut gamut)
+{
+ const float Imin = gamut.min_luma;
+ const float Irel = fmaxf((ich.I - Imin) / (peak.I - Imin), 0.0f);
+ return gamma * powf(Irel, 3) * fminf(ich.C / peak.C, 1.0f);
+}
+
+static const float maxDelta = 5e-5f;
+
+// Find gamut intersection using specified bounds
+static inline struct ICh
+desat_bounded(float I, float h, float Cmin, float Cmax, struct gamut gamut)
+{
+ if (I <= gamut.min_luma)
+ return (struct ICh) { .I = gamut.min_luma, .C = 0, .h = h };
+ if (I >= gamut.max_luma)
+ return (struct ICh) { .I = gamut.max_luma, .C = 0, .h = h };
+
+ const float maxDI = I * maxDelta;
+ struct ICh res = { .I = I, .C = (Cmin + Cmax) / 2, .h = h };
+ do {
+ if (ingamut(ich2ipt(res), gamut)) {
+ Cmin = res.C;
+ } else {
+ Cmax = res.C;
+ }
+ res.C = (Cmin + Cmax) / 2;
+ } while (Cmax - Cmin > maxDI);
+
+ return res;
+}
+
+// Finds maximally saturated in-gamut color (for given hue)
+static inline struct ICh saturate(float hue, struct gamut gamut)
+{
+ if (gamut.peak_cache->I && fabsf(gamut.peak_cache->h - hue) < 1e-3)
+ return *gamut.peak_cache;
+
+ static const float invphi = 0.6180339887498948f;
+ static const float invphi2 = 0.38196601125010515f;
+
+ struct ICh lo = { .I = gamut.min_luma, .h = hue };
+ struct ICh hi = { .I = gamut.max_luma, .h = hue };
+ float de = hi.I - lo.I;
+ struct ICh a = { .I = lo.I + invphi2 * de };
+ struct ICh b = { .I = lo.I + invphi * de };
+ a = desat_bounded(a.I, hue, 0.0f, 0.5f, gamut);
+ b = desat_bounded(b.I, hue, 0.0f, 0.5f, gamut);
+
+ while (de > maxDelta) {
+ de *= invphi;
+ if (a.C > b.C) {
+ hi = b;
+ b = a;
+ a.I = lo.I + invphi2 * de;
+ a = desat_bounded(a.I, hue, lo.C - maxDelta, 0.5f, gamut);
+ } else {
+ lo = a;
+ a = b;
+ b.I = lo.I + invphi * de;
+ b = desat_bounded(b.I, hue, hi.C - maxDelta, 0.5f, gamut);
+ }
+ }
+
+ struct ICh peak = a.C > b.C ? a : b;
+ *gamut.peak_cache = peak;
+ return peak;
+}
+
+// Clip a color along the exponential curve given by `gamma`
+static inline struct IPT
+clip_gamma(struct IPT ipt, float gamma, struct gamut gamut)
+{
+ if (ipt.I <= gamut.min_luma)
+ return (struct IPT) { .I = gamut.min_luma };
+ if (ingamut(ipt, gamut))
+ return ipt;
+
+ struct ICh ich = ipt2ich(ipt);
+ if (!gamma)
+ return ich2ipt(desat_bounded(ich.I, ich.h, 0.0f, ich.C, gamut));
+
+ const float maxDI = fmaxf(ich.I * maxDelta, 1e-7f);
+ struct ICh peak = saturate(ich.h, gamut);
+ gamma = scale_gamma(gamma, ich, peak, gamut);
+ float lo = 0.0f, hi = 1.0f, x = 0.5f;
+ do {
+ struct ICh test = mix_exp(ich, x, gamma, peak.I);
+ if (ingamut(ich2ipt(test), gamut)) {
+ lo = x;
+ } else {
+ hi = x;
+ }
+ x = (lo + hi) / 2.0f;
+ } while (hi - lo > maxDI);
+
+ return ich2ipt(mix_exp(ich, x, gamma, peak.I));
+}
+
+static float softclip(float value, float source, float target,
+ const struct pl_gamut_map_constants *c)
+{
+ if (!target)
+ return 0.0f;
+ const float peak = source / target;
+ const float x = fminf(value / target, peak);
+ const float j = c->softclip_knee;
+ if (x <= j || peak <= 1.0)
+ return value;
+ // Apply simple mobius function
+ const float a = -j*j * (peak - 1.0f) / (j*j - 2.0f * j + peak);
+ const float b = (j*j - 2.0f * j * peak + peak) /
+ fmaxf(1e-6f, peak - 1.0f);
+ const float scale = (b*b + 2.0f * b*j + j*j) / (b - a);
+ return scale * (x + a) / (x + b) * target;
+}
+
+static int cmp_float(const void *a, const void *b)
+{
+ float fa = *(const float*) a;
+ float fb = *(const float*) b;
+ return PL_CMP(fa, fb);
+}
+
+static float wrap(float h)
+{
+ if (h > M_PI) {
+ return h - 2 * M_PI;
+ } else if (h < -M_PI) {
+ return h + 2 * M_PI;
+ } else {
+ return h;
+ }
+}
+
+enum {
+ S = 12, // number of hue shift vertices
+ N = S + 2, // +2 for the endpoints
+};
+
+// Hue-shift helper struct
+struct hueshift {
+ float dh[N];
+ float dddh[N];
+ float K[N];
+ float prev_hue;
+ float prev_shift;
+ struct { float hue, delta; } hueshift[N];
+};
+
+static void hueshift_prepare(struct hueshift *s, struct gamut src, struct gamut dst)
+{
+ const float O = pq_eotf(src.min_luma), X = pq_eotf(src.max_luma);
+ const float M = (O + X) / 2.0f;
+ const struct RGB refpoints[S] = {
+ {X, O, O}, {O, X, O}, {O, O, X},
+ {O, X, X}, {X, O, X}, {X, X, O},
+ {O, X, M}, {X, O, M}, {X, M, O},
+ {O, M, X}, {M, O, X}, {M, X, O},
+ };
+
+ memset(s, 0, sizeof(*s));
+ for (int i = 0; i < S; i++) {
+ struct ICh ich_src = ipt2ich(rgb2ipt(refpoints[i], src));
+ struct ICh ich_dst = ipt2ich(rgb2ipt(refpoints[i], dst));
+ const float delta = wrap(ich_dst.h - ich_src.h);
+ s->hueshift[i+1].hue = ich_src.h;
+ s->hueshift[i+1].delta = delta;
+ }
+
+ // Sort and wrap endpoints
+ qsort(s->hueshift + 1, S, sizeof(*s->hueshift), cmp_float);
+ s->hueshift[0] = s->hueshift[S];
+ s->hueshift[S+1] = s->hueshift[1];
+ s->hueshift[0].hue -= 2 * M_PI;
+ s->hueshift[S+1].hue += 2 * M_PI;
+
+ // Construction of cubic spline coefficients
+ float tmp[N][N] = {0};
+ for (int i = N - 1; i > 0; i--) {
+ s->dh[i-1] = s->hueshift[i].hue - s->hueshift[i-1].hue;
+ s->dddh[i] = (s->hueshift[i].delta - s->hueshift[i-1].delta) / s->dh[i-1];
+ }
+ for (int i = 1; i < N - 1; i++) {
+ tmp[i][i] = 2 * (s->dh[i-1] + s->dh[i]);
+ if (i != 1)
+ tmp[i][i-1] = tmp[i-1][i] = s->dh[i-1];
+ tmp[i][N-1] = 6 * (s->dddh[i+1] - s->dddh[i]);
+ }
+ for (int i = 1; i < N - 2; i++) {
+ const float q = (tmp[i+1][i] / tmp[i][i]);
+ for (int j = 1; j <= N - 1; j++)
+ tmp[i+1][j] -= q * tmp[i][j];
+ }
+ for (int i = N - 2; i > 0; i--) {
+ float sum = 0.0f;
+ for (int j = i; j <= N - 2; j++)
+ sum += tmp[i][j] * s->K[j];
+ s->K[i] = (tmp[i][N-1] - sum) / tmp[i][i];
+ }
+
+ s->prev_hue = -10.0f;
+}
+
+static struct ICh hueshift_apply(struct hueshift *s, struct ICh ich)
+{
+ if (fabsf(ich.h - s->prev_hue) < 1e-6f)
+ goto done;
+
+ // Determine perceptual hue shift delta by interpolation of refpoints
+ for (int i = 0; i < N - 1; i++) {
+ if (s->hueshift[i+1].hue > ich.h) {
+ pl_assert(s->hueshift[i].hue <= ich.h);
+ float a = (s->K[i+1] - s->K[i]) / (6 * s->dh[i]);
+ float b = s->K[i] / 2;
+ float c = s->dddh[i+1] - (2 * s->dh[i] * s->K[i] + s->K[i+1] * s->dh[i]) / 6;
+ float d = s->hueshift[i].delta;
+ float x = ich.h - s->hueshift[i].hue;
+ float delta = ((a * x + b) * x + c) * x + d;
+ s->prev_shift = ich.h + delta;
+ s->prev_hue = ich.h;
+ break;
+ }
+ }
+
+done:
+ return (struct ICh) {
+ .I = ich.I,
+ .C = ich.C,
+ .h = s->prev_shift,
+ };
+}
+
+static void perceptual(float *lut, const struct pl_gamut_map_params *params)
+{
+ const struct pl_gamut_map_constants *c = &params->constants;
+ struct cache cache;
+ struct gamut dst, src;
+ get_gamuts(&dst, &src, &cache, params);
+
+ FOREACH_LUT(lut, ipt) {
+ struct ICh ich = ipt2ich(ipt);
+ struct ICh src_peak = saturate(ich.h, src);
+ struct ICh dst_peak = saturate(ich.h, dst);
+ struct IPT mapped = rgb2ipt(ipt2rgb(ipt, src), dst);
+
+ // Protect in gamut region
+ const float maxC = fmaxf(src_peak.C, dst_peak.C);
+ float k = pl_smoothstep(c->perceptual_deadzone, 1.0f, ich.C / maxC);
+ k *= c->perceptual_strength;
+ ipt.I = PL_MIX(ipt.I, mapped.I, k);
+ ipt.P = PL_MIX(ipt.P, mapped.P, k);
+ ipt.T = PL_MIX(ipt.T, mapped.T, k);
+
+ struct RGB rgb = ipt2rgb(ipt, dst);
+ const float maxRGB = fmaxf(rgb.R, fmaxf(rgb.G, rgb.B));
+ rgb.R = fmaxf(softclip(rgb.R, maxRGB, dst.max_rgb, c), dst.min_rgb);
+ rgb.G = fmaxf(softclip(rgb.G, maxRGB, dst.max_rgb, c), dst.min_rgb);
+ rgb.B = fmaxf(softclip(rgb.B, maxRGB, dst.max_rgb, c), dst.min_rgb);
+ ipt = rgb2ipt(rgb, dst);
+ }
+}
+
+const struct pl_gamut_map_function pl_gamut_map_perceptual = {
+ .name = "perceptual",
+ .description = "Perceptual mapping",
+ .bidirectional = true,
+ .map = perceptual,
+};
+
+static void softclip_map(float *lut, const struct pl_gamut_map_params *params)
+{
+ const struct pl_gamut_map_constants *c = &params->constants;
+
+ // Separate cache after hueshift, because this invalidates previous cache
+ struct cache cache_pre, cache_post;
+ struct gamut dst_pre, src_pre, src_post, dst_post;
+ struct hueshift hueshift;
+ get_gamuts(&dst_pre, &src_pre, &cache_pre, params);
+ get_gamuts(&dst_post, &src_post, &cache_post, params);
+ hueshift_prepare(&hueshift, src_pre, dst_pre);
+
+ FOREACH_LUT(lut, ipt) {
+ struct gamut src = src_pre;
+ struct gamut dst = dst_pre;
+
+ if (ipt.I <= dst.min_luma) {
+ ipt.P = ipt.T = 0.0f;
+ continue;
+ }
+
+ struct ICh ich = ipt2ich(ipt);
+ if (ich.C <= 1e-2f)
+ continue; // Fast path for achromatic colors
+
+ float margin = 1.0f;
+ struct ICh shifted = hueshift_apply(&hueshift, ich);
+ if (fabsf(shifted.h - ich.h) >= 1e-3f) {
+ struct ICh src_border = desat_bounded(ich.I, ich.h, 0.0f, 0.5f, src);
+ struct ICh dst_border = desat_bounded(ich.I, ich.h, 0.0f, 0.5f, dst);
+ const float k = pl_smoothstep(dst_border.C * c->softclip_knee,
+ src_border.C, ich.C);
+ ich.h = PL_MIX(ich.h, shifted.h, k);
+ src = src_post;
+ dst = dst_post;
+
+ // Expand/contract chromaticity margin to correspond to the altered
+ // size of the hue leaf after applying the hue delta
+ struct ICh shift_border = desat_bounded(ich.I, ich.h, 0.0f, 0.5f, src);
+ margin *= fmaxf(1.0f, src_border.C / shift_border.C);
+ }
+
+ // Determine intersections with source and target gamuts, and
+ // apply softclip to the chromaticity
+ struct ICh source = saturate(ich.h, src);
+ struct ICh target = saturate(ich.h, dst);
+ struct ICh border = desat_bounded(ich.I, ich.h, 0.0f, target.C, dst);
+ const float chromaticity = PL_MIX(target.C, border.C, c->softclip_desat);
+ ich.C = softclip(ich.C, margin * source.C, chromaticity, c);
+
+ // Soft-clip the resulting RGB color. This will generally distort
+ // hues slightly, but hopefully in an aesthetically pleasing way.
+ struct ICh saturated = { ich.I, chromaticity, ich.h };
+ struct RGB peak = ipt2rgb(ich2ipt(saturated), dst);
+ struct RGB rgb = ipt2rgb(ich2ipt(ich), dst);
+ rgb.R = fmaxf(softclip(rgb.R, peak.R, dst.max_rgb, c), dst.min_rgb);
+ rgb.G = fmaxf(softclip(rgb.G, peak.G, dst.max_rgb, c), dst.min_rgb);
+ rgb.B = fmaxf(softclip(rgb.B, peak.B, dst.max_rgb, c), dst.min_rgb);
+ ipt = rgb2ipt(rgb, dst);
+ }
+}
+
+const struct pl_gamut_map_function pl_gamut_map_softclip = {
+ .name = "softclip",
+ .description = "Soft clipping",
+ .map = softclip_map,
+};
+
+static void relative(float *lut, const struct pl_gamut_map_params *params)
+{
+ const struct pl_gamut_map_constants *c = &params->constants;
+ struct cache cache;
+ struct gamut dst;
+ get_gamuts(&dst, NULL, &cache, params);
+
+ FOREACH_LUT(lut, ipt)
+ ipt = clip_gamma(ipt, c->colorimetric_gamma, dst);
+}
+
+const struct pl_gamut_map_function pl_gamut_map_relative = {
+ .name = "relative",
+ .description = "Colorimetric clip",
+ .map = relative,
+};
+
+static void desaturate(float *lut, const struct pl_gamut_map_params *params)
+{
+ struct cache cache;
+ struct gamut dst;
+ get_gamuts(&dst, NULL, &cache, params);
+
+ FOREACH_LUT(lut, ipt)
+ ipt = clip_gamma(ipt, 0.0f, dst);
+}
+
+const struct pl_gamut_map_function pl_gamut_map_desaturate = {
+ .name = "desaturate",
+ .description = "Desaturating clip",
+ .map = desaturate,
+};
+
+static void saturation(float *lut, const struct pl_gamut_map_params *params)
+{
+ struct cache cache;
+ struct gamut dst, src;
+ get_gamuts(&dst, &src, &cache, params);
+
+ FOREACH_LUT(lut, ipt)
+ ipt = rgb2ipt(ipt2rgb(ipt, src), dst);
+}
+
+const struct pl_gamut_map_function pl_gamut_map_saturation = {
+ .name = "saturation",
+ .description = "Saturation mapping",
+ .bidirectional = true,
+ .map = saturation,
+};
+
+static void absolute(float *lut, const struct pl_gamut_map_params *params)
+{
+ const struct pl_gamut_map_constants *c = &params->constants;
+ struct cache cache;
+ struct gamut dst;
+ get_gamuts(&dst, NULL, &cache, params);
+ pl_matrix3x3 m = pl_get_adaptation_matrix(params->output_gamut.white,
+ params->input_gamut.white);
+
+ FOREACH_LUT(lut, ipt) {
+ struct RGB rgb = ipt2rgb(ipt, dst);
+ pl_matrix3x3_apply(&m, (float *) &rgb);
+ ipt = rgb2ipt(rgb, dst);
+ ipt = clip_gamma(ipt, c->colorimetric_gamma, dst);
+ }
+}
+
+const struct pl_gamut_map_function pl_gamut_map_absolute = {
+ .name = "absolute",
+ .description = "Absolute colorimetric clip",
+ .map = absolute,
+};
+
+static void highlight(float *lut, const struct pl_gamut_map_params *params)
+{
+ struct cache cache;
+ struct gamut dst;
+ get_gamuts(&dst, NULL, &cache, params);
+
+ FOREACH_LUT(lut, ipt) {
+ if (!ingamut(ipt, dst)) {
+ ipt.I = fminf(ipt.I + 0.1f, 1.0f);
+ ipt.P = fclampf(-1.2f * ipt.P, -0.5f, 0.5f);
+ ipt.T = fclampf(-1.2f * ipt.T, -0.5f, 0.5f);
+ }
+ }
+}
+
+const struct pl_gamut_map_function pl_gamut_map_highlight = {
+ .name = "highlight",
+ .description = "Highlight out-of-gamut pixels",
+ .map = highlight,
+};
+
+static void linear(float *lut, const struct pl_gamut_map_params *params)
+{
+ struct cache cache;
+ struct gamut dst, src;
+ get_gamuts(&dst, &src, &cache, params);
+
+ float gain = 1.0f;
+ for (float hue = -M_PI; hue < M_PI; hue += 0.1f)
+ gain = fminf(gain, saturate(hue, dst).C / saturate(hue, src).C);
+
+ FOREACH_LUT(lut, ipt) {
+ struct ICh ich = ipt2ich(ipt);
+ ich.C *= gain;
+ ipt = ich2ipt(ich);
+ }
+}
+
+const struct pl_gamut_map_function pl_gamut_map_linear = {
+ .name = "linear",
+ .description = "Linear desaturate",
+ .map = linear,
+};
+
+static void darken(float *lut, const struct pl_gamut_map_params *params)
+{
+ const struct pl_gamut_map_constants *c = &params->constants;
+ struct cache cache;
+ struct gamut dst, src;
+ get_gamuts(&dst, &src, &cache, params);
+
+ static const struct RGB points[6] = {
+ {1, 0, 0}, {0, 1, 0}, {0, 0, 1},
+ {0, 1, 1}, {1, 0, 1}, {1, 1, 0},
+ };
+
+ float gain = 1.0f;
+ for (int i = 0; i < PL_ARRAY_SIZE(points); i++) {
+ const struct RGB p = ipt2rgb(rgb2ipt(points[i], src), dst);
+ const float maxRGB = PL_MAX3(p.R, p.G, p.B);
+ gain = fminf(gain, 1.0 / maxRGB);
+ }
+
+ FOREACH_LUT(lut, ipt) {
+ struct RGB rgb = ipt2rgb(ipt, dst);
+ rgb.R *= gain;
+ rgb.G *= gain;
+ rgb.B *= gain;
+ ipt = rgb2ipt(rgb, dst);
+ ipt = clip_gamma(ipt, c->colorimetric_gamma, dst);
+ }
+}
+
+const struct pl_gamut_map_function pl_gamut_map_darken = {
+ .name = "darken",
+ .description = "Darken and clip",
+ .map = darken,
+};
+
+static void noop(float *lut, const struct pl_gamut_map_params *params)
+{
+ return;
+}
+
+const struct pl_gamut_map_function pl_gamut_map_clip = {
+ .name = "clip",
+ .description = "No gamut mapping (hard clip)",
+ .map = noop,
+};
+
+const struct pl_gamut_map_function * const pl_gamut_map_functions[] = {
+ &pl_gamut_map_clip,
+ &pl_gamut_map_perceptual,
+ &pl_gamut_map_softclip,
+ &pl_gamut_map_relative,
+ &pl_gamut_map_saturation,
+ &pl_gamut_map_absolute,
+ &pl_gamut_map_desaturate,
+ &pl_gamut_map_darken,
+ &pl_gamut_map_highlight,
+ &pl_gamut_map_linear,
+ NULL
+};
+
+const int pl_num_gamut_map_functions = PL_ARRAY_SIZE(pl_gamut_map_functions) - 1;
+
+const struct pl_gamut_map_function *pl_find_gamut_map_function(const char *name)
+{
+ for (int i = 0; i < pl_num_gamut_map_functions; i++) {
+ if (strcmp(name, pl_gamut_map_functions[i]->name) == 0)
+ return pl_gamut_map_functions[i];
+ }
+
+ return NULL;
+}
diff --git a/src/glsl/glslang.cc b/src/glsl/glslang.cc
new file mode 100644
index 0000000..2bc923c
--- /dev/null
+++ b/src/glsl/glslang.cc
@@ -0,0 +1,121 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "config_internal.h"
+
+#include <assert.h>
+
+extern "C" {
+#include "pl_alloc.h"
+#include "pl_thread.h"
+}
+
+#include <glslang/Public/ShaderLang.h>
+#include <glslang/SPIRV/GlslangToSpv.h>
+#include <glslang/build_info.h>
+
+#include "glslang.h"
+
+#if (GLSLANG_VERSION_MAJOR * 1000 + GLSLANG_VERSION_MINOR) >= 11013
+#include <glslang/Public/ResourceLimits.h>
+#define DefaultTBuiltInResource *GetDefaultResources()
+#endif
+
+using namespace glslang;
+
+static pl_static_mutex pl_glslang_mutex = PL_STATIC_MUTEX_INITIALIZER;
+static int pl_glslang_refcount;
+
+bool pl_glslang_init(void)
+{
+ bool ret = true;
+
+ pl_static_mutex_lock(&pl_glslang_mutex);
+ if (pl_glslang_refcount++ == 0)
+ ret = InitializeProcess();
+ pl_static_mutex_unlock(&pl_glslang_mutex);
+
+ return ret;
+}
+
+void pl_glslang_uninit(void)
+{
+ pl_static_mutex_lock(&pl_glslang_mutex);
+ if (--pl_glslang_refcount == 0)
+ FinalizeProcess();
+ pl_static_mutex_unlock(&pl_glslang_mutex);
+}
+
+struct pl_glslang_res *pl_glslang_compile(struct pl_glsl_version glsl_ver,
+ struct pl_spirv_version spirv_ver,
+ enum glsl_shader_stage stage,
+ const char *text)
+{
+ assert(pl_glslang_refcount);
+ struct pl_glslang_res *res = pl_zalloc_ptr(NULL, res);
+
+ EShLanguage lang;
+ switch (stage) {
+ case GLSL_SHADER_VERTEX: lang = EShLangVertex; break;
+ case GLSL_SHADER_FRAGMENT: lang = EShLangFragment; break;
+ case GLSL_SHADER_COMPUTE: lang = EShLangCompute; break;
+ default: abort();
+ }
+
+ TShader *shader = new TShader(lang);
+
+ shader->setEnvClient(EShClientVulkan, (EShTargetClientVersion) spirv_ver.env_version);
+ shader->setEnvTarget(EShTargetSpv, (EShTargetLanguageVersion) spirv_ver.spv_version);
+ shader->setStrings(&text, 1);
+
+ TBuiltInResource limits = DefaultTBuiltInResource;
+ limits.maxComputeWorkGroupSizeX = glsl_ver.max_group_size[0];
+ limits.maxComputeWorkGroupSizeY = glsl_ver.max_group_size[1];
+ limits.maxComputeWorkGroupSizeZ = glsl_ver.max_group_size[2];
+ limits.minProgramTexelOffset = glsl_ver.min_gather_offset;
+ limits.maxProgramTexelOffset = glsl_ver.max_gather_offset;
+
+ if (!shader->parse(&limits, 0, true, EShMsgDefault)) {
+ res->error_msg = pl_str0dup0(res, shader->getInfoLog());
+ delete shader;
+ return res;
+ }
+
+ TProgram *prog = new TProgram();
+ prog->addShader(shader);
+ if (!prog->link(EShMsgDefault)) {
+ res->error_msg = pl_str0dup0(res, prog->getInfoLog());
+ delete shader;
+ delete prog;
+ return res;
+ }
+
+ SpvOptions options;
+ options.disableOptimizer = false;
+ options.stripDebugInfo = true;
+ options.optimizeSize = true;
+ options.validate = true;
+ std::vector<unsigned int> spirv;
+ GlslangToSpv(*prog->getIntermediate(lang), spirv, &options);
+
+ res->success = true;
+ res->size = spirv.size() * sizeof(unsigned int);
+ res->data = pl_memdup(res, spirv.data(), res->size),
+ delete shader;
+ delete prog;
+ return res;
+}
diff --git a/src/glsl/glslang.h b/src/glsl/glslang.h
new file mode 100644
index 0000000..a5965a5
--- /dev/null
+++ b/src/glsl/glslang.h
@@ -0,0 +1,57 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <stdlib.h>
+#include <stdbool.h>
+
+typedef struct TLimits TLimits;
+typedef struct TBuiltInResource TBuiltInResource;
+#include <glslang/Include/ResourceLimits.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "utils.h"
+
+bool pl_glslang_init(void);
+void pl_glslang_uninit(void);
+
+struct pl_glslang_res {
+ // Compilation status
+ bool success;
+ const char *error_msg;
+
+ // Compiled shader memory, or NULL
+ void *data;
+ size_t size;
+};
+
+// Compile GLSL into a SPIRV stream, if possible. The resulting
+// pl_glslang_res can simply be freed with pl_free() when done.
+struct pl_glslang_res *pl_glslang_compile(struct pl_glsl_version glsl_ver,
+ struct pl_spirv_version spirv_ver,
+ enum glsl_shader_stage stage,
+ const char *shader);
+
+extern const TBuiltInResource DefaultTBuiltInResource;
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/glsl/glslang_resources.c b/src/glsl/glslang_resources.c
new file mode 100644
index 0000000..a111c15
--- /dev/null
+++ b/src/glsl/glslang_resources.c
@@ -0,0 +1,132 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "glslang.h"
+
+// Taken from glslang's examples, which apparently generally bases the choices
+// on OpenGL specification limits
+//
+// Note: This lives in a separate file so we can compile this struct using C99
+// designated initializers instead of using C++ struct initializers, because
+// the latter will break on every upstream struct extension.
+const TBuiltInResource DefaultTBuiltInResource = {
+ .maxLights = 32,
+ .maxClipPlanes = 6,
+ .maxTextureUnits = 32,
+ .maxTextureCoords = 32,
+ .maxVertexAttribs = 64,
+ .maxVertexUniformComponents = 4096,
+ .maxVaryingFloats = 64,
+ .maxVertexTextureImageUnits = 32,
+ .maxCombinedTextureImageUnits = 80,
+ .maxTextureImageUnits = 32,
+ .maxFragmentUniformComponents = 4096,
+ .maxDrawBuffers = 32,
+ .maxVertexUniformVectors = 128,
+ .maxVaryingVectors = 8,
+ .maxFragmentUniformVectors = 16,
+ .maxVertexOutputVectors = 16,
+ .maxFragmentInputVectors = 15,
+ .minProgramTexelOffset = -8,
+ .maxProgramTexelOffset = 7,
+ .maxClipDistances = 8,
+ .maxComputeWorkGroupCountX = 65535,
+ .maxComputeWorkGroupCountY = 65535,
+ .maxComputeWorkGroupCountZ = 65535,
+ .maxComputeWorkGroupSizeX = 1024,
+ .maxComputeWorkGroupSizeY = 1024,
+ .maxComputeWorkGroupSizeZ = 64,
+ .maxComputeUniformComponents = 1024,
+ .maxComputeTextureImageUnits = 16,
+ .maxComputeImageUniforms = 8,
+ .maxComputeAtomicCounters = 8,
+ .maxComputeAtomicCounterBuffers = 1,
+ .maxVaryingComponents = 60,
+ .maxVertexOutputComponents = 64,
+ .maxGeometryInputComponents = 64,
+ .maxGeometryOutputComponents = 128,
+ .maxFragmentInputComponents = 128,
+ .maxImageUnits = 8,
+ .maxCombinedImageUnitsAndFragmentOutputs = 8,
+ .maxCombinedShaderOutputResources = 8,
+ .maxImageSamples = 0,
+ .maxVertexImageUniforms = 0,
+ .maxTessControlImageUniforms = 0,
+ .maxTessEvaluationImageUniforms = 0,
+ .maxGeometryImageUniforms = 0,
+ .maxFragmentImageUniforms = 8,
+ .maxCombinedImageUniforms = 8,
+ .maxGeometryTextureImageUnits = 16,
+ .maxGeometryOutputVertices = 256,
+ .maxGeometryTotalOutputComponents = 1024,
+ .maxGeometryUniformComponents = 1024,
+ .maxGeometryVaryingComponents = 64,
+ .maxTessControlInputComponents = 128,
+ .maxTessControlOutputComponents = 128,
+ .maxTessControlTextureImageUnits = 16,
+ .maxTessControlUniformComponents = 1024,
+ .maxTessControlTotalOutputComponents = 4096,
+ .maxTessEvaluationInputComponents = 128,
+ .maxTessEvaluationOutputComponents = 128,
+ .maxTessEvaluationTextureImageUnits = 16,
+ .maxTessEvaluationUniformComponents = 1024,
+ .maxTessPatchComponents = 120,
+ .maxPatchVertices = 32,
+ .maxTessGenLevel = 64,
+ .maxViewports = 16,
+ .maxVertexAtomicCounters = 0,
+ .maxTessControlAtomicCounters = 0,
+ .maxTessEvaluationAtomicCounters = 0,
+ .maxGeometryAtomicCounters = 0,
+ .maxFragmentAtomicCounters = 8,
+ .maxCombinedAtomicCounters = 8,
+ .maxAtomicCounterBindings = 1,
+ .maxVertexAtomicCounterBuffers = 0,
+ .maxTessControlAtomicCounterBuffers = 0,
+ .maxTessEvaluationAtomicCounterBuffers = 0,
+ .maxGeometryAtomicCounterBuffers = 0,
+ .maxFragmentAtomicCounterBuffers = 1,
+ .maxCombinedAtomicCounterBuffers = 1,
+ .maxAtomicCounterBufferSize = 16384,
+ .maxTransformFeedbackBuffers = 4,
+ .maxTransformFeedbackInterleavedComponents = 64,
+ .maxCullDistances = 8,
+ .maxCombinedClipAndCullDistances = 8,
+ .maxSamples = 4,
+ .maxMeshOutputVerticesNV = 256,
+ .maxMeshOutputPrimitivesNV = 512,
+ .maxMeshWorkGroupSizeX_NV = 32,
+ .maxMeshWorkGroupSizeY_NV = 1,
+ .maxMeshWorkGroupSizeZ_NV = 1,
+ .maxTaskWorkGroupSizeX_NV = 32,
+ .maxTaskWorkGroupSizeY_NV = 1,
+ .maxTaskWorkGroupSizeZ_NV = 1,
+ .maxMeshViewCountNV = 4,
+ .maxDualSourceDrawBuffersEXT = 1,
+
+ .limits = {
+ .nonInductiveForLoops = 1,
+ .whileLoops = 1,
+ .doWhileLoops = 1,
+ .generalUniformIndexing = 1,
+ .generalAttributeMatrixVectorIndexing = 1,
+ .generalVaryingIndexing = 1,
+ .generalSamplerIndexing = 1,
+ .generalVariableIndexing = 1,
+ .generalConstantMatrixVectorIndexing = 1,
+ },
+};
diff --git a/src/glsl/meson.build b/src/glsl/meson.build
new file mode 100644
index 0000000..5cebfb8
--- /dev/null
+++ b/src/glsl/meson.build
@@ -0,0 +1,73 @@
+# shaderc
+shaderc = dependency('shaderc', version: '>=2019.1', required: get_option('shaderc'))
+components.set('shaderc', shaderc.found())
+if shaderc.found()
+ build_deps += shaderc
+ sources += 'glsl/spirv_shaderc.c'
+endif
+
+# glslang
+glslang = disabler()
+glslang_req = get_option('glslang')
+if glslang_req.auto() and shaderc.found()
+
+ # we only need one or the other, and shaderc is preferred
+ message('Skipping `glslang` because `shaderc` is available')
+
+elif not glslang_req.disabled()
+
+ glslang_deps = [
+ cxx.find_library('glslang-default-resource-limits', required: false)
+ ]
+
+ # meson doesn't respect generator expressions in INTERFACE_LINK_LIBRARIES
+ # https://github.com/mesonbuild/meson/issues/8232
+ # TODO: Use the following once it's fixed
+ # glslang = dependency('glslang', method: 'cmake', modules: ['glslang::SPIRV'])
+
+ prefer_static = get_option('prefer_static')
+ found_lib = false
+ foreach arg : [[prefer_static, false], [not prefer_static, glslang_req]]
+ static = arg[0]
+ required = arg[1]
+
+ spirv = cxx.find_library('SPIRV', required: required, static: static)
+
+ if not spirv.found()
+ continue
+ endif
+
+ glslang_deps += spirv
+
+ if static
+ glslang_deps += [
+ # Always required for static linking
+ cxx.find_library('MachineIndependent', required: true, static: true),
+ cxx.find_library('OSDependent', required: true, static: true),
+ cxx.find_library('OGLCompiler', required: true, static: true),
+ cxx.find_library('GenericCodeGen', required: true, static: true),
+ # SPIRV-Tools are required only if optimizer is enabled in glslang build
+ cxx.find_library('SPIRV-Tools', required: false, static: true),
+ cxx.find_library('SPIRV-Tools-opt', required: false, static: true),
+ ]
+ endif
+
+ found_lib = true
+ break
+ endforeach
+
+ if found_lib and cc.has_header('glslang/build_info.h')
+ glslang = declare_dependency(dependencies: glslang_deps)
+ endif
+
+endif
+
+components.set('glslang', glslang.found())
+if glslang.found()
+ build_deps += glslang
+ sources += [
+ 'glsl/glslang.cc',
+ 'glsl/glslang_resources.c',
+ 'glsl/spirv_glslang.c',
+ ]
+endif
diff --git a/src/glsl/spirv.c b/src/glsl/spirv.c
new file mode 100644
index 0000000..8317ed7
--- /dev/null
+++ b/src/glsl/spirv.c
@@ -0,0 +1,64 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "spirv.h"
+
+extern const struct spirv_compiler pl_spirv_shaderc;
+extern const struct spirv_compiler pl_spirv_glslang;
+
+static const struct spirv_compiler *compilers[] = {
+#ifdef PL_HAVE_SHADERC
+ &pl_spirv_shaderc,
+#endif
+#ifdef PL_HAVE_GLSLANG
+ &pl_spirv_glslang,
+#endif
+};
+
+pl_spirv pl_spirv_create(pl_log log, struct pl_spirv_version spirv_ver)
+{
+ for (int i = 0; i < PL_ARRAY_SIZE(compilers); i++) {
+ pl_spirv spirv = compilers[i]->create(log, spirv_ver);
+ if (!spirv)
+ continue;
+
+ pl_info(log, "Initialized SPIR-V compiler '%s'", compilers[i]->name);
+ return spirv;
+ }
+
+ pl_fatal(log, "Failed initializing any SPIR-V compiler! Maybe libplacebo "
+ "was built without support for either libshaderc or glslang?");
+ return NULL;
+}
+
+void pl_spirv_destroy(pl_spirv *pspirv)
+{
+ pl_spirv spirv = *pspirv;
+ if (!spirv)
+ return;
+
+ spirv->impl->destroy(spirv);
+ *pspirv = NULL;
+}
+
+pl_str pl_spirv_compile_glsl(pl_spirv spirv, void *alloc,
+ struct pl_glsl_version glsl,
+ enum glsl_shader_stage stage,
+ const char *shader)
+{
+ return spirv->impl->compile(spirv, alloc, glsl, stage, shader);
+}
diff --git a/src/glsl/spirv.h b/src/glsl/spirv.h
new file mode 100644
index 0000000..fa4494a
--- /dev/null
+++ b/src/glsl/spirv.h
@@ -0,0 +1,50 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "log.h"
+#include "utils.h"
+
+typedef const struct pl_spirv_t {
+ const struct spirv_compiler *impl;
+ pl_log log;
+
+ // SPIR-V version specified at creation time.
+ struct pl_spirv_version version;
+
+ // For cache invalidation, should uniquely identify everything about this
+ // spirv compiler and its configuration.
+ uint64_t signature;
+} *pl_spirv;
+
+// Initialize a SPIR-V compiler instance, or returns NULL on failure.
+pl_spirv pl_spirv_create(pl_log log, struct pl_spirv_version spirv_ver);
+void pl_spirv_destroy(pl_spirv *spirv);
+
+// Compile GLSL to SPIR-V. Returns {0} on failure.
+pl_str pl_spirv_compile_glsl(pl_spirv spirv, void *alloc,
+ struct pl_glsl_version glsl_ver,
+ enum glsl_shader_stage stage,
+ const char *shader);
+
+struct spirv_compiler {
+ const char *name;
+ void (*destroy)(pl_spirv spirv);
+ __typeof__(pl_spirv_create) *create;
+ __typeof__(pl_spirv_compile_glsl) *compile;
+};
diff --git a/src/glsl/spirv_glslang.c b/src/glsl/spirv_glslang.c
new file mode 100644
index 0000000..ffb8f55
--- /dev/null
+++ b/src/glsl/spirv_glslang.c
@@ -0,0 +1,112 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "hash.h"
+#include "spirv.h"
+#include "utils.h"
+#include "glsl/glslang.h"
+
+// This header contains only preprocessor definitions
+#include <glslang/build_info.h>
+
+// This is awkward, but we cannot use upstream macro, it was fixed in 11.11.0
+#define PL_GLSLANG_VERSION_GREATER_THAN(major, minor, patch) \
+ ((GLSLANG_VERSION_MAJOR) > (major) || ((major) == GLSLANG_VERSION_MAJOR && \
+ ((GLSLANG_VERSION_MINOR) > (minor) || ((minor) == GLSLANG_VERSION_MINOR && \
+ (GLSLANG_VERSION_PATCH) > (patch)))))
+
+#if PL_GLSLANG_VERSION_GREATER_THAN(11, 8, 0)
+#define GLSLANG_SPV_MAX PL_SPV_VERSION(1, 6)
+#elif PL_GLSLANG_VERSION_GREATER_THAN(7, 13, 3496)
+#define GLSLANG_SPV_MAX PL_SPV_VERSION(1, 5)
+#elif PL_GLSLANG_VERSION_GREATER_THAN(6, 2, 2596)
+#define GLSLANG_SPV_MAX PL_SPV_VERSION(1, 3)
+#else
+#define GLSLANG_SPV_MAX PL_SPV_VERSION(1, 0)
+#endif
+
+const struct spirv_compiler pl_spirv_glslang;
+
+static void glslang_destroy(pl_spirv spirv)
+{
+ pl_glslang_uninit();
+ pl_free((void *) spirv);
+}
+
+static pl_spirv glslang_create(pl_log log, struct pl_spirv_version spirv_ver)
+{
+ if (!pl_glslang_init()) {
+ pl_fatal(log, "Failed initializing glslang SPIR-V compiler!");
+ return NULL;
+ }
+
+ struct pl_spirv_t *spirv = pl_alloc_ptr(NULL, spirv);
+ *spirv = (struct pl_spirv_t) {
+ .signature = pl_str0_hash(pl_spirv_glslang.name),
+ .impl = &pl_spirv_glslang,
+ .version = spirv_ver,
+ .log = log,
+ };
+
+ PL_INFO(spirv, "glslang version: %d.%d.%d",
+ GLSLANG_VERSION_MAJOR,
+ GLSLANG_VERSION_MINOR,
+ GLSLANG_VERSION_PATCH);
+
+ // Clamp to supported version by glslang
+ if (GLSLANG_SPV_MAX < spirv->version.spv_version) {
+ spirv->version.spv_version = GLSLANG_SPV_MAX;
+ spirv->version.env_version = pl_spirv_version_to_vulkan(GLSLANG_SPV_MAX);
+ }
+
+ pl_hash_merge(&spirv->signature, (uint64_t) spirv->version.spv_version << 32 |
+ spirv->version.env_version);
+ pl_hash_merge(&spirv->signature, (GLSLANG_VERSION_MAJOR & 0xFF) << 24 |
+ (GLSLANG_VERSION_MINOR & 0xFF) << 16 |
+ (GLSLANG_VERSION_PATCH & 0xFFFF));
+ return spirv;
+}
+
+static pl_str glslang_compile(pl_spirv spirv, void *alloc,
+ struct pl_glsl_version glsl_ver,
+ enum glsl_shader_stage stage,
+ const char *shader)
+{
+ struct pl_glslang_res *res;
+
+ res = pl_glslang_compile(glsl_ver, spirv->version, stage, shader);
+ if (!res || !res->success) {
+ PL_ERR(spirv, "glslang failed: %s", res ? res->error_msg : "(null)");
+ pl_free(res);
+ return (struct pl_str) {0};
+ }
+
+ struct pl_str ret = {
+ .buf = pl_steal(alloc, res->data),
+ .len = res->size,
+ };
+
+ pl_free(res);
+ return ret;
+}
+
+const struct spirv_compiler pl_spirv_glslang = {
+ .name = "glslang",
+ .destroy = glslang_destroy,
+ .create = glslang_create,
+ .compile = glslang_compile,
+};
diff --git a/src/glsl/spirv_shaderc.c b/src/glsl/spirv_shaderc.c
new file mode 100644
index 0000000..e384382
--- /dev/null
+++ b/src/glsl/spirv_shaderc.c
@@ -0,0 +1,174 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdlib.h>
+#include <shaderc/shaderc.h>
+
+#include "hash.h"
+#include "spirv.h"
+#include "utils.h"
+
+const struct spirv_compiler pl_spirv_shaderc;
+
+struct priv {
+ shaderc_compiler_t compiler;
+};
+
+static void shaderc_destroy(pl_spirv spirv)
+{
+ struct priv *p = PL_PRIV(spirv);
+ shaderc_compiler_release(p->compiler);
+ pl_free((void *) spirv);
+}
+
+static pl_spirv shaderc_create(pl_log log, struct pl_spirv_version spirv_ver)
+{
+ struct pl_spirv_t *spirv = pl_alloc_obj(NULL, spirv, struct priv);
+ *spirv = (struct pl_spirv_t) {
+ .signature = pl_str0_hash(pl_spirv_shaderc.name),
+ .impl = &pl_spirv_shaderc,
+ .version = spirv_ver,
+ .log = log,
+ };
+
+ struct priv *p = PL_PRIV(spirv);
+ p->compiler = shaderc_compiler_initialize();
+ if (!p->compiler)
+ goto error;
+
+ unsigned int ver = 0, rev = 0;
+ shaderc_get_spv_version(&ver, &rev);
+ PL_INFO(spirv, "shaderc SPIR-V version %u.%u rev %u",
+ ver >> 16, (ver >> 8) & 0xff, rev);
+
+ // Clamp to supported version by shaderc
+ if (ver < spirv->version.spv_version) {
+ spirv->version.spv_version = ver;
+ spirv->version.env_version = pl_spirv_version_to_vulkan(ver);
+ }
+
+ pl_hash_merge(&spirv->signature, (uint64_t) spirv->version.spv_version << 32 |
+ spirv->version.env_version);
+ pl_hash_merge(&spirv->signature, (uint64_t) ver << 32 | rev);
+ return spirv;
+
+error:
+ shaderc_destroy(spirv);
+ return NULL;
+}
+
+static pl_str shaderc_compile(pl_spirv spirv, void *alloc,
+ struct pl_glsl_version glsl_ver,
+ enum glsl_shader_stage stage,
+ const char *shader)
+{
+ struct priv *p = PL_PRIV(spirv);
+ const size_t len = strlen(shader);
+
+ shaderc_compile_options_t opts = shaderc_compile_options_initialize();
+ if (!opts)
+ return (pl_str) {0};
+
+ shaderc_compile_options_set_optimization_level(opts,
+ shaderc_optimization_level_performance);
+ shaderc_compile_options_set_target_spirv(opts, spirv->version.spv_version);
+ shaderc_compile_options_set_target_env(opts, shaderc_target_env_vulkan,
+ spirv->version.env_version);
+
+ for (int i = 0; i < 3; i++) {
+ shaderc_compile_options_set_limit(opts,
+ shaderc_limit_max_compute_work_group_size_x + i,
+ glsl_ver.max_group_size[i]);
+ }
+
+ shaderc_compile_options_set_limit(opts,
+ shaderc_limit_min_program_texel_offset,
+ glsl_ver.min_gather_offset);
+ shaderc_compile_options_set_limit(opts,
+ shaderc_limit_max_program_texel_offset,
+ glsl_ver.max_gather_offset);
+
+ static const shaderc_shader_kind kinds[] = {
+ [GLSL_SHADER_VERTEX] = shaderc_glsl_vertex_shader,
+ [GLSL_SHADER_FRAGMENT] = shaderc_glsl_fragment_shader,
+ [GLSL_SHADER_COMPUTE] = shaderc_glsl_compute_shader,
+ };
+
+ static const char * const file_name = "input";
+ static const char * const entry_point = "main";
+
+ shaderc_compilation_result_t res;
+ res = shaderc_compile_into_spv(p->compiler, shader, len, kinds[stage],
+ file_name, entry_point, opts);
+
+ int errs = shaderc_result_get_num_errors(res),
+ warn = shaderc_result_get_num_warnings(res);
+
+ enum pl_log_level lev = errs ? PL_LOG_ERR : warn ? PL_LOG_INFO : PL_LOG_DEBUG;
+
+ int s = shaderc_result_get_compilation_status(res);
+ bool success = s == shaderc_compilation_status_success;
+ if (!success)
+ lev = PL_LOG_ERR;
+
+ const char *msg = shaderc_result_get_error_message(res);
+ if (msg[0])
+ PL_MSG(spirv, lev, "shaderc output:\n%s", msg);
+
+ static const char *results[] = {
+ [shaderc_compilation_status_success] = "success",
+ [shaderc_compilation_status_invalid_stage] = "invalid stage",
+ [shaderc_compilation_status_compilation_error] = "error",
+ [shaderc_compilation_status_internal_error] = "internal error",
+ [shaderc_compilation_status_null_result_object] = "no result",
+ [shaderc_compilation_status_invalid_assembly] = "invalid assembly",
+ };
+
+ const char *status = s < PL_ARRAY_SIZE(results) ? results[s] : "unknown";
+ PL_MSG(spirv, lev, "shaderc compile status '%s' (%d errors, %d warnings)",
+ status, errs, warn);
+
+ pl_str ret = {0};
+ if (success) {
+ void *bytes = (void *) shaderc_result_get_bytes(res);
+ pl_assert(bytes);
+ ret.len = shaderc_result_get_length(res);
+ ret.buf = pl_memdup(alloc, bytes, ret.len);
+
+ if (pl_msg_test(spirv->log, PL_LOG_TRACE)) {
+ shaderc_compilation_result_t dis;
+ dis = shaderc_compile_into_spv_assembly(p->compiler, shader, len,
+ kinds[stage], file_name,
+ entry_point, opts);
+ PL_TRACE(spirv, "Generated SPIR-V:\n%.*s",
+ (int) shaderc_result_get_length(dis),
+ shaderc_result_get_bytes(dis));
+ shaderc_result_release(dis);
+ }
+ }
+
+ shaderc_result_release(res);
+ shaderc_compile_options_release(opts);
+ return ret;
+}
+
+const struct spirv_compiler pl_spirv_shaderc = {
+ .name = "shaderc",
+ .destroy = shaderc_destroy,
+ .create = shaderc_create,
+ .compile = shaderc_compile,
+};
diff --git a/src/glsl/utils.h b/src/glsl/utils.h
new file mode 100644
index 0000000..965ea9e
--- /dev/null
+++ b/src/glsl/utils.h
@@ -0,0 +1,52 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <libplacebo/gpu.h>
+
+#define PL_SPV_VERSION(major, minor) ((major) << 16 | (minor) << 8)
+#define PL_VLK_VERSION(major, minor) ((major) << 22 | (minor) << 12)
+
+// Max version that can be used
+#define PL_MAX_SPIRV_VER PL_SPV_VERSION(1, 6)
+
+struct pl_spirv_version {
+ uint32_t env_version;
+ uint32_t spv_version;
+};
+
+// Returns minimum Vulkan version for given SPIR-V version
+static inline uint32_t pl_spirv_version_to_vulkan(uint32_t spirv_ver)
+{
+ if (spirv_ver >= PL_SPV_VERSION(1, 6))
+ return PL_VLK_VERSION(1, 3);
+ if (spirv_ver >= PL_SPV_VERSION(1, 4))
+ return PL_VLK_VERSION(1, 2);
+ if (spirv_ver >= PL_SPV_VERSION(1, 1))
+ return PL_VLK_VERSION(1, 1);
+ return PL_VLK_VERSION(1, 0);
+}
+
+enum glsl_shader_stage {
+ GLSL_SHADER_VERTEX = 0,
+ GLSL_SHADER_FRAGMENT,
+ GLSL_SHADER_COMPUTE,
+};
diff --git a/src/gpu.c b/src/gpu.c
new file mode 100644
index 0000000..b639ec2
--- /dev/null
+++ b/src/gpu.c
@@ -0,0 +1,1338 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+#include "gpu.h"
+
+#define require(expr) pl_require(gpu, expr)
+
+void pl_gpu_destroy(pl_gpu gpu)
+{
+ if (!gpu)
+ return;
+
+ struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ pl_dispatch_destroy(&impl->dp);
+ impl->destroy(gpu);
+}
+
+pl_dispatch pl_gpu_dispatch(pl_gpu gpu)
+{
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ return impl->dp;
+}
+
+pl_cache pl_gpu_cache(pl_gpu gpu)
+{
+ if (!gpu)
+ return NULL;
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ return atomic_load(&impl->cache);
+}
+
+void pl_gpu_set_cache(pl_gpu gpu, pl_cache cache)
+{
+ struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ atomic_store(&impl->cache, cache);
+}
+
+bool pl_fmt_is_ordered(pl_fmt fmt)
+{
+ bool ret = !fmt->opaque;
+ for (int i = 0; i < fmt->num_components; i++)
+ ret &= fmt->sample_order[i] == i;
+ return ret;
+}
+
+bool pl_fmt_is_float(pl_fmt fmt)
+{
+ switch (fmt->type) {
+ case PL_FMT_UNKNOWN: // more likely than not
+ case PL_FMT_FLOAT:
+ case PL_FMT_UNORM:
+ case PL_FMT_SNORM:
+ return true;
+
+ case PL_FMT_UINT:
+ case PL_FMT_SINT:
+ return false;
+
+ case PL_FMT_TYPE_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+}
+
+bool pl_fmt_has_modifier(pl_fmt fmt, uint64_t modifier)
+{
+ if (!fmt)
+ return false;
+
+ for (int i = 0; i < fmt->num_modifiers; i++) {
+ if (fmt->modifiers[i] == modifier)
+ return true;
+ }
+
+ return false;
+}
+
+pl_fmt pl_find_fmt(pl_gpu gpu, enum pl_fmt_type type, int num_components,
+ int min_depth, int host_bits, enum pl_fmt_caps caps)
+{
+ for (int n = 0; n < gpu->num_formats; n++) {
+ pl_fmt fmt = gpu->formats[n];
+ if (fmt->type != type || fmt->num_components != num_components)
+ continue;
+ if ((fmt->caps & caps) != caps)
+ continue;
+
+ // When specifying some particular host representation, ensure the
+ // format is non-opaque, ordered and unpadded
+ if (host_bits && fmt->opaque)
+ continue;
+ if (host_bits && fmt->texel_size * 8 != host_bits * num_components)
+ continue;
+ if (host_bits && !pl_fmt_is_ordered(fmt))
+ continue;
+
+ for (int i = 0; i < fmt->num_components; i++) {
+ if (fmt->component_depth[i] < min_depth)
+ goto next_fmt;
+ if (host_bits && fmt->host_bits[i] != host_bits)
+ goto next_fmt;
+ }
+
+ return fmt;
+
+next_fmt: ; // equivalent to `continue`
+ }
+
+ // ran out of formats
+ PL_TRACE(gpu, "No matching format found");
+ return NULL;
+}
+
+pl_fmt pl_find_vertex_fmt(pl_gpu gpu, enum pl_fmt_type type, int comps)
+{
+ static const size_t sizes[] = {
+ [PL_FMT_FLOAT] = sizeof(float),
+ [PL_FMT_UNORM] = sizeof(unsigned),
+ [PL_FMT_UINT] = sizeof(unsigned),
+ [PL_FMT_SNORM] = sizeof(int),
+ [PL_FMT_SINT] = sizeof(int),
+ };
+
+ return pl_find_fmt(gpu, type, comps, 0, 8 * sizes[type], PL_FMT_CAP_VERTEX);
+}
+
+pl_fmt pl_find_named_fmt(pl_gpu gpu, const char *name)
+{
+ if (!name)
+ return NULL;
+
+ for (int i = 0; i < gpu->num_formats; i++) {
+ pl_fmt fmt = gpu->formats[i];
+ if (strcmp(name, fmt->name) == 0)
+ return fmt;
+ }
+
+ // ran out of formats
+ return NULL;
+}
+
+pl_fmt pl_find_fourcc(pl_gpu gpu, uint32_t fourcc)
+{
+ if (!fourcc)
+ return NULL;
+
+ for (int i = 0; i < gpu->num_formats; i++) {
+ pl_fmt fmt = gpu->formats[i];
+ if (fourcc == fmt->fourcc)
+ return fmt;
+ }
+
+ // ran out of formats
+ return NULL;
+}
+
+static inline bool check_mod(pl_gpu gpu, pl_fmt fmt, uint64_t mod)
+{
+ for (int i = 0; i < fmt->num_modifiers; i++) {
+ if (fmt->modifiers[i] == mod)
+ return true;
+ }
+
+
+ PL_ERR(gpu, "DRM modifier %s not available for format %s. Available modifiers:",
+ PRINT_DRM_MOD(mod), fmt->name);
+ for (int i = 0; i < fmt->num_modifiers; i++)
+ PL_ERR(gpu, " %s", PRINT_DRM_MOD(fmt->modifiers[i]));
+
+ return false;
+}
+
+pl_tex pl_tex_create(pl_gpu gpu, const struct pl_tex_params *params)
+{
+ require(params->format);
+ require(!params->import_handle || !params->export_handle);
+ require(!params->import_handle || !params->initial_data);
+ if (params->export_handle) {
+ require(params->export_handle & gpu->export_caps.tex);
+ require(PL_ISPOT(params->export_handle));
+ }
+ if (params->import_handle) {
+ require(params->import_handle & gpu->import_caps.tex);
+ require(PL_ISPOT(params->import_handle));
+ if (params->import_handle == PL_HANDLE_DMA_BUF) {
+ if (!check_mod(gpu, params->format, params->shared_mem.drm_format_mod))
+ goto error;
+ if (params->shared_mem.stride_w)
+ require(params->w && params->shared_mem.stride_w >= params->w);
+ if (params->shared_mem.stride_h)
+ require(params->h && params->shared_mem.stride_h >= params->h);
+ } else if (params->import_handle == PL_HANDLE_MTL_TEX) {
+ require(params->shared_mem.plane <= 2);
+ }
+ }
+
+ switch (pl_tex_params_dimension(*params)) {
+ case 1:
+ require(params->w > 0);
+ require(params->w <= gpu->limits.max_tex_1d_dim);
+ require(!params->renderable);
+ require(!params->blit_src || gpu->limits.blittable_1d_3d);
+ require(!params->blit_dst || gpu->limits.blittable_1d_3d);
+ require(!params->format->num_planes);
+ break;
+ case 2:
+ require(params->w > 0 && params->h > 0);
+ require(params->w <= gpu->limits.max_tex_2d_dim);
+ require(params->h <= gpu->limits.max_tex_2d_dim);
+ break;
+ case 3:
+ require(params->w > 0 && params->h > 0 && params->d > 0);
+ require(params->w <= gpu->limits.max_tex_3d_dim);
+ require(params->h <= gpu->limits.max_tex_3d_dim);
+ require(params->d <= gpu->limits.max_tex_3d_dim);
+ require(!params->renderable);
+ require(!params->blit_src || gpu->limits.blittable_1d_3d);
+ require(!params->blit_dst || gpu->limits.blittable_1d_3d);
+ require(!params->format->num_planes);
+ break;
+ }
+
+ enum pl_fmt_caps fmt_caps = params->format->caps;
+ bool fmt_opaque = params->format->opaque;
+ for (int i = 0; i < params->format->num_planes; i++) {
+ pl_fmt pfmt = params->format->planes[i].format;
+ fmt_caps |= pfmt->caps;
+ fmt_opaque &= pfmt->opaque;
+ }
+
+ require(!params->host_readable || fmt_caps & PL_FMT_CAP_HOST_READABLE);
+ require(!params->host_writable || !fmt_opaque);
+ require(!params->sampleable || fmt_caps & PL_FMT_CAP_SAMPLEABLE);
+ require(!params->renderable || fmt_caps & PL_FMT_CAP_RENDERABLE);
+ require(!params->storable || fmt_caps & PL_FMT_CAP_STORABLE);
+ require(!params->blit_src || fmt_caps & PL_FMT_CAP_BLITTABLE);
+ require(!params->blit_dst || fmt_caps & PL_FMT_CAP_BLITTABLE);
+
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ return impl->tex_create(gpu, params);
+
+error:
+ if (params->debug_tag)
+ PL_ERR(gpu, " for texture: %s", params->debug_tag);
+ return NULL;
+}
+
+void pl_tex_destroy(pl_gpu gpu, pl_tex *tex)
+{
+ if (!*tex)
+ return;
+
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ impl->tex_destroy(gpu, *tex);
+ *tex = NULL;
+}
+
+static bool pl_tex_params_superset(struct pl_tex_params a, struct pl_tex_params b)
+{
+ return a.w == b.w && a.h == b.h && a.d == b.d &&
+ a.format == b.format &&
+ (a.sampleable || !b.sampleable) &&
+ (a.renderable || !b.renderable) &&
+ (a.storable || !b.storable) &&
+ (a.blit_src || !b.blit_src) &&
+ (a.blit_dst || !b.blit_dst) &&
+ (a.host_writable || !b.host_writable) &&
+ (a.host_readable || !b.host_readable);
+}
+
+bool pl_tex_recreate(pl_gpu gpu, pl_tex *tex, const struct pl_tex_params *params)
+{
+ if (params->initial_data) {
+ PL_ERR(gpu, "pl_tex_recreate may not be used with `initial_data`!");
+ return false;
+ }
+
+ if (params->import_handle) {
+ PL_ERR(gpu, "pl_tex_recreate may not be used with `import_handle`!");
+ return false;
+ }
+
+ if (*tex && pl_tex_params_superset((*tex)->params, *params)) {
+ pl_tex_invalidate(gpu, *tex);
+ return true;
+ }
+
+ PL_DEBUG(gpu, "(Re)creating %dx%dx%d texture with format %s: %s",
+ params->w, params->h, params->d, params->format->name,
+ PL_DEF(params->debug_tag, "unknown"));
+
+ pl_tex_destroy(gpu, tex);
+ *tex = pl_tex_create(gpu, params);
+
+ return !!*tex;
+}
+
+void pl_tex_clear_ex(pl_gpu gpu, pl_tex dst, const union pl_clear_color color)
+{
+ require(dst->params.blit_dst);
+
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ if (impl->tex_invalidate)
+ impl->tex_invalidate(gpu, dst);
+ impl->tex_clear_ex(gpu, dst, color);
+ return;
+
+error:
+ if (dst->params.debug_tag)
+ PL_ERR(gpu, " for texture: %s", dst->params.debug_tag);
+}
+
+void pl_tex_clear(pl_gpu gpu, pl_tex dst, const float color[4])
+{
+ if (!pl_fmt_is_float(dst->params.format)) {
+ PL_ERR(gpu, "Cannot call `pl_tex_clear` on integer textures, please "
+ "use `pl_tex_clear_ex` instead.");
+ return;
+ }
+
+ const union pl_clear_color col = {
+ .f = { color[0], color[1], color[2], color[3] },
+ };
+
+ pl_tex_clear_ex(gpu, dst, col);
+}
+
+void pl_tex_invalidate(pl_gpu gpu, pl_tex tex)
+{
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ if (impl->tex_invalidate)
+ impl->tex_invalidate(gpu, tex);
+}
+
+static void strip_coords(pl_tex tex, pl_rect3d *rc)
+{
+ if (!tex->params.d) {
+ rc->z0 = 0;
+ rc->z1 = 1;
+ }
+
+ if (!tex->params.h) {
+ rc->y0 = 0;
+ rc->y1 = 1;
+ }
+}
+
+static void infer_rc(pl_tex tex, pl_rect3d *rc)
+{
+ if (!rc->x0 && !rc->x1)
+ rc->x1 = tex->params.w;
+ if (!rc->y0 && !rc->y1)
+ rc->y1 = tex->params.h;
+ if (!rc->z0 && !rc->z1)
+ rc->z1 = tex->params.d;
+}
+
+void pl_tex_blit(pl_gpu gpu, const struct pl_tex_blit_params *params)
+{
+ pl_tex src = params->src, dst = params->dst;
+ require(src && dst);
+ pl_fmt src_fmt = src->params.format;
+ pl_fmt dst_fmt = dst->params.format;
+ require(src_fmt->internal_size == dst_fmt->internal_size);
+ require((src_fmt->type == PL_FMT_UINT) == (dst_fmt->type == PL_FMT_UINT));
+ require((src_fmt->type == PL_FMT_SINT) == (dst_fmt->type == PL_FMT_SINT));
+ require(src->params.blit_src);
+ require(dst->params.blit_dst);
+ require(params->sample_mode != PL_TEX_SAMPLE_LINEAR || (src_fmt->caps & PL_FMT_CAP_LINEAR));
+
+ struct pl_tex_blit_params fixed = *params;
+ infer_rc(src, &fixed.src_rc);
+ infer_rc(dst, &fixed.dst_rc);
+ strip_coords(src, &fixed.src_rc);
+ strip_coords(dst, &fixed.dst_rc);
+
+ require(fixed.src_rc.x0 >= 0 && fixed.src_rc.x0 < src->params.w);
+ require(fixed.src_rc.x1 > 0 && fixed.src_rc.x1 <= src->params.w);
+ require(fixed.dst_rc.x0 >= 0 && fixed.dst_rc.x0 < dst->params.w);
+ require(fixed.dst_rc.x1 > 0 && fixed.dst_rc.x1 <= dst->params.w);
+
+ if (src->params.h) {
+ require(fixed.src_rc.y0 >= 0 && fixed.src_rc.y0 < src->params.h);
+ require(fixed.src_rc.y1 > 0 && fixed.src_rc.y1 <= src->params.h);
+ }
+
+ if (dst->params.h) {
+ require(fixed.dst_rc.y0 >= 0 && fixed.dst_rc.y0 < dst->params.h);
+ require(fixed.dst_rc.y1 > 0 && fixed.dst_rc.y1 <= dst->params.h);
+ }
+
+ if (src->params.d) {
+ require(fixed.src_rc.z0 >= 0 && fixed.src_rc.z0 < src->params.d);
+ require(fixed.src_rc.z1 > 0 && fixed.src_rc.z1 <= src->params.d);
+ }
+
+ if (dst->params.d) {
+ require(fixed.dst_rc.z0 >= 0 && fixed.dst_rc.z0 < dst->params.d);
+ require(fixed.dst_rc.z1 > 0 && fixed.dst_rc.z1 <= dst->params.d);
+ }
+
+ pl_rect3d full = {0, 0, 0, dst->params.w, dst->params.h, dst->params.d};
+ strip_coords(dst, &full);
+
+ pl_rect3d rcnorm = fixed.dst_rc;
+ pl_rect3d_normalize(&rcnorm);
+ if (pl_rect3d_eq(rcnorm, full))
+ pl_tex_invalidate(gpu, dst);
+
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ impl->tex_blit(gpu, &fixed);
+ return;
+
+error:
+ if (src->params.debug_tag || dst->params.debug_tag) {
+ PL_ERR(gpu, " for textures: src %s, dst %s",
+ PL_DEF(src->params.debug_tag, "(unknown)"),
+ PL_DEF(dst->params.debug_tag, "(unknown)"));
+ }
+}
+
+static bool fix_tex_transfer(pl_gpu gpu, struct pl_tex_transfer_params *params)
+{
+ pl_tex tex = params->tex;
+ pl_fmt fmt = tex->params.format;
+ pl_rect3d rc = params->rc;
+
+ // Infer the default values
+ infer_rc(tex, &rc);
+ strip_coords(tex, &rc);
+
+ if (!params->row_pitch || !tex->params.w)
+ params->row_pitch = pl_rect_w(rc) * fmt->texel_size;
+ if (!params->depth_pitch || !tex->params.d)
+ params->depth_pitch = pl_rect_h(rc) * params->row_pitch;
+
+ require(params->row_pitch);
+ require(params->depth_pitch);
+ params->rc = rc;
+
+ // Check the parameters for sanity
+ switch (pl_tex_params_dimension(tex->params))
+ {
+ case 3:
+ require(rc.z1 > rc.z0);
+ require(rc.z0 >= 0 && rc.z0 < tex->params.d);
+ require(rc.z1 > 0 && rc.z1 <= tex->params.d);
+ require(params->depth_pitch >= pl_rect_h(rc) * params->row_pitch);
+ require(params->depth_pitch % params->row_pitch == 0);
+ // fall through
+ case 2:
+ require(rc.y1 > rc.y0);
+ require(rc.y0 >= 0 && rc.y0 < tex->params.h);
+ require(rc.y1 > 0 && rc.y1 <= tex->params.h);
+ require(params->row_pitch >= pl_rect_w(rc) * fmt->texel_size);
+ require(params->row_pitch % fmt->texel_align == 0);
+ // fall through
+ case 1:
+ require(rc.x1 > rc.x0);
+ require(rc.x0 >= 0 && rc.x0 < tex->params.w);
+ require(rc.x1 > 0 && rc.x1 <= tex->params.w);
+ break;
+ }
+
+ require(!params->buf ^ !params->ptr); // exactly one
+ if (params->buf) {
+ pl_buf buf = params->buf;
+ size_t size = pl_tex_transfer_size(params);
+ require(params->buf_offset + size >= params->buf_offset); // overflow check
+ require(params->buf_offset + size <= buf->params.size);
+ require(gpu->limits.buf_transfer);
+ }
+
+ require(!params->callback || gpu->limits.callbacks);
+ return true;
+
+error:
+ if (tex->params.debug_tag)
+ PL_ERR(gpu, " for texture: %s", tex->params.debug_tag);
+ return false;
+}
+
+bool pl_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+ pl_tex tex = params->tex;
+ require(tex->params.host_writable);
+
+ struct pl_tex_transfer_params fixed = *params;
+ if (!fix_tex_transfer(gpu, &fixed))
+ goto error;
+
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ return impl->tex_upload(gpu, &fixed);
+
+error:
+ if (tex->params.debug_tag)
+ PL_ERR(gpu, " for texture: %s", tex->params.debug_tag);
+ return false;
+}
+
+bool pl_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+ pl_tex tex = params->tex;
+ require(tex->params.host_readable);
+
+ struct pl_tex_transfer_params fixed = *params;
+ if (!fix_tex_transfer(gpu, &fixed))
+ goto error;
+
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ return impl->tex_download(gpu, &fixed);
+
+error:
+ if (tex->params.debug_tag)
+ PL_ERR(gpu, " for texture: %s", tex->params.debug_tag);
+ return false;
+}
+
+bool pl_tex_poll(pl_gpu gpu, pl_tex tex, uint64_t t)
+{
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ return impl->tex_poll ? impl->tex_poll(gpu, tex, t) : false;
+}
+
+pl_buf pl_buf_create(pl_gpu gpu, const struct pl_buf_params *params)
+{
+ struct pl_buf_params params_rounded;
+
+ require(!params->import_handle || !params->export_handle);
+ if (params->export_handle) {
+ require(PL_ISPOT(params->export_handle));
+ require(params->export_handle & gpu->export_caps.buf);
+ }
+ if (params->import_handle) {
+ require(PL_ISPOT(params->import_handle));
+ require(params->import_handle & gpu->import_caps.buf);
+ const struct pl_shared_mem *shmem = &params->shared_mem;
+ require(shmem->offset + params->size <= shmem->size);
+ require(params->import_handle != PL_HANDLE_DMA_BUF || !shmem->drm_format_mod);
+
+ // Fix misalignment on host pointer imports
+ if (params->import_handle == PL_HANDLE_HOST_PTR) {
+ uintptr_t page_mask = ~(gpu->limits.align_host_ptr - 1);
+ uintptr_t ptr_base = (uintptr_t) shmem->handle.ptr & page_mask;
+ size_t ptr_offset = (uintptr_t) shmem->handle.ptr - ptr_base;
+ size_t buf_offset = ptr_offset + shmem->offset;
+ size_t ptr_size = PL_ALIGN2(ptr_offset + shmem->size,
+ gpu->limits.align_host_ptr);
+
+ if (ptr_base != (uintptr_t) shmem->handle.ptr || ptr_size > shmem->size) {
+ static bool warned_rounding = false;
+ if (!warned_rounding) {
+ warned_rounding = true;
+ PL_WARN(gpu, "Imported host pointer is not page-aligned. "
+ "This should normally be fine on most platforms, "
+ "but may cause issues in some rare circumstances.");
+ }
+
+ PL_TRACE(gpu, "Rounding imported host pointer %p + %zu -> %zu to "
+ "nearest page boundaries: %p + %zu -> %zu",
+ shmem->handle.ptr, shmem->offset, shmem->size,
+ (void *) ptr_base, buf_offset, ptr_size);
+ }
+
+ params_rounded = *params;
+ params_rounded.shared_mem.handle.ptr = (void *) ptr_base;
+ params_rounded.shared_mem.offset = buf_offset;
+ params_rounded.shared_mem.size = ptr_size;
+ params = &params_rounded;
+ }
+ }
+
+ require(params->size > 0 && params->size <= gpu->limits.max_buf_size);
+ require(!params->uniform || params->size <= gpu->limits.max_ubo_size);
+ require(!params->storable || params->size <= gpu->limits.max_ssbo_size);
+ require(!params->drawable || params->size <= gpu->limits.max_vbo_size);
+ require(!params->host_mapped || params->size <= gpu->limits.max_mapped_size);
+
+ if (params->format) {
+ pl_fmt fmt = params->format;
+ require(params->size <= gpu->limits.max_buffer_texels * fmt->texel_size);
+ require(!params->uniform || (fmt->caps & PL_FMT_CAP_TEXEL_UNIFORM));
+ require(!params->storable || (fmt->caps & PL_FMT_CAP_TEXEL_STORAGE));
+ }
+
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ pl_buf buf = impl->buf_create(gpu, params);
+ if (buf)
+ require(!params->host_mapped || buf->data);
+
+ return buf;
+
+error:
+ if (params->debug_tag)
+ PL_ERR(gpu, " for buffer: %s", params->debug_tag);
+ return NULL;
+}
+
+void pl_buf_destroy(pl_gpu gpu, pl_buf *buf)
+{
+ if (!*buf)
+ return;
+
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ impl->buf_destroy(gpu, *buf);
+ *buf = NULL;
+}
+
+static bool pl_buf_params_superset(struct pl_buf_params a, struct pl_buf_params b)
+{
+ return a.size >= b.size &&
+ a.memory_type == b.memory_type &&
+ a.format == b.format &&
+ (a.host_writable || !b.host_writable) &&
+ (a.host_readable || !b.host_readable) &&
+ (a.host_mapped || !b.host_mapped) &&
+ (a.uniform || !b.uniform) &&
+ (a.storable || !b.storable) &&
+ (a.drawable || !b.drawable);
+}
+
+bool pl_buf_recreate(pl_gpu gpu, pl_buf *buf, const struct pl_buf_params *params)
+{
+
+ if (params->initial_data) {
+ PL_ERR(gpu, "pl_buf_recreate may not be used with `initial_data`!");
+ return false;
+ }
+
+ if (*buf && pl_buf_params_superset((*buf)->params, *params))
+ return true;
+
+ PL_INFO(gpu, "(Re)creating %zu buffer", params->size);
+ pl_buf_destroy(gpu, buf);
+ *buf = pl_buf_create(gpu, params);
+
+ return !!*buf;
+}
+
+void pl_buf_write(pl_gpu gpu, pl_buf buf, size_t buf_offset,
+ const void *data, size_t size)
+{
+ require(buf->params.host_writable);
+ require(buf_offset + size <= buf->params.size);
+ require(buf_offset == PL_ALIGN2(buf_offset, 4));
+
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ impl->buf_write(gpu, buf, buf_offset, data, size);
+ return;
+
+error:
+ if (buf->params.debug_tag)
+ PL_ERR(gpu, " for buffer: %s", buf->params.debug_tag);
+}
+
+bool pl_buf_read(pl_gpu gpu, pl_buf buf, size_t buf_offset,
+ void *dest, size_t size)
+{
+ require(buf->params.host_readable);
+ require(buf_offset + size <= buf->params.size);
+
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ return impl->buf_read(gpu, buf, buf_offset, dest, size);
+
+error:
+ if (buf->params.debug_tag)
+ PL_ERR(gpu, " for buffer: %s", buf->params.debug_tag);
+ return false;
+}
+
+void pl_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset,
+ pl_buf src, size_t src_offset, size_t size)
+{
+ require(src_offset + size <= src->params.size);
+ require(dst_offset + size <= dst->params.size);
+ require(src != dst);
+
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ impl->buf_copy(gpu, dst, dst_offset, src, src_offset, size);
+ return;
+
+error:
+ if (src->params.debug_tag || dst->params.debug_tag) {
+ PL_ERR(gpu, " for buffers: src %s, dst %s",
+ src->params.debug_tag, dst->params.debug_tag);
+ }
+}
+
+bool pl_buf_export(pl_gpu gpu, pl_buf buf)
+{
+ require(buf->params.export_handle || buf->params.import_handle);
+
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ return impl->buf_export(gpu, buf);
+
+error:
+ if (buf->params.debug_tag)
+ PL_ERR(gpu, " for buffer: %s", buf->params.debug_tag);
+ return false;
+}
+
+bool pl_buf_poll(pl_gpu gpu, pl_buf buf, uint64_t t)
+{
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ return impl->buf_poll ? impl->buf_poll(gpu, buf, t) : false;
+}
+
+size_t pl_var_type_size(enum pl_var_type type)
+{
+ switch (type) {
+ case PL_VAR_SINT: return sizeof(int);
+ case PL_VAR_UINT: return sizeof(unsigned int);
+ case PL_VAR_FLOAT: return sizeof(float);
+ case PL_VAR_INVALID: // fall through
+ case PL_VAR_TYPE_COUNT: break;
+ }
+
+ pl_unreachable();
+}
+
+#define PL_VAR(TYPE, NAME, M, V) \
+ struct pl_var pl_var_##NAME(const char *name) { \
+ return (struct pl_var) { \
+ .name = name, \
+ .type = PL_VAR_##TYPE, \
+ .dim_m = M, \
+ .dim_v = V, \
+ .dim_a = 1, \
+ }; \
+ }
+
+PL_VAR(FLOAT, float, 1, 1)
+PL_VAR(FLOAT, vec2, 1, 2)
+PL_VAR(FLOAT, vec3, 1, 3)
+PL_VAR(FLOAT, vec4, 1, 4)
+PL_VAR(FLOAT, mat2, 2, 2)
+PL_VAR(FLOAT, mat2x3, 2, 3)
+PL_VAR(FLOAT, mat2x4, 2, 4)
+PL_VAR(FLOAT, mat3, 3, 3)
+PL_VAR(FLOAT, mat3x4, 3, 4)
+PL_VAR(FLOAT, mat4x2, 4, 2)
+PL_VAR(FLOAT, mat4x3, 4, 3)
+PL_VAR(FLOAT, mat4, 4, 4)
+PL_VAR(SINT, int, 1, 1)
+PL_VAR(SINT, ivec2, 1, 2)
+PL_VAR(SINT, ivec3, 1, 3)
+PL_VAR(SINT, ivec4, 1, 4)
+PL_VAR(UINT, uint, 1, 1)
+PL_VAR(UINT, uvec2, 1, 2)
+PL_VAR(UINT, uvec3, 1, 3)
+PL_VAR(UINT, uvec4, 1, 4)
+
+#undef PL_VAR
+
+const struct pl_named_var pl_var_glsl_types[] = {
+ // float vectors
+ { "float", { .type = PL_VAR_FLOAT, .dim_m = 1, .dim_v = 1, .dim_a = 1, }},
+ { "vec2", { .type = PL_VAR_FLOAT, .dim_m = 1, .dim_v = 2, .dim_a = 1, }},
+ { "vec3", { .type = PL_VAR_FLOAT, .dim_m = 1, .dim_v = 3, .dim_a = 1, }},
+ { "vec4", { .type = PL_VAR_FLOAT, .dim_m = 1, .dim_v = 4, .dim_a = 1, }},
+ // float matrices
+ { "mat2", { .type = PL_VAR_FLOAT, .dim_m = 2, .dim_v = 2, .dim_a = 1, }},
+ { "mat2x3", { .type = PL_VAR_FLOAT, .dim_m = 2, .dim_v = 3, .dim_a = 1, }},
+ { "mat2x4", { .type = PL_VAR_FLOAT, .dim_m = 2, .dim_v = 4, .dim_a = 1, }},
+ { "mat3", { .type = PL_VAR_FLOAT, .dim_m = 3, .dim_v = 3, .dim_a = 1, }},
+ { "mat3x4", { .type = PL_VAR_FLOAT, .dim_m = 3, .dim_v = 4, .dim_a = 1, }},
+ { "mat4x2", { .type = PL_VAR_FLOAT, .dim_m = 4, .dim_v = 2, .dim_a = 1, }},
+ { "mat4x3", { .type = PL_VAR_FLOAT, .dim_m = 4, .dim_v = 3, .dim_a = 1, }},
+ { "mat4", { .type = PL_VAR_FLOAT, .dim_m = 4, .dim_v = 4, .dim_a = 1, }},
+ // integer vectors
+ { "int", { .type = PL_VAR_SINT, .dim_m = 1, .dim_v = 1, .dim_a = 1, }},
+ { "ivec2", { .type = PL_VAR_SINT, .dim_m = 1, .dim_v = 2, .dim_a = 1, }},
+ { "ivec3", { .type = PL_VAR_SINT, .dim_m = 1, .dim_v = 3, .dim_a = 1, }},
+ { "ivec4", { .type = PL_VAR_SINT, .dim_m = 1, .dim_v = 4, .dim_a = 1, }},
+ // unsigned integer vectors
+ { "uint", { .type = PL_VAR_UINT, .dim_m = 1, .dim_v = 1, .dim_a = 1, }},
+ { "uvec2", { .type = PL_VAR_UINT, .dim_m = 1, .dim_v = 2, .dim_a = 1, }},
+ { "uvec3", { .type = PL_VAR_UINT, .dim_m = 1, .dim_v = 3, .dim_a = 1, }},
+ { "uvec4", { .type = PL_VAR_UINT, .dim_m = 1, .dim_v = 4, .dim_a = 1, }},
+
+ {0},
+};
+
+#define MAX_DIM 4
+
+const char *pl_var_glsl_type_name(struct pl_var var)
+{
+ static const char *types[PL_VAR_TYPE_COUNT][MAX_DIM+1][MAX_DIM+1] = {
+ // float vectors
+ [PL_VAR_FLOAT][1][1] = "float",
+ [PL_VAR_FLOAT][1][2] = "vec2",
+ [PL_VAR_FLOAT][1][3] = "vec3",
+ [PL_VAR_FLOAT][1][4] = "vec4",
+ // float matrices
+ [PL_VAR_FLOAT][2][2] = "mat2",
+ [PL_VAR_FLOAT][2][3] = "mat2x3",
+ [PL_VAR_FLOAT][2][4] = "mat2x4",
+ [PL_VAR_FLOAT][3][2] = "mat3x2",
+ [PL_VAR_FLOAT][3][3] = "mat3",
+ [PL_VAR_FLOAT][3][4] = "mat3x4",
+ [PL_VAR_FLOAT][4][2] = "mat4x2",
+ [PL_VAR_FLOAT][4][3] = "mat4x3",
+ [PL_VAR_FLOAT][4][4] = "mat4",
+ // integer vectors
+ [PL_VAR_SINT][1][1] = "int",
+ [PL_VAR_SINT][1][2] = "ivec2",
+ [PL_VAR_SINT][1][3] = "ivec3",
+ [PL_VAR_SINT][1][4] = "ivec4",
+ // unsigned integer vectors
+ [PL_VAR_UINT][1][1] = "uint",
+ [PL_VAR_UINT][1][2] = "uvec2",
+ [PL_VAR_UINT][1][3] = "uvec3",
+ [PL_VAR_UINT][1][4] = "uvec4",
+ };
+
+ if (var.dim_v > MAX_DIM || var.dim_m > MAX_DIM)
+ return NULL;
+
+ return types[var.type][var.dim_m][var.dim_v];
+}
+
+struct pl_var pl_var_from_fmt(pl_fmt fmt, const char *name)
+{
+ static const enum pl_var_type vartypes[] = {
+ [PL_FMT_FLOAT] = PL_VAR_FLOAT,
+ [PL_FMT_UNORM] = PL_VAR_FLOAT,
+ [PL_FMT_SNORM] = PL_VAR_FLOAT,
+ [PL_FMT_UINT] = PL_VAR_UINT,
+ [PL_FMT_SINT] = PL_VAR_SINT,
+ };
+
+ pl_assert(fmt->type < PL_ARRAY_SIZE(vartypes));
+ return (struct pl_var) {
+ .type = vartypes[fmt->type],
+ .name = name,
+ .dim_v = fmt->num_components,
+ .dim_m = 1,
+ .dim_a = 1,
+ };
+}
+
+struct pl_var_layout pl_var_host_layout(size_t offset, const struct pl_var *var)
+{
+ size_t col_size = pl_var_type_size(var->type) * var->dim_v;
+ return (struct pl_var_layout) {
+ .offset = offset,
+ .stride = col_size,
+ .size = col_size * var->dim_m * var->dim_a,
+ };
+}
+
+struct pl_var_layout pl_std140_layout(size_t offset, const struct pl_var *var)
+{
+ size_t el_size = pl_var_type_size(var->type);
+
+ // std140 packing rules:
+ // 1. The size of generic values is their size in bytes
+ // 2. The size of vectors is the vector length * the base count
+ // 3. Matrices are treated like arrays of column vectors
+ // 4. The size of array rows is that of the element size rounded up to
+ // the nearest multiple of vec4
+ // 5. All values are aligned to a multiple of their size (stride for arrays),
+ // with the exception of vec3 which is aligned like vec4
+ size_t stride = el_size * var->dim_v;
+ size_t align = stride;
+ if (var->dim_v == 3)
+ align += el_size;
+ if (var->dim_m * var->dim_a > 1)
+ stride = align = PL_ALIGN2(align, sizeof(float[4]));
+
+ return (struct pl_var_layout) {
+ .offset = PL_ALIGN2(offset, align),
+ .stride = stride,
+ .size = stride * var->dim_m * var->dim_a,
+ };
+}
+
+struct pl_var_layout pl_std430_layout(size_t offset, const struct pl_var *var)
+{
+ size_t el_size = pl_var_type_size(var->type);
+
+ // std430 packing rules: like std140, except arrays/matrices are always
+ // "tightly" packed, even arrays/matrices of vec3s
+ size_t stride = el_size * var->dim_v;
+ size_t align = stride;
+ if (var->dim_v == 3)
+ align += el_size;
+ if (var->dim_m * var->dim_a > 1)
+ stride = align;
+
+ return (struct pl_var_layout) {
+ .offset = PL_ALIGN2(offset, align),
+ .stride = stride,
+ .size = stride * var->dim_m * var->dim_a,
+ };
+}
+
+void memcpy_layout(void *dst_p, struct pl_var_layout dst_layout,
+ const void *src_p, struct pl_var_layout src_layout)
+{
+ uintptr_t src = (uintptr_t) src_p + src_layout.offset;
+ uintptr_t dst = (uintptr_t) dst_p + dst_layout.offset;
+
+ if (src_layout.stride == dst_layout.stride) {
+ pl_assert(dst_layout.size == src_layout.size);
+ memcpy((void *) dst, (const void *) src, src_layout.size);
+ return;
+ }
+
+ size_t stride = PL_MIN(src_layout.stride, dst_layout.stride);
+ uintptr_t end = src + src_layout.size;
+ while (src < end) {
+ pl_assert(dst < dst + dst_layout.size);
+ memcpy((void *) dst, (const void *) src, stride);
+ src += src_layout.stride;
+ dst += dst_layout.stride;
+ }
+}
+
+int pl_desc_namespace(pl_gpu gpu, enum pl_desc_type type)
+{
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ int ret = impl->desc_namespace(gpu, type);
+ pl_assert(ret >= 0 && ret < PL_DESC_TYPE_COUNT);
+ return ret;
+}
+
+const char *pl_desc_access_glsl_name(enum pl_desc_access mode)
+{
+ switch (mode) {
+ case PL_DESC_ACCESS_READWRITE: return "";
+ case PL_DESC_ACCESS_READONLY: return "readonly";
+ case PL_DESC_ACCESS_WRITEONLY: return "writeonly";
+ case PL_DESC_ACCESS_COUNT: break;
+ }
+
+ pl_unreachable();
+}
+
+const struct pl_blend_params pl_alpha_overlay = {
+ .src_rgb = PL_BLEND_SRC_ALPHA,
+ .dst_rgb = PL_BLEND_ONE_MINUS_SRC_ALPHA,
+ .src_alpha = PL_BLEND_ONE,
+ .dst_alpha = PL_BLEND_ONE_MINUS_SRC_ALPHA,
+};
+
+static inline void log_shader_sources(pl_log log, enum pl_log_level level,
+ const struct pl_pass_params *params)
+{
+ if (!pl_msg_test(log, level) || !params->glsl_shader)
+ return;
+
+ switch (params->type) {
+ case PL_PASS_RASTER:
+ if (!params->vertex_shader)
+ return;
+ pl_msg(log, level, "vertex shader source:");
+ pl_msg_source(log, level, params->vertex_shader);
+ pl_msg(log, level, "fragment shader source:");
+ pl_msg_source(log, level, params->glsl_shader);
+ return;
+
+ case PL_PASS_COMPUTE:
+ pl_msg(log, level, "compute shader source:");
+ pl_msg_source(log, level, params->glsl_shader);
+ return;
+
+ case PL_PASS_INVALID:
+ case PL_PASS_TYPE_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+}
+
+static void log_spec_constants(pl_log log, enum pl_log_level lev,
+ const struct pl_pass_params *params,
+ const void *constant_data)
+{
+ if (!constant_data || !params->num_constants || !pl_msg_test(log, lev))
+ return;
+
+ pl_msg(log, lev, "Specialization constant values:");
+
+ uintptr_t data_base = (uintptr_t) constant_data;
+ for (int i = 0; i < params->num_constants; i++) {
+ union {
+ int i;
+ unsigned u;
+ float f;
+ } *data = (void *) (data_base + params->constants[i].offset);
+ int id = params->constants[i].id;
+
+ switch (params->constants[i].type) {
+ case PL_VAR_SINT: pl_msg(log, lev, " constant_id=%d: %d", id, data->i); break;
+ case PL_VAR_UINT: pl_msg(log, lev, " constant_id=%d: %u", id, data->u); break;
+ case PL_VAR_FLOAT: pl_msg(log, lev, " constant_id=%d: %f", id, data->f); break;
+ default: pl_unreachable();
+ }
+ }
+}
+
+pl_pass pl_pass_create(pl_gpu gpu, const struct pl_pass_params *params)
+{
+ require(params->glsl_shader);
+ switch(params->type) {
+ case PL_PASS_RASTER:
+ require(params->vertex_shader);
+ require(params->vertex_stride % gpu->limits.align_vertex_stride == 0);
+ for (int i = 0; i < params->num_vertex_attribs; i++) {
+ struct pl_vertex_attrib va = params->vertex_attribs[i];
+ require(va.name);
+ require(va.fmt);
+ require(va.fmt->caps & PL_FMT_CAP_VERTEX);
+ require(va.offset + va.fmt->texel_size <= params->vertex_stride);
+ }
+
+ require(params->target_format);
+ require(params->target_format->caps & PL_FMT_CAP_RENDERABLE);
+ require(!params->blend_params || params->target_format->caps & PL_FMT_CAP_BLENDABLE);
+ require(!params->blend_params || params->load_target);
+ break;
+ case PL_PASS_COMPUTE:
+ require(gpu->glsl.compute);
+ break;
+ case PL_PASS_INVALID:
+ case PL_PASS_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ size_t num_var_comps = 0;
+ for (int i = 0; i < params->num_variables; i++) {
+ struct pl_var var = params->variables[i];
+ num_var_comps += var.dim_v * var.dim_m * var.dim_a;
+ require(var.name);
+ require(pl_var_glsl_type_name(var));
+ }
+ require(num_var_comps <= gpu->limits.max_variable_comps);
+
+ require(params->num_constants <= gpu->limits.max_constants);
+ for (int i = 0; i < params->num_constants; i++)
+ require(params->constants[i].type);
+
+ for (int i = 0; i < params->num_descriptors; i++) {
+ struct pl_desc desc = params->descriptors[i];
+ require(desc.name);
+
+ // enforce disjoint descriptor bindings for each namespace
+ int namespace = pl_desc_namespace(gpu, desc.type);
+ for (int j = i+1; j < params->num_descriptors; j++) {
+ struct pl_desc other = params->descriptors[j];
+ require(desc.binding != other.binding ||
+ namespace != pl_desc_namespace(gpu, other.type));
+ }
+ }
+
+ require(params->push_constants_size <= gpu->limits.max_pushc_size);
+ require(params->push_constants_size == PL_ALIGN2(params->push_constants_size, 4));
+
+ log_shader_sources(gpu->log, PL_LOG_DEBUG, params);
+ log_spec_constants(gpu->log, PL_LOG_DEBUG, params, params->constant_data);
+
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ pl_pass pass = impl->pass_create(gpu, params);
+ if (!pass)
+ goto error;
+
+ return pass;
+
+error:
+ log_shader_sources(gpu->log, PL_LOG_ERR, params);
+ pl_log_stack_trace(gpu->log, PL_LOG_ERR);
+ pl_debug_abort();
+ return NULL;
+}
+
+void pl_pass_destroy(pl_gpu gpu, pl_pass *pass)
+{
+ if (!*pass)
+ return;
+
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ impl->pass_destroy(gpu, *pass);
+ *pass = NULL;
+}
+
+void pl_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params)
+{
+ pl_pass pass = params->pass;
+ struct pl_pass_run_params new = *params;
+
+ for (int i = 0; i < pass->params.num_descriptors; i++) {
+ struct pl_desc desc = pass->params.descriptors[i];
+ struct pl_desc_binding db = params->desc_bindings[i];
+ require(db.object);
+ switch (desc.type) {
+ case PL_DESC_SAMPLED_TEX: {
+ pl_tex tex = db.object;
+ pl_fmt fmt = tex->params.format;
+ require(tex->params.sampleable);
+ require(db.sample_mode != PL_TEX_SAMPLE_LINEAR || (fmt->caps & PL_FMT_CAP_LINEAR));
+ break;
+ }
+ case PL_DESC_STORAGE_IMG: {
+ pl_tex tex = db.object;
+ pl_fmt fmt = tex->params.format;
+ require(tex->params.storable);
+ require(desc.access != PL_DESC_ACCESS_READWRITE || (fmt->caps & PL_FMT_CAP_READWRITE));
+ break;
+ }
+ case PL_DESC_BUF_UNIFORM: {
+ pl_buf buf = db.object;
+ require(buf->params.uniform);
+ break;
+ }
+ case PL_DESC_BUF_STORAGE: {
+ pl_buf buf = db.object;
+ require(buf->params.storable);
+ break;
+ }
+ case PL_DESC_BUF_TEXEL_UNIFORM: {
+ pl_buf buf = db.object;
+ require(buf->params.uniform && buf->params.format);
+ break;
+ }
+ case PL_DESC_BUF_TEXEL_STORAGE: {
+ pl_buf buf = db.object;
+ pl_fmt fmt = buf->params.format;
+ require(buf->params.storable && buf->params.format);
+ require(desc.access != PL_DESC_ACCESS_READWRITE || (fmt->caps & PL_FMT_CAP_READWRITE));
+ break;
+ }
+ case PL_DESC_INVALID:
+ case PL_DESC_TYPE_COUNT:
+ pl_unreachable();
+ }
+ }
+
+ for (int i = 0; i < params->num_var_updates; i++) {
+ struct pl_var_update vu = params->var_updates[i];
+ require(vu.index >= 0 && vu.index < pass->params.num_variables);
+ require(vu.data);
+ }
+
+ require(params->push_constants || !pass->params.push_constants_size);
+
+ switch (pass->params.type) {
+ case PL_PASS_RASTER: {
+ switch (pass->params.vertex_type) {
+ case PL_PRIM_TRIANGLE_LIST:
+ require(params->vertex_count % 3 == 0);
+ // fall through
+ case PL_PRIM_TRIANGLE_STRIP:
+ require(params->vertex_count >= 3);
+ break;
+ case PL_PRIM_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ require(!params->vertex_data ^ !params->vertex_buf);
+ if (params->vertex_buf) {
+ pl_buf vertex_buf = params->vertex_buf;
+ require(vertex_buf->params.drawable);
+ if (!params->index_data && !params->index_buf) {
+ // Cannot bounds check indexed draws
+ size_t vert_size = params->vertex_count * pass->params.vertex_stride;
+ require(params->buf_offset + vert_size <= vertex_buf->params.size);
+ }
+ }
+
+ require(!params->index_data || !params->index_buf);
+ if (params->index_buf) {
+ pl_buf index_buf = params->index_buf;
+ require(!params->vertex_data);
+ require(index_buf->params.drawable);
+ size_t index_size = pl_index_buf_size(params);
+ require(params->index_offset + index_size <= index_buf->params.size);
+ }
+
+ pl_tex target = params->target;
+ require(target);
+ require(pl_tex_params_dimension(target->params) == 2);
+ require(target->params.format->signature == pass->params.target_format->signature);
+ require(target->params.renderable);
+ pl_rect2d *vp = &new.viewport;
+ pl_rect2d *sc = &new.scissors;
+
+ // Sanitize viewport/scissors
+ if (!vp->x0 && !vp->x1)
+ vp->x1 = target->params.w;
+ if (!vp->y0 && !vp->y1)
+ vp->y1 = target->params.h;
+
+ if (!sc->x0 && !sc->x1)
+ sc->x1 = target->params.w;
+ if (!sc->y0 && !sc->y1)
+ sc->y1 = target->params.h;
+
+ // Constrain the scissors to the target dimension (to sanitize the
+ // underlying graphics API calls)
+ sc->x0 = PL_CLAMP(sc->x0, 0, target->params.w);
+ sc->y0 = PL_CLAMP(sc->y0, 0, target->params.h);
+ sc->x1 = PL_CLAMP(sc->x1, 0, target->params.w);
+ sc->y1 = PL_CLAMP(sc->y1, 0, target->params.h);
+
+ // Scissors wholly outside target -> silently drop pass (also needed
+ // to ensure we don't cause UB by specifying invalid scissors)
+ if (!pl_rect_w(*sc) || !pl_rect_h(*sc))
+ return;
+
+ require(pl_rect_w(*vp) > 0);
+ require(pl_rect_h(*vp) > 0);
+ require(pl_rect_w(*sc) > 0);
+ require(pl_rect_h(*sc) > 0);
+
+ if (!pass->params.load_target)
+ pl_tex_invalidate(gpu, target);
+ break;
+ }
+ case PL_PASS_COMPUTE:
+ for (int i = 0; i < PL_ARRAY_SIZE(params->compute_groups); i++) {
+ require(params->compute_groups[i] >= 0);
+ require(params->compute_groups[i] <= gpu->limits.max_dispatch[i]);
+ }
+ break;
+ case PL_PASS_INVALID:
+ case PL_PASS_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ impl->pass_run(gpu, &new);
+
+error:
+ return;
+}
+
+void pl_gpu_flush(pl_gpu gpu)
+{
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ if (impl->gpu_flush)
+ impl->gpu_flush(gpu);
+}
+
+void pl_gpu_finish(pl_gpu gpu)
+{
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ impl->gpu_finish(gpu);
+}
+
+bool pl_gpu_is_failed(pl_gpu gpu)
+{
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ if (!impl->gpu_is_failed)
+ return false;
+
+ return impl->gpu_is_failed(gpu);
+}
+
+pl_sync pl_sync_create(pl_gpu gpu, enum pl_handle_type handle_type)
+{
+ require(handle_type);
+ require(handle_type & gpu->export_caps.sync);
+ require(PL_ISPOT(handle_type));
+
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ return impl->sync_create(gpu, handle_type);
+
+error:
+ return NULL;
+}
+
+void pl_sync_destroy(pl_gpu gpu, pl_sync *sync)
+{
+ if (!*sync)
+ return;
+
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ impl->sync_destroy(gpu, *sync);
+ *sync = NULL;
+}
+
+bool pl_tex_export(pl_gpu gpu, pl_tex tex, pl_sync sync)
+{
+ require(tex->params.import_handle || tex->params.export_handle);
+
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ return impl->tex_export(gpu, tex, sync);
+
+error:
+ if (tex->params.debug_tag)
+ PL_ERR(gpu, " for texture: %s", tex->params.debug_tag);
+ return false;
+}
+
+pl_timer pl_timer_create(pl_gpu gpu)
+{
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ if (!impl->timer_create)
+ return NULL;
+
+ return impl->timer_create(gpu);
+}
+
+void pl_timer_destroy(pl_gpu gpu, pl_timer *timer)
+{
+ if (!*timer)
+ return;
+
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ impl->timer_destroy(gpu, *timer);
+ *timer = NULL;
+}
+
+uint64_t pl_timer_query(pl_gpu gpu, pl_timer timer)
+{
+ if (!timer)
+ return 0;
+
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ return impl->timer_query(gpu, timer);
+}
diff --git a/src/gpu.h b/src/gpu.h
new file mode 100644
index 0000000..e915a50
--- /dev/null
+++ b/src/gpu.h
@@ -0,0 +1,207 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+#include "log.h"
+
+#include <libplacebo/gpu.h>
+#include <libplacebo/dispatch.h>
+
+// To avoid having to include drm_fourcc.h
+#ifndef DRM_FORMAT_MOD_LINEAR
+#define DRM_FORMAT_MOD_LINEAR UINT64_C(0x0)
+#define DRM_FORMAT_MOD_INVALID ((UINT64_C(1) << 56) - 1)
+#endif
+
+// This struct must be the first member of the gpu's priv struct. The `pl_gpu`
+// helpers will cast the priv struct to this struct!
+
+#define GPU_PFN(name) __typeof__(pl_##name) *name
+struct pl_gpu_fns {
+ // This is a pl_dispatch used (on the pl_gpu itself!) for the purposes of
+ // dispatching compute shaders for performing various emulation tasks (e.g.
+ // partial clears, blits or emulated texture transfers, see below).
+ //
+ // Warning: Care must be taken to avoid recursive calls.
+ pl_dispatch dp;
+
+ // Internal cache, or NULL. Set by the user (via pl_gpu_set_cache).
+ _Atomic(pl_cache) cache;
+
+ // Destructors: These also free the corresponding objects, but they
+ // must not be called on NULL. (The NULL checks are done by the pl_*_destroy
+ // wrappers)
+ void (*destroy)(pl_gpu gpu);
+ void (*tex_destroy)(pl_gpu, pl_tex);
+ void (*buf_destroy)(pl_gpu, pl_buf);
+ void (*pass_destroy)(pl_gpu, pl_pass);
+ void (*sync_destroy)(pl_gpu, pl_sync);
+ void (*timer_destroy)(pl_gpu, pl_timer);
+
+ GPU_PFN(tex_create);
+ GPU_PFN(tex_invalidate); // optional
+ GPU_PFN(tex_clear_ex); // optional if no blittable formats
+ GPU_PFN(tex_blit); // optional if no blittable formats
+ GPU_PFN(tex_upload);
+ GPU_PFN(tex_download);
+ GPU_PFN(tex_poll); // optional: if NULL, textures are always free to use
+ GPU_PFN(buf_create);
+ GPU_PFN(buf_write);
+ GPU_PFN(buf_read);
+ GPU_PFN(buf_copy);
+ GPU_PFN(buf_export); // optional if !gpu->export_caps.buf
+ GPU_PFN(buf_poll); // optional: if NULL, buffers are always free to use
+ GPU_PFN(desc_namespace);
+ GPU_PFN(pass_create);
+ GPU_PFN(pass_run);
+ GPU_PFN(sync_create); // optional if !gpu->export_caps.sync
+ GPU_PFN(tex_export); // optional if !gpu->export_caps.sync
+ GPU_PFN(timer_create); // optional
+ GPU_PFN(timer_query); // optional
+ GPU_PFN(gpu_flush); // optional
+ GPU_PFN(gpu_finish);
+ GPU_PFN(gpu_is_failed); // optional
+};
+#undef GPU_PFN
+
+// All resources such as textures and buffers allocated from the GPU must be
+// destroyed before calling pl_destroy.
+void pl_gpu_destroy(pl_gpu gpu);
+
+// Returns true if the device supports interop. This is considered to be
+// the case if at least one of `gpu->export/import_caps` is nonzero.
+static inline bool pl_gpu_supports_interop(pl_gpu gpu)
+{
+ return gpu->export_caps.tex ||
+ gpu->import_caps.tex ||
+ gpu->export_caps.buf ||
+ gpu->import_caps.buf ||
+ gpu->export_caps.sync ||
+ gpu->import_caps.sync;
+}
+
+// Returns the GPU-internal `pl_dispatch` and `pl_cache` objects.
+pl_dispatch pl_gpu_dispatch(pl_gpu gpu);
+pl_cache pl_gpu_cache(pl_gpu gpu);
+
+// GPU-internal helpers: these should not be used outside of GPU implementations
+
+// This performs several tasks. It sorts the format list, logs GPU metadata,
+// performs verification and fixes up backwards compatibility fields. This
+// should be returned as the last step when creating a `pl_gpu`.
+pl_gpu pl_gpu_finalize(struct pl_gpu_t *gpu);
+
+// Look up the right GLSL image format qualifier from a partially filled-in
+// pl_fmt, or NULL if the format does not have a legal matching GLSL name.
+//
+// `components` may differ from fmt->num_components (for emulated formats)
+const char *pl_fmt_glsl_format(pl_fmt fmt, int components);
+
+// Look up the right fourcc from a partially filled-in pl_fmt, or 0 if the
+// format does not have a legal matching fourcc format.
+uint32_t pl_fmt_fourcc(pl_fmt fmt);
+
+// Compute the total size (in bytes) of a texture transfer operation
+size_t pl_tex_transfer_size(const struct pl_tex_transfer_params *par);
+
+// Split a tex transfer into slices. For emulated formats, `texel_fmt` gives
+// the format of the underlying texel buffer.
+//
+// Returns the number of slices, or 0 on error (e.g. no SSBOs available).
+// `out_slices` must be freed by caller (on success).
+int pl_tex_transfer_slices(pl_gpu gpu, pl_fmt texel_fmt,
+ const struct pl_tex_transfer_params *params,
+ struct pl_tex_transfer_params **out_slices);
+
+// Helper that wraps pl_tex_upload/download using texture upload buffers to
+// ensure that params->buf is always set.
+bool pl_tex_upload_pbo(pl_gpu gpu, const struct pl_tex_transfer_params *params);
+bool pl_tex_download_pbo(pl_gpu gpu, const struct pl_tex_transfer_params *params);
+
+// This requires that params.buf has been set and is of type PL_BUF_TEXEL_*
+bool pl_tex_upload_texel(pl_gpu gpu, const struct pl_tex_transfer_params *params);
+bool pl_tex_download_texel(pl_gpu gpu, const struct pl_tex_transfer_params *params);
+
+// Both `src` and `dst must be storable. `src` must also be sampleable, if the
+// blit requires linear sampling. Returns false if these conditions are unmet.
+bool pl_tex_blit_compute(pl_gpu gpu, const struct pl_tex_blit_params *params);
+
+// Helper to do a 2D blit with stretch and scale using a raster pass
+void pl_tex_blit_raster(pl_gpu gpu, const struct pl_tex_blit_params *params);
+
+// Helper for GPU-accelerated endian swapping
+//
+// Note: `src` and `dst` can be the same buffer, for an in-place operation. In
+// this case, `src_offset` and `dst_offset` must be the same.
+struct pl_buf_copy_swap_params {
+ // Source of the copy operation. Must be `storable`.
+ pl_buf src;
+ size_t src_offset;
+
+ // Destination of the copy operation. Must be `storable`.
+ pl_buf dst;
+ size_t dst_offset;
+
+ // Number of bytes to copy. Must be a multiple of 4.
+ size_t size;
+
+ // Underlying word size. Must be 2 (for 16-bit swap) or 4 (for 32-bit swap)
+ int wordsize;
+};
+
+bool pl_buf_copy_swap(pl_gpu gpu, const struct pl_buf_copy_swap_params *params);
+
+void pl_pass_run_vbo(pl_gpu gpu, const struct pl_pass_run_params *params);
+
+// Make a deep-copy of the pass params. Note: cached_program etc. are not
+// copied, but cleared explicitly.
+struct pl_pass_params pl_pass_params_copy(void *alloc, const struct pl_pass_params *params);
+
+// Helper to compute the size of an index buffer
+static inline size_t pl_index_buf_size(const struct pl_pass_run_params *params)
+{
+ switch (params->index_fmt) {
+ case PL_INDEX_UINT16: return params->vertex_count * sizeof(uint16_t);
+ case PL_INDEX_UINT32: return params->vertex_count * sizeof(uint32_t);
+ case PL_INDEX_FORMAT_COUNT: break;
+ }
+
+ pl_unreachable();
+}
+
+// Helper to compute the size of a vertex buffer required to fit all indices
+size_t pl_vertex_buf_size(const struct pl_pass_run_params *params);
+
+// Utility function for pretty-printing UUIDs
+#define UUID_SIZE 16
+#define PRINT_UUID(uuid) (print_uuid((char[3 * UUID_SIZE]){0}, (uuid)))
+const char *print_uuid(char buf[3 * UUID_SIZE], const uint8_t uuid[UUID_SIZE]);
+
+// Helper to pretty-print fourcc codes
+#define PRINT_FOURCC(fcc) \
+ (!(fcc) ? "" : (char[5]) { \
+ (fcc) & 0xFF, \
+ ((fcc) >> 8) & 0xFF, \
+ ((fcc) >> 16) & 0xFF, \
+ ((fcc) >> 24) & 0xFF \
+ })
+
+#define DRM_MOD_SIZE 26
+#define PRINT_DRM_MOD(mod) (print_drm_mod((char[DRM_MOD_SIZE]){0}, (mod)))
+const char *print_drm_mod(char buf[DRM_MOD_SIZE], uint64_t mod);
diff --git a/src/gpu/utils.c b/src/gpu/utils.c
new file mode 100644
index 0000000..40ca84d
--- /dev/null
+++ b/src/gpu/utils.c
@@ -0,0 +1,1288 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+
+#include "common.h"
+#include "shaders.h"
+#include "gpu.h"
+
+// GPU-internal helpers
+
+static int cmp_fmt(const void *pa, const void *pb)
+{
+ pl_fmt a = *(pl_fmt *)pa;
+ pl_fmt b = *(pl_fmt *)pb;
+
+ // Always prefer non-opaque formats
+ if (a->opaque != b->opaque)
+ return PL_CMP(a->opaque, b->opaque);
+
+ // Always prefer non-emulated formats
+ if (a->emulated != b->emulated)
+ return PL_CMP(a->emulated, b->emulated);
+
+ int ca = __builtin_popcount(a->caps),
+ cb = __builtin_popcount(b->caps);
+ if (ca != cb)
+ return -PL_CMP(ca, cb); // invert to sort higher values first
+
+ // If the population count is the same but the caps are different, prefer
+ // the caps with a "lower" value (which tend to be more fundamental caps)
+ if (a->caps != b->caps)
+ return PL_CMP(a->caps, b->caps);
+
+ // If the capabilities are equal, sort based on the component attributes
+ for (int i = 0; i < PL_ARRAY_SIZE(a->component_depth); i++) {
+ int da = a->component_depth[i],
+ db = b->component_depth[i];
+ if (da != db)
+ return PL_CMP(da, db);
+
+ int ha = a->host_bits[i],
+ hb = b->host_bits[i];
+ if (ha != hb)
+ return PL_CMP(ha, hb);
+
+ int oa = a->sample_order[i],
+ ob = b->sample_order[i];
+ if (oa != ob)
+ return PL_CMP(oa, ob);
+ }
+
+ // Fall back to sorting by the name (for stability)
+ return strcmp(a->name, b->name);
+}
+
+#define FMT_BOOL(letter, cap) ((cap) ? (letter) : '-')
+#define FMT_IDX4(f) (f)[0], (f)[1], (f)[2], (f)[3]
+
+static void print_formats(pl_gpu gpu)
+{
+ if (!pl_msg_test(gpu->log, PL_LOG_DEBUG))
+ return;
+
+#define CAP_HEADER "%-12s"
+#define CAP_FIELDS "%c%c%c%c%c%c%c%c%c%c%c%c"
+#define CAP_VALUES \
+ FMT_BOOL('S', fmt->caps & PL_FMT_CAP_SAMPLEABLE), \
+ FMT_BOOL('s', fmt->caps & PL_FMT_CAP_STORABLE), \
+ FMT_BOOL('L', fmt->caps & PL_FMT_CAP_LINEAR), \
+ FMT_BOOL('R', fmt->caps & PL_FMT_CAP_RENDERABLE), \
+ FMT_BOOL('b', fmt->caps & PL_FMT_CAP_BLENDABLE), \
+ FMT_BOOL('B', fmt->caps & PL_FMT_CAP_BLITTABLE), \
+ FMT_BOOL('V', fmt->caps & PL_FMT_CAP_VERTEX), \
+ FMT_BOOL('u', fmt->caps & PL_FMT_CAP_TEXEL_UNIFORM), \
+ FMT_BOOL('t', fmt->caps & PL_FMT_CAP_TEXEL_STORAGE), \
+ FMT_BOOL('H', fmt->caps & PL_FMT_CAP_HOST_READABLE), \
+ FMT_BOOL('W', fmt->caps & PL_FMT_CAP_READWRITE), \
+ FMT_BOOL('G', fmt->gatherable)
+
+ PL_DEBUG(gpu, "GPU texture formats:");
+ PL_DEBUG(gpu, " %-20s %-6s %-4s %-4s " CAP_HEADER " %-3s %-13s %-13s %-10s %-10s %-6s",
+ "NAME", "TYPE", "SIZE", "COMP", "CAPS", "EMU", "DEPTH", "HOST_BITS",
+ "GLSL_TYPE", "GLSL_FMT", "FOURCC");
+ for (int n = 0; n < gpu->num_formats; n++) {
+ pl_fmt fmt = gpu->formats[n];
+
+ static const char *types[] = {
+ [PL_FMT_UNKNOWN] = "UNKNOWN",
+ [PL_FMT_UNORM] = "UNORM",
+ [PL_FMT_SNORM] = "SNORM",
+ [PL_FMT_UINT] = "UINT",
+ [PL_FMT_SINT] = "SINT",
+ [PL_FMT_FLOAT] = "FLOAT",
+ };
+
+ static const char idx_map[4] = {'R', 'G', 'B', 'A'};
+ char indices[4] = {' ', ' ', ' ', ' '};
+ if (!fmt->opaque) {
+ for (int i = 0; i < fmt->num_components; i++)
+ indices[i] = idx_map[fmt->sample_order[i]];
+ }
+
+
+ PL_DEBUG(gpu, " %-20s %-6s %-4zu %c%c%c%c " CAP_FIELDS " %-3s "
+ "{%-2d %-2d %-2d %-2d} {%-2d %-2d %-2d %-2d} %-10s %-10s %-6s",
+ fmt->name, types[fmt->type], fmt->texel_size,
+ FMT_IDX4(indices), CAP_VALUES, fmt->emulated ? "y" : "n",
+ FMT_IDX4(fmt->component_depth), FMT_IDX4(fmt->host_bits),
+ PL_DEF(fmt->glsl_type, ""), PL_DEF(fmt->glsl_format, ""),
+ PRINT_FOURCC(fmt->fourcc));
+
+#undef CAP_HEADER
+#undef CAP_FIELDS
+#undef CAP_VALUES
+
+ for (int i = 0; i < fmt->num_modifiers; i++) {
+ PL_TRACE(gpu, " modifiers[%d]: %s",
+ i, PRINT_DRM_MOD(fmt->modifiers[i]));
+ }
+ }
+}
+
+pl_gpu pl_gpu_finalize(struct pl_gpu_t *gpu)
+{
+ // Sort formats
+ qsort(gpu->formats, gpu->num_formats, sizeof(pl_fmt), cmp_fmt);
+
+ // Verification
+ pl_assert(gpu->limits.max_tex_2d_dim);
+ pl_assert(gpu->limits.max_variable_comps || gpu->limits.max_ubo_size);
+ pl_assert(gpu->limits.max_ubo_size <= gpu->limits.max_buf_size);
+ pl_assert(gpu->limits.max_ssbo_size <= gpu->limits.max_buf_size);
+ pl_assert(gpu->limits.max_vbo_size <= gpu->limits.max_buf_size);
+ pl_assert(gpu->limits.max_mapped_size <= gpu->limits.max_buf_size);
+
+ for (int n = 0; n < gpu->num_formats; n++) {
+ pl_fmt fmt = gpu->formats[n];
+ pl_assert(fmt->name);
+ pl_assert(fmt->type);
+ pl_assert(fmt->num_components);
+ pl_assert(fmt->internal_size);
+ pl_assert(fmt->opaque ? !fmt->texel_size : fmt->texel_size);
+ pl_assert(!fmt->gatherable || (fmt->caps & PL_FMT_CAP_SAMPLEABLE));
+ for (int i = 0; i < fmt->num_components; i++) {
+ pl_assert(fmt->component_depth[i]);
+ pl_assert(fmt->opaque ? !fmt->host_bits[i] : fmt->host_bits[i]);
+ }
+ for (int i = 0; i < fmt->num_planes; i++)
+ pl_assert(fmt->planes[i].format);
+
+ enum pl_fmt_caps texel_caps = PL_FMT_CAP_VERTEX |
+ PL_FMT_CAP_TEXEL_UNIFORM |
+ PL_FMT_CAP_TEXEL_STORAGE;
+
+ if (fmt->caps & texel_caps) {
+ pl_assert(fmt->glsl_type);
+ pl_assert(!fmt->opaque);
+ }
+ if (!fmt->opaque) {
+ pl_assert(fmt->texel_size && fmt->texel_align);
+ pl_assert((fmt->texel_size % fmt->texel_align) == 0);
+ pl_assert(fmt->internal_size == fmt->texel_size || fmt->emulated);
+ } else {
+ pl_assert(!fmt->texel_size && !fmt->texel_align);
+ pl_assert(!(fmt->caps & PL_FMT_CAP_HOST_READABLE));
+ }
+
+ // Assert uniqueness of name
+ for (int o = n + 1; o < gpu->num_formats; o++)
+ pl_assert(strcmp(fmt->name, gpu->formats[o]->name) != 0);
+ }
+
+ // Print info
+ PL_INFO(gpu, "GPU information:");
+
+#define LOG(fmt, field) \
+ PL_INFO(gpu, " %-26s %" fmt, #field ":", gpu->LOG_STRUCT.field)
+
+#define LOG_STRUCT glsl
+ PL_INFO(gpu, " GLSL version: %d%s", gpu->glsl.version,
+ gpu->glsl.vulkan ? " (vulkan)" : gpu->glsl.gles ? " es" : "");
+ if (gpu->glsl.compute) {
+ LOG("zu", max_shmem_size);
+ LOG(PRIu32, max_group_threads);
+ LOG(PRIu32, max_group_size[0]);
+ LOG(PRIu32, max_group_size[1]);
+ LOG(PRIu32, max_group_size[2]);
+ }
+ LOG(PRIu32, subgroup_size);
+ LOG(PRIi16, min_gather_offset);
+ LOG(PRIi16, max_gather_offset);
+#undef LOG_STRUCT
+
+#define LOG_STRUCT limits
+ PL_INFO(gpu, " Limits:");
+ // pl_gpu
+ LOG("d", thread_safe);
+ LOG("d", callbacks);
+ // pl_buf
+ LOG("zu", max_buf_size);
+ LOG("zu", max_ubo_size);
+ LOG("zu", max_ssbo_size);
+ LOG("zu", max_vbo_size);
+ LOG("zu", max_mapped_size);
+ LOG(PRIu64, max_buffer_texels);
+ LOG("zu", align_host_ptr);
+ LOG("d", host_cached);
+ // pl_tex
+ LOG(PRIu32, max_tex_1d_dim);
+ LOG(PRIu32, max_tex_2d_dim);
+ LOG(PRIu32, max_tex_3d_dim);
+ LOG("d", blittable_1d_3d);
+ LOG("d", buf_transfer);
+ LOG("zu", align_tex_xfer_pitch);
+ LOG("zu", align_tex_xfer_offset);
+ // pl_pass
+ LOG("zu", max_variable_comps);
+ LOG("zu", max_constants);
+ LOG("zu", max_pushc_size);
+ LOG("zu", align_vertex_stride);
+ if (gpu->glsl.compute) {
+ LOG(PRIu32, max_dispatch[0]);
+ LOG(PRIu32, max_dispatch[1]);
+ LOG(PRIu32, max_dispatch[2]);
+ }
+ LOG(PRIu32, fragment_queues);
+ LOG(PRIu32, compute_queues);
+#undef LOG_STRUCT
+#undef LOG
+
+ if (pl_gpu_supports_interop(gpu)) {
+ PL_INFO(gpu, " External API interop:");
+
+ PL_INFO(gpu, " UUID: %s", PRINT_UUID(gpu->uuid));
+ PL_INFO(gpu, " PCI: %04x:%02x:%02x:%x",
+ gpu->pci.domain, gpu->pci.bus, gpu->pci.device, gpu->pci.function);
+ PL_INFO(gpu, " buf export caps: 0x%x",
+ (unsigned int) gpu->export_caps.buf);
+ PL_INFO(gpu, " buf import caps: 0x%x",
+ (unsigned int) gpu->import_caps.buf);
+ PL_INFO(gpu, " tex export caps: 0x%x",
+ (unsigned int) gpu->export_caps.tex);
+ PL_INFO(gpu, " tex import caps: 0x%x",
+ (unsigned int) gpu->import_caps.tex);
+ PL_INFO(gpu, " sync export caps: 0x%x",
+ (unsigned int) gpu->export_caps.sync);
+ PL_INFO(gpu, " sync import caps: 0x%x",
+ (unsigned int) gpu->import_caps.sync);
+ }
+
+ print_formats(gpu);
+
+ // Finally, create a `pl_dispatch` object for internal operations
+ struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ atomic_init(&impl->cache, NULL);
+ impl->dp = pl_dispatch_create(gpu->log, gpu);
+ return gpu;
+}
+
+struct glsl_fmt {
+ enum pl_fmt_type type;
+ int num_components;
+ int depth[4];
+ const char *glsl_format;
+};
+
+// List taken from the GLSL specification. (Yes, GLSL supports only exactly
+// these formats with exactly these names)
+static const struct glsl_fmt pl_glsl_fmts[] = {
+ {PL_FMT_FLOAT, 1, {16}, "r16f"},
+ {PL_FMT_FLOAT, 1, {32}, "r32f"},
+ {PL_FMT_FLOAT, 2, {16, 16}, "rg16f"},
+ {PL_FMT_FLOAT, 2, {32, 32}, "rg32f"},
+ {PL_FMT_FLOAT, 4, {16, 16, 16, 16}, "rgba16f"},
+ {PL_FMT_FLOAT, 4, {32, 32, 32, 32}, "rgba32f"},
+ {PL_FMT_FLOAT, 3, {11, 11, 10}, "r11f_g11f_b10f"},
+
+ {PL_FMT_UNORM, 1, {8}, "r8"},
+ {PL_FMT_UNORM, 1, {16}, "r16"},
+ {PL_FMT_UNORM, 2, {8, 8}, "rg8"},
+ {PL_FMT_UNORM, 2, {16, 16}, "rg16"},
+ {PL_FMT_UNORM, 4, {8, 8, 8, 8}, "rgba8"},
+ {PL_FMT_UNORM, 4, {16, 16, 16, 16}, "rgba16"},
+ {PL_FMT_UNORM, 4, {10, 10, 10, 2}, "rgb10_a2"},
+
+ {PL_FMT_SNORM, 1, {8}, "r8_snorm"},
+ {PL_FMT_SNORM, 1, {16}, "r16_snorm"},
+ {PL_FMT_SNORM, 2, {8, 8}, "rg8_snorm"},
+ {PL_FMT_SNORM, 2, {16, 16}, "rg16_snorm"},
+ {PL_FMT_SNORM, 4, {8, 8, 8, 8}, "rgba8_snorm"},
+ {PL_FMT_SNORM, 4, {16, 16, 16, 16}, "rgba16_snorm"},
+
+ {PL_FMT_UINT, 1, {8}, "r8ui"},
+ {PL_FMT_UINT, 1, {16}, "r16ui"},
+ {PL_FMT_UINT, 1, {32}, "r32ui"},
+ {PL_FMT_UINT, 2, {8, 8}, "rg8ui"},
+ {PL_FMT_UINT, 2, {16, 16}, "rg16ui"},
+ {PL_FMT_UINT, 2, {32, 32}, "rg32ui"},
+ {PL_FMT_UINT, 4, {8, 8, 8, 8}, "rgba8ui"},
+ {PL_FMT_UINT, 4, {16, 16, 16, 16}, "rgba16ui"},
+ {PL_FMT_UINT, 4, {32, 32, 32, 32}, "rgba32ui"},
+ {PL_FMT_UINT, 4, {10, 10, 10, 2}, "rgb10_a2ui"},
+
+ {PL_FMT_SINT, 1, {8}, "r8i"},
+ {PL_FMT_SINT, 1, {16}, "r16i"},
+ {PL_FMT_SINT, 1, {32}, "r32i"},
+ {PL_FMT_SINT, 2, {8, 8}, "rg8i"},
+ {PL_FMT_SINT, 2, {16, 16}, "rg16i"},
+ {PL_FMT_SINT, 2, {32, 32}, "rg32i"},
+ {PL_FMT_SINT, 4, {8, 8, 8, 8}, "rgba8i"},
+ {PL_FMT_SINT, 4, {16, 16, 16, 16}, "rgba16i"},
+ {PL_FMT_SINT, 4, {32, 32, 32, 32}, "rgba32i"},
+};
+
+const char *pl_fmt_glsl_format(pl_fmt fmt, int components)
+{
+ if (fmt->opaque)
+ return NULL;
+
+ for (int n = 0; n < PL_ARRAY_SIZE(pl_glsl_fmts); n++) {
+ const struct glsl_fmt *gfmt = &pl_glsl_fmts[n];
+
+ if (fmt->type != gfmt->type)
+ continue;
+ if (components != gfmt->num_components)
+ continue;
+
+ // The component order is irrelevant, so we need to sort the depth
+ // based on the component's index
+ int depth[4] = {0};
+ for (int i = 0; i < fmt->num_components; i++)
+ depth[fmt->sample_order[i]] = fmt->component_depth[i];
+
+ // Copy over any emulated components
+ for (int i = fmt->num_components; i < components; i++)
+ depth[i] = gfmt->depth[i];
+
+ for (int i = 0; i < PL_ARRAY_SIZE(depth); i++) {
+ if (depth[i] != gfmt->depth[i])
+ goto next_fmt;
+ }
+
+ return gfmt->glsl_format;
+
+next_fmt: ; // equivalent to `continue`
+ }
+
+ return NULL;
+}
+
+#define FOURCC(a,b,c,d) ((uint32_t)(a) | ((uint32_t)(b) << 8) | \
+ ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24))
+
+struct pl_fmt_fourcc {
+ const char *name;
+ uint32_t fourcc;
+};
+
+static const struct pl_fmt_fourcc pl_fmt_fourccs[] = {
+ // 8 bpp red
+ {"r8", FOURCC('R','8',' ',' ')},
+ // 16 bpp red
+ {"r16", FOURCC('R','1','6',' ')},
+ // 16 bpp rg
+ {"rg8", FOURCC('G','R','8','8')},
+ {"gr8", FOURCC('R','G','8','8')},
+ // 32 bpp rg
+ {"rg16", FOURCC('G','R','3','2')},
+ {"gr16", FOURCC('R','G','3','2')},
+ // 8 bpp rgb: N/A
+ // 16 bpp rgb
+ {"argb4", FOURCC('B','A','1','2')},
+ {"abgr4", FOURCC('R','A','1','2')},
+ {"rgba4", FOURCC('A','B','1','2')},
+ {"bgra4", FOURCC('A','R','1','2')},
+
+ {"a1rgb5", FOURCC('B','A','1','5')},
+ {"a1bgr5", FOURCC('R','A','1','5')},
+ {"rgb5a1", FOURCC('A','B','1','5')},
+ {"bgr5a1", FOURCC('A','R','1','5')},
+
+ {"rgb565", FOURCC('B','G','1','6')},
+ {"bgr565", FOURCC('R','G','1','6')},
+ // 24 bpp rgb
+ {"rgb8", FOURCC('B','G','2','4')},
+ {"bgr8", FOURCC('R','G','2','4')},
+ // 32 bpp rgb
+ {"argb8", FOURCC('B','A','2','4')},
+ {"abgr8", FOURCC('R','A','2','4')},
+ {"rgba8", FOURCC('A','B','2','4')},
+ {"bgra8", FOURCC('A','R','2','4')},
+
+ {"a2rgb10", FOURCC('B','A','3','0')},
+ {"a2bgr10", FOURCC('R','A','3','0')},
+ {"rgb10a2", FOURCC('A','B','3','0')},
+ {"bgr10a2", FOURCC('A','R','3','0')},
+ // 64bpp rgb
+ {"rgba16hf", FOURCC('A','B','4','H')},
+ {"bgra16hf", FOURCC('A','R','4','H')},
+
+ // packed 16-bit formats
+ // rx10: N/A
+ // rxgx10: N/A
+ {"rxgxbxax10", FOURCC('A','B','1','0')},
+ // rx12: N/A
+ // rxgx12: N/A
+ // rxgxbxax12: N/A
+
+ // planar formats
+ {"g8_b8_r8_420", FOURCC('Y','U','1','2')},
+ {"g8_b8_r8_422", FOURCC('Y','U','1','6')},
+ {"g8_b8_r8_444", FOURCC('Y','U','2','4')},
+ // g16_b18_r8_*: N/A
+ // gx10_bx10_rx10_42*: N/A
+ {"gx10_bx10_rx10_444", FOURCC('Q','4','1','0')},
+ // gx12_bx12_rx12_*:N/A
+ {"g8_br8_420", FOURCC('N','V','1','2')},
+ {"g8_br8_422", FOURCC('N','V','1','6')},
+ {"g8_br8_444", FOURCC('N','V','2','4')},
+ {"g16_br16_420", FOURCC('P','0','1','6')},
+ // g16_br16_422: N/A
+ // g16_br16_444: N/A
+ {"gx10_bxrx10_420", FOURCC('P','0','1','0')},
+ {"gx10_bxrx10_422", FOURCC('P','2','1','0')},
+ // gx10_bxrx10_444: N/A
+ {"gx12_bxrx12_420", FOURCC('P','0','1','2')},
+ // gx12_bxrx12_422: N/A
+ // gx12_bxrx12_444: N/A
+};
+
+uint32_t pl_fmt_fourcc(pl_fmt fmt)
+{
+ for (int n = 0; n < PL_ARRAY_SIZE(pl_fmt_fourccs); n++) {
+ const struct pl_fmt_fourcc *fourcc = &pl_fmt_fourccs[n];
+ if (strcmp(fmt->name, fourcc->name) == 0)
+ return fourcc->fourcc;
+ }
+
+ return 0; // no matching format
+}
+
+size_t pl_tex_transfer_size(const struct pl_tex_transfer_params *par)
+{
+ int w = pl_rect_w(par->rc), h = pl_rect_h(par->rc), d = pl_rect_d(par->rc);
+ size_t pixel_pitch = par->tex->params.format->texel_size;
+
+ // This generates the absolute bare minimum size of a buffer required to
+ // hold the data of a texture upload/download, by including stride padding
+ // only where strictly necessary.
+ return (d - 1) * par->depth_pitch + (h - 1) * par->row_pitch + w * pixel_pitch;
+}
+
+int pl_tex_transfer_slices(pl_gpu gpu, pl_fmt texel_fmt,
+ const struct pl_tex_transfer_params *params,
+ struct pl_tex_transfer_params **out_slices)
+{
+ PL_ARRAY(struct pl_tex_transfer_params) slices = {0};
+ size_t max_size = params->buf ? gpu->limits.max_buf_size : SIZE_MAX;
+
+ pl_fmt fmt = params->tex->params.format;
+ if (fmt->emulated && texel_fmt) {
+ size_t max_texel = gpu->limits.max_buffer_texels * texel_fmt->texel_size;
+ max_size = PL_MIN(gpu->limits.max_ssbo_size, max_texel);
+ }
+
+ int slice_w = pl_rect_w(params->rc);
+ int slice_h = pl_rect_h(params->rc);
+ int slice_d = pl_rect_d(params->rc);
+
+ slice_d = PL_MIN(slice_d, max_size / params->depth_pitch);
+ if (!slice_d) {
+ slice_d = 1;
+ slice_h = PL_MIN(slice_h, max_size / params->row_pitch);
+ if (!slice_h) {
+ slice_h = 1;
+ slice_w = PL_MIN(slice_w, max_size / fmt->texel_size);
+ pl_assert(slice_w);
+ }
+ }
+
+ for (int z = 0; z < pl_rect_d(params->rc); z += slice_d) {
+ for (int y = 0; y < pl_rect_h(params->rc); y += slice_h) {
+ for (int x = 0; x < pl_rect_w(params->rc); x += slice_w) {
+ struct pl_tex_transfer_params slice = *params;
+ slice.callback = NULL;
+ slice.rc.x0 = params->rc.x0 + x;
+ slice.rc.y0 = params->rc.y0 + y;
+ slice.rc.z0 = params->rc.z0 + z;
+ slice.rc.x1 = PL_MIN(slice.rc.x0 + slice_w, params->rc.x1);
+ slice.rc.y1 = PL_MIN(slice.rc.y0 + slice_h, params->rc.y1);
+ slice.rc.z1 = PL_MIN(slice.rc.z0 + slice_d, params->rc.z1);
+
+ const size_t offset = z * params->depth_pitch +
+ y * params->row_pitch +
+ x * fmt->texel_size;
+ if (slice.ptr) {
+ slice.ptr = (uint8_t *) slice.ptr + offset;
+ } else {
+ slice.buf_offset += offset;
+ }
+
+ PL_ARRAY_APPEND(NULL, slices, slice);
+ }
+ }
+ }
+
+ *out_slices = slices.elem;
+ return slices.num;
+}
+
+bool pl_tex_upload_pbo(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+ if (params->buf)
+ return pl_tex_upload(gpu, params);
+
+ struct pl_buf_params bufparams = {
+ .size = pl_tex_transfer_size(params),
+ .debug_tag = PL_DEBUG_TAG,
+ };
+
+ struct pl_tex_transfer_params fixed = *params;
+ fixed.ptr = NULL;
+
+ // If we can import host pointers directly, and the function is being used
+ // asynchronously, then we can use host pointer import to skip a memcpy. In
+ // the synchronous case, we still force a host memcpy to avoid stalling the
+ // host until the GPU memcpy completes.
+ bool can_import = gpu->import_caps.buf & PL_HANDLE_HOST_PTR;
+ can_import &= !params->no_import;
+ can_import &= params->callback != NULL;
+ can_import &= bufparams.size > (32 << 10); // 32 KiB
+ if (can_import) {
+ bufparams.import_handle = PL_HANDLE_HOST_PTR;
+ bufparams.shared_mem = (struct pl_shared_mem) {
+ .handle.ptr = params->ptr,
+ .size = bufparams.size,
+ .offset = 0,
+ };
+
+ // Suppress errors for this test because it may fail, in which case we
+ // want to silently fall back.
+ pl_log_level_cap(gpu->log, PL_LOG_DEBUG);
+ fixed.buf = pl_buf_create(gpu, &bufparams);
+ pl_log_level_cap(gpu->log, PL_LOG_NONE);
+ }
+
+ if (!fixed.buf) {
+ bufparams.import_handle = 0;
+ bufparams.host_writable = true;
+ fixed.buf = pl_buf_create(gpu, &bufparams);
+ if (!fixed.buf)
+ return false;
+ pl_buf_write(gpu, fixed.buf, 0, params->ptr, bufparams.size);
+ if (params->callback)
+ params->callback(params->priv);
+ fixed.callback = NULL;
+ }
+
+ bool ok = pl_tex_upload(gpu, &fixed);
+ pl_buf_destroy(gpu, &fixed.buf);
+ return ok;
+}
+
+struct pbo_cb_ctx {
+ pl_gpu gpu;
+ pl_buf buf;
+ void *ptr;
+ void (*callback)(void *priv);
+ void *priv;
+};
+
+static void pbo_download_cb(void *priv)
+{
+ struct pbo_cb_ctx *p = priv;
+ pl_buf_read(p->gpu, p->buf, 0, p->ptr, p->buf->params.size);
+ pl_buf_destroy(p->gpu, &p->buf);
+
+ // Run the original callback
+ p->callback(p->priv);
+ pl_free(priv);
+};
+
+bool pl_tex_download_pbo(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+ if (params->buf)
+ return pl_tex_download(gpu, params);
+
+ pl_buf buf = NULL;
+ struct pl_buf_params bufparams = {
+ .size = pl_tex_transfer_size(params),
+ .debug_tag = PL_DEBUG_TAG,
+ };
+
+ // If we can import host pointers directly, we can avoid an extra memcpy
+ // (sometimes). In the cases where it isn't avoidable, the extra memcpy
+ // will happen inside VRAM, which is typically faster anyway.
+ bool can_import = gpu->import_caps.buf & PL_HANDLE_HOST_PTR;
+ can_import &= !params->no_import;
+ can_import &= bufparams.size > (32 << 10); // 32 KiB
+ if (can_import) {
+ bufparams.import_handle = PL_HANDLE_HOST_PTR;
+ bufparams.shared_mem = (struct pl_shared_mem) {
+ .handle.ptr = params->ptr,
+ .size = bufparams.size,
+ .offset = 0,
+ };
+
+ // Suppress errors for this test because it may fail, in which case we
+ // want to silently fall back.
+ pl_log_level_cap(gpu->log, PL_LOG_DEBUG);
+ buf = pl_buf_create(gpu, &bufparams);
+ pl_log_level_cap(gpu->log, PL_LOG_NONE);
+ }
+
+ if (!buf) {
+ // Fallback when host pointer import is not supported
+ bufparams.import_handle = 0;
+ bufparams.host_readable = true;
+ buf = pl_buf_create(gpu, &bufparams);
+ }
+
+ if (!buf)
+ return false;
+
+ struct pl_tex_transfer_params newparams = *params;
+ newparams.ptr = NULL;
+ newparams.buf = buf;
+
+ // If the transfer is asynchronous, propagate our host read asynchronously
+ if (params->callback && !bufparams.import_handle) {
+ newparams.callback = pbo_download_cb;
+ newparams.priv = pl_alloc_struct(NULL, struct pbo_cb_ctx, {
+ .gpu = gpu,
+ .buf = buf,
+ .ptr = params->ptr,
+ .callback = params->callback,
+ .priv = params->priv,
+ });
+ }
+
+ if (!pl_tex_download(gpu, &newparams)) {
+ pl_buf_destroy(gpu, &buf);
+ return false;
+ }
+
+ if (!params->callback) {
+ while (pl_buf_poll(gpu, buf, 10000000)) // 10 ms
+ PL_TRACE(gpu, "pl_tex_download: synchronous/blocking (slow path)");
+ }
+
+ bool ok;
+ if (bufparams.import_handle) {
+ // Buffer download completion already means the host pointer contains
+ // the valid data, no more need to copy. (Note: this applies even for
+ // asynchronous downloads)
+ ok = true;
+ pl_buf_destroy(gpu, &buf);
+ } else if (!params->callback) {
+ // Synchronous read back to the host pointer
+ ok = pl_buf_read(gpu, buf, 0, params->ptr, bufparams.size);
+ pl_buf_destroy(gpu, &buf);
+ } else {
+ // Nothing left to do here, the rest will be done by pbo_download_cb
+ ok = true;
+ }
+
+ return ok;
+}
+
+bool pl_tex_upload_texel(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+ const int threads = PL_MIN(256, pl_rect_w(params->rc));
+ pl_tex tex = params->tex;
+ pl_fmt fmt = tex->params.format;
+ pl_require(gpu, params->buf);
+
+ pl_dispatch dp = pl_gpu_dispatch(gpu);
+ pl_shader sh = pl_dispatch_begin(dp);
+ if (!sh_try_compute(sh, threads, 1, false, 0)) {
+ PL_ERR(gpu, "Failed emulating texture transfer!");
+ pl_dispatch_abort(dp, &sh);
+ return false;
+ }
+
+ ident_t buf = sh_desc(sh, (struct pl_shader_desc) {
+ .binding.object = params->buf,
+ .desc = {
+ .name = "data",
+ .type = PL_DESC_BUF_TEXEL_STORAGE,
+ },
+ });
+
+ ident_t img = sh_desc(sh, (struct pl_shader_desc) {
+ .binding.object = params->tex,
+ .desc = {
+ .name = "image",
+ .type = PL_DESC_STORAGE_IMG,
+ .access = PL_DESC_ACCESS_WRITEONLY,
+ },
+ });
+
+ // If the transfer width is a natural multiple of the thread size, we
+ // can skip the bounds check. Otherwise, make sure we aren't blitting out
+ // of the range since this would read out of bounds.
+ int groups_x = PL_DIV_UP(pl_rect_w(params->rc), threads);
+ if (groups_x * threads != pl_rect_w(params->rc)) {
+ GLSL("if (gl_GlobalInvocationID.x >= %d) \n"
+ " return; \n",
+ pl_rect_w(params->rc));
+ }
+
+ // fmt->texel_align contains the size of an individual color value
+ assert(fmt->texel_size == fmt->num_components * fmt->texel_align);
+ GLSL("vec4 color = vec4(0.0, 0.0, 0.0, 1.0); \n"
+ "ivec3 pos = ivec3(gl_GlobalInvocationID); \n"
+ "ivec3 tex_pos = pos + ivec3("$", "$", "$"); \n"
+ "int base = "$" + pos.z * "$" + pos.y * "$" + pos.x * "$"; \n",
+ SH_INT_DYN(params->rc.x0), SH_INT_DYN(params->rc.y0), SH_INT_DYN(params->rc.z0),
+ SH_INT_DYN(params->buf_offset),
+ SH_INT(params->depth_pitch / fmt->texel_align),
+ SH_INT(params->row_pitch / fmt->texel_align),
+ SH_INT(fmt->texel_size / fmt->texel_align));
+
+ for (int i = 0; i < fmt->num_components; i++)
+ GLSL("color[%d] = imageLoad("$", base + %d).r; \n", i, buf, i);
+
+ int dims = pl_tex_params_dimension(tex->params);
+ static const char *coord_types[] = {
+ [1] = "int",
+ [2] = "ivec2",
+ [3] = "ivec3",
+ };
+
+ GLSL("imageStore("$", %s(tex_pos), color);\n", img, coord_types[dims]);
+ return pl_dispatch_compute(dp, pl_dispatch_compute_params(
+ .shader = &sh,
+ .dispatch_size = {
+ groups_x,
+ pl_rect_h(params->rc),
+ pl_rect_d(params->rc),
+ },
+ ));
+
+error:
+ return false;
+}
+
+bool pl_tex_download_texel(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+ const int threads = PL_MIN(256, pl_rect_w(params->rc));
+ pl_tex tex = params->tex;
+ pl_fmt fmt = tex->params.format;
+ pl_require(gpu, params->buf);
+
+ pl_dispatch dp = pl_gpu_dispatch(gpu);
+ pl_shader sh = pl_dispatch_begin(dp);
+ if (!sh_try_compute(sh, threads, 1, false, 0)) {
+ PL_ERR(gpu, "Failed emulating texture transfer!");
+ pl_dispatch_abort(dp, &sh);
+ return false;
+ }
+
+ ident_t buf = sh_desc(sh, (struct pl_shader_desc) {
+ .binding.object = params->buf,
+ .desc = {
+ .name = "data",
+ .type = PL_DESC_BUF_TEXEL_STORAGE,
+ },
+ });
+
+ ident_t img = sh_desc(sh, (struct pl_shader_desc) {
+ .binding.object = params->tex,
+ .desc = {
+ .name = "image",
+ .type = PL_DESC_STORAGE_IMG,
+ .access = PL_DESC_ACCESS_READONLY,
+ },
+ });
+
+ int groups_x = PL_DIV_UP(pl_rect_w(params->rc), threads);
+ if (groups_x * threads != pl_rect_w(params->rc)) {
+ GLSL("if (gl_GlobalInvocationID.x >= %d) \n"
+ " return; \n",
+ pl_rect_w(params->rc));
+ }
+
+ int dims = pl_tex_params_dimension(tex->params);
+ static const char *coord_types[] = {
+ [1] = "int",
+ [2] = "ivec2",
+ [3] = "ivec3",
+ };
+
+ assert(fmt->texel_size == fmt->num_components * fmt->texel_align);
+ GLSL("ivec3 pos = ivec3(gl_GlobalInvocationID); \n"
+ "ivec3 tex_pos = pos + ivec3("$", "$", "$"); \n"
+ "int base = "$" + pos.z * "$" + pos.y * "$" + pos.x * "$"; \n"
+ "vec4 color = imageLoad("$", %s(tex_pos)); \n",
+ SH_INT_DYN(params->rc.x0), SH_INT_DYN(params->rc.y0), SH_INT_DYN(params->rc.z0),
+ SH_INT_DYN(params->buf_offset),
+ SH_INT(params->depth_pitch / fmt->texel_align),
+ SH_INT(params->row_pitch / fmt->texel_align),
+ SH_INT(fmt->texel_size / fmt->texel_align),
+ img, coord_types[dims]);
+
+ for (int i = 0; i < fmt->num_components; i++)
+ GLSL("imageStore("$", base + %d, vec4(color[%d])); \n", buf, i, i);
+
+ return pl_dispatch_compute(dp, pl_dispatch_compute_params(
+ .shader = &sh,
+ .dispatch_size = {
+ groups_x,
+ pl_rect_h(params->rc),
+ pl_rect_d(params->rc),
+ },
+ ));
+
+error:
+ return false;
+}
+
+bool pl_tex_blit_compute(pl_gpu gpu, const struct pl_tex_blit_params *params)
+{
+ if (!params->dst->params.storable)
+ return false;
+
+ // Normalize `dst_rc`, moving all flipping to `src_rc` instead.
+ pl_rect3d src_rc = params->src_rc;
+ pl_rect3d dst_rc = params->dst_rc;
+ if (pl_rect_w(dst_rc) < 0) {
+ PL_SWAP(src_rc.x0, src_rc.x1);
+ PL_SWAP(dst_rc.x0, dst_rc.x1);
+ }
+ if (pl_rect_h(dst_rc) < 0) {
+ PL_SWAP(src_rc.y0, src_rc.y1);
+ PL_SWAP(dst_rc.y0, dst_rc.y1);
+ }
+ if (pl_rect_d(dst_rc) < 0) {
+ PL_SWAP(src_rc.z0, src_rc.z1);
+ PL_SWAP(dst_rc.z0, dst_rc.z1);
+ }
+
+ bool needs_scaling = false;
+ needs_scaling |= pl_rect_w(dst_rc) != abs(pl_rect_w(src_rc));
+ needs_scaling |= pl_rect_h(dst_rc) != abs(pl_rect_h(src_rc));
+ needs_scaling |= pl_rect_d(dst_rc) != abs(pl_rect_d(src_rc));
+
+ // Exception: fast path for 1-pixel blits, which don't require scaling
+ bool is_1pixel = abs(pl_rect_w(src_rc)) == 1 && abs(pl_rect_h(src_rc)) == 1;
+ needs_scaling &= !is_1pixel;
+
+ // Manual trilinear interpolation would be too slow to justify
+ bool needs_sampling = needs_scaling && params->sample_mode != PL_TEX_SAMPLE_NEAREST;
+ needs_sampling |= !params->src->params.storable;
+ if (needs_sampling && !params->src->params.sampleable)
+ return false;
+
+ const int threads = 256;
+ int bw = PL_MIN(32, pl_rect_w(dst_rc));
+ int bh = PL_MIN(threads / bw, pl_rect_h(dst_rc));
+ pl_dispatch dp = pl_gpu_dispatch(gpu);
+ pl_shader sh = pl_dispatch_begin(dp);
+ if (!sh_try_compute(sh, bw, bh, false, 0)) {
+ pl_dispatch_abort(dp, &sh);
+ return false;
+ }
+
+ // Avoid over-writing into `dst`
+ int groups_x = PL_DIV_UP(pl_rect_w(dst_rc), bw);
+ if (groups_x * bw != pl_rect_w(dst_rc)) {
+ GLSL("if (gl_GlobalInvocationID.x >= %d) \n"
+ " return; \n",
+ pl_rect_w(dst_rc));
+ }
+
+ int groups_y = PL_DIV_UP(pl_rect_h(dst_rc), bh);
+ if (groups_y * bh != pl_rect_h(dst_rc)) {
+ GLSL("if (gl_GlobalInvocationID.y >= %d) \n"
+ " return; \n",
+ pl_rect_h(dst_rc));
+ }
+
+ ident_t dst = sh_desc(sh, (struct pl_shader_desc) {
+ .binding.object = params->dst,
+ .desc = {
+ .name = "dst",
+ .type = PL_DESC_STORAGE_IMG,
+ .access = PL_DESC_ACCESS_WRITEONLY,
+ },
+ });
+
+ static const char *vecs[] = {
+ [1] = "float",
+ [2] = "vec2",
+ [3] = "vec3",
+ [4] = "vec4",
+ };
+
+ static const char *ivecs[] = {
+ [1] = "int",
+ [2] = "ivec2",
+ [3] = "ivec3",
+ [4] = "ivec4",
+ };
+
+ int src_dims = pl_tex_params_dimension(params->src->params);
+ int dst_dims = pl_tex_params_dimension(params->dst->params);
+ GLSL("ivec3 pos = ivec3(gl_GlobalInvocationID); \n"
+ "%s dst_pos = %s(pos + ivec3(%d, %d, %d)); \n",
+ ivecs[dst_dims], ivecs[dst_dims],
+ params->dst_rc.x0, params->dst_rc.y0, params->dst_rc.z0);
+
+ if (needs_sampling || (needs_scaling && params->src->params.sampleable)) {
+
+ ident_t src = sh_desc(sh, (struct pl_shader_desc) {
+ .desc = {
+ .name = "src",
+ .type = PL_DESC_SAMPLED_TEX,
+ },
+ .binding = {
+ .object = params->src,
+ .address_mode = PL_TEX_ADDRESS_CLAMP,
+ .sample_mode = params->sample_mode,
+ }
+ });
+
+ if (is_1pixel) {
+ GLSL("%s fpos = %s(0.5); \n", vecs[src_dims], vecs[src_dims]);
+ } else {
+ GLSL("vec3 fpos = (vec3(pos) + vec3(0.5)) / vec3(%d.0, %d.0, %d.0); \n",
+ pl_rect_w(dst_rc), pl_rect_h(dst_rc), pl_rect_d(dst_rc));
+ }
+
+ GLSL("%s src_pos = %s(0.5); \n"
+ "src_pos.x = mix(%f, %f, fpos.x); \n",
+ vecs[src_dims], vecs[src_dims],
+ (float) src_rc.x0 / params->src->params.w,
+ (float) src_rc.x1 / params->src->params.w);
+
+ if (params->src->params.h) {
+ GLSL("src_pos.y = mix(%f, %f, fpos.y); \n",
+ (float) src_rc.y0 / params->src->params.h,
+ (float) src_rc.y1 / params->src->params.h);
+ }
+
+ if (params->src->params.d) {
+ GLSL("src_pos.z = mix(%f, %f, fpos.z); \n",
+ (float) src_rc.z0 / params->src->params.d,
+ (float) src_rc.z1 / params->src->params.d);
+ }
+
+ GLSL("imageStore("$", dst_pos, textureLod("$", src_pos, 0.0)); \n",
+ dst, src);
+
+ } else {
+
+ ident_t src = sh_desc(sh, (struct pl_shader_desc) {
+ .binding.object = params->src,
+ .desc = {
+ .name = "src",
+ .type = PL_DESC_STORAGE_IMG,
+ .access = PL_DESC_ACCESS_READONLY,
+ },
+ });
+
+ if (is_1pixel) {
+ GLSL("ivec3 src_pos = ivec3(0); \n");
+ } else if (needs_scaling) {
+ GLSL("ivec3 src_pos = ivec3(vec3(%f, %f, %f) * vec3(pos)); \n",
+ fabs((float) pl_rect_w(src_rc) / pl_rect_w(dst_rc)),
+ fabs((float) pl_rect_h(src_rc) / pl_rect_h(dst_rc)),
+ fabs((float) pl_rect_d(src_rc) / pl_rect_d(dst_rc)));
+ } else {
+ GLSL("ivec3 src_pos = pos; \n");
+ }
+
+ GLSL("src_pos = ivec3(%d, %d, %d) * src_pos + ivec3(%d, %d, %d); \n"
+ "imageStore("$", dst_pos, imageLoad("$", %s(src_pos))); \n",
+ src_rc.x1 < src_rc.x0 ? -1 : 1,
+ src_rc.y1 < src_rc.y0 ? -1 : 1,
+ src_rc.z1 < src_rc.z0 ? -1 : 1,
+ src_rc.x0, src_rc.y0, src_rc.z0,
+ dst, src, ivecs[src_dims]);
+
+ }
+
+ return pl_dispatch_compute(dp, pl_dispatch_compute_params(
+ .shader = &sh,
+ .dispatch_size = {
+ groups_x,
+ groups_y,
+ pl_rect_d(dst_rc),
+ },
+ ));
+}
+
+void pl_tex_blit_raster(pl_gpu gpu, const struct pl_tex_blit_params *params)
+{
+ enum pl_fmt_type src_type = params->src->params.format->type;
+ enum pl_fmt_type dst_type = params->dst->params.format->type;
+
+ // Only for 2D textures
+ pl_assert(params->src->params.h && !params->src->params.d);
+ pl_assert(params->dst->params.h && !params->dst->params.d);
+
+ // Integer textures are not supported
+ pl_assert(src_type != PL_FMT_UINT && src_type != PL_FMT_SINT);
+ pl_assert(dst_type != PL_FMT_UINT && dst_type != PL_FMT_SINT);
+
+ pl_rect2df src_rc = {
+ .x0 = params->src_rc.x0, .x1 = params->src_rc.x1,
+ .y0 = params->src_rc.y0, .y1 = params->src_rc.y1,
+ };
+ pl_rect2d dst_rc = {
+ .x0 = params->dst_rc.x0, .x1 = params->dst_rc.x1,
+ .y0 = params->dst_rc.y0, .y1 = params->dst_rc.y1,
+ };
+
+ pl_dispatch dp = pl_gpu_dispatch(gpu);
+ pl_shader sh = pl_dispatch_begin(dp);
+ sh->output = PL_SHADER_SIG_COLOR;
+
+ ident_t pos, src = sh_bind(sh, params->src, PL_TEX_ADDRESS_CLAMP,
+ params->sample_mode, "src_tex", &src_rc, &pos, NULL);
+
+ GLSL("vec4 color = textureLod("$", "$", 0.0); \n", src, pos);
+
+ pl_dispatch_finish(dp, pl_dispatch_params(
+ .shader = &sh,
+ .target = params->dst,
+ .rect = dst_rc,
+ ));
+}
+
+bool pl_buf_copy_swap(pl_gpu gpu, const struct pl_buf_copy_swap_params *params)
+{
+ pl_buf src = params->src, dst = params->dst;
+ pl_require(gpu, src->params.storable && dst->params.storable);
+ pl_require(gpu, params->src_offset % sizeof(unsigned) == 0);
+ pl_require(gpu, params->dst_offset % sizeof(unsigned) == 0);
+ pl_require(gpu, params->src_offset + params->size <= src->params.size);
+ pl_require(gpu, params->dst_offset + params->size <= dst->params.size);
+ pl_require(gpu, src != dst || params->src_offset == params->dst_offset);
+ pl_require(gpu, params->size % sizeof(unsigned) == 0);
+ pl_require(gpu, params->wordsize == sizeof(uint16_t) ||
+ params->wordsize == sizeof(uint32_t));
+
+ const size_t words = params->size / sizeof(unsigned);
+ const size_t src_off = params->src_offset / sizeof(unsigned);
+ const size_t dst_off = params->dst_offset / sizeof(unsigned);
+
+ const int threads = PL_MIN(256, words);
+ pl_dispatch dp = pl_gpu_dispatch(gpu);
+ pl_shader sh = pl_dispatch_begin(dp);
+ if (!sh_try_compute(sh, threads, 1, false, 0)) {
+ pl_dispatch_abort(dp, &sh);
+ return false;
+ }
+
+ const size_t groups = PL_DIV_UP(words, threads);
+ if (groups * threads > words) {
+ GLSL("if (gl_GlobalInvocationID.x >= %zu) \n"
+ " return; \n",
+ words);
+ }
+
+ sh_desc(sh, (struct pl_shader_desc) {
+ .binding.object = src,
+ .desc = {
+ .name = "SrcBuf",
+ .type = PL_DESC_BUF_STORAGE,
+ .access = src == dst ? PL_DESC_ACCESS_READWRITE : PL_DESC_ACCESS_READONLY,
+ },
+ .num_buffer_vars = 1,
+ .buffer_vars = &(struct pl_buffer_var) {
+ .var = {
+ .name = "src",
+ .type = PL_VAR_UINT,
+ .dim_v = 1,
+ .dim_m = 1,
+ .dim_a = src_off + words,
+ },
+ },
+ });
+
+ if (src != dst) {
+ sh_desc(sh, (struct pl_shader_desc) {
+ .binding.object = dst,
+ .desc = {
+ .name = "DstBuf",
+ .type = PL_DESC_BUF_STORAGE,
+ .access = PL_DESC_ACCESS_WRITEONLY,
+ },
+ .num_buffer_vars = 1,
+ .buffer_vars = &(struct pl_buffer_var) {
+ .var = {
+ .name = "dst",
+ .type = PL_VAR_UINT,
+ .dim_v = 1,
+ .dim_m = 1,
+ .dim_a = dst_off + words,
+ },
+ },
+ });
+ } else {
+ GLSL("#define dst src \n");
+ }
+
+ GLSL("// pl_buf_copy_swap \n"
+ "{ \n"
+ "uint word = src["$" + gl_GlobalInvocationID.x]; \n"
+ "word = (word & 0xFF00FF00u) >> 8 | \n"
+ " (word & 0x00FF00FFu) << 8; \n",
+ SH_UINT(src_off));
+ if (params->wordsize > 2) {
+ GLSL("word = (word & 0xFFFF0000u) >> 16 | \n"
+ " (word & 0x0000FFFFu) << 16; \n");
+ }
+ GLSL("dst["$" + gl_GlobalInvocationID.x] = word; \n"
+ "} \n",
+ SH_UINT(dst_off));
+
+ return pl_dispatch_compute(dp, pl_dispatch_compute_params(
+ .shader = &sh,
+ .dispatch_size = {groups, 1, 1},
+ ));
+
+error:
+ if (src->params.debug_tag || dst->params.debug_tag) {
+ PL_ERR(gpu, " for buffers: src %s, dst %s",
+ src->params.debug_tag, dst->params.debug_tag);
+ }
+ return false;
+}
+
+void pl_pass_run_vbo(pl_gpu gpu, const struct pl_pass_run_params *params)
+{
+ if (!params->vertex_data && !params->index_data)
+ return pl_pass_run(gpu, params);
+
+ struct pl_pass_run_params newparams = *params;
+ pl_buf vert = NULL, index = NULL;
+
+ if (params->vertex_data) {
+ vert = pl_buf_create(gpu, pl_buf_params(
+ .size = pl_vertex_buf_size(params),
+ .initial_data = params->vertex_data,
+ .drawable = true,
+ ));
+
+ if (!vert) {
+ PL_ERR(gpu, "Failed allocating vertex buffer!");
+ return;
+ }
+
+ newparams.vertex_buf = vert;
+ newparams.vertex_data = NULL;
+ }
+
+ if (params->index_data) {
+ index = pl_buf_create(gpu, pl_buf_params(
+ .size = pl_index_buf_size(params),
+ .initial_data = params->index_data,
+ .drawable = true,
+ ));
+
+ if (!index) {
+ PL_ERR(gpu, "Failed allocating index buffer!");
+ return;
+ }
+
+ newparams.index_buf = index;
+ newparams.index_data = NULL;
+ }
+
+ pl_pass_run(gpu, &newparams);
+ pl_buf_destroy(gpu, &vert);
+ pl_buf_destroy(gpu, &index);
+}
+
+struct pl_pass_params pl_pass_params_copy(void *alloc, const struct pl_pass_params *params)
+{
+ struct pl_pass_params new = *params;
+
+ new.glsl_shader = pl_str0dup0(alloc, new.glsl_shader);
+ new.vertex_shader = pl_str0dup0(alloc, new.vertex_shader);
+ if (new.blend_params)
+ new.blend_params = pl_memdup_ptr(alloc, new.blend_params);
+
+#define DUPNAMES(field) \
+ do { \
+ size_t _size = new.num_##field * sizeof(new.field[0]); \
+ new.field = pl_memdup(alloc, new.field, _size); \
+ for (int j = 0; j < new.num_##field; j++) \
+ new.field[j].name = pl_str0dup0(alloc, new.field[j].name); \
+ } while (0)
+
+ DUPNAMES(variables);
+ DUPNAMES(descriptors);
+ DUPNAMES(vertex_attribs);
+
+#undef DUPNAMES
+
+ new.constant_data = NULL;
+ new.constants = pl_memdup(alloc, new.constants,
+ new.num_constants * sizeof(new.constants[0]));
+
+ return new;
+}
+
+size_t pl_vertex_buf_size(const struct pl_pass_run_params *params)
+{
+ if (!params->index_data)
+ return params->vertex_count * params->pass->params.vertex_stride;
+
+ int num_vertices = 0;
+ const void *idx = params->index_data;
+ switch (params->index_fmt) {
+ case PL_INDEX_UINT16:
+ for (int i = 0; i < params->vertex_count; i++)
+ num_vertices = PL_MAX(num_vertices, ((const uint16_t *) idx)[i]);
+ break;
+ case PL_INDEX_UINT32:
+ for (int i = 0; i < params->vertex_count; i++)
+ num_vertices = PL_MAX(num_vertices, ((const uint32_t *) idx)[i]);
+ break;
+ case PL_INDEX_FORMAT_COUNT: pl_unreachable();
+ }
+
+ return (num_vertices + 1) * params->pass->params.vertex_stride;
+}
+
+const char *print_uuid(char buf[3 * UUID_SIZE], const uint8_t uuid[UUID_SIZE])
+{
+ static const char *hexdigits = "0123456789ABCDEF";
+ for (int i = 0; i < UUID_SIZE; i++) {
+ uint8_t x = uuid[i];
+ buf[3 * i + 0] = hexdigits[x >> 4];
+ buf[3 * i + 1] = hexdigits[x & 0xF];
+ buf[3 * i + 2] = i == UUID_SIZE - 1 ? '\0' : ':';
+ }
+
+ return buf;
+}
+
+const char *print_drm_mod(char buf[DRM_MOD_SIZE], uint64_t mod)
+{
+ switch (mod) {
+ case DRM_FORMAT_MOD_LINEAR: return "LINEAR";
+ case DRM_FORMAT_MOD_INVALID: return "INVALID";
+ }
+
+ uint8_t vendor = mod >> 56;
+ uint64_t val = mod & ((1ULL << 56) - 1);
+
+ const char *name = NULL;
+ switch (vendor) {
+ case 0x00: name = "NONE"; break;
+ case 0x01: name = "INTEL"; break;
+ case 0x02: name = "AMD"; break;
+ case 0x03: name = "NVIDIA"; break;
+ case 0x04: name = "SAMSUNG"; break;
+ case 0x08: name = "ARM"; break;
+ }
+
+ if (name) {
+ snprintf(buf, DRM_MOD_SIZE, "%s 0x%"PRIx64, name, val);
+ } else {
+ snprintf(buf, DRM_MOD_SIZE, "0x%02x 0x%"PRIx64, vendor, val);
+ }
+
+ return buf;
+}
diff --git a/src/hash.h b/src/hash.h
new file mode 100644
index 0000000..2513919
--- /dev/null
+++ b/src/hash.h
@@ -0,0 +1,162 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+#define GOLDEN_RATIO_64 UINT64_C(0x9e3779b97f4a7c15)
+
+static inline void pl_hash_merge(uint64_t *accum, uint64_t hash) {
+ *accum ^= hash + GOLDEN_RATIO_64 + (*accum << 6) + (*accum >> 2);
+}
+
+static inline uint64_t pl_mem_hash(const void *mem, size_t size);
+#define pl_var_hash(x) pl_mem_hash(&(x), sizeof(x))
+
+static inline uint64_t pl_str_hash(pl_str str)
+{
+ return pl_mem_hash(str.buf, str.len);
+}
+
+static inline uint64_t pl_str0_hash(const char *str)
+{
+ return pl_mem_hash(str, str ? strlen(str) : 0);
+}
+
+#ifdef PL_HAVE_XXHASH
+
+#define XXH_NAMESPACE pl_
+#define XXH_INLINE_ALL
+#define XXH_NO_STREAM
+#include <xxhash.h>
+
+XXH_FORCE_INLINE uint64_t pl_mem_hash(const void *mem, size_t size)
+{
+ return XXH3_64bits(mem, size);
+}
+
+#else // !PL_HAVE_XXHASH
+
+/*
+ SipHash reference C implementation
+ Modified for use by libplacebo:
+ - Hard-coded a fixed key (k0 and k1)
+ - Hard-coded the output size to 64 bits
+ - Return the result vector directly
+
+ Copyright (c) 2012-2016 Jean-Philippe Aumasson
+ <jeanphilippe.aumasson@gmail.com>
+ Copyright (c) 2012-2014 Daniel J. Bernstein <djb@cr.yp.to>
+
+ To the extent possible under law, the author(s) have dedicated all copyright
+ and related and neighboring rights to this software to the public domain
+ worldwide. This software is distributed without any warranty.
+
+ <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+/* default: SipHash-2-4 */
+#define cROUNDS 2
+#define dROUNDS 4
+
+#define ROTL(x, b) (uint64_t)(((x) << (b)) | ((x) >> (64 - (b))))
+
+#define U8TO64_LE(p) \
+ (((uint64_t)((p)[0])) | ((uint64_t)((p)[1]) << 8) | \
+ ((uint64_t)((p)[2]) << 16) | ((uint64_t)((p)[3]) << 24) | \
+ ((uint64_t)((p)[4]) << 32) | ((uint64_t)((p)[5]) << 40) | \
+ ((uint64_t)((p)[6]) << 48) | ((uint64_t)((p)[7]) << 56))
+
+#define SIPROUND \
+ do { \
+ v0 += v1; \
+ v1 = ROTL(v1, 13); \
+ v1 ^= v0; \
+ v0 = ROTL(v0, 32); \
+ v2 += v3; \
+ v3 = ROTL(v3, 16); \
+ v3 ^= v2; \
+ v0 += v3; \
+ v3 = ROTL(v3, 21); \
+ v3 ^= v0; \
+ v2 += v1; \
+ v1 = ROTL(v1, 17); \
+ v1 ^= v2; \
+ v2 = ROTL(v2, 32); \
+ } while (0)
+
+static inline uint64_t pl_mem_hash(const void *mem, size_t size)
+{
+ if (!size)
+ return 0x8533321381b8254bULL;
+
+ uint64_t v0 = 0x736f6d6570736575ULL;
+ uint64_t v1 = 0x646f72616e646f6dULL;
+ uint64_t v2 = 0x6c7967656e657261ULL;
+ uint64_t v3 = 0x7465646279746573ULL;
+ uint64_t k0 = 0xfe9f075098ddb0faULL;
+ uint64_t k1 = 0x68f7f03510e5285cULL;
+ uint64_t m;
+ int i;
+ const uint8_t *buf = mem;
+ const uint8_t *end = buf + size - (size % sizeof(uint64_t));
+ const int left = size & 7;
+ uint64_t b = ((uint64_t) size) << 56;
+ v3 ^= k1;
+ v2 ^= k0;
+ v1 ^= k1;
+ v0 ^= k0;
+
+ for (; buf != end; buf += 8) {
+ m = U8TO64_LE(buf);
+ v3 ^= m;
+
+ for (i = 0; i < cROUNDS; ++i)
+ SIPROUND;
+
+ v0 ^= m;
+ }
+
+ switch (left) {
+ case 7: b |= ((uint64_t) buf[6]) << 48; // fall through
+ case 6: b |= ((uint64_t) buf[5]) << 40; // fall through
+ case 5: b |= ((uint64_t) buf[4]) << 32; // fall through
+ case 4: b |= ((uint64_t) buf[3]) << 24; // fall through
+ case 3: b |= ((uint64_t) buf[2]) << 16; // fall through
+ case 2: b |= ((uint64_t) buf[1]) << 8; // fall through
+ case 1: b |= ((uint64_t) buf[0]); break;
+ case 0: break;
+ }
+
+ v3 ^= b;
+
+ for (i = 0; i < cROUNDS; ++i)
+ SIPROUND;
+
+ v0 ^= b;
+
+ v2 ^= 0xff;
+
+ for (i = 0; i < dROUNDS; ++i)
+ SIPROUND;
+
+ b = v0 ^ v1 ^ v2 ^ v3;
+ return b;
+}
+
+#endif // PL_HAVE_XXHASH
diff --git a/src/include/libplacebo/cache.h b/src/include/libplacebo/cache.h
new file mode 100644
index 0000000..5897ac8
--- /dev/null
+++ b/src/include/libplacebo/cache.h
@@ -0,0 +1,200 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_CACHE_H_
+#define LIBPLACEBO_CACHE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <libplacebo/config.h>
+#include <libplacebo/common.h>
+#include <libplacebo/log.h>
+
+PL_API_BEGIN
+
+typedef struct pl_cache_obj {
+ // Cache object key. This will uniquely identify this cached object.
+ uint64_t key;
+
+ // Cache data pointer and length. 0-length cached objects are invalid
+ // and will be silently dropped. You can explicitly remove a cached
+ // object by overwriting it with a length 0 object.
+ void *data;
+ size_t size;
+
+ // Free callback, to free memory associated with `data`. (Optional)
+ // Will be called when the object is either explicitly deleted, culled
+ // due to hitting size limits, or on pl_cache_destroy().
+ void (*free)(void *data);
+} pl_cache_obj;
+
+struct pl_cache_params {
+ // Optional `pl_log` that is used for logging internal events related
+ // to the cache, such as insertions, saving and loading.
+ pl_log log;
+
+ // Size limits. If 0, no limit is imposed.
+ //
+ // Note: libplacebo will never detect or invalidate stale cache entries, so
+ // setting an upper size limit is strongly recommended
+ size_t max_object_size;
+ size_t max_total_size;
+
+ // Optional external callback to call after a cached object is modified
+ // (including deletion and (re-)insertion). Note that this is not called on
+ // objects which are merely pruned from the cache due to `max_total_size`,
+ // so users must rely on some external mechanism to prune stale entries or
+ // enforce size limits.
+ //
+ // Note: `pl_cache_load` does not trigger this callback.
+ // Note: Ownership of `obj` does *not* pass to the caller.
+ // Note: This function must be thread safe.
+ void (*set)(void *priv, pl_cache_obj obj);
+
+ // Optional external callback to call on a cache miss. Ownership of the
+ // returned object passes to the `pl_cache`. Objects returned by this
+ // callback *should* have a valid `free` callback, unless lifetime can be
+ // externally managed and guaranteed to outlive the `pl_cache`.
+ //
+ // Note: This function must be thread safe.
+ pl_cache_obj (*get)(void *priv, uint64_t key);
+
+ // External context for insert/lookup.
+ void *priv;
+};
+
+#define pl_cache_params(...) (&(struct pl_cache_params) { __VA_ARGS__ })
+PL_API extern const struct pl_cache_params pl_cache_default_params;
+
+// Thread-safety: Safe
+//
+// Note: In any context in which `pl_cache` is used, users may also pass NULL
+// to disable caching. In other words, NULL is a valid `pl_cache`.
+typedef const struct pl_cache_t {
+ struct pl_cache_params params;
+} *pl_cache;
+
+// Create a new cache. This function will never fail.
+PL_API pl_cache pl_cache_create(const struct pl_cache_params *params);
+
+// Destroy a `pl_cache` object, including all underlying objects.
+PL_API void pl_cache_destroy(pl_cache *cache);
+
+// Explicitly clear all objects in the cache without destroying it. This is
+// similar to `pl_cache_destroy`, but the cache remains valid afterwards.
+//
+// Note: Objects destroyed in this way *not* propagated to the `set` callback.
+PL_API void pl_cache_reset(pl_cache cache);
+
+// Return the current internal number of objects and total size (bytes)
+PL_API int pl_cache_objects(pl_cache cache);
+PL_API size_t pl_cache_size(pl_cache cache);
+
+// --- Cache saving and loading APIs
+
+// Serialize the internal state of a `pl_cache` into an abstract cache
+// object that can be e.g. saved to disk and loaded again later. Returns the
+// number of objects saved.
+//
+// Note: Using `save/load` is largely redundant with using `insert/lookup`
+// callbacks, and the user should decide whether to use the explicit API or the
+// callback-based API.
+PL_API int pl_cache_save_ex(pl_cache cache,
+ void (*write)(void *priv, size_t size, const void *ptr),
+ void *priv);
+
+// Load the result of a previous `pl_cache_save` call. Any duplicate entries in
+// the `pl_cache` will be overwritten. Returns the number of objects loaded, or
+// a negative number on serious error (e.g. corrupt header)
+//
+// Note: This does not trigger the `update` callback.
+PL_API int pl_cache_load_ex(pl_cache cache,
+ bool (*read)(void *priv, size_t size, void *ptr),
+ void *priv);
+
+// --- Convenience wrappers around pl_cache_save/load_ex
+
+// Writes data directly to a pointer. Returns the number of bytes that *would*
+// have been written, so this can be used on a size 0 buffer to get the required
+// total size.
+PL_API size_t pl_cache_save(pl_cache cache, uint8_t *data, size_t size);
+
+// Reads data directly from a pointer. This still reads from `data`, so it does
+// not avoid a copy.
+PL_API int pl_cache_load(pl_cache cache, const uint8_t *data, size_t size);
+
+// Writes/loads data to/from a FILE stream at the current position.
+#define pl_cache_save_file(c, file) pl_cache_save_ex(c, pl_write_file_cb, file)
+#define pl_cache_load_file(c, file) pl_cache_load_ex(c, pl_read_file_cb, file)
+
+static inline void pl_write_file_cb(void *priv, size_t size, const void *ptr)
+{
+ (void) fwrite(ptr, 1, size, (FILE *) priv);
+}
+
+static inline bool pl_read_file_cb(void *priv, size_t size, void *ptr)
+{
+ return fread(ptr, 1, size, (FILE *) priv) == size;
+}
+
+// --- Object modification API. Mostly intended for internal use.
+
+// Insert a new cached object into a `pl_cache`. Returns whether successful.
+// Overwrites any existing cached object with that signature, so this can be
+// used to e.g. delete objects as well (set their size to 0). On success,
+// ownership of `obj` passes to the `pl_cache`.
+//
+// Note: If `object.free` is NULL, this will perform an internal memdup. To
+// bypass this (e.g. when directly adding externally managed memory), you can
+// set the `free` callback to an explicit noop function.
+//
+// Note: `obj->data/free` will be reset to NULL on successful insertion.
+PL_API bool pl_cache_try_set(pl_cache cache, pl_cache_obj *obj);
+
+// Variant of `pl_cache_try_set` that simply frees `obj` on failure.
+PL_API void pl_cache_set(pl_cache cache, pl_cache_obj *obj);
+
+// Looks up `obj->key` in the object cache. If successful, `obj->data` is
+// set to memory owned by the caller, which must be either explicitly
+// re-inserted, or explicitly freed (using obj->free).
+//
+// Note: On failure, `obj->data/size/free` are reset to NULL.
+PL_API bool pl_cache_get(pl_cache cache, pl_cache_obj *obj);
+
+// Run a callback on every object currently stored in `cache`.
+//
+// Note: Running any `pl_cache_*` function on `cache` from this callback is
+// undefined behavior.
+PL_API void pl_cache_iterate(pl_cache cache,
+ void (*cb)(void *priv, pl_cache_obj obj),
+ void *priv);
+
+// Utility wrapper to free a `pl_cache_obj` if necessary (and sanitize it)
+static inline void pl_cache_obj_free(pl_cache_obj *obj)
+{
+ if (obj->free)
+ obj->free(obj->data);
+ obj->data = NULL;
+ obj->free = NULL;
+ obj->size = 0;
+}
+
+PL_API_END
+
+#endif // LIBPLACEBO_CACHE_H_
diff --git a/src/include/libplacebo/colorspace.h b/src/include/libplacebo/colorspace.h
new file mode 100644
index 0000000..6663019
--- /dev/null
+++ b/src/include/libplacebo/colorspace.h
@@ -0,0 +1,719 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_COLORSPACE_H_
+#define LIBPLACEBO_COLORSPACE_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <libplacebo/common.h>
+
+PL_API_BEGIN
+
+// The underlying color representation (e.g. RGB, XYZ or YCbCr)
+enum pl_color_system {
+ PL_COLOR_SYSTEM_UNKNOWN = 0,
+ // YCbCr-like color systems:
+ PL_COLOR_SYSTEM_BT_601, // ITU-R Rec. BT.601 (SD)
+ PL_COLOR_SYSTEM_BT_709, // ITU-R Rec. BT.709 (HD)
+ PL_COLOR_SYSTEM_SMPTE_240M, // SMPTE-240M
+ PL_COLOR_SYSTEM_BT_2020_NC, // ITU-R Rec. BT.2020 (non-constant luminance)
+ PL_COLOR_SYSTEM_BT_2020_C, // ITU-R Rec. BT.2020 (constant luminance)
+ PL_COLOR_SYSTEM_BT_2100_PQ, // ITU-R Rec. BT.2100 ICtCp PQ variant
+ PL_COLOR_SYSTEM_BT_2100_HLG, // ITU-R Rec. BT.2100 ICtCp HLG variant
+ PL_COLOR_SYSTEM_DOLBYVISION, // Dolby Vision (see pl_dovi_metadata)
+ PL_COLOR_SYSTEM_YCGCO, // YCgCo (derived from RGB)
+ // Other color systems:
+ PL_COLOR_SYSTEM_RGB, // Red, Green and Blue
+ PL_COLOR_SYSTEM_XYZ, // Digital Cinema Distribution Master (XYZ)
+ PL_COLOR_SYSTEM_COUNT
+};
+
+PL_API bool pl_color_system_is_ycbcr_like(enum pl_color_system sys);
+
+// Returns true for color systems that are linear transformations of the RGB
+// equivalent, i.e. are simple matrix multiplications. For color systems with
+// this property, `pl_color_repr_decode` is sufficient for conversion to RGB.
+PL_API bool pl_color_system_is_linear(enum pl_color_system sys);
+
+// Guesses the best YCbCr-like colorspace based on a image given resolution.
+// This only picks conservative values. (In particular, BT.2020 is never
+// auto-guessed, even for 4K resolution content)
+PL_API enum pl_color_system pl_color_system_guess_ycbcr(int width, int height);
+
+// Friendly names for the canonical channel names and order.
+enum pl_channel {
+ PL_CHANNEL_NONE = -1,
+ PL_CHANNEL_A = 3, // alpha
+ // RGB system
+ PL_CHANNEL_R = 0,
+ PL_CHANNEL_G = 1,
+ PL_CHANNEL_B = 2,
+ // YCbCr-like systems
+ PL_CHANNEL_Y = 0,
+ PL_CHANNEL_CB = 1,
+ PL_CHANNEL_CR = 2,
+ // Aliases for Cb/Cr
+ PL_CHANNEL_U = 1,
+ PL_CHANNEL_V = 2
+ // There are deliberately no names for the XYZ system to avoid
+ // confusion due to PL_CHANNEL_Y.
+};
+
+// The numerical range of the representation (where applicable).
+enum pl_color_levels {
+ PL_COLOR_LEVELS_UNKNOWN = 0,
+ PL_COLOR_LEVELS_LIMITED, // Limited/TV range, e.g. 16-235
+ PL_COLOR_LEVELS_FULL, // Full/PC range, e.g. 0-255
+ PL_COLOR_LEVELS_COUNT,
+
+ // Compatibility aliases
+ PL_COLOR_LEVELS_TV = PL_COLOR_LEVELS_LIMITED,
+ PL_COLOR_LEVELS_PC = PL_COLOR_LEVELS_FULL,
+};
+
+// The alpha representation mode.
+enum pl_alpha_mode {
+ PL_ALPHA_UNKNOWN = 0, // or no alpha channel present
+ PL_ALPHA_INDEPENDENT, // alpha channel is separate from the video
+ PL_ALPHA_PREMULTIPLIED, // alpha channel is multiplied into the colors
+ PL_ALPHA_MODE_COUNT,
+};
+
+// The underlying bit-wise representation of a color sample. For example,
+// a 10-bit TV-range YCbCr value uploaded to a 16 bit texture would have
+// sample_depth=16 color_depth=10 bit_shift=0.
+//
+// For another example, a 12-bit XYZ full range sample shifted to 16-bits with
+// the lower 4 bits all set to 0 would have sample_depth=16 color_depth=12
+// bit_shift=4. (libavcodec likes outputting this type of `xyz12`)
+//
+// To explain the meaning of `sample_depth` further; the consideration factor
+// here is the fact that GPU sampling will normalized the sampled color to the
+// range 0.0 - 1.0 in a manner dependent on the number of bits in the texture
+// format. So if you upload a 10-bit YCbCr value unpadded as 16-bit color
+// samples, all of the sampled values will be extremely close to 0.0. In such a
+// case, `pl_color_repr_normalize` would return a high scaling factor, which
+// would pull the color up to their 16-bit range.
+struct pl_bit_encoding {
+ int sample_depth; // the number of bits the color is stored/sampled as
+ int color_depth; // the effective number of bits of the color information
+ int bit_shift; // a representational bit shift applied to the color
+};
+
+// Returns whether two bit encodings are exactly identical.
+PL_API bool pl_bit_encoding_equal(const struct pl_bit_encoding *b1,
+ const struct pl_bit_encoding *b2);
+
+// Parsed metadata from the Dolby Vision RPU
+struct pl_dovi_metadata {
+ // Colorspace transformation metadata
+ float nonlinear_offset[3]; // input offset ("ycc_to_rgb_offset")
+ pl_matrix3x3 nonlinear; // before PQ, also called "ycc_to_rgb"
+ pl_matrix3x3 linear; // after PQ, also called "rgb_to_lms"
+
+ // Reshape data, grouped by component
+ struct pl_reshape_data {
+ uint8_t num_pivots;
+ float pivots[9]; // normalized to [0.0, 1.0] based on BL bit depth
+ uint8_t method[8]; // 0 = polynomial, 1 = MMR
+ // Note: these must be normalized (divide by coefficient_log2_denom)
+ float poly_coeffs[8][3]; // x^0, x^1, x^2, unused must be 0
+ uint8_t mmr_order[8]; // 1, 2 or 3
+ float mmr_constant[8];
+ float mmr_coeffs[8][3 /* order */][7];
+ } comp[3];
+};
+
+// Struct describing the underlying color system and representation. This
+// information is needed to convert an encoded color to a normalized RGB triple
+// in the range 0-1.
+struct pl_color_repr {
+ enum pl_color_system sys;
+ enum pl_color_levels levels;
+ enum pl_alpha_mode alpha;
+ struct pl_bit_encoding bits; // or {0} if unknown
+
+ // Metadata for PL_COLOR_SYSTEM_DOLBYVISION. Note that, for the sake of
+ // efficiency, this is treated purely as an opaque reference - functions
+ // like pl_color_repr_equal will merely do a pointer equality test.
+ //
+ // The only functions that actually dereference it in any way are
+ // pl_color_repr_decode, pl_shader_decode_color and pl_render_image(_mix).
+ const struct pl_dovi_metadata *dovi;
+};
+
+// Some common color representations. It's worth pointing out that all of these
+// presets leave `alpha` and `bits` as unknown - that is, only the system and
+// levels are predefined
+PL_API extern const struct pl_color_repr pl_color_repr_unknown;
+PL_API extern const struct pl_color_repr pl_color_repr_rgb;
+PL_API extern const struct pl_color_repr pl_color_repr_sdtv;
+PL_API extern const struct pl_color_repr pl_color_repr_hdtv; // also Blu-ray
+PL_API extern const struct pl_color_repr pl_color_repr_uhdtv; // SDR, NCL system
+PL_API extern const struct pl_color_repr pl_color_repr_jpeg;
+
+// Returns whether two colorspace representations are exactly identical.
+PL_API bool pl_color_repr_equal(const struct pl_color_repr *c1,
+ const struct pl_color_repr *c2);
+
+// Replaces unknown values in the first struct by those of the second struct.
+PL_API void pl_color_repr_merge(struct pl_color_repr *orig,
+ const struct pl_color_repr *update);
+
+// This function normalizes the color representation such that
+// color_depth=sample_depth and bit_shift=0; and returns the scaling factor
+// that must be multiplied into the color value to accomplish this, assuming
+// it has already been sampled by the GPU. If unknown, the color and sample
+// depth will both be inferred as 8 bits for the purposes of this conversion.
+PL_API float pl_color_repr_normalize(struct pl_color_repr *repr);
+
+// Guesses the best color levels based on the specified color levels and
+// falling back to using the color system instead. YCbCr-like systems are
+// assumed to be TV range, otherwise this defaults to PC range.
+PL_API enum pl_color_levels pl_color_levels_guess(const struct pl_color_repr *repr);
+
+// The colorspace's primaries (gamut)
+enum pl_color_primaries {
+ PL_COLOR_PRIM_UNKNOWN = 0,
+ // Standard gamut:
+ PL_COLOR_PRIM_BT_601_525, // ITU-R Rec. BT.601 (525-line = NTSC, SMPTE-C)
+ PL_COLOR_PRIM_BT_601_625, // ITU-R Rec. BT.601 (625-line = PAL, SECAM)
+ PL_COLOR_PRIM_BT_709, // ITU-R Rec. BT.709 (HD), also sRGB
+ PL_COLOR_PRIM_BT_470M, // ITU-R Rec. BT.470 M
+ PL_COLOR_PRIM_EBU_3213, // EBU Tech. 3213-E / JEDEC P22 phosphors
+ // Wide gamut:
+ PL_COLOR_PRIM_BT_2020, // ITU-R Rec. BT.2020 (UltraHD)
+ PL_COLOR_PRIM_APPLE, // Apple RGB
+ PL_COLOR_PRIM_ADOBE, // Adobe RGB (1998)
+ PL_COLOR_PRIM_PRO_PHOTO, // ProPhoto RGB (ROMM)
+ PL_COLOR_PRIM_CIE_1931, // CIE 1931 RGB primaries
+ PL_COLOR_PRIM_DCI_P3, // DCI-P3 (Digital Cinema)
+ PL_COLOR_PRIM_DISPLAY_P3, // DCI-P3 (Digital Cinema) with D65 white point
+ PL_COLOR_PRIM_V_GAMUT, // Panasonic V-Gamut (VARICAM)
+ PL_COLOR_PRIM_S_GAMUT, // Sony S-Gamut
+ PL_COLOR_PRIM_FILM_C, // Traditional film primaries with Illuminant C
+ PL_COLOR_PRIM_ACES_AP0, // ACES Primaries #0 (ultra wide)
+ PL_COLOR_PRIM_ACES_AP1, // ACES Primaries #1
+ PL_COLOR_PRIM_COUNT
+};
+
+PL_API bool pl_color_primaries_is_wide_gamut(enum pl_color_primaries prim);
+
+// Guesses the best primaries based on a resolution. This always guesses
+// conservatively, i.e. it will never return a wide gamut color space even if
+// the resolution is 4K.
+PL_API enum pl_color_primaries pl_color_primaries_guess(int width, int height);
+
+// The colorspace's transfer function (gamma / EOTF)
+enum pl_color_transfer {
+ PL_COLOR_TRC_UNKNOWN = 0,
+ // Standard dynamic range:
+ PL_COLOR_TRC_BT_1886, // ITU-R Rec. BT.1886 (CRT emulation + OOTF)
+ PL_COLOR_TRC_SRGB, // IEC 61966-2-4 sRGB (CRT emulation)
+ PL_COLOR_TRC_LINEAR, // Linear light content
+ PL_COLOR_TRC_GAMMA18, // Pure power gamma 1.8
+ PL_COLOR_TRC_GAMMA20, // Pure power gamma 2.0
+ PL_COLOR_TRC_GAMMA22, // Pure power gamma 2.2
+ PL_COLOR_TRC_GAMMA24, // Pure power gamma 2.4
+ PL_COLOR_TRC_GAMMA26, // Pure power gamma 2.6
+ PL_COLOR_TRC_GAMMA28, // Pure power gamma 2.8
+ PL_COLOR_TRC_PRO_PHOTO, // ProPhoto RGB (ROMM)
+ PL_COLOR_TRC_ST428, // Digital Cinema Distribution Master (XYZ)
+ // High dynamic range:
+ PL_COLOR_TRC_PQ, // ITU-R BT.2100 PQ (perceptual quantizer), aka SMPTE ST2048
+ PL_COLOR_TRC_HLG, // ITU-R BT.2100 HLG (hybrid log-gamma), aka ARIB STD-B67
+ PL_COLOR_TRC_V_LOG, // Panasonic V-Log (VARICAM)
+ PL_COLOR_TRC_S_LOG1, // Sony S-Log1
+ PL_COLOR_TRC_S_LOG2, // Sony S-Log2
+ PL_COLOR_TRC_COUNT
+};
+
+// Returns the nominal peak of a given transfer function, relative to the
+// reference white. This refers to the highest encodable signal level.
+// Always equal to 1.0 for SDR curves.
+//
+// Note: For HLG in particular, which is scene-referred, this returns the
+// highest nominal peak in scene-referred space (3.77), which may be different
+// from the actual peak in display space after application of the HLG OOTF.
+PL_API float pl_color_transfer_nominal_peak(enum pl_color_transfer trc);
+
+static inline bool pl_color_transfer_is_hdr(enum pl_color_transfer trc)
+{
+ return pl_color_transfer_nominal_peak(trc) > 1.0;
+}
+
+// This defines the display-space standard reference white level (in cd/m^2)
+// that is assumed for SDR content, for use when mapping between HDR and SDR in
+// display space. See ITU-R Report BT.2408 for more information.
+#define PL_COLOR_SDR_WHITE 203.0f
+
+// This defines the assumed contrast level of an unknown SDR display. This
+// will be used to determine the black point in the absence of any tagged
+// minimum luminance, relative to the tagged maximum luminance (or
+// PL_COLOR_SDR_WHITE in the absence of all tagging)
+#define PL_COLOR_SDR_CONTRAST 1000.0f
+
+// This defines the default black point assumed for "infinite contrast" HDR
+// displays. This is not exactly 0.0 because a value of 0.0 is interpreted
+// as "unknown / missing metadata" inside struct pl_hdr_metadata, and also
+// to avoid numerical issues in a variety of tone mapping functions.
+// Essentially, a black level below this number is functionally meaningless
+// inside libplacebo, and will be clamped to this value regardless.
+//
+// The value used here (1e-6) is about one 13-bit PQ step above absolute zero,
+// which is a small fraction of the human JND at this brightness level, and also
+// about 3 bits above the floating point machine epsilon.
+#define PL_COLOR_HDR_BLACK 1e-6f
+
+// This defines the assumed peak brightness of a HLG display with no HDR10
+// metadata. This is set to the brightness of a "nominal" HLG reference display.
+#define PL_COLOR_HLG_PEAK 1000.0f
+
+// Represents a single CIE xy coordinate (e.g. CIE Yxy with Y = 1.0)
+struct pl_cie_xy {
+ float x, y;
+};
+
+// Creates a pl_cie_xyz from raw XYZ values
+static inline struct pl_cie_xy pl_cie_from_XYZ(float X, float Y, float Z)
+{
+ float k = 1.0f / (X + Y + Z);
+ struct pl_cie_xy xy = { k * X, k * Y };
+ return xy;
+}
+
+// Recovers (X / Y) from a CIE xy value.
+static inline float pl_cie_X(struct pl_cie_xy xy)
+{
+ return xy.x / xy.y;
+}
+
+// Recovers (Z / Y) from a CIE xy value.
+static inline float pl_cie_Z(struct pl_cie_xy xy)
+{
+ return (1 - xy.x - xy.y) / xy.y;
+}
+
+static inline bool pl_cie_xy_equal(const struct pl_cie_xy *a,
+ const struct pl_cie_xy *b)
+{
+ return a->x == b->x && a->y == b->y;
+}
+
+// Computes the CIE xy chromaticity coordinates of a CIE D-series illuminant
+// with the given correlated color temperature.
+//
+// `temperature` must be between 2500 K and 25000 K, inclusive.
+PL_API struct pl_cie_xy pl_white_from_temp(float temperature);
+
+// Represents the raw physical primaries corresponding to a color space.
+struct pl_raw_primaries {
+ struct pl_cie_xy red, green, blue, white;
+};
+
+// Returns whether two raw primaries are exactly identical.
+PL_API bool pl_raw_primaries_equal(const struct pl_raw_primaries *a,
+ const struct pl_raw_primaries *b);
+
+// Returns whether two raw primaries are approximately equal
+PL_API bool pl_raw_primaries_similar(const struct pl_raw_primaries *a,
+ const struct pl_raw_primaries *b);
+
+// Replaces unknown values in the first struct by those of the second struct.
+PL_API void pl_raw_primaries_merge(struct pl_raw_primaries *orig,
+ const struct pl_raw_primaries *update);
+
+// Returns the raw primaries for a given color space.
+PL_API const struct pl_raw_primaries *pl_raw_primaries_get(enum pl_color_primaries prim);
+
+enum pl_hdr_scaling {
+ PL_HDR_NORM = 0, // 0.0 is absolute black, 1.0 is PL_COLOR_SDR_WHITE
+ PL_HDR_SQRT, // sqrt() of PL_HDR_NORM values
+ PL_HDR_NITS, // absolute brightness in raw cd/m²
+ PL_HDR_PQ, // absolute brightness in PQ (0.0 to 1.0)
+ PL_HDR_SCALING_COUNT,
+};
+
+// Generic helper for performing HDR scale conversions.
+PL_API float pl_hdr_rescale(enum pl_hdr_scaling from, enum pl_hdr_scaling to, float x);
+
+enum pl_hdr_metadata_type {
+ PL_HDR_METADATA_ANY = 0,
+ PL_HDR_METADATA_NONE,
+ PL_HDR_METADATA_HDR10, // HDR10 static mastering display metadata
+ PL_HDR_METADATA_HDR10PLUS, // HDR10+ dynamic metadata
+ PL_HDR_METADATA_CIE_Y, // CIE Y derived dynamic luminance metadata
+ PL_HDR_METADATA_TYPE_COUNT,
+};
+
+// Bezier curve for HDR metadata
+struct pl_hdr_bezier {
+ float target_luma; // target luminance (cd/m²) for this OOTF
+ float knee_x, knee_y; // cross-over knee point (0-1)
+ float anchors[15]; // intermediate bezier curve control points (0-1)
+ uint8_t num_anchors;
+};
+
+// Represents raw HDR metadata as defined by SMPTE 2086 / CTA 861.3, which is
+// often attached to HDR sources and can be forwarded to HDR-capable displays,
+// or used to guide the libplacebo built-in tone mapping. Values left as 0
+// are treated as unknown by libplacebo.
+//
+// Note: This means that a value of `min_luma == 0.0` gets treated as "minimum
+// luminance not known", which in practice may end up inferring a default
+// contrast of 1000:1 for SDR transfer functions. To avoid this, the user should
+// set these fields to a low positive value, e.g. PL_COLOR_HDR_BLACK, to signal
+// a "zero" black point (i.e. infinite contrast display).
+struct pl_hdr_metadata {
+ // --- PL_HDR_METADATA_HDR10
+ // Mastering display metadata.
+ struct pl_raw_primaries prim; // mastering display primaries
+ float min_luma, max_luma; // min/max luminance (in cd/m²)
+
+ // Content light level. (Note: this is ignored by libplacebo itself)
+ float max_cll; // max content light level (in cd/m²)
+ float max_fall; // max frame average light level (in cd/m²)
+
+ // --- PL_HDR_METADATA_HDR10PLUS
+ float scene_max[3]; // maxSCL in cd/m² per component (RGB)
+ float scene_avg; // average of maxRGB in cd/m²
+ struct pl_hdr_bezier ootf; // reference OOTF (optional)
+
+ // --- PL_HDR_METADATA_CIE_Y
+ float max_pq_y; // maximum PQ luminance (in PQ, 0-1)
+ float avg_pq_y; // averaged PQ luminance (in PQ, 0-1)
+};
+
+PL_API extern const struct pl_hdr_metadata pl_hdr_metadata_empty; // equal to {0}
+PL_API extern const struct pl_hdr_metadata pl_hdr_metadata_hdr10; // generic HDR10 display
+
+// Returns whether two sets of HDR metadata are exactly identical.
+PL_API bool pl_hdr_metadata_equal(const struct pl_hdr_metadata *a,
+ const struct pl_hdr_metadata *b);
+
+// Replaces unknown values in the first struct by those of the second struct.
+PL_API void pl_hdr_metadata_merge(struct pl_hdr_metadata *orig,
+ const struct pl_hdr_metadata *update);
+
+// Returns `true` if `data` contains a complete set of a given metadata type.
+// Note: for PL_HDR_METADATA_HDR10, only `min_luma` and `max_luma` are
+// considered - CLL/FALL and primaries are irrelevant for HDR tone-mapping.
+PL_API bool pl_hdr_metadata_contains(const struct pl_hdr_metadata *data,
+ enum pl_hdr_metadata_type type);
+
+// Rendering intent for colorspace transformations. These constants match the
+// ICC specification (Table 23)
+enum pl_rendering_intent {
+ PL_INTENT_AUTO = -1, // not a valid ICC intent, but used to auto-infer
+ PL_INTENT_PERCEPTUAL = 0,
+ PL_INTENT_RELATIVE_COLORIMETRIC = 1,
+ PL_INTENT_SATURATION = 2,
+ PL_INTENT_ABSOLUTE_COLORIMETRIC = 3
+};
+
+// Struct describing a physical color space. This information is needed to
+// turn a normalized RGB triple into its physical meaning, as well as to convert
+// between color spaces.
+struct pl_color_space {
+ enum pl_color_primaries primaries;
+ enum pl_color_transfer transfer;
+
+ // HDR metadata for this color space, if present. (Optional)
+ struct pl_hdr_metadata hdr;
+};
+
+#define pl_color_space(...) (&(struct pl_color_space) { __VA_ARGS__ })
+
+// Returns whether or not a color space is considered as effectively HDR.
+// This is true when the effective signal peak is greater than the SDR
+// reference white (1.0), taking into account `csp->hdr`.
+PL_API bool pl_color_space_is_hdr(const struct pl_color_space *csp);
+
+// Returns whether or not a color space is "black scaled", in which case 0.0 is
+// the true black point. This is true for SDR signals other than BT.1886, as
+// well as for HLG.
+PL_API bool pl_color_space_is_black_scaled(const struct pl_color_space *csp);
+
+struct pl_nominal_luma_params {
+ // The color space to infer luminance from
+ const struct pl_color_space *color;
+
+ // Which type of metadata to draw values from
+ enum pl_hdr_metadata_type metadata;
+
+ // This field controls the scaling of `out_*`
+ enum pl_hdr_scaling scaling;
+
+ // Fields to write the detected nominal luminance to. (Optional)
+ //
+ // For SDR displays, this will default to a contrast level of 1000:1 unless
+ // indicated otherwise in the `min/max_luma` static HDR10 metadata fields.
+ float *out_min;
+ float *out_max;
+
+ // Field to write the detected average luminance to, or 0.0 in the absence
+ // of dynamic metadata. (Optional)
+ float *out_avg;
+};
+
+#define pl_nominal_luma_params(...) \
+ (&(struct pl_nominal_luma_params) { __VA_ARGS__ })
+
+// Returns the effective luminance described by a pl_color_space.
+PL_API void pl_color_space_nominal_luma_ex(const struct pl_nominal_luma_params *params);
+
+// Backwards compatibility wrapper for `pl_color_space_nominal_luma_ex`
+PL_DEPRECATED PL_API void pl_color_space_nominal_luma(const struct pl_color_space *csp,
+ float *out_min, float *out_max);
+
+// Replaces unknown values in the first struct by those of the second struct.
+PL_API void pl_color_space_merge(struct pl_color_space *orig,
+ const struct pl_color_space *update);
+
+// Returns whether two colorspaces are exactly identical.
+PL_API bool pl_color_space_equal(const struct pl_color_space *c1,
+ const struct pl_color_space *c2);
+
+// Go through a color-space and explicitly default all unknown fields to
+// reasonable values. After this function is called, none of the values will be
+// PL_COLOR_*_UNKNOWN or 0.0, except for the dynamic HDR metadata fields.
+PL_API void pl_color_space_infer(struct pl_color_space *space);
+
+// Like `pl_color_space_infer`, but takes default values from the reference
+// color space (excluding certain special cases like HDR or wide gamut).
+PL_API void pl_color_space_infer_ref(struct pl_color_space *space,
+ const struct pl_color_space *ref);
+
+// Infer both the source and destination gamut simultaneously, and also adjust
+// values for optimal display. This is mostly the same as
+// `pl_color_space_infer(src)` followed by `pl_color_space_infer_ref`, but also
+// takes into account the SDR contrast levels and PQ black points. This is
+// basically the logic used by `pl_shader_color_map` and `pl_renderer` to
+// decide the output color space in a conservative way and compute the final
+// end-to-end color transformation that needs to be done.
+PL_API void pl_color_space_infer_map(struct pl_color_space *src,
+ struct pl_color_space *dst);
+
+// Some common color spaces. Note: These don't necessarily have all fields
+// filled, in particular `hdr` is left unset.
+PL_API extern const struct pl_color_space pl_color_space_unknown;
+PL_API extern const struct pl_color_space pl_color_space_srgb;
+PL_API extern const struct pl_color_space pl_color_space_bt709;
+PL_API extern const struct pl_color_space pl_color_space_hdr10;
+PL_API extern const struct pl_color_space pl_color_space_bt2020_hlg;
+PL_API extern const struct pl_color_space pl_color_space_monitor; // typical display
+
+// This represents metadata about extra operations to perform during colorspace
+// conversion, which correspond to artistic adjustments of the color.
+struct pl_color_adjustment {
+ // Brightness boost. 0.0 = neutral, 1.0 = solid white, -1.0 = solid black
+ float brightness;
+ // Contrast boost. 1.0 = neutral, 0.0 = solid black
+ float contrast;
+ // Saturation gain. 1.0 = neutral, 0.0 = grayscale
+ float saturation;
+ // Hue shift, corresponding to a rotation around the [U, V] subvector, in
+ // radians. 0.0 = neutral
+ float hue;
+ // Gamma adjustment. 1.0 = neutral, 0.0 = solid black
+ float gamma;
+ // Color temperature shift. 0.0 = 6500 K, -1.0 = 3000 K, 1.0 = 10000 K
+ float temperature;
+};
+
+#define PL_COLOR_ADJUSTMENT_NEUTRAL \
+ .contrast = 1.0, \
+ .saturation = 1.0, \
+ .gamma = 1.0,
+
+#define pl_color_adjustment(...) (&(struct pl_color_adjustment) { PL_COLOR_ADJUSTMENT_NEUTRAL __VA_ARGS__ })
+PL_API extern const struct pl_color_adjustment pl_color_adjustment_neutral;
+
+// Represents the chroma placement with respect to the luma samples. This is
+// only relevant for YCbCr-like colorspaces with chroma subsampling.
+enum pl_chroma_location {
+ PL_CHROMA_UNKNOWN = 0,
+ PL_CHROMA_LEFT, // MPEG2/4, H.264
+ PL_CHROMA_CENTER, // MPEG1, JPEG
+ PL_CHROMA_TOP_LEFT,
+ PL_CHROMA_TOP_CENTER,
+ PL_CHROMA_BOTTOM_LEFT,
+ PL_CHROMA_BOTTOM_CENTER,
+ PL_CHROMA_COUNT,
+};
+
+// Fills *x and *y with the offset in luma pixels corresponding to a given
+// chroma location.
+//
+// Note: PL_CHROMA_UNKNOWN defaults to PL_CHROMA_LEFT
+PL_API void pl_chroma_location_offset(enum pl_chroma_location loc, float *x, float *y);
+
+// Returns an RGB->XYZ conversion matrix for a given set of primaries.
+// Multiplying this into the RGB color transforms it to CIE XYZ, centered
+// around the color space's white point.
+PL_API pl_matrix3x3 pl_get_rgb2xyz_matrix(const struct pl_raw_primaries *prim);
+
+// Similar to pl_get_rgb2xyz_matrix, but gives the inverse transformation.
+PL_API pl_matrix3x3 pl_get_xyz2rgb_matrix(const struct pl_raw_primaries *prim);
+
+// Returns a primary adaptation matrix, which converts from one set of
+// primaries to another. This is an RGB->RGB transformation. For rendering
+// intents other than PL_INTENT_ABSOLUTE_COLORIMETRIC, the white point is
+// adapted using the Bradford matrix.
+PL_API pl_matrix3x3 pl_get_color_mapping_matrix(const struct pl_raw_primaries *src,
+ const struct pl_raw_primaries *dst,
+ enum pl_rendering_intent intent);
+
+// Return a chromatic adaptation matrix, which converts from one white point to
+// another, using the Bradford matrix. This is an RGB->RGB transformation.
+PL_API pl_matrix3x3 pl_get_adaptation_matrix(struct pl_cie_xy src, struct pl_cie_xy dst);
+
+// Returns true if 'b' is entirely contained in 'a'. Useful for figuring out if
+// colorimetric clipping will occur or not.
+PL_API bool pl_primaries_superset(const struct pl_raw_primaries *a,
+ const struct pl_raw_primaries *b);
+
+// Returns true if `prim` forms a nominally valid set of primaries. This does
+// not check whether or not these primaries are actually physically realisable,
+// merely that they satisfy the requirements for colorspace math (to avoid NaN).
+PL_API bool pl_primaries_valid(const struct pl_raw_primaries *prim);
+
+// Returns true if two primaries are 'compatible', which is the case if
+// they preserve the relationship between primaries (red=red, green=green,
+// blue=blue). In other words, this is false for synthetic primaries that have
+// channels misordered from the convention (e.g. for some test ICC profiles).
+PL_API bool pl_primaries_compatible(const struct pl_raw_primaries *a,
+ const struct pl_raw_primaries *b);
+
+// Clip points in the first gamut (src) to be fully contained inside the second
+// gamut (dst). Only works on compatible primaries (pl_primaries_compatible).
+PL_API struct pl_raw_primaries
+pl_primaries_clip(const struct pl_raw_primaries *src,
+ const struct pl_raw_primaries *dst);
+
+// Primary-dependent RGB->LMS matrix for the IPTPQc4 color system. This is
+// derived from the HPE XYZ->LMS matrix with 4% crosstalk added.
+PL_API pl_matrix3x3 pl_ipt_rgb2lms(const struct pl_raw_primaries *prim);
+PL_API pl_matrix3x3 pl_ipt_lms2rgb(const struct pl_raw_primaries *prim);
+
+// Primary-independent L'M'S' -> IPT matrix for the IPTPQc4 color system, and
+// its inverse. This is identical to the Ebner & Fairchild (1998) IPT matrix.
+PL_API extern const pl_matrix3x3 pl_ipt_lms2ipt;
+PL_API extern const pl_matrix3x3 pl_ipt_ipt2lms;
+
+// Cone types involved in human vision
+enum pl_cone {
+ PL_CONE_L = 1 << 0,
+ PL_CONE_M = 1 << 1,
+ PL_CONE_S = 1 << 2,
+
+ // Convenience aliases
+ PL_CONE_NONE = 0,
+ PL_CONE_LM = PL_CONE_L | PL_CONE_M,
+ PL_CONE_MS = PL_CONE_M | PL_CONE_S,
+ PL_CONE_LS = PL_CONE_L | PL_CONE_S,
+ PL_CONE_LMS = PL_CONE_L | PL_CONE_M | PL_CONE_S,
+};
+
+// Structure describing parameters for simulating color blindness
+struct pl_cone_params {
+ enum pl_cone cones; // Which cones are *affected* by the vision model
+ float strength; // Coefficient for how strong the defect is
+ // (1.0 = Unaffected, 0.0 = Full blindness)
+};
+
+#define pl_cone_params(...) (&(struct pl_cone_params) { __VA_ARGS__ })
+
+// Built-in color blindness models
+PL_API extern const struct pl_cone_params pl_vision_normal; // No distortion (92%)
+PL_API extern const struct pl_cone_params pl_vision_protanomaly; // Red deficiency (0.66%)
+PL_API extern const struct pl_cone_params pl_vision_protanopia; // Red absence (0.59%)
+PL_API extern const struct pl_cone_params pl_vision_deuteranomaly; // Green deficiency (2.7%)
+PL_API extern const struct pl_cone_params pl_vision_deuteranopia; // Green absence (0.56%)
+PL_API extern const struct pl_cone_params pl_vision_tritanomaly; // Blue deficiency (0.01%)
+PL_API extern const struct pl_cone_params pl_vision_tritanopia; // Blue absence (0.016%)
+PL_API extern const struct pl_cone_params pl_vision_monochromacy; // Blue cones only (<0.001%)
+PL_API extern const struct pl_cone_params pl_vision_achromatopsia; // Rods only (<0.0001%)
+
+// Returns a cone adaptation matrix. Applying this to an RGB color in the given
+// color space will apply the given cone adaptation coefficients for simulating
+// a type of color blindness.
+//
+// For the color blindness models which don't entail complete loss of a cone,
+// you can partially counteract the effect by using a similar model with the
+// `strength` set to its inverse. For example, to partially counteract
+// deuteranomaly, you could generate a cone matrix for PL_CONE_M with the
+// strength 2.0 (or some other number above 1.0).
+PL_API pl_matrix3x3 pl_get_cone_matrix(const struct pl_cone_params *params,
+ const struct pl_raw_primaries *prim);
+
+// Returns a color decoding matrix for a given combination of source color
+// representation and adjustment parameters. This mutates `repr` to reflect the
+// change. If `params` is NULL, it defaults to &pl_color_adjustment_neutral.
+//
+// This function always performs a conversion to RGB. To convert to other
+// colorspaces (e.g. between YUV systems), obtain a second YUV->RGB matrix
+// and invert it using `pl_transform3x3_invert`.
+//
+// Note: For BT.2020 constant-luminance, this outputs chroma information in the
+// range [-0.5, 0.5]. Since the CL system conversion is non-linear, further
+// processing must be done by the caller. The channel order is CrYCb.
+//
+// Note: For BT.2100 ICtCp, this outputs in the color space L'M'S'. Further
+// non-linear processing must be done by the caller.
+//
+// Note: XYZ system is expected to be in DCDM X'Y'Z' encoding (ST 428-1), in
+// practice this means normalizing by (48.0 / 52.37) factor and applying 2.6 gamma
+PL_API pl_transform3x3 pl_color_repr_decode(struct pl_color_repr *repr,
+ const struct pl_color_adjustment *params);
+
+// Common struct to describe an ICC profile
+struct pl_icc_profile {
+ // Points to the in-memory representation of the ICC profile. This is
+ // allowed to be NULL, in which case the `pl_icc_profile` represents "no
+ // profile”.
+ const void *data;
+ size_t len;
+
+ // If a profile is set, this signature must uniquely identify it (including
+ // across restarts, for caching), ideally using a checksum of the profile
+ // contents. The user is free to choose the method of determining this
+ // signature, but note the existence of the
+ // `pl_icc_profile_compute_signature` helper.
+ uint64_t signature;
+};
+
+#define pl_icc_profile(...) &(struct pl_icc_profile) { __VA_ARGS__ }
+
+// This doesn't do a comparison of the actual contents, only of the signature.
+PL_API bool pl_icc_profile_equal(const struct pl_icc_profile *p1,
+ const struct pl_icc_profile *p2);
+
+// Sets `signature` to a hash of `profile->data`, if non-NULL. Provided as a
+// convenience function for the sake of users ingesting arbitrary ICC profiles
+// from sources where they can't reliably detect profile changes.
+//
+// Note: This is based on a very fast hash, and will compute a signature for
+// even large (10 MB) ICC profiles in, typically, a fraction of a millisecond.
+PL_API void pl_icc_profile_compute_signature(struct pl_icc_profile *profile);
+
+PL_API_END
+
+#endif // LIBPLACEBO_COLORSPACE_H_
diff --git a/src/include/libplacebo/common.h b/src/include/libplacebo/common.h
new file mode 100644
index 0000000..806730c
--- /dev/null
+++ b/src/include/libplacebo/common.h
@@ -0,0 +1,244 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_COMMON_H_
+#define LIBPLACEBO_COMMON_H_
+
+#include <stdbool.h>
+
+#include <libplacebo/config.h>
+
+PL_API_BEGIN
+
+// Some common utility types. These are overloaded to support 2D, 3D and
+// integer/float variants.
+typedef struct pl_rect2d {
+ int x0, y0;
+ int x1, y1;
+} pl_rect2d;
+
+typedef struct pl_rect3d {
+ int x0, y0, z0;
+ int x1, y1, z1;
+} pl_rect3d;
+
+typedef struct pl_rect2df {
+ float x0, y0;
+ float x1, y1;
+} pl_rect2df;
+
+typedef struct pl_rect3df {
+ float x0, y0, z0;
+ float x1, y1, z1;
+} pl_rect3df;
+
+// These macros will work for any of the above pl_rect variants (with enough
+// dimensions). Careful: double-evaluation hazard
+#define pl_rect_w(r) ((r).x1 - (r).x0)
+#define pl_rect_h(r) ((r).y1 - (r).y0)
+#define pl_rect_d(r) ((r).z1 - (r).z0)
+
+#define pl_rect2d_eq(a, b) \
+ ((a).x0 == (b).x0 && (a).x1 == (b).x1 && \
+ (a).y0 == (b).y0 && (a).y1 == (b).y1)
+
+#define pl_rect3d_eq(a, b) \
+ ((a).x0 == (b).x0 && (a).x1 == (b).x1 && \
+ (a).y0 == (b).y0 && (a).y1 == (b).y1 && \
+ (a).z0 == (b).z0 && (a).z1 == (b).z1)
+
+// "Normalize" a rectangle: This ensures d1 >= d0 for all dimensions.
+PL_API void pl_rect2d_normalize(pl_rect2d *rc);
+PL_API void pl_rect3d_normalize(pl_rect3d *rc);
+
+PL_API void pl_rect2df_normalize(pl_rect2df *rc);
+PL_API void pl_rect3df_normalize(pl_rect3df *rc);
+
+// Return the rounded form of a rect.
+PL_API pl_rect2d pl_rect2df_round(const pl_rect2df *rc);
+PL_API pl_rect3d pl_rect3df_round(const pl_rect3df *rc);
+
+// Represents a row-major matrix, i.e. the following matrix
+// [ a11 a12 a13 ]
+// [ a21 a22 a23 ]
+// [ a31 a32 a33 ]
+// is represented in C like this:
+// { { a11, a12, a13 },
+// { a21, a22, a23 },
+// { a31, a32, a33 } };
+typedef struct pl_matrix3x3 {
+ float m[3][3];
+} pl_matrix3x3;
+
+PL_API extern const pl_matrix3x3 pl_matrix3x3_identity;
+
+// Applies a matrix to a float vector in-place.
+PL_API void pl_matrix3x3_apply(const pl_matrix3x3 *mat, float vec[3]);
+
+// Applies a matrix to a pl_rect3df
+PL_API void pl_matrix3x3_apply_rc(const pl_matrix3x3 *mat, pl_rect3df *rc);
+
+// Scales a color matrix by a linear factor.
+PL_API void pl_matrix3x3_scale(pl_matrix3x3 *mat, float scale);
+
+// Inverts a matrix. Only use where precision is not that important.
+PL_API void pl_matrix3x3_invert(pl_matrix3x3 *mat);
+
+// Composes/multiplies two matrices. Multiples B into A, i.e.
+// A := A * B
+PL_API void pl_matrix3x3_mul(pl_matrix3x3 *a, const pl_matrix3x3 *b);
+
+// Flipped version of `pl_matrix3x3_mul`.
+// B := A * B
+PL_API void pl_matrix3x3_rmul(const pl_matrix3x3 *a, pl_matrix3x3 *b);
+
+// Represents an affine transformation, which is basically a 3x3 matrix
+// together with a column vector to add onto the output.
+typedef struct pl_transform3x3 {
+ pl_matrix3x3 mat;
+ float c[3];
+} pl_transform3x3;
+
+PL_API extern const pl_transform3x3 pl_transform3x3_identity;
+
+// Applies a transform to a float vector in-place.
+PL_API void pl_transform3x3_apply(const pl_transform3x3 *t, float vec[3]);
+
+// Applies a transform to a pl_rect3df
+PL_API void pl_transform3x3_apply_rc(const pl_transform3x3 *t, pl_rect3df *rc);
+
+// Scales the output of a transform by a linear factor. Since an affine
+// transformation is non-linear, this does not commute. If you want to scale
+// the *input* of a transform, use pl_matrix3x3_scale on `t.mat`.
+PL_API void pl_transform3x3_scale(pl_transform3x3 *t, float scale);
+
+// Inverts a transform. Only use where precision is not that important.
+PL_API void pl_transform3x3_invert(pl_transform3x3 *t);
+
+// 2D analog of the above structs. Since these are featured less prominently,
+// we omit some of the other helper functions.
+typedef struct pl_matrix2x2 {
+ float m[2][2];
+} pl_matrix2x2;
+
+PL_API extern const pl_matrix2x2 pl_matrix2x2_identity;
+PL_API pl_matrix2x2 pl_matrix2x2_rotation(float angle);
+
+PL_API void pl_matrix2x2_apply(const pl_matrix2x2 *mat, float vec[2]);
+PL_API void pl_matrix2x2_apply_rc(const pl_matrix2x2 *mat, pl_rect2df *rc);
+
+PL_API void pl_matrix2x2_mul(pl_matrix2x2 *a, const pl_matrix2x2 *b);
+PL_API void pl_matrix2x2_rmul(const pl_matrix2x2 *a, pl_matrix2x2 *b);
+
+PL_API void pl_matrix2x2_scale(pl_matrix2x2 *mat, float scale);
+PL_API void pl_matrix2x2_invert(pl_matrix2x2 *mat);
+
+typedef struct pl_transform2x2 {
+ pl_matrix2x2 mat;
+ float c[2];
+} pl_transform2x2;
+
+PL_API extern const pl_transform2x2 pl_transform2x2_identity;
+
+PL_API void pl_transform2x2_apply(const pl_transform2x2 *t, float vec[2]);
+PL_API void pl_transform2x2_apply_rc(const pl_transform2x2 *t, pl_rect2df *rc);
+
+PL_API void pl_transform2x2_mul(pl_transform2x2 *a, const pl_transform2x2 *b);
+PL_API void pl_transform2x2_rmul(const pl_transform2x2 *a, pl_transform2x2 *b);
+
+PL_API void pl_transform2x2_scale(pl_transform2x2 *t, float scale);
+PL_API void pl_transform2x2_invert(pl_transform2x2 *t);
+
+// Compute new bounding box of a transformation (as applied to a given rect).
+PL_API pl_rect2df pl_transform2x2_bounds(const pl_transform2x2 *t,
+ const pl_rect2df *rc);
+
+// Helper functions for dealing with aspect ratios and stretched/scaled rects.
+
+// Return the (absolute) aspect ratio (width/height) of a given pl_rect2df.
+// This will always be a positive number, even if `rc` is flipped.
+PL_API float pl_rect2df_aspect(const pl_rect2df *rc);
+
+// Set the aspect of a `rc` to a given aspect ratio with an extra 'panscan'
+// factor choosing the balance between shrinking and growing the `rc` to meet
+// this aspect ratio.
+//
+// Notes:
+// - If `panscan` is 0.0, this function will only ever shrink the `rc`.
+// - If `panscan` is 1.0, this function will only ever grow the `rc`.
+// - If `panscan` is 0.5, this function is area-preserving.
+PL_API void pl_rect2df_aspect_set(pl_rect2df *rc, float aspect, float panscan);
+
+// Set one rect's aspect to that of another
+#define pl_rect2df_aspect_copy(rc, src, panscan) \
+ pl_rect2df_aspect_set((rc), pl_rect2df_aspect(src), (panscan))
+
+// 'Fit' one rect inside another. `rc` will be set to the same size and aspect
+// ratio as `src`, but with the size limited to fit inside the original `rc`.
+// Like `pl_rect2df_aspect_set`, `panscan` controls the pan&scan factor.
+PL_API void pl_rect2df_aspect_fit(pl_rect2df *rc, const pl_rect2df *src, float panscan);
+
+// Scale rect in each direction while keeping it centered.
+PL_API void pl_rect2df_stretch(pl_rect2df *rc, float stretch_x, float stretch_y);
+
+// Offset rect by an arbitrary offset factor. If the corresponding dimension
+// of a rect is flipped, so too is the applied offset.
+PL_API void pl_rect2df_offset(pl_rect2df *rc, float offset_x, float offset_y);
+
+// Scale a rect uniformly in both dimensions.
+#define pl_rect2df_zoom(rc, zoom) pl_rect2df_stretch((rc), (zoom), (zoom))
+
+// Rotation in degrees clockwise
+typedef int pl_rotation;
+enum {
+ PL_ROTATION_0 = 0,
+ PL_ROTATION_90 = 1,
+ PL_ROTATION_180 = 2,
+ PL_ROTATION_270 = 3,
+ PL_ROTATION_360 = 4, // equivalent to PL_ROTATION_0
+
+ // Note: Values outside the range [0,4) are legal, including negatives.
+};
+
+// Constrains to the interval [PL_ROTATION_0, PL_ROTATION_360).
+static inline pl_rotation pl_rotation_normalize(pl_rotation rot)
+{
+ return (rot % PL_ROTATION_360 + PL_ROTATION_360) % PL_ROTATION_360;
+}
+
+// Rotates the coordinate system of a `pl_rect2d(f)` in a certain direction.
+// For example, calling this with PL_ROTATION_90 will correspond to rotating
+// the coordinate system 90° to the right (so the x axis becomes the y axis).
+//
+// The resulting rect is re-normalized in the same coordinate system.
+PL_API void pl_rect2df_rotate(pl_rect2df *rc, pl_rotation rot);
+
+// Returns the aspect ratio in a rotated frame of reference.
+static inline float pl_aspect_rotate(float aspect, pl_rotation rot)
+{
+ return (rot % PL_ROTATION_180) ? 1.0 / aspect : aspect;
+}
+
+#define pl_rect2df_aspect_set_rot(rc, aspect, rot, panscan) \
+ pl_rect2df_aspect_set((rc), pl_aspect_rotate((aspect), (rot)), (panscan))
+
+#define pl_rect2df_aspect_copy_rot(rc, src, panscan, rot) \
+ pl_rect2df_aspect_set_rot((rc), pl_rect2df_aspect(src), (rot), (panscan))
+
+PL_API_END
+
+#endif // LIBPLACEBO_COMMON_H_
diff --git a/src/include/libplacebo/config.h.in b/src/include/libplacebo/config.h.in
new file mode 100644
index 0000000..2ed6290
--- /dev/null
+++ b/src/include/libplacebo/config.h.in
@@ -0,0 +1,102 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_CONFIG_H_
+#define LIBPLACEBO_CONFIG_H_
+
+// Increased any time the library changes in a fundamental/major way.
+#define PL_MAJOR_VER @majorver@
+
+// Increased any time the API changes. (Note: Does not reset when PL_MAJOR_VER
+// is increased)
+#define PL_API_VER @apiver@
+
+// Increased any time a fix is made to a given API version.
+#define PL_FIX_VER (pl_fix_ver())
+
+// Friendly name (`git describe`) for the overall version of the library
+#define PL_VERSION (pl_version())
+
+// Feature tests. These aren't described in further detail, but may be useful
+// for programmers wanting to programmatically check for feature support
+// in their compiled libplacebo versions.
+@extra_defs@
+
+// Extra compiler-specific stuff
+#ifndef PL_DEPRECATED
+# if defined(_MSC_VER)
+# define PL_DEPRECATED
+# else
+# define PL_DEPRECATED __attribute__((deprecated))
+# endif
+#endif
+
+#ifndef __has_feature
+#define __has_feature(x) 0
+#endif
+
+#ifndef PL_DEPRECATED_ENUMERATOR
+# if (defined(__GNUC__) && (__GNUC__ >= 6)) || __has_feature(enumerator_attributes)
+# define PL_DEPRECATED_ENUMERATOR PL_DEPRECATED
+# else
+# define PL_DEPRECATED_ENUMERATOR
+# endif
+#endif
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+# ifdef PL_EXPORT
+# define PL_API __declspec(dllexport)
+# else
+# ifndef PL_STATIC
+# define PL_API __declspec(dllimport)
+# else
+# define PL_API
+# endif
+# endif
+#else
+# define PL_API __attribute__ ((visibility ("default")))
+#endif
+
+// C++ compatibility
+#ifdef __cplusplus
+# define PL_API_BEGIN extern "C" {
+# define PL_API_END }
+#else
+# define PL_API_BEGIN
+# define PL_API_END
+#endif
+
+#ifndef __cplusplus
+// Disable this warning because libplacebo's params macros override fields
+# pragma GCC diagnostic ignored "-Woverride-init"
+#endif
+
+// Extra helper macros
+#define PL_TOSTRING_INNER(x) #x
+#define PL_TOSTRING(x) PL_TOSTRING_INNER(x)
+
+// Deprecated macro for back-compatibility
+#define PL_STRUCT(name) struct name##_t
+
+PL_API_BEGIN
+
+PL_API int pl_fix_ver(void);
+PL_API const char *pl_version(void);
+
+PL_API_END
+
+#endif // LIBPLACEBO_CONFIG_H_
diff --git a/src/include/libplacebo/d3d11.h b/src/include/libplacebo/d3d11.h
new file mode 100644
index 0000000..8ecba30
--- /dev/null
+++ b/src/include/libplacebo/d3d11.h
@@ -0,0 +1,248 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_D3D11_H_
+#define LIBPLACEBO_D3D11_H_
+
+#include <windows.h>
+#include <d3d11.h>
+#include <dxgi1_2.h>
+#include <libplacebo/gpu.h>
+#include <libplacebo/swapchain.h>
+
+PL_API_BEGIN
+
+// Structure representing the actual D3D11 device and associated GPU instance
+typedef const struct pl_d3d11_t {
+ pl_gpu gpu;
+
+ // The D3D11 device in use. The user is free to use this for their own
+ // purposes, including taking a reference to the device (with AddRef) and
+ // using it beyond the lifetime of the pl_d3d11 that created it (though if
+ // this is done with debug enabled, it will confuse the leak checker.)
+ ID3D11Device *device;
+
+ // True if the device is using a software (WARP) adapter
+ bool software;
+} *pl_d3d11;
+
+struct pl_d3d11_params {
+ // The Direct3D 11 device to use. Optional, if NULL then libplacebo will
+ // create its own ID3D11Device using the options below. If set, all the
+ // options below will be ignored.
+ ID3D11Device *device;
+
+ // --- Adapter selection options
+
+ // The adapter to use. This overrides adapter_luid.
+ IDXGIAdapter *adapter;
+
+ // The LUID of the adapter to use. If adapter and adapter_luid are unset,
+ // the default adapter will be used instead.
+ LUID adapter_luid;
+
+ // Allow a software (WARP) adapter when selecting the adapter automatically.
+ // Note that sometimes the default adapter will be a software adapter. This
+ // is because, on Windows 8 and up, if there are no hardware adapters,
+ // Windows will pretend the WARP adapter is the default hardware adapter.
+ bool allow_software;
+
+ // Always use a software adapter. This is mainly for testing purposes.
+ bool force_software;
+
+ // --- Device creation options
+
+ // Enable the debug layer (D3D11_CREATE_DEVICE_DEBUG)
+ // Also logs IDXGIInfoQueue messages
+ bool debug;
+
+ // Extra flags to pass to D3D11CreateDevice (D3D11_CREATE_DEVICE_FLAG).
+ // libplacebo should be compatible with any flags passed here.
+ UINT flags;
+
+ // The minimum and maximum allowable feature levels for the created device.
+ // libplacebo will attempt to create a device with the highest feature level
+ // between min_feature_level and max_feature_level (inclusive.) If there are
+ // no supported feature levels in this range, `pl_d3d11_create` will either
+ // return NULL or fall back to the software adapter, depending on whether
+ // `allow_software` is set.
+ //
+ // Normally there is no reason to set `max_feature_level` other than to test
+ // if a program works at lower feature levels.
+ //
+ // Note that D3D_FEATURE_LEVEL_9_3 and below (known as 10level9) are highly
+ // restrictive. These feature levels are supported on a best-effort basis.
+ // They represent very old DirectX 9 compatible PC and laptop hardware
+ // (2001-2007, GeForce FX, 6, 7, ATI R300-R500, GMA 950-X3000) and some
+ // less-old mobile devices (Surface RT, Surface 2.) Basic video rendering
+ // should work, but the full pl_gpu API will not be available and advanced
+ // shaders will probably fail. The hardware is probably too slow for these
+ // anyway.
+ //
+ // Known restrictions of 10level9 devices include:
+ // D3D_FEATURE_LEVEL_9_3 and below:
+ // - `pl_pass_run_params->index_buf` will not work (but `index_data` will)
+ // - Dimensions of 3D textures must be powers of two
+ // - Shaders cannot use gl_FragCoord
+ // - Shaders cannot use texelFetch
+ // D3D_FEATURE_LEVEL_9_2 and below:
+ // - Fragment shaders have no dynamic flow control and very strict limits
+ // on the number of constants, temporary registers and instructions.
+ // Whether a shader meets the requirements will depend on how it's
+ // compiled and optimized, but it's likely that only simple shaders will
+ // work.
+ // D3D_FEATURE_LEVEL_9_1:
+ // - No high-bit-depth formats with PL_FMT_CAP_RENDERABLE or
+ // PL_FMT_CAP_LINEAR
+ //
+ // If these restrictions are undesirable and you don't need to support
+ // ancient hardware, set `min_feature_level` to D3D_FEATURE_LEVEL_10_0.
+ int min_feature_level; // Defaults to D3D_FEATURE_LEVEL_9_1 if unset
+ int max_feature_level; // Defaults to D3D_FEATURE_LEVEL_12_1 if unset
+
+ // Allow up to N in-flight frames. Similar to swapchain_depth for Vulkan and
+ // OpenGL, though with DXGI this is a device-wide setting that affects all
+ // swapchains (except for waitable swapchains.) See the documentation for
+ // `pl_swapchain_latency` for more information.
+ int max_frame_latency;
+};
+
+// Default/recommended parameters. Should generally be safe and efficient.
+#define PL_D3D11_DEFAULTS \
+ .allow_software = true,
+
+#define pl_d3d11_params(...) (&(struct pl_d3d11_params) { PL_D3D11_DEFAULTS __VA_ARGS__ })
+PL_API extern const struct pl_d3d11_params pl_d3d11_default_params;
+
+// Creates a new Direct3D 11 device based on the given parameters, or wraps an
+// existing device, and initializes a new GPU instance. If params is left as
+// NULL, it defaults to &pl_d3d11_default_params. If an existing device is
+// provided in params->device, `pl_d3d11_create` will take a reference to it
+// that will be released in `pl_d3d11_destroy`.
+PL_API pl_d3d11 pl_d3d11_create(pl_log log, const struct pl_d3d11_params *params);
+
+// Release the D3D11 device.
+//
+// Note that all libplacebo objects allocated from this pl_d3d11 object (e.g.
+// via `d3d11->gpu` or using `pl_d3d11_create_swapchain`) *must* be explicitly
+// destroyed by the user before calling this.
+PL_API void pl_d3d11_destroy(pl_d3d11 *d3d11);
+
+// For a `pl_gpu` backed by `pl_d3d11`, this function can be used to retrieve
+// the underlying `pl_d3d11`. Returns NULL for any other type of `gpu`.
+PL_API pl_d3d11 pl_d3d11_get(pl_gpu gpu);
+
+struct pl_d3d11_swapchain_params {
+ // The Direct3D 11 swapchain to wrap. Optional. If NULL, libplacebo will
+ // create its own swapchain using the options below. If set, all the
+ // swapchain creation options will be ignored.
+ //
+ // The provided swapchain must have been created by the same device used
+ // by `gpu` and must not have multisampled backbuffers.
+ IDXGISwapChain *swapchain;
+
+ // --- Swapchain creation options
+
+ // Initial framebuffer width and height. If both width and height are set to
+ // 0 and window is non-NULL, the client area of the window is used instead.
+ // For convenience, if either component would be 0, it is set to 1 instead.
+ // This is because Windows can have 0-sized windows, but not 0-sized
+ // swapchains.
+ int width;
+ int height;
+
+ // The handle of the output window. In Windows 8 and up this is optional
+ // because you can output to a CoreWindow or create a composition swapchain
+ // instead.
+ HWND window;
+
+ // A pointer to the CoreWindow to output to. If both this and `window` are
+ // NULL, CreateSwapChainForComposition will be used to create the swapchain.
+ IUnknown *core_window;
+
+ // If set, libplacebo will create a swapchain that uses the legacy bitblt
+ // presentation model (with the DXGI_SWAP_EFFECT_DISCARD swap effect.) This
+ // tends to give worse performance and frame pacing in windowed mode and it
+ // prevents borderless fullscreen optimizations, but it might be necessary
+ // to work around buggy drivers, especially with DXGI 1.2 in the Platform
+ // Update for Windows 7. When unset, libplacebo will try to use the flip
+ // presentation model and only fall back to bitblt if flip is unavailable.
+ bool blit;
+
+ // additional swapchain flags
+ // No validation on these flags is being performed, and swapchain creation
+ // may fail if an unsupported combination is requested.
+ UINT flags;
+
+ // --- Swapchain usage behavior options
+
+ // Disable using a 10-bit swapchain format for SDR output
+ bool disable_10bit_sdr;
+};
+
+#define pl_d3d11_swapchain_params(...) (&(struct pl_d3d11_swapchain_params) { __VA_ARGS__ })
+
+// Creates a new Direct3D 11 swapchain, or wraps an existing one. If an existing
+// swapchain is provided in params->swapchain, `pl_d3d11_create_swapchain` will
+// take a reference to it that will be released in `pl_swapchain_destroy`.
+PL_API pl_swapchain pl_d3d11_create_swapchain(pl_d3d11 d3d11,
+ const struct pl_d3d11_swapchain_params *params);
+
+// Takes a `pl_swapchain` created by pl_d3d11_create_swapchain and returns a
+// reference to the underlying IDXGISwapChain. This increments the refcount, so
+// call IDXGISwapChain::Release when finished with it.
+PL_API IDXGISwapChain *pl_d3d11_swapchain_unwrap(pl_swapchain sw);
+
+struct pl_d3d11_wrap_params {
+ // The D3D11 texture to wrap, or a texture array containing the texture to
+ // wrap. Must be a ID3D11Texture1D, ID3D11Texture2D or ID3D11Texture3D
+ // created by the same device used by `gpu`, must have D3D11_USAGE_DEFAULT,
+ // and must not be mipmapped or multisampled.
+ ID3D11Resource *tex;
+
+ // If tex is a texture array, this is the array member to use as the pl_tex.
+ int array_slice;
+
+ // If tex is a video resource (eg. DXGI_FORMAT_AYUV, DXGI_FORMAT_NV12,
+ // DXGI_FORMAT_P010, etc.,) it can be wrapped as a pl_tex by specifying the
+ // type and size of the shader view. For planar video formats, the plane
+ // that is wrapped depends on the chosen format.
+ //
+ // If tex is not a video resource, these fields are unnecessary. The correct
+ // format will be determined automatically. If tex is not 2D, these fields
+ // are ignored.
+ //
+ // For a list of supported video formats and their corresponding view
+ // formats and sizes, see:
+ // https://microsoft.github.io/DirectX-Specs/d3d/archive/D3D11_3_FunctionalSpec.htm#VideoViews
+ DXGI_FORMAT fmt;
+ int w;
+ int h;
+};
+
+#define pl_d3d11_wrap_params(...) (&(struct pl_d3d11_wrap_params) { __VA_ARGS__ })
+
+// Wraps an external texture into a pl_tex abstraction. `pl_d3d11_wrap` takes a
+// reference to the texture, which is released when `pl_tex_destroy` is called.
+//
+// This function may fail due to incompatible formats, incompatible flags or
+// other reasons, in which case it will return NULL.
+PL_API pl_tex pl_d3d11_wrap(pl_gpu gpu, const struct pl_d3d11_wrap_params *params);
+
+PL_API_END
+
+#endif // LIBPLACEBO_D3D11_H_
diff --git a/src/include/libplacebo/dispatch.h b/src/include/libplacebo/dispatch.h
new file mode 100644
index 0000000..7d43794
--- /dev/null
+++ b/src/include/libplacebo/dispatch.h
@@ -0,0 +1,239 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_DISPATCH_H_
+#define LIBPLACEBO_DISPATCH_H_
+
+#include <libplacebo/shaders.h>
+#include <libplacebo/gpu.h>
+
+PL_API_BEGIN
+
+// Thread-safety: Safe
+typedef struct pl_dispatch_t *pl_dispatch;
+
+// Creates a new shader dispatch object. This object provides a translation
+// layer between generated shaders (pl_shader) and the ra context such that it
+// can be used to execute shaders. This dispatch object will also provide
+// shader caching (for efficient re-use).
+PL_API pl_dispatch pl_dispatch_create(pl_log log, pl_gpu gpu);
+PL_API void pl_dispatch_destroy(pl_dispatch *dp);
+
+// Reset/increments the internal counters of the pl_dispatch. This must be
+// called whenever the user is going to begin with a new frame, in order to
+// perform garbage collection and advance the state of the internal PRNG.
+//
+// Note that shaders generated by `pl_dispatch` are therefore entirely
+// deterministic, as long as the sequence of calls (and inputs to the shader)
+// are the same.
+PL_API void pl_dispatch_reset_frame(pl_dispatch dp);
+
+// Returns a blank pl_shader object, suitable for recording rendering commands.
+// For more information, see the header documentation in `shaders/*.h`.
+PL_API pl_shader pl_dispatch_begin(pl_dispatch dp);
+
+// Struct passed to `info_callback`. Only valid until that function returns.
+struct pl_dispatch_info {
+ // Information about the shader for this shader execution, as well as a
+ // 64-bit signature uniquely identifying it.
+ pl_shader_info shader;
+ uint64_t signature;
+
+ // A list of execution times for this pass, in nanoseconds. May be empty.
+ uint64_t samples[256];
+ int num_samples;
+
+ // As a convenience, this contains the last, average and peak of the above
+ // list of samples. If `num_samples` is 0, these values are also 0.
+ uint64_t last;
+ uint64_t peak;
+ uint64_t average;
+};
+
+// Helper function to make a copy of `pl_dispatch_info`, while overriding
+// (and dereferencing) whatever was previously stored there.
+static inline void pl_dispatch_info_move(struct pl_dispatch_info *dst,
+ const struct pl_dispatch_info *src)
+{
+ pl_shader_info_deref(&dst->shader);
+ *dst = *src;
+ dst->shader = pl_shader_info_ref(src->shader);
+}
+
+// Set up a dispatch callback for this `pl_dispatch` object. The given callback
+// will be run for every successfully dispatched shader. Call this again with
+// `cb == NULL` to disable.
+PL_API void pl_dispatch_callback(pl_dispatch dp, void *priv,
+ void (*cb)(void *priv,
+ const struct pl_dispatch_info *));
+
+struct pl_dispatch_params {
+ // The shader to execute. The pl_dispatch will take over ownership
+ // of this shader, and return it back to the internal pool.
+ //
+ // This shader must have a compatible signature, i.e. inputs
+ // `PL_SHADER_SIG_NONE` and outputs `PL_SHADER_SIG_COLOR`.
+ pl_shader *shader;
+
+ // The texture to render to. This must have params compatible with the
+ // shader, i.e. `target->params.renderable` for fragment shaders and
+ // `target->params.storable` for compute shaders.
+ //
+ // Note: Even when not using compute shaders, users are advised to always
+ // set `target->params.storable` if permitted by the `pl_fmt`, since this
+ // allows the use of compute shaders instead of full-screen quads, which is
+ // faster on some platforms.
+ pl_tex target;
+
+ // The target rect to render to. Optional, if left as {0}, then the
+ // entire texture will be rendered to.
+ pl_rect2d rect;
+
+ // If set, enables and controls the blending for this pass. Optional. When
+ // using this with fragment shaders, `target->params.fmt->caps` must
+ // include `PL_FMT_CAP_BLENDABLE`.
+ const struct pl_blend_params *blend_params;
+
+ // If set, records the execution time of this dispatch into the given
+ // timer object. Optional.
+ //
+ // Note: If this is set, `pl_dispatch` cannot internally measure the
+ // execution time of the shader, which means `pl_dispatch_info.samples` may
+ // be empty as a result.
+ pl_timer timer;
+};
+
+#define pl_dispatch_params(...) (&(struct pl_dispatch_params) { __VA_ARGS__ })
+
+// Dispatch a generated shader (via the pl_shader mechanism). Returns whether
+// or not the dispatch was successful.
+PL_API bool pl_dispatch_finish(pl_dispatch dp, const struct pl_dispatch_params *params);
+
+struct pl_dispatch_compute_params {
+ // The shader to execute. This must be a compute shader with the input
+ // set to PL_SHADER_SIG_NONE. The output, if it has any, is ignored.
+ pl_shader *shader;
+
+ // The number of work groups to dispatch in each dimension. If this is left
+ // as [0} and `width/height` are both set, the number of work groups will
+ // be inferred from the shader's `compute_group_sizes`.
+ int dispatch_size[3];
+
+ // If set, simulate vertex attributes (similar to `pl_dispatch_finish`)
+ // according to the given dimensions. The first two components of the
+ // thread's global ID will be interpreted as the X and Y locations.
+ //
+ // Optional, ignored if either component is left as 0.
+ int width, height;
+
+ // If set, records the execution time of this dispatch into the given
+ // timer object. Optional.
+ //
+ // Note: If this is set, `pl_dispatch` cannot internally measure the
+ // execution time of the shader, which means `pl_dispatch_info.samples` may
+ // be empty as a result.
+ pl_timer timer;
+};
+
+#define pl_dispatch_compute_params(...) (&(struct pl_dispatch_compute_params) { __VA_ARGS__ })
+
+// A variant of `pl_dispatch_finish`, this one only dispatches a compute shader
+// while ignoring its output (if it has one). It's only useful for shaders
+// which have otherwise observable side effects (such as updating state
+// objects).
+PL_API bool pl_dispatch_compute(pl_dispatch dp, const struct pl_dispatch_compute_params *params);
+
+enum pl_vertex_coords {
+ PL_COORDS_ABSOLUTE, // Absolute/integer `target` coordinates
+ PL_COORDS_RELATIVE, // Relative `target` coordinates in range [0, 1]
+ PL_COORDS_NORMALIZED, // GL-normalized coordinates in range [-1, 1]
+};
+
+struct pl_dispatch_vertex_params {
+ // The shader to execute. This must be a raster shader with the input set
+ // to `PL_SHADER_SIG_NONE` and the output set to `PL_SHADER_SIG_COLOR`.
+ //
+ // Additionally, the shader must not have any attached vertex attributes.
+ pl_shader *shader;
+
+ // The texture to render to. Requires `target->params.renderable`.
+ pl_tex target;
+
+ // The target rect to clip the rendering to. (Optional)
+ pl_rect2d scissors;
+
+ // If set, enables and controls the blending for this pass. Optional. When
+ // enabled, `target->params.fmt->caps` must include `PL_FMT_CAP_BLENDABLE`.
+ const struct pl_blend_params *blend_params;
+
+ // The description of the vertex format, including offsets.
+ //
+ // Note: `location` is ignored and can safely be left unset.
+ const struct pl_vertex_attrib *vertex_attribs;
+ int num_vertex_attribs;
+ size_t vertex_stride;
+
+ // The index of the vertex position in `vertex_attribs`, as well as the
+ // interpretation of its contents.
+ int vertex_position_idx;
+ enum pl_vertex_coords vertex_coords;
+ bool vertex_flipped; // flip all vertex y coordinates
+
+ // Type and number of vertices to render.
+ enum pl_prim_type vertex_type;
+ int vertex_count;
+
+ // Vertex data. See `pl_pass_run_params.vertex_data`.
+ const void *vertex_data;
+ pl_buf vertex_buf;
+ size_t buf_offset;
+
+ // Index data. See `pl_pass_run_params.index_data`. Optional.
+ const void *index_data;
+ enum pl_index_format index_fmt;
+ pl_buf index_buf;
+ size_t index_offset;
+
+ // If set, records the execution time of this dispatch into the given
+ // timer object. Optional.
+ //
+ // Note: If this is set, `pl_dispatch` cannot internally measure the
+ // execution time of the shader, which means `pl_dispatch_info.samples` may
+ // be empty as a result.
+ pl_timer timer;
+};
+
+#define pl_dispatch_vertex_params(...) (&(struct pl_dispatch_vertex_params) { __VA_ARGS__ })
+
+// Dispatch a generated shader using custom vertices, rather than using a quad
+// generated by the dispatch. This allows the use of e.g. custom fragment
+// shaders for things like rendering custom UI elements, or possibly doing
+// advanced things like sampling from a cube map or spherical video.
+PL_API bool pl_dispatch_vertex(pl_dispatch dp, const struct pl_dispatch_vertex_params *params);
+
+// Cancel an active shader without submitting anything. Useful, for example,
+// if the shader was instead merged into a different shader.
+PL_API void pl_dispatch_abort(pl_dispatch dp, pl_shader *sh);
+
+// Deprecated in favor of `pl_cache_save/pl_cache_load` on the `pl_cache`
+// associated with the `pl_gpu` this dispatch is using.
+PL_DEPRECATED PL_API size_t pl_dispatch_save(pl_dispatch dp, uint8_t *out_cache);
+PL_DEPRECATED PL_API void pl_dispatch_load(pl_dispatch dp, const uint8_t *cache);
+
+PL_API_END
+
+#endif // LIBPLACEBO_DISPATCH_H
diff --git a/src/include/libplacebo/dither.h b/src/include/libplacebo/dither.h
new file mode 100644
index 0000000..84f17c7
--- /dev/null
+++ b/src/include/libplacebo/dither.h
@@ -0,0 +1,82 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_DITHER_H_
+#define LIBPLACEBO_DITHER_H_
+
+#include <libplacebo/common.h>
+
+PL_API_BEGIN
+
+// Generates a deterministic NxN bayer (ordered) dither matrix, storing the
+// result in `data`. `size` must be a power of two. The resulting matrix will
+// be roughly uniformly distributed within the range [0,1).
+PL_API void pl_generate_bayer_matrix(float *data, int size);
+
+// Generates a random NxN blue noise texture. storing the result in `data`.
+// `size` must be a positive power of two no larger than 256. The resulting
+// texture will be roughly uniformly distributed within the range [0,1).
+//
+// Note: This function is very, *very* slow for large sizes. Generating a
+// dither matrix with size 256 can take several seconds on a modern processor.
+PL_API void pl_generate_blue_noise(float *data, int size);
+
+// Defines the border of all error diffusion kernels
+#define PL_EDF_MIN_DX (-2)
+#define PL_EDF_MAX_DX (2)
+#define PL_EDF_MAX_DY (2)
+
+struct pl_error_diffusion_kernel {
+ const char *name; // Short and concise identifier
+ const char *description; // Longer / friendly name
+
+ // The minimum value such that a (y, x) -> (y, x + y * shift) mapping will
+ // make all error pushing operations affect next column (and after it)
+ // only.
+ //
+ // Higher shift values are significantly more computationally intensive.
+ int shift;
+
+ // The diffusion factor for (y, x) is pattern[y][x - PL_EDF_MIN_DX] / divisor.
+ int pattern[PL_EDF_MAX_DY + 1][PL_EDF_MAX_DX - PL_EDF_MIN_DX + 1];
+ int divisor;
+};
+
+// Algorithms with shift=1:
+PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_simple;
+PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_false_fs;
+// Algorithms with shift=2:
+PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_sierra_lite;
+PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_floyd_steinberg;
+PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_atkinson;
+// Algorithms with shift=3, probably too heavy for low end GPUs:
+PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_jarvis_judice_ninke;
+PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_stucki;
+PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_burkes;
+PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_sierra2;
+PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_sierra3;
+
+// A list of built-in error diffusion kernels, terminated by NULL
+PL_API extern const struct pl_error_diffusion_kernel * const pl_error_diffusion_kernels[];
+PL_API extern const int pl_num_error_diffusion_kernels; // excluding trailing NULL
+
+// Find the error diffusion kernel with the given name, or NULL on failure.
+PL_API const struct pl_error_diffusion_kernel *pl_find_error_diffusion_kernel(const char *name);
+
+PL_API_END
+
+#endif // LIBPLACEBO_DITHER_H_
diff --git a/src/include/libplacebo/dummy.h b/src/include/libplacebo/dummy.h
new file mode 100644
index 0000000..c298438
--- /dev/null
+++ b/src/include/libplacebo/dummy.h
@@ -0,0 +1,131 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_DUMMY_H_
+#define LIBPLACEBO_DUMMY_H_
+
+#include <libplacebo/gpu.h>
+
+PL_API_BEGIN
+
+// The functions in this file allow creating and manipulating "dummy" contexts.
+// A dummy context isn't actually mapped by the GPU, all data exists purely on
+// the CPU. It also isn't capable of compiling or executing any shaders, any
+// attempts to do so will simply fail.
+//
+// The main use case for this dummy context is for users who want to generate
+// advanced shaders that depend on specific GLSL features or support for
+// certain types of GPU resources (e.g. LUTs). This dummy context allows such
+// shaders to be generated, with all of the referenced shader objects and
+// textures simply containing their data in a host-accessible way.
+
+struct pl_gpu_dummy_params {
+ // These GPU parameters correspond to their equivalents in `pl_gpu`, and
+ // must obey the same rules as documented there. The values from
+ // `pl_gpu_dummy_default_params` are set to support pretty much everything
+ // and are set for GLSL version 450.
+ //
+ // Individual fields such as `glsl.compute` or `glsl.version` description
+ // can and should be overridden by the user based on their requirements.
+ // Individual limits should ideally be set based on the corresponding
+ // `glGet` queries etc.
+ struct pl_glsl_version glsl;
+ struct pl_gpu_limits limits;
+};
+
+#define PL_GPU_DUMMY_DEFAULTS \
+ .glsl = { \
+ .version = 450, \
+ .gles = false, \
+ .vulkan = false, \
+ .compute = true, \
+ .max_shmem_size = SIZE_MAX, \
+ .max_group_threads = 1024, \
+ .max_group_size = { 1024, 1024, 1024 }, \
+ .subgroup_size = 32, \
+ .min_gather_offset = INT16_MIN, \
+ .max_gather_offset = INT16_MAX, \
+ }, \
+ .limits = { \
+ /* pl_gpu */ \
+ .callbacks = false, \
+ .thread_safe = true, \
+ /* pl_buf */ \
+ .max_buf_size = SIZE_MAX, \
+ .max_ubo_size = SIZE_MAX, \
+ .max_ssbo_size = SIZE_MAX, \
+ .max_vbo_size = SIZE_MAX, \
+ .max_mapped_size = SIZE_MAX, \
+ .max_buffer_texels = UINT64_MAX, \
+ /* pl_tex */ \
+ .max_tex_1d_dim = UINT32_MAX, \
+ .max_tex_2d_dim = UINT32_MAX, \
+ .max_tex_3d_dim = UINT32_MAX, \
+ .buf_transfer = true, \
+ .align_tex_xfer_pitch = 1, \
+ .align_tex_xfer_offset = 1, \
+ /* pl_pass */ \
+ .max_variable_comps = SIZE_MAX, \
+ .max_constants = SIZE_MAX, \
+ .max_pushc_size = SIZE_MAX, \
+ .max_dispatch = { UINT32_MAX, UINT32_MAX, UINT32_MAX }, \
+ .fragment_queues = 0, \
+ .compute_queues = 0, \
+ },
+
+#define pl_gpu_dummy_params(...) (&(struct pl_gpu_dummy_params) { PL_GPU_DUMMY_DEFAULTS __VA_ARGS__ })
+PL_API extern const struct pl_gpu_dummy_params pl_gpu_dummy_default_params;
+
+// Create a dummy GPU context based on the given parameters. This GPU will have
+// a format for each host-representable type (i.e. intN_t, floats and doubles),
+// in the canonical channel order RGBA. These formats will have every possible
+// capability activated, respectively.
+//
+// If `params` is left as NULL, it defaults to `&pl_gpu_dummy_params`.
+PL_API pl_gpu pl_gpu_dummy_create(pl_log log, const struct pl_gpu_dummy_params *params);
+PL_API void pl_gpu_dummy_destroy(pl_gpu *gpu);
+
+// Back-doors into the `pl_tex` and `pl_buf` representations. These allow you
+// to access the raw data backing this object. Textures are always laid out in
+// a tightly packed manner.
+//
+// For "placeholder" dummy textures, this always returns NULL.
+PL_API uint8_t *pl_buf_dummy_data(pl_buf buf);
+PL_API uint8_t *pl_tex_dummy_data(pl_tex tex);
+
+// Skeleton of `pl_tex_params` containing only the fields relevant to
+// `pl_tex_dummy_create`, plus the extra `sampler_type` field.
+struct pl_tex_dummy_params {
+ int w, h, d;
+ pl_fmt format;
+ enum pl_sampler_type sampler_type;
+ void *user_data;
+};
+
+#define pl_tex_dummy_params(...) (&(struct pl_tex_dummy_params) { __VA_ARGS__ })
+
+// Allows creating a "placeholder" dummy texture. This is basically a texture
+// that isn't even backed by anything. All `pl_tex_*` operations (other than
+// `pl_tex_destroy`) performed on it will simply fail.
+//
+// All of the permissions will be set to `false`, except `sampleable`, which is
+// set to `true`. (So you can use it as an input to shader sampling functions)
+PL_API pl_tex pl_tex_dummy_create(pl_gpu gpu, const struct pl_tex_dummy_params *params);
+
+PL_API_END
+
+#endif // LIBPLACEBO_DUMMY_H_
diff --git a/src/include/libplacebo/filters.h b/src/include/libplacebo/filters.h
new file mode 100644
index 0000000..a95649d
--- /dev/null
+++ b/src/include/libplacebo/filters.h
@@ -0,0 +1,415 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_FILTER_KERNELS_H_
+#define LIBPLACEBO_FILTER_KERNELS_H_
+
+#include <stdbool.h>
+#include <libplacebo/log.h>
+
+PL_API_BEGIN
+
+#define PL_FILTER_MAX_PARAMS 2
+
+// Invocation parameters for a given kernel
+struct pl_filter_ctx {
+ float radius;
+ float params[PL_FILTER_MAX_PARAMS];
+};
+
+// Represents a single filter function, i.e. kernel or windowing function.
+struct pl_filter_function {
+ // The cosmetic name associated with this filter function.
+ const char *name;
+
+ // The radius of the filter function. For resizable filters, this gives
+ // the radius needed to represent a single filter lobe (tap).
+ float radius;
+
+ // If true, the filter function is resizable (see pl_filter_config.radius)
+ bool resizable;
+
+ // If true, the filter function is tunable (see pl_filter_config.params)
+ bool tunable[PL_FILTER_MAX_PARAMS];
+
+ // If the relevant parameter is tunable, this contains the default values.
+ float params[PL_FILTER_MAX_PARAMS];
+
+ // The underlying filter function itself: Computes the weight as a function
+ // of the offset. All filter functions must be normalized such that x=0 is
+ // the center point, and in particular weight(0) = 1.0. The functions may
+ // be undefined for values of x outside [0, radius].
+ double (*weight)(const struct pl_filter_ctx *f, double x);
+
+ // If true, this filter represents an opaque placeholder for a more
+ // sophisticated filter function which does not fit into the pl_filter
+ // framework. `weight()` will always return 0.0.
+ bool opaque;
+};
+
+// Deprecated function, merely checks a->weight == b->weight
+PL_DEPRECATED PL_API bool
+pl_filter_function_eq(const struct pl_filter_function *a,
+ const struct pl_filter_function *b);
+
+// Box filter: Entirely 1.0 within the radius, entirely 0.0 outside of it.
+// This is also sometimes called a Dirichlet window
+PL_API extern const struct pl_filter_function pl_filter_function_box;
+
+// Triangle filter: Linear transitions from 1.0 at x=0 to 0.0 at x=radius.
+// This is also sometimes called a Bartlett window.
+PL_API extern const struct pl_filter_function pl_filter_function_triangle;
+
+// Cosine filter: Ordinary cosine function, single lobe.
+PL_API extern const struct pl_filter_function pl_filter_function_cosine;
+
+// Hann function: Cosine filter named after Julius von Hann. Also commonly
+// mislabeled as a "Hanning" function, due to its similarly to the Hamming
+// function.
+PL_API extern const struct pl_filter_function pl_filter_function_hann;
+
+// Hamming function: Cosine filter named after Richard Hamming.
+PL_API extern const struct pl_filter_function pl_filter_function_hamming;
+
+// Welch filter: Polynomial function consisting of a single parabolic section.
+PL_API extern const struct pl_filter_function pl_filter_function_welch;
+
+// Kaiser filter: Approximation of the DPSS window using Bessel functions.
+// Also sometimes called a Kaiser-Bessel window.
+// Parameter [0]: Shape (alpha). Determines the trade-off between the main lobe
+// and the side lobes.
+PL_API extern const struct pl_filter_function pl_filter_function_kaiser;
+
+// Blackman filter: Cosine filter named after Ralph Beebe Blackman.
+// Parameter [0]: Scale (alpha). Influences the shape. The defaults result in
+// zeros at the third and fourth sidelobes.
+PL_API extern const struct pl_filter_function pl_filter_function_blackman;
+
+// Bohman filter: 2nd order Cosine filter.
+PL_API extern const struct pl_filter_function pl_filter_function_bohman;
+
+// Gaussian function: Similar to the Gaussian distribution, this defines a
+// bell curve function.
+// Parameter [0]: Scale (t), increasing makes the result blurrier.
+PL_API extern const struct pl_filter_function pl_filter_function_gaussian;
+
+// Quadratic function: 2nd order approximation of the gaussian function. Also
+// sometimes called a "quadric" window.
+PL_API extern const struct pl_filter_function pl_filter_function_quadratic;
+
+// Sinc function: Widely used for both kernels and windows, sinc(x) = sin(x)/x.
+PL_API extern const struct pl_filter_function pl_filter_function_sinc;
+
+// Jinc function: Similar to sinc, but extended to the 2D domain. Widely
+// used as the kernel of polar (EWA) filters. Also sometimes called a Sombrero
+// function.
+PL_API extern const struct pl_filter_function pl_filter_function_jinc;
+
+// Sphinx function: Similar to sinc and jinx, but extended to the 3D domain.
+// The name is derived from "spherical" sinc. Can be used to filter 3D signals
+// in theory.
+PL_API extern const struct pl_filter_function pl_filter_function_sphinx;
+
+// B/C-tunable Spline function: This is a family of commonly used spline
+// functions with two tunable parameters. Does not need to be windowed.
+// Parameter [0]: "B"
+// Parameter [1]: "C"
+// Some popular variants of this function are:
+// B = 1.0, C = 0.0: "base" Cubic (blurry)
+// B = 0.0, C = 0.0: Hermite filter (blocky)
+// B = 0.0, C = 0.5: Catmull-Rom filter (sharp)
+// B = 1/3, C = 1/3: Mitchell-Netravali filter (soft, doesn't ring)
+// B ≈ 0.37, C ≈ 0.31: Robidoux filter (used by ImageMagick)
+// B ≈ 0.26, C ≈ 0.37: RobidouxSharp filter (sharper variant of Robidoux)
+PL_API extern const struct pl_filter_function pl_filter_function_cubic;
+PL_API extern const struct pl_filter_function pl_filter_function_hermite;
+#define pl_filter_function_bicubic pl_filter_function_cubic
+#define pl_filter_function_bcspline pl_filter_function_cubic
+
+// Cubic splines with 2/3/4 taps. Referred to as "spline16", "spline36", and
+// "spline64" mainly for historical reasons, based on the number of pixels in
+// their window when using them as 2D orthogonal filters. Do not need to be
+// windowed.
+PL_API extern const struct pl_filter_function pl_filter_function_spline16;
+PL_API extern const struct pl_filter_function pl_filter_function_spline36;
+PL_API extern const struct pl_filter_function pl_filter_function_spline64;
+
+// Special filter function for the built-in oversampling algorithm. This is an
+// opaque filter with no meaningful representation. though it has one tunable
+// parameter controlling the threshold at which to switch back to ordinary
+// nearest neighbour sampling. (See `pl_shader_sample_oversample`)
+PL_API extern const struct pl_filter_function pl_filter_function_oversample;
+
+// A list of built-in filter functions, terminated by NULL
+//
+// Note: May contain extra aliases for the above functions.
+PL_API extern const struct pl_filter_function * const pl_filter_functions[];
+PL_API extern const int pl_num_filter_functions; // excluding trailing NULL
+
+// Find the filter function with the given name, or NULL on failure.
+PL_API const struct pl_filter_function *pl_find_filter_function(const char *name);
+
+// Backwards compatibility with the older configuration API. Redundant with
+// `pl_filter_function.name`. May be formally deprecated in the future.
+
+struct pl_filter_function_preset {
+ const char *name;
+ const struct pl_filter_function *function;
+};
+
+// A list of built-in filter function presets, terminated by {0}
+PL_API extern const struct pl_filter_function_preset pl_filter_function_presets[];
+PL_API extern const int pl_num_filter_function_presets; // excluding trailing {0}
+
+// Find the filter function preset with the given name, or NULL on failure.
+PL_API const struct pl_filter_function_preset *pl_find_filter_function_preset(const char *name);
+
+// Different usage domains for a filter
+enum pl_filter_usage {
+ PL_FILTER_UPSCALING = (1 << 0),
+ PL_FILTER_DOWNSCALING = (1 << 1),
+ PL_FILTER_FRAME_MIXING = (1 << 2),
+
+ PL_FILTER_SCALING = PL_FILTER_UPSCALING | PL_FILTER_DOWNSCALING,
+ PL_FILTER_ALL = PL_FILTER_SCALING | PL_FILTER_FRAME_MIXING,
+};
+
+// Represents a tuned combination of filter functions, plus parameters
+struct pl_filter_config {
+ // The cosmetic name associated with this filter config. Optional for
+ // user-provided configs, but always set by built-in configurations.
+ const char *name;
+
+ // Longer / friendly name. Always set for built-in configurations,
+ // except for names which are merely aliases of other filters.
+ const char *description;
+
+ // Allowed and recommended usage domains (respectively)
+ //
+ // When it is desired to maintain a simpler user interface, it may be
+ // recommended to include only scalers whose recommended usage domains
+ // includes the relevant context in which it will be used.
+ enum pl_filter_usage allowed;
+ enum pl_filter_usage recommended;
+
+ // The kernel function and (optionally) windowing function.
+ const struct pl_filter_function *kernel;
+ const struct pl_filter_function *window;
+
+ // The radius. Ignored if !kernel->resizable. Optional, defaults to
+ // kernel->radius if unset.
+ float radius;
+
+ // Parameters for the respective filter function. Ignored if not tunable.
+ float params[PL_FILTER_MAX_PARAMS];
+ float wparams[PL_FILTER_MAX_PARAMS];
+
+ // Represents a clamping coefficient for negative weights. A value of 0.0
+ // (the default) represents no clamping. A value of 1.0 represents full
+ // clamping, i.e. all negative weights will be clamped to 0. Values in
+ // between will be linearly scaled.
+ float clamp;
+
+ // Additional blur coefficient. This effectively stretches the kernel,
+ // without changing the effective radius of the filter radius. Setting this
+ // to a value of 0.0 is equivalent to disabling it. Values significantly
+ // below 1.0 may seriously degrade the visual output, and should be used
+ // with care.
+ float blur;
+
+ // Additional taper coefficient. This essentially flattens the function's
+ // center. The values within [-taper, taper] will return 1.0, with the
+ // actual function being squished into the remainder of [taper, radius].
+ // Defaults to 0.0.
+ float taper;
+
+ // If true, this filter is intended to be used as a polar/2D filter (EWA)
+ // instead of a separable/1D filter. Does not affect the actual sampling,
+ // but provides information about how the results are to be interpreted.
+ bool polar;
+
+ // Antiringing strength. A value of 0.0 disables antiringing, and a value
+ // of 1.0 enables full-strength antiringing. Defaults to 0.0 if
+ // unspecified.
+ //
+ // Note: This is only included in `pl_filter_config` for convenience. Does
+ // not affect the actual filter sampling, but provides information to the
+ // downstream consumer of the `pl_filter`.
+ float antiring;
+};
+
+PL_API bool pl_filter_config_eq(const struct pl_filter_config *a,
+ const struct pl_filter_config *b);
+
+// Samples a given filter configuration at a given x coordinate, while
+// respecting all parameters of the configuration.
+PL_API double pl_filter_sample(const struct pl_filter_config *c, double x);
+
+// A list of built-in filter configurations. Since they are just combinations
+// of the above filter functions, they are not described in much further
+// detail.
+PL_API extern const struct pl_filter_config pl_filter_spline16; // 2 taps
+PL_API extern const struct pl_filter_config pl_filter_spline36; // 3 taps
+PL_API extern const struct pl_filter_config pl_filter_spline64; // 4 taps
+PL_API extern const struct pl_filter_config pl_filter_nearest;
+PL_API extern const struct pl_filter_config pl_filter_box;
+PL_API extern const struct pl_filter_config pl_filter_bilinear;
+PL_API extern const struct pl_filter_config pl_filter_gaussian;
+// Sinc family (all configured to 3 taps):
+PL_API extern const struct pl_filter_config pl_filter_sinc; // unwindowed
+PL_API extern const struct pl_filter_config pl_filter_lanczos; // sinc-sinc
+PL_API extern const struct pl_filter_config pl_filter_ginseng; // sinc-jinc
+PL_API extern const struct pl_filter_config pl_filter_ewa_jinc; // unwindowed
+PL_API extern const struct pl_filter_config pl_filter_ewa_lanczos; // jinc-jinc
+PL_API extern const struct pl_filter_config pl_filter_ewa_lanczossharp;
+PL_API extern const struct pl_filter_config pl_filter_ewa_lanczos4sharpest;
+PL_API extern const struct pl_filter_config pl_filter_ewa_ginseng; // jinc-sinc
+PL_API extern const struct pl_filter_config pl_filter_ewa_hann; // jinc-hann
+// Spline family
+PL_API extern const struct pl_filter_config pl_filter_bicubic;
+PL_API extern const struct pl_filter_config pl_filter_hermite;
+PL_API extern const struct pl_filter_config pl_filter_catmull_rom;
+PL_API extern const struct pl_filter_config pl_filter_mitchell;
+PL_API extern const struct pl_filter_config pl_filter_mitchell_clamp; // clamp = 1.0
+PL_API extern const struct pl_filter_config pl_filter_robidoux;
+PL_API extern const struct pl_filter_config pl_filter_robidouxsharp;
+PL_API extern const struct pl_filter_config pl_filter_ewa_robidoux;
+PL_API extern const struct pl_filter_config pl_filter_ewa_robidouxsharp;
+// Special/opaque filters
+PL_API extern const struct pl_filter_config pl_filter_oversample;
+
+// Backwards compatibility
+#define pl_filter_triangle pl_filter_bilinear
+#define pl_oversample_frame_mixer pl_filter_oversample
+
+// A list of built-in filter configs, terminated by NULL
+PL_API extern const struct pl_filter_config * const pl_filter_configs[];
+PL_API extern const int pl_num_filter_configs; // excluding trailing NULL
+
+// Find the filter config with the given name, or NULL on failure.
+// `usage` restricts the valid usage (based on `pl_filter_config.allowed`).
+PL_API const struct pl_filter_config *
+pl_find_filter_config(const char *name, enum pl_filter_usage usage);
+
+// Backward compatibility with the previous filter configuration API. Redundant
+// with pl_filter_config.name/description. May be deprecated in the future.
+struct pl_filter_preset {
+ const char *name;
+ const struct pl_filter_config *filter;
+
+ // Longer / friendly name, or NULL for aliases
+ const char *description;
+};
+
+// A list of built-in filter presets, terminated by {0}
+PL_API extern const struct pl_filter_preset pl_filter_presets[];
+PL_API extern const int pl_num_filter_presets; // excluding trailing {0}
+
+// Find the filter preset with the given name, or NULL on failure.
+PL_API const struct pl_filter_preset *pl_find_filter_preset(const char *name);
+
+// Parameters for filter generation.
+struct pl_filter_params {
+ // The particular filter configuration to be sampled. config.kernel must
+ // be set to a valid pl_filter_function.
+ struct pl_filter_config config;
+
+ // The precision of the resulting LUT. A value of 64 should be fine for
+ // most practical purposes, but higher or lower values may be justified
+ // depending on the use case. This value must be set to something > 0.
+ int lut_entries;
+
+ // --- Polar filers only (config.polar)
+
+ // As a micro-optimization, all samples below this cutoff value will be
+ // ignored when updating the cutoff radius. Setting it to a value of 0.0
+ // disables this optimization.
+ float cutoff;
+
+ // --- Separable filters only (!config.polar)
+
+ // Indicates the maximum row size that is supported by the calling code, or
+ // 0 for no limit.
+ int max_row_size;
+
+ // Indicates the row stride alignment. For some use cases (e.g. uploading
+ // the weights as a texture), there are certain alignment requirements for
+ // each row. The chosen row_size will always be a multiple of this value.
+ // Specifying 0 indicates no alignment requirements.
+ int row_stride_align;
+
+ // --- Deprecated options
+ float filter_scale PL_DEPRECATED; // no effect, use `config.blur` instead
+};
+
+#define pl_filter_params(...) (&(struct pl_filter_params) { __VA_ARGS__ })
+
+// Represents an initialized instance of a particular filter, with a
+// precomputed LUT. The interpretation of the LUT depends on the type of the
+// filter (polar or separable).
+typedef const struct pl_filter_t {
+ // Deep copy of the parameters, for convenience.
+ struct pl_filter_params params;
+
+ // Contains the true radius of the computed filter. This may be
+ // smaller than the configured radius depending on the exact filter
+ // parameters used. Mainly relevant for polar filters, since
+ // it affects the value range of *weights.
+ float radius;
+
+ // Radius of the first zero crossing (main lobe size).
+ float radius_zero;
+
+ // The computed look-up table (LUT). For polar filters, this is interpreted
+ // as a 1D array with dimensions [lut_entries] containing the raw filter
+ // samples on the scale [0, radius]. For separable (non-polar) filters,
+ // this is interpreted as a 2D array with dimensions
+ // [lut_entries][row_stride]. The inner rows contain the `row_size` samples
+ // to convolve with the corresponding input pixels. The outer coordinate is
+ // used to very the fractional offset (phase). So for example, if the
+ // sample position to reconstruct is directly aligned with the source
+ // texels, you would use the values from weights[0]. If the sample position
+ // to reconstruct is exactly half-way between two source texels (180° out
+ // of phase), you would use the values from weights[lut_entries/2].
+ const float *weights;
+
+ // --- separable filters only (!params.config.polar)
+
+ // The number of source texels to convolve over for each row. This value
+ // will never exceed the given `max_row_size`. If the filter ends up
+ // cut off because of this, the bool `insufficient` will be set to true.
+ int row_size;
+ bool insufficient;
+
+ // The separation (in *weights) between each row of the filter. Always
+ // a multiple of params.row_stride_align.
+ int row_stride;
+
+ // --- deprecated / removed fields
+ float radius_cutoff PL_DEPRECATED; // identical to `radius`
+} *pl_filter;
+
+// Generate (compute) a filter instance based on a given filter configuration.
+// The resulting pl_filter must be freed with `pl_filter_free` when no longer
+// needed. Returns NULL if filter generation fails due to invalid parameters
+// (i.e. missing a required parameter).
+PL_API pl_filter pl_filter_generate(pl_log log, const struct pl_filter_params *params);
+PL_API void pl_filter_free(pl_filter *filter);
+
+PL_API_END
+
+#endif // LIBPLACEBO_FILTER_KERNELS_H_
diff --git a/src/include/libplacebo/gamut_mapping.h b/src/include/libplacebo/gamut_mapping.h
new file mode 100644
index 0000000..a92a73b
--- /dev/null
+++ b/src/include/libplacebo/gamut_mapping.h
@@ -0,0 +1,182 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_GAMUT_MAPPING_H_
+#define LIBPLACEBO_GAMUT_MAPPING_H_
+
+#include <libplacebo/common.h>
+#include <libplacebo/colorspace.h>
+
+PL_API_BEGIN
+
+struct pl_gamut_map_params;
+struct pl_gamut_map_function {
+ const char *name; // Identifier
+ const char *description; // Friendly / longer name
+
+ // The gamut-mapping function itself. Iterates over all values in `lut`,
+ // and adapts them as needed.
+ void (*map)(float *lut, const struct pl_gamut_map_params *params);
+
+ // Returns true if `map` supports both stretching and contracting the
+ // gamut. In this case, `map` is always executed, even if the output gamut
+ // is larger than the input gamut.
+ bool bidirectional;
+
+ // Private data. Unused by libplacebo, but may be accessed by `map`.
+ void *priv;
+};
+
+struct pl_gamut_map_constants {
+ // (Relative) chromaticity protection zone for perceptual mapping [0,1]
+ float perceptual_deadzone;
+
+ // Strength of the perceptual saturation mapping component [0,1]
+ float perceptual_strength;
+
+ // I vs C curve gamma to use for colorimetric clipping [0,10]
+ float colorimetric_gamma;
+
+ // Knee point to use for softclipping methods (perceptual, softclip) [0,1]
+ float softclip_knee;
+
+ // Desaturation strength (for softclip only) [0,1]
+ float softclip_desat;
+};
+
+#define PL_GAMUT_MAP_CONSTANTS \
+ .colorimetric_gamma = 1.80f, \
+ .softclip_knee = 0.70f, \
+ .softclip_desat = 0.35f, \
+ .perceptual_deadzone = 0.30f, \
+ .perceptual_strength = 0.80f,
+
+struct pl_gamut_map_params {
+ // If `function` is NULL, defaults to `pl_gamut_map_clip`.
+ const struct pl_gamut_map_function *function;
+
+ // The desired input/output primaries. This affects the subjective color
+ // volume in which the desired mapping shall take place.
+ struct pl_raw_primaries input_gamut;
+ struct pl_raw_primaries output_gamut;
+
+ // Minimum/maximum luminance (PQ) of the target display. Note that the same
+ // value applies to both the input and output, since it's assumed that tone
+ // mapping has already happened by this stage. This effectively defines the
+ // legal gamut boundary in RGB space.
+ //
+ // This also defines the I channel value range, for `pl_gamut_map_generate`
+ float min_luma;
+ float max_luma;
+
+ // Common constants, should be initialized to PL_GAMUT_MAP_CONSTANTS if
+ // not intending to override them further.
+ struct pl_gamut_map_constants constants;
+
+ // -- LUT generation options (for `pl_gamut_map_generate` only)
+
+ // The size of the resulting LUT, per channel.
+ //
+ // Note: For quality, it's generally best to increase h > I > C
+ int lut_size_I;
+ int lut_size_C;
+ int lut_size_h;
+
+ // The stride (in number of floats) between elements in the resulting LUT.
+ int lut_stride;
+
+ // -- Removed parameters
+ float chroma_margin PL_DEPRECATED; // non-functional
+};
+
+#define pl_gamut_map_params(...) (&(struct pl_gamut_map_params) { \
+ .constants = { PL_GAMUT_MAP_CONSTANTS }, \
+ __VA_ARGS__ \
+})
+
+// Note: Only does pointer equality testing on `function`
+PL_API bool pl_gamut_map_params_equal(const struct pl_gamut_map_params *a,
+ const struct pl_gamut_map_params *b);
+
+// Returns true if the given gamut mapping configuration effectively represents
+// a no-op configuration. Gamut mapping can be skipped in this case.
+PL_API bool pl_gamut_map_params_noop(const struct pl_gamut_map_params *params);
+
+// Generate a gamut-mapping LUT for a given configuration. LUT samples are
+// stored as IPTPQc4 values, but the LUT itself is indexed by IChPQc4,spanning
+// the effective range [min_luma, max_luma] × [0, 0.5] × [-pi,pi].
+//
+// This ordering is designed to keep frequently co-occurring values close in
+// memory, while permitting simple wrapping of the 'h' component.
+PL_API void pl_gamut_map_generate(float *out, const struct pl_gamut_map_params *params);
+
+// Samples a gamut mapping function for a single IPTPQc4 value. The input
+// values are updated in-place.
+PL_API void pl_gamut_map_sample(float x[3], const struct pl_gamut_map_params *params);
+
+// Performs no gamut-mapping, just hard clips out-of-range colors per-channel.
+PL_API extern const struct pl_gamut_map_function pl_gamut_map_clip;
+
+// Performs a perceptually balanced (saturation) gamut mapping, using a soft
+// knee function to preserve in-gamut colors, followed by a final softclip
+// operation. This works bidirectionally, meaning it can both compress and
+// expand the gamut. Behaves similar to a blend of `saturation` and `softclip`.
+PL_API extern const struct pl_gamut_map_function pl_gamut_map_perceptual;
+
+// Performs a perceptually balanced gamut mapping using a soft knee function to
+// roll-off clipped regions, and a hue shifting function to preserve saturation.
+PL_API extern const struct pl_gamut_map_function pl_gamut_map_softclip;
+
+// Performs relative colorimetric clipping, while maintaining an exponential
+// relationship between brightness and chromaticity.
+PL_API extern const struct pl_gamut_map_function pl_gamut_map_relative;
+
+// Performs simple RGB->RGB saturation mapping. The input R/G/B channels are
+// mapped directly onto the output R/G/B channels. Will never clip, but will
+// distort all hues and/or result in a faded look.
+PL_API extern const struct pl_gamut_map_function pl_gamut_map_saturation;
+
+// Performs absolute colorimetric clipping. Like pl_gamut_map_relative, but
+// does not adapt the white point.
+PL_API extern const struct pl_gamut_map_function pl_gamut_map_absolute;
+
+// Performs constant-luminance colorimetric clipping, desaturing colors
+// towards white until they're in-range.
+PL_API extern const struct pl_gamut_map_function pl_gamut_map_desaturate;
+
+// Uniformly darkens the input slightly to prevent clipping on blown-out
+// highlights, then clamps colorimetrically to the input gamut boundary,
+// biased slightly to preserve chromaticity over luminance.
+PL_API extern const struct pl_gamut_map_function pl_gamut_map_darken;
+
+// Performs no gamut mapping, but simply highlights out-of-gamut pixels.
+PL_API extern const struct pl_gamut_map_function pl_gamut_map_highlight;
+
+// Linearly/uniformly desaturates the image in order to bring the entire
+// image into the target gamut.
+PL_API extern const struct pl_gamut_map_function pl_gamut_map_linear;
+
+// A list of built-in gamut mapping functions, terminated by NULL
+PL_API extern const struct pl_gamut_map_function * const pl_gamut_map_functions[];
+PL_API extern const int pl_num_gamut_map_functions; // excluding trailing NULL
+
+// Find the gamut mapping function with the given name, or NULL on failure.
+PL_API const struct pl_gamut_map_function *pl_find_gamut_map_function(const char *name);
+
+PL_API_END
+
+#endif // LIBPLACEBO_GAMUT_MAPPING_H_
diff --git a/src/include/libplacebo/gpu.h b/src/include/libplacebo/gpu.h
new file mode 100644
index 0000000..a63fdf7
--- /dev/null
+++ b/src/include/libplacebo/gpu.h
@@ -0,0 +1,1464 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_GPU_H_
+#define LIBPLACEBO_GPU_H_
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <libplacebo/common.h>
+#include <libplacebo/cache.h>
+#include <libplacebo/log.h>
+
+PL_API_BEGIN
+
+// These are not memory managed, and should represent compile-time constants
+typedef const char *pl_debug_tag;
+#define PL_DEBUG_TAG (__FILE__ ":" PL_TOSTRING(__LINE__))
+
+// Type of a shader input descriptor.
+enum pl_desc_type {
+ PL_DESC_INVALID = 0,
+ PL_DESC_SAMPLED_TEX, // C: pl_tex* GLSL: combined texture sampler
+ // (`pl_tex->params.sampleable` must be set)
+ PL_DESC_STORAGE_IMG, // C: pl_tex* GLSL: storage image
+ // (`pl_tex->params.storable` must be set)
+ PL_DESC_BUF_UNIFORM, // C: pl_buf* GLSL: uniform buffer
+ // (`pl_buf->params.uniform` must be set)
+ PL_DESC_BUF_STORAGE, // C: pl_buf* GLSL: storage buffer
+ // (`pl_buf->params.storable` must be set)
+ PL_DESC_BUF_TEXEL_UNIFORM,// C: pl_buf* GLSL: uniform samplerBuffer
+ // (`pl_buf->params.uniform` and `format` must be set)
+ PL_DESC_BUF_TEXEL_STORAGE,// C: pl_buf* GLSL: uniform imageBuffer
+ // (`pl_buf->params.uniform` and `format` must be set)
+ PL_DESC_TYPE_COUNT
+};
+
+// This file contains the definition of an API which is designed to abstract
+// away from platform-specific APIs like the various OpenGL variants, Direct3D
+// and Vulkan in a common way. It is a much more limited API than those APIs,
+// since it tries targeting a very small common subset of features that is
+// needed to implement libplacebo's rendering.
+//
+// NOTE: Most, but not all, parameter conditions (phrases such as "must" or
+// "valid usage" are explicitly tested and result in error messages followed by
+// graceful failure. Exceptions are noted where they exist.
+
+// Structure which wraps metadata describing GLSL capabilities.
+struct pl_glsl_version {
+ int version; // GLSL version (e.g. 450), for #version
+ bool gles; // GLSL ES semantics (ESSL)
+ bool vulkan; // GL_KHR_vulkan_glsl semantics
+
+ // Compute shader support and limits. If `compute` is false, then all
+ // of the remaining fields in this section are {0}.
+ bool compute;
+ size_t max_shmem_size; // maximum compute shader shared memory size
+ uint32_t max_group_threads; // maximum number of local threads per work group
+ uint32_t max_group_size[3]; // maximum work group size per dimension
+
+ // If nonzero, signals availability of shader subgroups. This guarantess
+ // availability of all of the following extensions:
+ // - GL_KHR_shader_subgroup_basic
+ // - GL_KHR_shader_subgroup_vote
+ // - GL_KHR_shader_subgroup_arithmetic
+ // - GL_KHR_shader_subgroup_ballot
+ // - GL_KHR_shader_subgroup_shuffle
+ uint32_t subgroup_size;
+
+ // Miscellaneous shader limits
+ int16_t min_gather_offset; // minimum `textureGatherOffset` offset
+ int16_t max_gather_offset; // maximum `textureGatherOffset` offset
+};
+
+// Backwards compatibility alias
+#define pl_glsl_desc pl_glsl_version
+
+// Structure defining the physical limits and capabilities of this GPU
+// instance. If a limit is given as 0, that means that feature is unsupported.
+struct pl_gpu_limits {
+ // --- pl_gpu
+ bool thread_safe; // `pl_gpu` calls are thread-safe
+ bool callbacks; // supports asynchronous GPU callbacks
+
+ // --- pl_buf
+ size_t max_buf_size; // maximum size of any buffer
+ size_t max_ubo_size; // maximum size of a `uniform` buffer
+ size_t max_ssbo_size; // maximum size of a `storable` buffer
+ size_t max_vbo_size; // maximum size of a `drawable` buffer
+ size_t max_mapped_size; // maximum size of a `host_mapped` buffer
+ uint64_t max_buffer_texels; // maximum number of texels in a texel buffer
+ bool host_cached; // if true, PL_BUF_MEM_HOST buffers are cached
+
+ // Required alignment for PL_HANDLE_HOST_PTR imports. This is provided
+ // merely as a hint to the user. If the host pointer being imported is
+ // misaligned, libplacebo will internally round (over-map) the region.
+ size_t align_host_ptr;
+
+ // --- pl_tex
+ uint32_t max_tex_1d_dim; // maximum width for a 1D texture
+ uint32_t max_tex_2d_dim; // maximum width/height for a 2D texture (required)
+ uint32_t max_tex_3d_dim; // maximum width/height/depth for a 3D texture
+ bool blittable_1d_3d; // supports blittable 1D/3D textures
+ bool buf_transfer; // supports `pl_tex_transfer_params.buf`
+
+ // These don't represent hard limits but indicate performance hints for
+ // optimal alignment. For best performance, the corresponding field
+ // should be aligned to a multiple of these. They will always be a power
+ // of two.
+ size_t align_tex_xfer_pitch; // optimal `pl_tex_transfer_params.row_pitch`
+ size_t align_tex_xfer_offset; // optimal `pl_tex_transfer_params.buf_offset`
+
+ // --- pl_pass
+ size_t max_variable_comps; // maximum components passed in variables
+ size_t max_constants; // maximum `pl_pass_params.num_constants`
+ bool array_size_constants; // push constants can be used to size arrays
+ size_t max_pushc_size; // maximum `push_constants_size`
+ size_t align_vertex_stride; // alignment of `pl_pass_params.vertex_stride`
+ uint32_t max_dispatch[3]; // maximum dispatch size per dimension
+
+ // Note: At least one of `max_variable_comps` or `max_ubo_size` is
+ // guaranteed to be nonzero.
+
+ // As a performance hint, the GPU may signal the number of command queues
+ // it has for fragment and compute shaders, respectively. Users may use
+ // this information to decide the appropriate type of shader to dispatch.
+ uint32_t fragment_queues;
+ uint32_t compute_queues;
+};
+
+// Backwards compatibility aliases
+#define max_xfer_size max_buf_size
+#define align_tex_xfer_stride align_tex_xfer_pitch
+
+// Some `pl_gpu` operations allow sharing GPU resources with external APIs -
+// examples include interop with other graphics APIs such as CUDA, and also
+// various hardware decoding APIs. This defines the mechanism underpinning the
+// communication of such an interoperation.
+typedef uint64_t pl_handle_caps;
+enum pl_handle_type {
+ PL_HANDLE_FD = (1 << 0), // `int fd` for POSIX-style APIs
+ PL_HANDLE_WIN32 = (1 << 1), // `HANDLE` for win32 API
+ PL_HANDLE_WIN32_KMT = (1 << 2), // `HANDLE` for pre-Windows-8 win32 API
+ PL_HANDLE_DMA_BUF = (1 << 3), // 'int fd' for a dma_buf fd
+ PL_HANDLE_HOST_PTR = (1 << 4), // `void *` for a host-allocated pointer
+ PL_HANDLE_MTL_TEX = (1 << 5), // `MTLTexture*` for Apple platforms
+ PL_HANDLE_IOSURFACE = (1 << 6), // `IOSurfaceRef` for Apple platforms
+};
+
+struct pl_gpu_handle_caps {
+ pl_handle_caps tex; // supported handles for `pl_tex` + `pl_shared_mem`
+ pl_handle_caps buf; // supported handles for `pl_buf` + `pl_shared_mem`
+ pl_handle_caps sync; // supported handles for `pl_sync` / semaphores
+};
+
+// Wrapper for the handle used to communicate a shared resource externally.
+// This handle is owned by the `pl_gpu` - if a user wishes to use it in a way
+// that takes over ownership (e.g. importing into some APIs), they must clone
+// the handle before doing so (e.g. using `dup` for fds). It is important to
+// read the external API documentation _very_ carefully as different handle
+// types may be managed in different ways. (eg: CUDA takes ownership of an fd,
+// but does not take ownership of a win32 handle).
+union pl_handle {
+ int fd; // PL_HANDLE_FD / PL_HANDLE_DMA_BUF
+ void *handle; // PL_HANDLE_WIN32 / PL_HANDLE_WIN32_KMT / PL_HANDLE_MTL_TEX / PL_HANDLE_IOSURFACE
+ void *ptr; // PL_HANDLE_HOST_PTR
+};
+
+// Structure encapsulating memory that is shared between libplacebo and the
+// user. This memory can be imported into external APIs using the handle.
+//
+// If the object a `pl_shared_mem` belongs to is destroyed (e.g. via
+// `pl_buf_destroy`), the handle becomes undefined, as do the contents of the
+// memory it points to, as well as any external API objects imported from it.
+struct pl_shared_mem {
+ union pl_handle handle;
+ size_t size; // the total size of the memory referenced by this handle
+ size_t offset; // the offset of the object within the referenced memory
+
+ // Note: `size` is optional for some APIs and handle types, in particular
+ // when importing DMABUFs or D3D11 textures.
+
+ // For PL_HANDLE_DMA_BUF, this specifies the DRM format modifier that
+ // describes this resource. Note that when importing `pl_buf`, this must
+ // be DRM_FORMAT_MOD_LINEAR. For importing `pl_tex`, it can be any
+ // format modifier supported by the implementation.
+ uint64_t drm_format_mod;
+
+ // When importing a `pl_tex` of type PL_HANDLE_DMA_BUF, this can be used to
+ // set the image stride (AKA pitch) in memory. If left as 0, defaults to
+ // the image width/height.
+ size_t stride_w;
+ size_t stride_h;
+
+ // When importing a `pl_tex` of type PL_HANDLE_MTL_TEX, this determines
+ // which plane is imported (0 - 2).
+ unsigned plane;
+};
+
+// Structure grouping PCI bus address fields for GPU devices
+struct pl_gpu_pci_address {
+ uint32_t domain;
+ uint32_t bus;
+ uint32_t device;
+ uint32_t function;
+};
+
+typedef const struct pl_fmt_t *pl_fmt;
+
+// Abstract device context which wraps an underlying graphics context and can
+// be used to dispatch rendering commands.
+//
+// Thread-safety: Depends on `pl_gpu_limits.thread_safe`
+typedef const struct pl_gpu_t {
+ pl_log log;
+
+ struct pl_glsl_version glsl; // GLSL features supported by this GPU
+ struct pl_gpu_limits limits; // physical device limits and capabilities
+
+ // Fields relevant to external API interop. If the underlying device does
+ // not support interop with other APIs, these will all be {0}.
+ struct pl_gpu_handle_caps export_caps; // supported handles for exporting
+ struct pl_gpu_handle_caps import_caps; // supported handles for importing
+ uint8_t uuid[16]; // underlying device UUID
+
+ // Supported texture formats, in preference order. (If there are multiple
+ // similar formats, the "better" ones come first)
+ pl_fmt *formats;
+ int num_formats;
+
+ // PCI Bus address of the underlying device, to help with interop.
+ // This will only be filled in if interop is supported.
+ struct pl_gpu_pci_address pci;
+} *pl_gpu;
+
+// Attach a pl_cache object to this GPU instance. This cache will be
+// used to cache all compiled shaders, as well as several other shader objects
+// (e.g. cached 3DLUTs). Calling this with `cache = NULL` disables the cache.
+//
+// Note: Calling this after shaders have already been compiled will not
+// retroactively add those shaders to the cache, so it's recommended to set
+// this early, before creating any passes.
+PL_API void pl_gpu_set_cache(pl_gpu gpu, pl_cache cache);
+
+enum pl_fmt_type {
+ PL_FMT_UNKNOWN = 0, // also used for inconsistent multi-component formats
+ PL_FMT_UNORM, // unsigned, normalized integer format (sampled as float)
+ PL_FMT_SNORM, // signed, normalized integer format (sampled as float)
+ PL_FMT_UINT, // unsigned integer format (sampled as integer)
+ PL_FMT_SINT, // signed integer format (sampled as integer)
+ PL_FMT_FLOAT, // (signed) float formats, any bit size
+ PL_FMT_TYPE_COUNT,
+};
+
+enum pl_fmt_caps {
+ PL_FMT_CAP_SAMPLEABLE = 1 << 0, // may be sampled from (PL_DESC_SAMPLED_TEX)
+ PL_FMT_CAP_STORABLE = 1 << 1, // may be used as storage image (PL_DESC_STORAGE_IMG)
+ PL_FMT_CAP_LINEAR = 1 << 2, // may be linearly samplied from (PL_TEX_SAMPLE_LINEAR)
+ PL_FMT_CAP_RENDERABLE = 1 << 3, // may be rendered to (pl_pass_params.target_fmt)
+ PL_FMT_CAP_BLENDABLE = 1 << 4, // may be blended to (pl_pass_params.enable_blend)
+ PL_FMT_CAP_BLITTABLE = 1 << 5, // may be blitted from/to (pl_tex_blit)
+ PL_FMT_CAP_VERTEX = 1 << 6, // may be used as a vertex attribute
+ PL_FMT_CAP_TEXEL_UNIFORM = 1 << 7, // may be used as a texel uniform buffer
+ PL_FMT_CAP_TEXEL_STORAGE = 1 << 8, // may be used as a texel storage buffer
+ PL_FMT_CAP_HOST_READABLE = 1 << 9, // may be used with `host_readable` textures
+ PL_FMT_CAP_READWRITE = 1 << 10, // may be used with PL_DESC_ACCESS_READWRITE
+
+ // Notes:
+ // - PL_FMT_CAP_LINEAR also implies PL_FMT_CAP_SAMPLEABLE
+ // - PL_FMT_CAP_STORABLE also implies `pl_gpu.glsl.compute`
+ // - PL_FMT_CAP_BLENDABLE implies PL_FMT_CAP_RENDERABLE
+ // - PL_FMT_CAP_VERTEX implies that the format is non-opaque
+ // - PL_FMT_CAP_HOST_READABLE implies that the format is non-opaque
+};
+
+struct pl_fmt_plane {
+ // Underlying format of this particular sub-plane. This describes the
+ // components, texel size and host representation for the purpose of
+ // e.g. transfers, blits, and sampling.
+ pl_fmt format;
+
+ // X/Y subsampling shift factor for this plane.
+ uint8_t shift_x, shift_y;
+};
+
+// Structure describing a texel/vertex format.
+struct pl_fmt_t {
+ const char *name; // symbolic name for this format (e.g. rgba32f)
+ uint64_t signature; // unique but stable signature (for pass reusability)
+
+ enum pl_fmt_type type; // the format's data type and interpretation
+ enum pl_fmt_caps caps; // the features supported by this format
+ int num_components; // number of components for this format
+ int component_depth[4]; // meaningful bits per component, texture precision
+ size_t internal_size; // internal texel size (for blit compatibility)
+
+ // For planar formats, this provides a description of each sub-plane.
+ //
+ // Note on planar formats: Planar formats are always opaque and typically
+ // support only a limit subset of capabilities (or none at all). Access
+ // should be done via sub-planes. (See `pl_tex.planes`)
+ struct pl_fmt_plane planes[4];
+ int num_planes; // or 0 for non-planar textures
+
+ // This controls the relationship between the data as seen by the host and
+ // the way it's interpreted by the texture. The host representation is
+ // always tightly packed (no padding bits in between each component).
+ //
+ // This representation assumes little endian ordering, i.e. components
+ // being ordered from LSB to MSB in memory. Note that for oddly packed
+ // formats like rgb10a2 or rgb565, this is inconsistent with the naming.
+ // (That is to say, rgb565 has sample order {2, 1, 0} under this convention
+ // - because rgb565 treats the R channel as the *most* significant bits)
+ //
+ // If `opaque` is true, then there's no meaningful correspondence between
+ // the two, and all of the remaining fields in this section are unset.
+ //
+ // If `emulated` is true, then this format doesn't actually exist on the
+ // GPU as an uploadable texture format - and any apparent support is being
+ // emulated (typically using compute shaders in the upload path).
+ bool opaque;
+ bool emulated;
+ size_t texel_size; // total size in bytes per texel
+ size_t texel_align; // texel alignment requirements (bytes)
+ int host_bits[4]; // number of meaningful bits in host memory
+ int sample_order[4]; // sampled index for each component, e.g.
+ // {2, 1, 0, 3} for BGRA textures
+
+ // For sampleable formats, this bool indicates whether or not the format
+ // is compatible with `textureGather()`
+ bool gatherable;
+
+ // If usable as a vertex or texel buffer format, this gives the GLSL type
+ // corresponding to the data. (e.g. vec4)
+ const char *glsl_type;
+
+ // If usable as a storage image or texel storage buffer
+ // (PL_FMT_CAP_STORABLE / PL_FMT_CAP_TEXEL_STORAGE), this gives the GLSL
+ // texel format corresponding to the format (e.g. rgba16ui), if any. This
+ // field may be NULL, in which case the format modifier may be left
+ // unspecified.
+ const char *glsl_format;
+
+ // If available, this gives the fourcc associated with the host
+ // representation. In particular, this is intended for use with
+ // PL_HANDLE_DMA_BUF, where this field will match the DRM format from
+ // <drm_fourcc.h>. May be 0, for formats without matching DRM fourcc.
+ uint32_t fourcc;
+
+ // If `fourcc` is set, this contains the list of supported drm format
+ // modifiers for this format.
+ const uint64_t *modifiers;
+ int num_modifiers;
+};
+
+// Returns whether or not a pl_fmt's components are ordered sequentially
+// in memory in the order RGBA.
+PL_API bool pl_fmt_is_ordered(pl_fmt fmt);
+
+// Returns whether or not a pl_fmt is sampled as a float (e.g. UNORM)
+PL_API bool pl_fmt_is_float(pl_fmt fmt);
+
+// Returns whether or not a pl_fmt supports a given DRM modifier.
+PL_API bool pl_fmt_has_modifier(pl_fmt fmt, uint64_t modifier);
+
+// Helper function to find a format with a given number of components and
+// minimum effective precision per component. If `host_bits` is set, then the
+// format will always be non-opaque, unpadded, ordered and have exactly this
+// bit depth for each component. Finally, all `caps` must be supported.
+PL_API pl_fmt pl_find_fmt(pl_gpu gpu, enum pl_fmt_type type, int num_components,
+ int min_depth, int host_bits, enum pl_fmt_caps caps);
+
+// Finds a vertex format for a given configuration. The resulting vertex will
+// have a component depth equivalent to the sizeof() the equivalent host type.
+// (e.g. PL_FMT_FLOAT will always have sizeof(float))
+PL_API pl_fmt pl_find_vertex_fmt(pl_gpu gpu, enum pl_fmt_type type, int num_components);
+
+// Find a format based on its name.
+PL_API pl_fmt pl_find_named_fmt(pl_gpu gpu, const char *name);
+
+// Find a format based on its fourcc.
+PL_API pl_fmt pl_find_fourcc(pl_gpu gpu, uint32_t fourcc);
+
+// A generic 'timer query' object. These can be used to measure an
+// approximation of the GPU execution time of a given operation. Due to the
+// highly asynchronous nature of GPUs, the actual results of any individual
+// timer query may be delayed by quite a bit. As such, users should avoid
+// trying to pair any particular GPU command with any particular timer query
+// result, and only reuse `pl_timer` objects with identical operations. The
+// results of timer queries are guaranteed to be in-order, but individual
+// queries may be dropped, and some operations might not record timer results
+// at all. (For example, if the underlying hardware does not support timer
+// queries for a given operation type)
+//
+// Thread-safety: Unsafe
+typedef struct pl_timer_t *pl_timer;
+
+// Creates a new timer object. This may return NULL, for example if the
+// implementation does not support timers, but since passing NULL to
+// `pl_timer_destroy` and `pl_timer_query` is safe, users generally need not
+// concern themselves with handling this.
+PL_API pl_timer pl_timer_create(pl_gpu gpu);
+PL_API void pl_timer_destroy(pl_gpu gpu, pl_timer *);
+
+// Queries any results that have been measured since the last execution of
+// `pl_timer_query`. There may be more than one result, in which case the user
+// should simply call the function again to get the subsequent values. This
+// function returns a value of 0 in the event that there are no more
+// unprocessed results.
+//
+// The results are reported in nanoseconds, but the actual precision of the
+// timestamp queries may be significantly lower.
+//
+// Note: Results do not queue up indefinitely. Generally, the implementation
+// will only keep track of a small, fixed number of results internally. Make
+// sure to include this function as part of your main rendering loop to process
+// all of its results, or older results will be overwritten by newer ones.
+PL_API uint64_t pl_timer_query(pl_gpu gpu, pl_timer);
+
+enum pl_buf_mem_type {
+ PL_BUF_MEM_AUTO = 0, // use whatever seems most appropriate
+ PL_BUF_MEM_HOST, // try allocating from host memory (RAM)
+ PL_BUF_MEM_DEVICE, // try allocating from device memory (VRAM)
+ PL_BUF_MEM_TYPE_COUNT,
+
+ // Note: This distinction only matters for discrete GPUs
+};
+
+// Structure describing a buffer.
+struct pl_buf_params {
+ size_t size; // size in bytes (must be <= `pl_gpu_limits.max_buf_size`)
+ bool host_writable; // contents may be updated via pl_buf_write()
+ bool host_readable; // contents may be read back via pl_buf_read()
+ bool host_mapped; // create a persistent, RW mapping (pl_buf.data)
+
+ // May be used as PL_DESC_BUF_UNIFORM or PL_DESC_BUF_TEXEL_UNIFORM.
+ // Requires `size <= pl_gpu_limits.max_ubo_size`
+ bool uniform;
+
+ // May be used as PL_DESC_BUF_STORAGE or PL_DESC_BUF_TEXEL_STORAGE.
+ // Requires `size <= pl_gpu_limits.max_ssbo_size`
+ bool storable;
+
+ // May be used as the source of vertex data for `pl_pass_run`.
+ bool drawable;
+
+ // Provide a hint for the memory type you want to use when allocating
+ // this buffer's memory.
+ //
+ // Note: Restrictions may apply depending on the usage flags. In
+ // particular, allocating buffers with `uniform` or `storable` enabled from
+ // non-device memory will almost surely fail.
+ enum pl_buf_mem_type memory_type;
+
+ // Setting this to a format with the `PL_FMT_CAP_TEXEL_*` capability allows
+ // this buffer to be used as a `PL_DESC_BUF_TEXEL_*`, when `uniform` and
+ // `storage` are respectively also enabled.
+ pl_fmt format;
+
+ // At most one of `export_handle` and `import_handle` can be set for a
+ // buffer.
+
+ // Setting this indicates that the memory backing this buffer should be
+ // shared with external APIs, If so, this must be exactly *one* of
+ // `pl_gpu.export_caps.buf`.
+ enum pl_handle_type export_handle;
+
+ // Setting this indicates that the memory backing this buffer will be
+ // imported from an external API. If so, this must be exactly *one* of
+ // `pl_gpu.import_caps.buf`.
+ enum pl_handle_type import_handle;
+
+ // If the shared memory is being imported, the import handle must be
+ // specified here. Otherwise, this is ignored.
+ struct pl_shared_mem shared_mem;
+
+ // If non-NULL, the buffer will be created with these contents. Otherwise,
+ // the initial data is undefined. Using this does *not* require setting
+ // host_writable.
+ const void *initial_data;
+
+ // Arbitrary user data. libplacebo does not use this at all.
+ void *user_data;
+
+ // Arbitrary identifying tag. Used only for debugging purposes.
+ pl_debug_tag debug_tag;
+};
+
+#define pl_buf_params(...) (&(struct pl_buf_params) { \
+ .debug_tag = PL_DEBUG_TAG, \
+ __VA_ARGS__ \
+ })
+
+// A generic buffer, which can be used for multiple purposes (texture transfer,
+// storage buffer, uniform buffer, etc.)
+//
+// Note on efficiency: A pl_buf does not necessarily represent a true "buffer"
+// object on the underlying graphics API. It may also refer to a sub-slice of
+// a larger buffer, depending on the implementation details of the GPU. The
+// bottom line is that users do not need to worry about the efficiency of using
+// many small pl_buf objects. Having many small pl_bufs, even lots of few-byte
+// vertex buffers, is designed to be completely fine.
+//
+// Thread-safety: Unsafe
+typedef const struct pl_buf_t {
+ struct pl_buf_params params;
+ uint8_t *data; // for persistently mapped buffers, points to the first byte
+
+ // If `params.handle_type` is set, this structure references the shared
+ // memory backing this buffer, via the requested handle type.
+ //
+ // While this buffer is not in an "exported" state, the contents of the
+ // memory are undefined. (See: `pl_buf_export`)
+ struct pl_shared_mem shared_mem;
+} *pl_buf;
+
+// Create a buffer. The type of buffer depends on the parameters. The buffer
+// parameters must adhere to the restrictions imposed by the pl_gpu_limits.
+// Returns NULL on failure.
+//
+// For buffers with shared memory, the buffer is considered to be in an
+// "exported" state by default, and may be used directly by the external API
+// after being created (until the first libplacebo operation on the buffer).
+PL_API pl_buf pl_buf_create(pl_gpu gpu, const struct pl_buf_params *params);
+PL_API void pl_buf_destroy(pl_gpu gpu, pl_buf *buf);
+
+// This behaves like `pl_buf_create`, but if the buffer already exists and has
+// incompatible parameters, it will get destroyed first. A buffer is considered
+// "compatible" if it has the same buffer type and texel format, a size greater
+// than or equal to the requested size, and it has a superset of the features
+// the user requested. After this operation, the contents of the buffer are
+// undefined.
+//
+// Note: Due to its unpredictability, it's not allowed to use this with
+// `params->initial_data` being set. Similarly, it's not allowed on a buffer
+// with `params->export_handle`. since this may invalidate the corresponding
+// external API's handle. Conversely, it *is* allowed on a buffer with
+// `params->host_mapped`, and the corresponding `buf->data` pointer *may*
+// change as a result of doing so.
+//
+// Note: If the `user_data` alone changes, this does not trigger a buffer
+// recreation. In theory, this can be used to detect when the buffer ended
+// up being recreated.
+PL_API bool pl_buf_recreate(pl_gpu gpu, pl_buf *buf, const struct pl_buf_params *params);
+
+// Update the contents of a buffer, starting at a given offset (must be a
+// multiple of 4) and up to a given size, with the contents of *data.
+//
+// This function will block until the buffer is no longer in use. Use
+// `pl_buf_poll` to perform non-blocking queries of buffer availability.
+//
+// Note: This function can incur synchronization overhead, so it shouldn't be
+// used in tight loops. If you do need to loop (e.g. to perform a strided
+// write), consider using host-mapped buffers, or fixing the memory in RAM,
+// before calling this function.
+PL_API void pl_buf_write(pl_gpu gpu, pl_buf buf, size_t buf_offset,
+ const void *data, size_t size);
+
+// Read back the contents of a buffer, starting at a given offset, storing the
+// data into *dest. Returns whether successful.
+//
+// This function will block until the buffer is no longer in use. Use
+// `pl_buf_poll` to perform non-blocking queries of buffer availability.
+PL_API bool pl_buf_read(pl_gpu gpu, pl_buf buf, size_t buf_offset,
+ void *dest, size_t size);
+
+// Copy `size` bytes from one buffer to another, reading from and writing to
+// the respective offsets.
+PL_API void pl_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset,
+ pl_buf src, size_t src_offset, size_t size);
+
+// Initiates a buffer export operation, allowing a buffer to be accessed by an
+// external API. This is only valid for buffers with `params.handle_type`.
+// Calling this twice in a row is a harmless no-op. Returns whether successful.
+//
+// There is no corresponding "buffer import" operation, the next libplacebo
+// operation that touches the buffer (e.g. pl_tex_upload, but also pl_buf_write
+// and pl_buf_read) will implicitly import the buffer back to libplacebo. Users
+// must ensure that all pending operations made by the external API are fully
+// completed before using it in libplacebo again. (Otherwise, the behaviour
+// is undefined)
+//
+// Please note that this function returning does not mean the memory is
+// immediately available as such. In general, it will mark a buffer as "in use"
+// in the same way any other buffer operation would, and it is the user's
+// responsibility to wait until `pl_buf_poll` returns false before accessing
+// the memory from the external API.
+//
+// In terms of the access performed by this operation, it is not considered a
+// "read" or "write" and therefore does not technically conflict with reads or
+// writes to the buffer performed by the host (via mapped memory - any use of
+// `pl_buf_read` or `pl_buf_write` would defeat the purpose of the export).
+// However, restrictions made by the external API may apply that prevent this.
+//
+// The recommended use pattern is something like this:
+//
+// while (loop) {
+// pl_buf buf = get_free_buffer(); // or block on pl_buf_poll
+// // write to the buffer using the external API
+// pl_tex_upload(gpu, /* ... buf ... */); // implicitly imports
+// pl_buf_export(gpu, buf);
+// }
+//
+// i.e. perform an external API operation, then use and immediately export the
+// buffer in libplacebo, and finally wait until `pl_buf_poll` is false before
+// re-using it in the external API. (Or get a new buffer in the meantime)
+PL_API bool pl_buf_export(pl_gpu gpu, pl_buf buf);
+
+// Returns whether or not a buffer is currently "in use". This can either be
+// because of a pending read operation, a pending write operation or a pending
+// buffer export operation. Any access to the buffer by external APIs or via
+// the host pointer (for host-mapped buffers) is forbidden while a buffer is
+// "in use". The only exception to this rule is multiple reads, for example
+// reading from a buffer with `pl_tex_upload` while simultaneously reading from
+// it using mapped memory.
+//
+// The `timeout`, specified in nanoseconds, indicates how long to block for
+// before returning. If set to 0, this function will never block, and only
+// returns the current status of the buffer. The actual precision of the
+// timeout may be significantly longer than one nanosecond, and has no upper
+// bound. This function does not provide hard latency guarantees. This function
+// may also return at any time, even if the buffer is still in use. If the user
+// wishes to block until the buffer is definitely no longer in use, the
+// recommended usage is:
+//
+// while (pl_buf_poll(gpu, buf, UINT64_MAX))
+// ; // do nothing
+//
+// Note: libplacebo operations on buffers are always internally synchronized,
+// so this is only needed for host-mapped or externally exported buffers.
+// However, it may be used to do non-blocking queries before calling blocking
+// functions such as `pl_buf_read`.
+//
+// Note: If `pl_gpu_limits.thread_safe` is set, this function is implicitly
+// synchronized, meaning it can safely be called on a `pl_buf` that is in use
+// by another thread.
+PL_API bool pl_buf_poll(pl_gpu gpu, pl_buf buf, uint64_t timeout);
+
+enum pl_tex_sample_mode {
+ PL_TEX_SAMPLE_NEAREST, // nearest neighbour sampling
+ PL_TEX_SAMPLE_LINEAR, // linear filtering, requires PL_FMT_CAP_LINEAR
+ PL_TEX_SAMPLE_MODE_COUNT,
+};
+
+enum pl_tex_address_mode {
+ PL_TEX_ADDRESS_CLAMP, // clamp the nearest edge texel
+ PL_TEX_ADDRESS_REPEAT, // repeat (tile) the texture
+ PL_TEX_ADDRESS_MIRROR, // repeat (mirror) the texture
+ PL_TEX_ADDRESS_MODE_COUNT,
+};
+
+// Structure describing a texture.
+struct pl_tex_params {
+ int w, h, d; // physical dimension; unused dimensions must be 0
+ pl_fmt format;
+
+ // The following bools describe what operations can be performed. The
+ // corresponding pl_fmt capability must be set for every enabled
+ // operation type.
+ //
+ // Note: For planar formats, it is also possible to set capabilities only
+ // supported by sub-planes. In this case, the corresponding functionality
+ // will be available for the sub-plane, but not the planar texture itself.
+ bool sampleable; // usable as a PL_DESC_SAMPLED_TEX
+ bool renderable; // usable as a render target (pl_pass_run)
+ // (must only be used with 2D textures)
+ bool storable; // usable as a storage image (PL_DESC_IMG_*)
+ bool blit_src; // usable as a blit source
+ bool blit_dst; // usable as a blit destination
+ bool host_writable; // may be updated with pl_tex_upload()
+ bool host_readable; // may be fetched with pl_tex_download()
+
+ // Note: For `blit_src`, `blit_dst`, the texture must either be
+ // 2-dimensional or `pl_gpu_limits.blittable_1d_3d` must be set.
+
+ // At most one of `export_handle` and `import_handle` can be set for a
+ // texture.
+
+ // Setting this indicates that the memory backing this texture should be
+ // shared with external APIs, If so, this must be exactly *one* of
+ // `pl_gpu.export_caps.tex`.
+ enum pl_handle_type export_handle;
+
+ // Setting this indicates that the memory backing this texture will be
+ // imported from an external API. If so, this must be exactly *one* of
+ // `pl_gpu.import_caps.tex`. Mutually exclusive with `initial_data`.
+ enum pl_handle_type import_handle;
+
+ // If the shared memory is being imported, the import handle must be
+ // specified here. Otherwise, this is ignored.
+ struct pl_shared_mem shared_mem;
+
+ // If non-NULL, the texture will be created with these contents (tightly
+ // packed). Using this does *not* require setting host_writable. Otherwise,
+ // the initial data is undefined. Mutually exclusive with `import_handle`.
+ const void *initial_data;
+
+ // Arbitrary user data. libplacebo does not use this at all.
+ void *user_data;
+
+ // Arbitrary identifying tag. Used only for debugging purposes.
+ pl_debug_tag debug_tag;
+};
+
+#define pl_tex_params(...) (&(struct pl_tex_params) { \
+ .debug_tag = PL_DEBUG_TAG, \
+ __VA_ARGS__ \
+ })
+
+static inline int pl_tex_params_dimension(const struct pl_tex_params params)
+{
+ return params.d ? 3 : params.h ? 2 : 1;
+}
+
+enum pl_sampler_type {
+ PL_SAMPLER_NORMAL, // gsampler2D, gsampler3D etc.
+ PL_SAMPLER_RECT, // gsampler2DRect
+ PL_SAMPLER_EXTERNAL, // gsamplerExternalOES
+ PL_SAMPLER_TYPE_COUNT,
+};
+
+// Conflates the following typical GPU API concepts:
+// - texture itself
+// - sampler state
+// - staging buffers for texture upload
+// - framebuffer objects
+// - wrappers for swapchain framebuffers
+// - synchronization needed for upload/rendering/etc.
+//
+// Essentially a pl_tex can be anything ranging from a normal texture, a wrapped
+// external/real framebuffer, a framebuffer object + texture pair, a mapped
+// texture (via pl_hwdec), or other sorts of things that can be sampled from
+// and/or rendered to.
+//
+// Thread-safety: Unsafe
+typedef const struct pl_tex_t *pl_tex;
+struct pl_tex_t {
+ struct pl_tex_params params;
+
+ // If `params.format` is a planar format, this contains `pl_tex` handles
+ // encapsulating individual texture planes. Conversely, if this is a
+ // sub-plane of a planar texture, `parent` points to the planar texture.
+ //
+ // Note: Calling `pl_tex_destroy` on sub-planes is undefined behavior.
+ pl_tex planes[4];
+ pl_tex parent;
+
+ // If `params.export_handle` is set, this structure references the shared
+ // memory backing this buffer, via the requested handle type.
+ //
+ // While this texture is not in an "exported" state, the contents of the
+ // memory are undefined. (See: `pl_tex_export`)
+ //
+ // Note: Due to vulkan driver limitations, `shared_mem.drm_format_mod` will
+ // currently always be set to DRM_FORMAT_MOD_INVALID. No guarantee can be
+ // made about the cross-driver compatibility of textures exported this way.
+ struct pl_shared_mem shared_mem;
+
+ // If `params.sampleable` is true, this indicates the correct sampler type
+ // to use when sampling from this texture.
+ enum pl_sampler_type sampler_type;
+};
+
+// Create a texture (with undefined contents). Returns NULL on failure. This is
+// assumed to be an expensive/rare operation, and may need to perform memory
+// allocation or framebuffer creation.
+PL_API pl_tex pl_tex_create(pl_gpu gpu, const struct pl_tex_params *params);
+PL_API void pl_tex_destroy(pl_gpu gpu, pl_tex *tex);
+
+// This works like `pl_tex_create`, but if the texture already exists and has
+// incompatible texture parameters, it will get destroyed first. A texture is
+// considered "compatible" if it has the same texture format and sample/address
+// mode and it supports a superset of the features the user requested.
+//
+// Even if the texture is not recreated, calling this function will still
+// invalidate the contents of the texture. (Note: Because of this,
+// `initial_data` may not be used with `pl_tex_recreate`. Doing so is an error)
+//
+// Note: If the `user_data` alone changes, this does not trigger a texture
+// recreation. In theory, this can be used to detect when the texture ended
+// up being recreated.
+PL_API bool pl_tex_recreate(pl_gpu gpu, pl_tex *tex, const struct pl_tex_params *params);
+
+// Invalidates the contents of a texture. After this, the contents are fully
+// undefined.
+PL_API void pl_tex_invalidate(pl_gpu gpu, pl_tex tex);
+
+union pl_clear_color {
+ float f[4];
+ int32_t i[4];
+ uint32_t u[4];
+};
+
+// Clear the dst texture with the given color (rgba). This is functionally
+// identical to a blit operation, which means `dst->params.blit_dst` must be
+// set.
+PL_API void pl_tex_clear_ex(pl_gpu gpu, pl_tex dst, const union pl_clear_color color);
+
+// Wrapper for `pl_tex_clear_ex` which only works for floating point textures.
+PL_API void pl_tex_clear(pl_gpu gpu, pl_tex dst, const float color[4]);
+
+struct pl_tex_blit_params {
+ // The texture to blit from. Must have `params.blit_src` enabled.
+ pl_tex src;
+
+ // The texture to blit to. Must have `params.blit_dst` enabled, and a
+ // format that is loosely compatible with `src`. This essentially means
+ // that they must have the same `internal_size`. Additionally, UINT
+ // textures can only be blitted to other UINT textures, and SINT textures
+ // can only be blitted to other SINT textures.
+ pl_tex dst;
+
+ // The region of the source texture to blit. Must be within the texture
+ // bounds of `src`. May be flipped. (Optional)
+ pl_rect3d src_rc;
+
+ // The region of the destination texture to blit into. Must be within the
+ // texture bounds of `dst`. May be flipped. Areas outside of `dst_rc` in
+ // `dst` are preserved. (Optional)
+ pl_rect3d dst_rc;
+
+ // If `src_rc` and `dst_rc` have different sizes, the texture will be
+ // scaled using the given texture sampling mode.
+ enum pl_tex_sample_mode sample_mode;
+};
+
+#define pl_tex_blit_params(...) (&(struct pl_tex_blit_params) { __VA_ARGS__ })
+
+// Copy a sub-rectangle from one texture to another.
+PL_API void pl_tex_blit(pl_gpu gpu, const struct pl_tex_blit_params *params);
+
+// Structure describing a texture transfer operation.
+struct pl_tex_transfer_params {
+ // Texture to transfer to/from. Depending on the type of the operation,
+ // this must have params.host_writable (uploads) or params.host_readable
+ // (downloads) set, respectively.
+ pl_tex tex;
+
+ // Note: Superfluous parameters are ignored, i.e. for a 1D texture, the y
+ // and z fields of `rc`, as well as the corresponding pitches, are ignored.
+ // In all other cases, the pitch must be large enough to contain the
+ // corresponding dimension of `rc`, and the `rc` must be normalized and
+ // fully contained within the image dimensions. Missing fields in the `rc`
+ // are inferred from the image size. If unset, the pitch is inferred
+ // from `rc` (that is, it's assumed that the data is tightly packed in the
+ // buffer). Otherwise, `row_pitch` *must* be a multiple of
+ // `tex->params.format->texel_align`, and `depth_pitch` must be a multiple
+ // of `row_pitch`.
+ pl_rect3d rc; // region of the texture to transfer
+ size_t row_pitch; // the number of bytes separating image rows
+ size_t depth_pitch; // the number of bytes separating image planes
+
+ // An optional timer to report the approximate duration of the texture
+ // transfer to. Note that this is only an approximation, since the actual
+ // texture transfer may happen entirely in the background (in particular,
+ // for implementations with asynchronous transfer capabilities). It's also
+ // not guaranteed that all GPUs support this.
+ pl_timer timer;
+
+ // An optional callback to fire after the operation completes. If this is
+ // specified, then the operation is performed asynchronously. Note that
+ // transfers to/from buffers are always asynchronous, even without, this
+ // field, so it's more useful for `ptr` transfers. (Though it can still be
+ // helpful to avoid having to manually poll buffers all the time)
+ //
+ // When this is *not* specified, uploads from `ptr` are still asynchronous
+ // but require a host memcpy, while downloads from `ptr` are blocking. As
+ // such, it's recommended to always try using asynchronous texture
+ // transfers wherever possible.
+ //
+ // Note: Requires `pl_gpu_limits.callbacks`
+ //
+ // Note: Callbacks are implicitly synchronized, meaning that callbacks are
+ // guaranteed to never execute concurrently with other callbacks. However,
+ // they may execute from any thread that the `pl_gpu` is used on.
+ void (*callback)(void *priv);
+ void *priv; // arbitrary user data
+
+ // For the data source/target of a transfer operation, there are two valid
+ // options:
+ //
+ // 1. Transferring to/from a buffer: (requires `pl_gpu_limits.buf_transfer`)
+ pl_buf buf; // buffer to use
+ size_t buf_offset; // offset of data within buffer, should be a
+ // multiple of `tex->params.format->texel_size`
+ // 2. Transferring to/from host memory directly:
+ void *ptr; // address of data
+ bool no_import; // always use memcpy, bypassing host ptr import
+
+ // Note: The contents of the memory region / buffer must exactly match the
+ // texture format; i.e. there is no explicit conversion between formats.
+};
+
+#define pl_tex_transfer_params(...) (&(struct pl_tex_transfer_params) { __VA_ARGS__ })
+
+// Upload data to a texture. Returns whether successful.
+PL_API bool pl_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params);
+
+// Download data from a texture. Returns whether successful.
+PL_API bool pl_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params);
+
+// Returns whether or not a texture is currently "in use". This can either be
+// because of a pending read operation, a pending write operation or a pending
+// texture export operation. Note that this function's usefulness is extremely
+// limited under ordinary circumstances. In practically all cases, textures do
+// not need to be directly synchronized by the user, except when interfacing
+// with external libraries. This function should NOT, however, be used as a
+// crutch to avoid having to implement semaphore-based synchronization. Use
+// the API-specific functions such as `pl_vulkan_hold/release` for that.
+//
+// A good example of a use case in which this function is required is when
+// interoperating with external memory management that needs to know when an
+// imported texture is safe to free / reclaim internally, in which case
+// semaphores are insufficient because memory management is a host operation.
+//
+// The `timeout`, specified in nanoseconds, indicates how long to block for
+// before returning. If set to 0, this function will never block, and only
+// returns the current status of the texture. The actual precision of the
+// timeout may be significantly longer than one nanosecond, and has no upper
+// bound. This function does not provide hard latency guarantees. This function
+// may also return at any time, even if the texture is still in use. If the
+// user wishes to block until the texture is definitely no longer in use, the
+// recommended usage is:
+//
+// while (pl_tex_poll(gpu, buf, UINT64_MAX))
+// ; // do nothing
+//
+// Note: If `pl_gpu_limits.thread_safe` is set, this function is implicitly
+// synchronized, meaning it can safely be called on a `pl_tex` that is in use
+// by another thread.
+PL_API bool pl_tex_poll(pl_gpu gpu, pl_tex tex, uint64_t timeout);
+
+// Data type of a shader input variable (e.g. uniform, or UBO member)
+enum pl_var_type {
+ PL_VAR_INVALID = 0,
+ PL_VAR_SINT, // C: int GLSL: int/ivec
+ PL_VAR_UINT, // C: unsigned int GLSL: uint/uvec
+ PL_VAR_FLOAT, // C: float GLSL: float/vec/mat
+ PL_VAR_TYPE_COUNT
+};
+
+// Returns the host size (in bytes) of a pl_var_type.
+PL_API size_t pl_var_type_size(enum pl_var_type type);
+
+// Represents a shader input variable (concrete data, e.g. vector, matrix)
+struct pl_var {
+ const char *name; // name as used in the shader
+ enum pl_var_type type;
+ // The total number of values is given by dim_v * dim_m. For example, a
+ // vec2 would have dim_v = 2 and dim_m = 1. A mat3x4 would have dim_v = 4
+ // and dim_m = 3.
+ int dim_v; // vector dimension
+ int dim_m; // matrix dimension (number of columns, see below)
+ int dim_a; // array dimension
+};
+
+// Helper functions for constructing the most common pl_vars, with names
+// corresponding to their corresponding GLSL built-in types.
+PL_API struct pl_var pl_var_float(const char *name);
+PL_API struct pl_var pl_var_vec2(const char *name);
+PL_API struct pl_var pl_var_vec3(const char *name);
+PL_API struct pl_var pl_var_vec4(const char *name);
+PL_API struct pl_var pl_var_mat2(const char *name);
+PL_API struct pl_var pl_var_mat2x3(const char *name);
+PL_API struct pl_var pl_var_mat2x4(const char *name);
+PL_API struct pl_var pl_var_mat3(const char *name);
+PL_API struct pl_var pl_var_mat3x4(const char *name);
+PL_API struct pl_var pl_var_mat4x2(const char *name);
+PL_API struct pl_var pl_var_mat4x3(const char *name);
+PL_API struct pl_var pl_var_mat4(const char *name);
+PL_API struct pl_var pl_var_int(const char *name);
+PL_API struct pl_var pl_var_ivec2(const char *name);
+PL_API struct pl_var pl_var_ivec3(const char *name);
+PL_API struct pl_var pl_var_ivec4(const char *name);
+PL_API struct pl_var pl_var_uint(const char *name);
+PL_API struct pl_var pl_var_uvec2(const char *name);
+PL_API struct pl_var pl_var_uvec3(const char *name);
+PL_API struct pl_var pl_var_uvec4(const char *name);
+
+struct pl_named_var {
+ const char *glsl_name;
+ struct pl_var var;
+};
+
+// The same list as above, tagged by name and terminated with a {0} entry.
+PL_API extern const struct pl_named_var pl_var_glsl_types[];
+
+// Efficient helper function for performing a lookup in the above array.
+// Returns NULL if the variable is not legal. Note that the array dimension is
+// ignored, since it's usually part of the variable name and not the type name.
+PL_API const char *pl_var_glsl_type_name(struct pl_var var);
+
+// Converts a pl_fmt to an "equivalent" pl_var. Equivalent in this sense means
+// that the pl_var's type will be the same as the vertex's sampled type (e.g.
+// PL_FMT_UNORM gets turned into PL_VAR_FLOAT).
+PL_API struct pl_var pl_var_from_fmt(pl_fmt fmt, const char *name);
+
+// Describes the memory layout of a variable, relative to some starting location
+// (typically the offset within a uniform/storage/pushconstant buffer)
+//
+// Note on matrices: All GPUs expect column major matrices, for both buffers and
+// input variables. Care needs to be taken to avoid trying to use e.g. a
+// pl_matrix3x3 (which is row major) directly as a pl_var_update.data!
+//
+// In terms of the host layout, a column-major matrix (e.g. matCxR) with C
+// columns and R rows is treated like an array vecR[C]. The `stride` here refers
+// to the separation between these array elements, i.e. the separation between
+// the individual columns.
+//
+// Visualization of a mat4x3:
+//
+// 0 1 2 3 <- columns
+// 0 [ (A) (D) (G) (J) ]
+// 1 [ (B) (E) (H) (K) ]
+// 2 [ (C) (F) (I) (L) ]
+// ^ rows
+//
+// Layout in GPU memory: (stride=16, size=60)
+//
+// [ A B C ] X <- column 0, offset +0
+// [ D E F ] X <- column 1, offset +16
+// [ G H I ] X <- column 2, offset +32
+// [ J K L ] <- column 3, offset +48
+//
+// Note the lack of padding on the last column in this example.
+// In general: size <= stride * dim_m
+//
+// C representation: (stride=12, size=48)
+//
+// { { A, B, C },
+// { D, E, F },
+// { G, H, I },
+// { J, K, L } }
+//
+// Note on arrays: `stride` represents both the stride between elements of a
+// matrix, and the stride between elements of an array. That is, there is no
+// distinction between the columns of a matrix and the rows of an array. For
+// example, a mat2[10] and a vec2[20] share the same pl_var_layout - the stride
+// would be sizeof(vec2) and the size would be sizeof(vec2) * 2 * 10.
+//
+// For non-array/matrix types, `stride` is equal to `size`.
+
+struct pl_var_layout {
+ size_t offset; // the starting offset of the first byte
+ size_t stride; // the delta between two elements of an array/matrix
+ size_t size; // the total size of the input
+};
+
+// Returns the host layout of an input variable as required for a
+// tightly-packed, byte-aligned C data type, given a starting offset.
+PL_API struct pl_var_layout pl_var_host_layout(size_t offset, const struct pl_var *var);
+
+// Returns the GLSL std140 layout of an input variable given a current buffer
+// offset, as required for a buffer descriptor of type PL_DESC_BUF_UNIFORM
+//
+// The normal way to use this function is when calculating the size and offset
+// requirements of a uniform buffer in an incremental fashion, to calculate the
+// new offset of the next variable in this buffer.
+PL_API struct pl_var_layout pl_std140_layout(size_t offset, const struct pl_var *var);
+
+// Returns the GLSL std430 layout of an input variable given a current buffer
+// offset, as required for a buffer descriptor of type PL_DESC_BUF_STORAGE, and
+// for push constants.
+PL_API struct pl_var_layout pl_std430_layout(size_t offset, const struct pl_var *var);
+
+// Convenience definitions / friendly names for these
+#define pl_buf_uniform_layout pl_std140_layout
+#define pl_buf_storage_layout pl_std430_layout
+#define pl_push_constant_layout pl_std430_layout
+
+// Like memcpy, but copies bytes from `src` to `dst` in a manner governed by
+// the stride and size of `dst_layout` as well as `src_layout`. Also takes
+// into account the respective `offset`.
+PL_API void memcpy_layout(void *dst, struct pl_var_layout dst_layout,
+ const void *src, struct pl_var_layout src_layout);
+
+// Represents a compile-time constant.
+struct pl_constant {
+ enum pl_var_type type; // constant data type
+ uint32_t id; // GLSL `constant_id`
+ size_t offset; // byte offset in `constant_data`
+};
+
+// Represents a vertex attribute.
+struct pl_vertex_attrib {
+ const char *name; // name as used in the shader
+ pl_fmt fmt; // data format (must have PL_FMT_CAP_VERTEX)
+ size_t offset; // byte offset into the vertex struct
+ int location; // vertex location (as used in the shader)
+};
+
+// Returns an abstract namespace index for a given descriptor type. This will
+// always be a value >= 0 and < PL_DESC_TYPE_COUNT. Implementations can use
+// this to figure out which descriptors may share the same value of `binding`.
+// Bindings must only be unique for all descriptors within the same namespace.
+PL_API int pl_desc_namespace(pl_gpu gpu, enum pl_desc_type type);
+
+// Access mode of a shader input descriptor.
+enum pl_desc_access {
+ PL_DESC_ACCESS_READWRITE,
+ PL_DESC_ACCESS_READONLY,
+ PL_DESC_ACCESS_WRITEONLY,
+ PL_DESC_ACCESS_COUNT,
+};
+
+// Returns the GLSL syntax for a given access mode (e.g. "readonly").
+PL_API const char *pl_desc_access_glsl_name(enum pl_desc_access mode);
+
+// Represents a shader descriptor (e.g. texture or buffer binding)
+struct pl_desc {
+ const char *name; // name as used in the shader
+ enum pl_desc_type type;
+
+ // The binding of this descriptor, as used in the shader. All bindings
+ // within a namespace must be unique. (see: pl_desc_namespace)
+ int binding;
+
+ // For storage images and storage buffers, this can be used to restrict
+ // the type of access that may be performed on the descriptor. Ignored for
+ // the other descriptor types (uniform buffers and sampled textures are
+ // always read-only).
+ enum pl_desc_access access;
+};
+
+// Framebuffer blending mode (for raster passes)
+enum pl_blend_mode {
+ PL_BLEND_ZERO,
+ PL_BLEND_ONE,
+ PL_BLEND_SRC_ALPHA,
+ PL_BLEND_ONE_MINUS_SRC_ALPHA,
+ PL_BLEND_MODE_COUNT,
+};
+
+struct pl_blend_params {
+ enum pl_blend_mode src_rgb;
+ enum pl_blend_mode dst_rgb;
+ enum pl_blend_mode src_alpha;
+ enum pl_blend_mode dst_alpha;
+};
+
+#define pl_blend_params(...) (&(struct pl_blend_params) { __VA_ARGS__ })
+
+// Typical alpha compositing
+PL_API extern const struct pl_blend_params pl_alpha_overlay;
+
+enum pl_prim_type {
+ PL_PRIM_TRIANGLE_LIST,
+ PL_PRIM_TRIANGLE_STRIP,
+ PL_PRIM_TYPE_COUNT,
+};
+
+enum pl_index_format {
+ PL_INDEX_UINT16 = 0,
+ PL_INDEX_UINT32,
+ PL_INDEX_FORMAT_COUNT,
+};
+
+enum pl_pass_type {
+ PL_PASS_INVALID = 0,
+ PL_PASS_RASTER, // vertex+fragment shader
+ PL_PASS_COMPUTE, // compute shader (requires `pl_gpu.glsl.compute`)
+ PL_PASS_TYPE_COUNT,
+};
+
+// Description of a rendering pass. It conflates the following:
+// - GLSL shader(s) and its list of inputs
+// - target parameters (for raster passes)
+struct pl_pass_params {
+ enum pl_pass_type type;
+
+ // Input variables.
+ struct pl_var *variables;
+ int num_variables;
+
+ // Input descriptors.
+ struct pl_desc *descriptors;
+ int num_descriptors;
+
+ // Compile-time specialization constants.
+ struct pl_constant *constants;
+ int num_constants;
+
+ // Initial data for the specialization constants. Optional. If NULL,
+ // specialization constants receive the values from the shader text.
+ void *constant_data;
+
+ // Push constant region. Must be be a multiple of 4 <= limits.max_pushc_size
+ size_t push_constants_size;
+
+ // The shader text in GLSL. For PL_PASS_RASTER, this is interpreted
+ // as a fragment shader. For PL_PASS_COMPUTE, this is interpreted as
+ // a compute shader.
+ const char *glsl_shader;
+
+ // --- type==PL_PASS_RASTER only
+
+ // Describes the interpretation and layout of the vertex data.
+ enum pl_prim_type vertex_type;
+ struct pl_vertex_attrib *vertex_attribs;
+ int num_vertex_attribs;
+ size_t vertex_stride; // must be a multiple of limits.align_vertex_stride
+
+ // The vertex shader itself.
+ const char *vertex_shader;
+
+ // Target format. The format must support PL_FMT_CAP_RENDERABLE. The
+ // resulting pass may only be used on textures that have a format with a
+ // `pl_fmt.signature` compatible to this format.
+ pl_fmt target_format;
+
+ // Target blending mode. If this is NULL, blending is disabled. Otherwise,
+ // the `target_format` must also support PL_FMT_CAP_BLENDABLE.
+ const struct pl_blend_params *blend_params;
+
+ // If false, the target's existing contents will be discarded before the
+ // pass is run. (Semantically equivalent to calling pl_tex_invalidate
+ // before every pl_pass_run, but slightly more efficient)
+ //
+ // Specifying `blend_params` requires `load_target` to be true.
+ bool load_target;
+
+ // --- Deprecated / removed fields.
+ PL_DEPRECATED const uint8_t *cached_program; // Non-functional
+ PL_DEPRECATED size_t cached_program_len;
+};
+
+#define pl_pass_params(...) (&(struct pl_pass_params) { __VA_ARGS__ })
+
+// Conflates the following typical GPU API concepts:
+// - various kinds of shaders
+// - rendering pipelines
+// - descriptor sets, uniforms, other bindings
+// - all synchronization necessary
+// - the current values of all inputs
+//
+// Thread-safety: Unsafe
+typedef const struct pl_pass_t {
+ struct pl_pass_params params;
+} *pl_pass;
+
+// Compile a shader and create a render pass. This is a rare/expensive
+// operation and may take a significant amount of time, even if a cached
+// program is used. Returns NULL on failure.
+PL_API pl_pass pl_pass_create(pl_gpu gpu, const struct pl_pass_params *params);
+PL_API void pl_pass_destroy(pl_gpu gpu, pl_pass *pass);
+
+struct pl_desc_binding {
+ const void *object; // pl_* object with type corresponding to pl_desc_type
+
+ // For PL_DESC_SAMPLED_TEX, this can be used to configure the sampler.
+ enum pl_tex_address_mode address_mode;
+ enum pl_tex_sample_mode sample_mode;
+};
+
+struct pl_var_update {
+ int index; // index into params.variables[]
+ const void *data; // pointer to raw byte data corresponding to pl_var_host_layout()
+};
+
+struct pl_pass_run_params {
+ pl_pass pass;
+
+ // If present, the shader will be re-specialized with the new constants
+ // provided. This is a significantly cheaper operation than recompiling a
+ // brand new shader, but should still be avoided if possible.
+ //
+ // Leaving it as NULL re-uses the existing specialization values. Ignored
+ // if the shader has no specialization constants. Guaranteed to be a no-op
+ // if the values have not changed since the last invocation.
+ void *constant_data;
+
+ // This list only contains descriptors/variables which have changed
+ // since the previous invocation. All non-mentioned variables implicitly
+ // preserve their state from the last invocation.
+ struct pl_var_update *var_updates;
+ int num_var_updates;
+
+ // This list contains all descriptors used by this pass. It must
+ // always be filled, even if the descriptors haven't changed. The order
+ // must match that of pass->params.descriptors
+ struct pl_desc_binding *desc_bindings;
+
+ // The push constants for this invocation. This must always be set and
+ // fully defined for every invocation if params.push_constants_size > 0.
+ void *push_constants;
+
+ // An optional timer to report the approximate runtime of this shader pass
+ // invocation to. Note that this is only an approximation, since shaders
+ // may overlap their execution times and contend for GPU time.
+ pl_timer timer;
+
+ // --- pass->params.type==PL_PASS_RASTER only
+
+ // Target must be a 2D texture, `target->params.renderable` must be true,
+ // and `target->params.format->signature` must match the signature provided
+ // in `pass->params.target_format`.
+ //
+ // If the viewport or scissors are left blank, they are inferred from
+ // target->params.
+ //
+ // WARNING: Rendering to a *target that is being read from by the same
+ // shader is undefined behavior. In general, trying to bind the same
+ // resource multiple times to the same shader is undefined behavior.
+ pl_tex target;
+ pl_rect2d viewport; // screen space viewport (must be normalized)
+ pl_rect2d scissors; // target render scissors (must be normalized)
+
+ // Number of vertices to render
+ int vertex_count;
+
+ // Vertex data may be provided in one of two forms:
+ //
+ // 1. Drawing from host memory directly
+ const void *vertex_data;
+ // 2. Drawing from a vertex buffer (requires `vertex_buf->params.drawable`)
+ pl_buf vertex_buf;
+ size_t buf_offset;
+
+ // (Optional) Index data may be provided in the form given by `index_fmt`.
+ // These will be used for instanced rendering. Similar to vertex data, this
+ // can be provided in two forms:
+ // 1. From host memory
+ const void *index_data;
+ enum pl_index_format index_fmt;
+ // 2. From an index buffer (requires `index_buf->params.drawable`)
+ pl_buf index_buf;
+ size_t index_offset;
+ // Note: Drawing from an index buffer requires vertex data to also be
+ // present in buffer form, i.e. it's forbidden to mix `index_buf` with
+ // `vertex_data` (though vice versa is allowed).
+
+ // --- pass->params.type==PL_PASS_COMPUTE only
+
+ // Number of work groups to dispatch per dimension (X/Y/Z). Must be <= the
+ // corresponding index of limits.max_dispatch
+ int compute_groups[3];
+};
+
+#define pl_pass_run_params(...) (&(struct pl_pass_run_params) { __VA_ARGS__ })
+
+// Execute a render pass.
+PL_API void pl_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params);
+
+// This is semantically a no-op, but it provides a hint that you want to flush
+// any partially queued up commands and begin execution. There is normally no
+// need to call this, because queued commands will always be implicitly flushed
+// whenever necessary to make forward progress on commands like `pl_buf_poll`,
+// or when submitting a frame to a swapchain for display. In fact, calling this
+// function can negatively impact performance, because some GPUs rely on being
+// able to re-order and modify queued commands in order to enable optimizations
+// retroactively.
+//
+// The only time this might be beneficial to call explicitly is if you're doing
+// lots of offline processing, i.e. you aren't rendering to a swapchain but to
+// textures that you download from again. In that case you should call this
+// function after each "work item" to ensure good parallelism between them.
+//
+// It's worth noting that this function may block if you're over-feeding the
+// GPU without waiting for existing results to finish.
+PL_API void pl_gpu_flush(pl_gpu gpu);
+
+// This is like `pl_gpu_flush` but also blocks until the GPU is fully idle
+// before returning. Using this in your rendering loop is seriously disadvised,
+// and almost never the right solution. The intended use case is for deinit
+// logic, where users may want to force the all pending GPU operations to
+// finish so they can clean up their state more easily.
+//
+// After this operation is called, it's guaranteed that all pending buffer
+// operations are complete - i.e. `pl_buf_poll` is guaranteed to return false.
+// It's also guaranteed that any outstanding timer query results are available.
+//
+// Note: If you only care about buffer operations, you can accomplish this more
+// easily by using `pl_buf_poll` with the timeout set to `UINT64_MAX`. But if
+// you have many buffers it may be more convenient to call this function
+// instead. The difference is that this function will also affect e.g. renders
+// to a `pl_swapchain`.
+PL_API void pl_gpu_finish(pl_gpu gpu);
+
+// Returns true if the GPU is considered to be in a "failed" state, which
+// during normal operation is typically the result of things like the device
+// being lost (due to e.g. power management).
+//
+// If this returns true, users *should* destroy and recreate the `pl_gpu`,
+// including all associated resources, via the appropriate mechanism.
+PL_API bool pl_gpu_is_failed(pl_gpu gpu);
+
+
+// Deprecated objects and functions:
+
+// A generic synchronization object intended for use with an external API. This
+// is not required when solely using libplacebo API functions, as all required
+// synchronisation is done internally. This comes in the form of a pair of
+// semaphores - one to synchronize access in each direction.
+//
+// Thread-safety: Unsafe
+typedef const struct pl_sync_t {
+ enum pl_handle_type handle_type;
+
+ // This handle is signalled by the `pl_gpu`, and waited on by the user. It
+ // fires when it is safe for the user to access the shared resource.
+ union pl_handle wait_handle;
+
+ // This handle is signalled by the user, and waited on by the `pl_gpu`. It
+ // must fire when the user has finished accessing the shared resource.
+ union pl_handle signal_handle;
+} *pl_sync;
+
+// Create a synchronization object. Returns NULL on failure.
+//
+// `handle_type` must be exactly *one* of `pl_gpu.export_caps.sync`, and
+// indicates which type of handle to generate for sharing this sync object.
+//
+// Deprecated in favor of API-specific semaphore creation operations such as
+// `pl_vulkan_sem_create`.
+PL_DEPRECATED PL_API pl_sync pl_sync_create(pl_gpu gpu, enum pl_handle_type handle_type);
+
+// Destroy a `pl_sync`. Note that this invalidates the externally imported
+// semaphores. Users should therefore make sure that all operations that
+// wait on or signal any of the semaphore have been fully submitted and
+// processed by the external API before destroying the `pl_sync`.
+//
+// Despite this, it's safe to destroy a `pl_sync` if the only pending
+// operations that involve it are internal to libplacebo.
+PL_DEPRECATED PL_API void pl_sync_destroy(pl_gpu gpu, pl_sync *sync);
+
+// Initiates a texture export operation, allowing a texture to be accessed by
+// an external API. Returns whether successful. After this operation
+// successfully returns, it is guaranteed that `sync->wait_handle` will
+// eventually be signalled. For APIs where this is relevant, the image layout
+// should be specified as "general", e.g. `GL_LAYOUT_GENERAL_EXT` for OpenGL.
+//
+// There is no corresponding "import" operation - the next operation that uses
+// a texture will implicitly import the texture. Valid API usage requires that
+// the user *must* submit a semaphore signal operation on `sync->signal_handle`
+// before doing so. Not doing so is undefined behavior and may very well
+// deadlock the calling process and/or the graphics card!
+//
+// Note that despite this restriction, it is always valid to call
+// `pl_tex_destroy`, even if the texture is in an exported state, without
+// having to signal the corresponding sync object first.
+//
+// Deprecated in favor of API-specific synchronization mechanisms such as
+// `pl_vulkan_hold/release_ex`.
+PL_DEPRECATED PL_API bool pl_tex_export(pl_gpu gpu, pl_tex tex, pl_sync sync);
+
+
+PL_API_END
+
+#endif // LIBPLACEBO_GPU_H_
diff --git a/src/include/libplacebo/log.h b/src/include/libplacebo/log.h
new file mode 100644
index 0000000..b24c931
--- /dev/null
+++ b/src/include/libplacebo/log.h
@@ -0,0 +1,113 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_LOG_H_
+#define LIBPLACEBO_LOG_H_
+
+#include <libplacebo/config.h>
+#include <libplacebo/common.h>
+
+PL_API_BEGIN
+
+// The log level associated with a given log message.
+enum pl_log_level {
+ PL_LOG_NONE = 0,
+ PL_LOG_FATAL, // results in total loss of function of a major component
+ PL_LOG_ERR, // serious error; may result in degraded function
+ PL_LOG_WARN, // warning; potentially bad, probably user-relevant
+ PL_LOG_INFO, // informational message, also potentially harmless errors
+ PL_LOG_DEBUG, // verbose debug message, informational
+ PL_LOG_TRACE, // very noisy trace of activity,, usually benign
+ PL_LOG_ALL = PL_LOG_TRACE,
+};
+
+struct pl_log_params {
+ // Logging callback. All messages, informational or otherwise, will get
+ // redirected to this callback. The logged messages do not include trailing
+ // newlines. Optional.
+ void (*log_cb)(void *log_priv, enum pl_log_level level, const char *msg);
+ void *log_priv;
+
+ // The current log level. Controls the level of message that will be
+ // redirected to the log callback. Setting this to PL_LOG_ALL means all
+ // messages will be forwarded, but doing so indiscriminately can result
+ // in increased CPU usage as it may enable extra debug paths based on the
+ // configured log level.
+ enum pl_log_level log_level;
+};
+
+#define pl_log_params(...) (&(struct pl_log_params) { __VA_ARGS__ })
+PL_API extern const struct pl_log_params pl_log_default_params;
+
+// Thread-safety: Safe
+//
+// Note: In any context in which `pl_log` is used, users may also pass NULL
+// to disable logging. In other words, NULL is a valid `pl_log`.
+typedef const struct pl_log_t {
+ struct pl_log_params params;
+} *pl_log;
+
+#define pl_log_glue1(x, y) x##y
+#define pl_log_glue2(x, y) pl_log_glue1(x, y)
+// Force a link error in the case of linking against an incompatible API
+// version.
+#define pl_log_create pl_log_glue2(pl_log_create_, PL_API_VER)
+// Creates a pl_log. `api_ver` is for historical reasons and ignored currently.
+// `params` defaults to `&pl_log_default_params` if left as NULL.
+//
+// Note: As a general rule, any `params` struct used as an argument to a
+// function need only live until the corresponding function returns.
+PL_API pl_log pl_log_create(int api_ver, const struct pl_log_params *params);
+
+// Destroy a `pl_log` object.
+//
+// Note: As a general rule, all `_destroy` functions take the pointer to the
+// object to free as their parameter. This pointer is overwritten by NULL
+// afterwards. Calling a _destroy function on &{NULL} is valid, but calling it
+// on NULL itself is invalid.
+PL_API void pl_log_destroy(pl_log *log);
+
+// Update the parameters of a `pl_log` without destroying it. This can be
+// used to change the log function, log context or log level retroactively.
+// `params` defaults to `&pl_log_default_params` if left as NULL.
+//
+// Returns the previous params, atomically.
+PL_API struct pl_log_params pl_log_update(pl_log log, const struct pl_log_params *params);
+
+// Like `pl_log_update` but only updates the log level, leaving the log
+// callback intact.
+//
+// Returns the previous log level, atomically.
+PL_API enum pl_log_level pl_log_level_update(pl_log log, enum pl_log_level level);
+
+// Two simple, stream-based loggers. You can use these as the log_cb. If you
+// also set log_priv to a FILE* (e.g. stdout or stderr) it will be printed
+// there; otherwise, it will be printed to stdout or stderr depending on the
+// log level.
+//
+// The version with colors will use ANSI escape sequences to indicate the log
+// level. The version without will use explicit prefixes.
+PL_API void pl_log_simple(void *stream, enum pl_log_level level, const char *msg);
+PL_API void pl_log_color(void *stream, enum pl_log_level level, const char *msg);
+
+// Backwards compatibility with older versions of libplacebo
+#define pl_context pl_log
+#define pl_context_params pl_log_params
+
+PL_API_END
+
+#endif // LIBPLACEBO_LOG_H_
diff --git a/src/include/libplacebo/meson.build b/src/include/libplacebo/meson.build
new file mode 100644
index 0000000..2f4631e
--- /dev/null
+++ b/src/include/libplacebo/meson.build
@@ -0,0 +1,6 @@
+sources += configure_file(
+ input: 'config.h.in',
+ output: 'config.h',
+ install_dir: get_option('includedir') / meson.project_name(),
+ configuration: conf_public,
+)
diff --git a/src/include/libplacebo/opengl.h b/src/include/libplacebo/opengl.h
new file mode 100644
index 0000000..46597b2
--- /dev/null
+++ b/src/include/libplacebo/opengl.h
@@ -0,0 +1,230 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_OPENGL_H_
+#define LIBPLACEBO_OPENGL_H_
+
+#include <string.h>
+
+#include <libplacebo/gpu.h>
+#include <libplacebo/swapchain.h>
+
+PL_API_BEGIN
+
+// Note on thread safety: The thread safety of `pl_opengl` and any associated
+// GPU objects follows the same thread safety rules as the underlying OpenGL
+// context. In other words, they must only be called from the thread the OpenGL
+// context is current on.
+
+typedef const struct pl_opengl_t {
+ pl_gpu gpu;
+
+ // Detected GL version
+ int major, minor;
+
+ // List of GL/EGL extensions, provided for convenience
+ const char * const *extensions;
+ int num_extensions;
+} *pl_opengl;
+
+static inline bool pl_opengl_has_ext(pl_opengl gl, const char *ext)
+{
+ for (int i = 0; i < gl->num_extensions; i++)
+ if (!strcmp(ext, gl->extensions[i]))
+ return true;
+ return false;
+}
+
+typedef void (*pl_voidfunc_t)(void);
+
+struct pl_opengl_params {
+ // Main gl*GetProcAddr function. This will be used to load all GL/EGL
+ // functions. Optional - if unspecified, libplacebo will default to an
+ // internal loading logic which should work on most platforms.
+ pl_voidfunc_t (*get_proc_addr_ex)(void *proc_ctx, const char *procname);
+ void *proc_ctx;
+
+ // Simpler API for backwards compatibility / convenience. (This one
+ // directly matches the signature of most gl*GetProcAddr library functions)
+ pl_voidfunc_t (*get_proc_addr)(const char *procname);
+
+ // Enable OpenGL debug report callbacks. May have little effect depending
+ // on whether or not the GL context was initialized with appropriate
+ // debugging enabled.
+ bool debug;
+
+ // Allow the use of (suspected) software rasterizers and renderers. These
+ // can be useful for debugging purposes, but normally, their use is
+ // undesirable when GPU-accelerated processing is expected.
+ bool allow_software;
+
+ // Restrict the maximum allowed GLSL version. (Mainly for testing)
+ int max_glsl_version;
+
+ // Optional. Required when importing/exporting dmabufs as textures.
+ void *egl_display;
+ void *egl_context;
+
+ // Optional callbacks to bind/release the OpenGL context on the current
+ // thread. If these are specified, then the resulting `pl_gpu` will have
+ // `pl_gpu_limits.thread_safe` enabled, and may therefore be used from any
+ // thread without first needing to bind the OpenGL context.
+ //
+ // If the user is re-using the same OpenGL context in non-libplacebo code,
+ // then these callbacks should include whatever synchronization is
+ // necessary to prevent simultaneous use between libplacebo and the user.
+ bool (*make_current)(void *priv);
+ void (*release_current)(void *priv);
+ void *priv;
+};
+
+// Default/recommended parameters
+#define pl_opengl_params(...) (&(struct pl_opengl_params) { __VA_ARGS__ })
+PL_API extern const struct pl_opengl_params pl_opengl_default_params;
+
+// Creates a new OpenGL renderer based on the given parameters. This will
+// internally use whatever platform-defined mechanism (WGL, X11, EGL) is
+// appropriate for loading the OpenGL function calls, so the user doesn't need
+// to pass in a `getProcAddress` callback. If `params` is left as NULL, it
+// defaults to `&pl_opengl_default_params`. The context must be active when
+// calling this function, and must remain active whenever calling any
+// libplacebo function on the resulting `pl_opengl` or `pl_gpu`.
+//
+// Note that creating multiple `pl_opengl` instances from the same OpenGL
+// context is undefined behavior.
+PL_API pl_opengl pl_opengl_create(pl_log log, const struct pl_opengl_params *params);
+
+// All resources allocated from the `pl_gpu` contained by this `pl_opengl` must
+// be explicitly destroyed by the user before calling `pl_opengl_destroy`.
+PL_API void pl_opengl_destroy(pl_opengl *gl);
+
+// For a `pl_gpu` backed by `pl_opengl`, this function can be used to retrieve
+// the underlying `pl_opengl`. Returns NULL for any other type of `gpu`.
+PL_API pl_opengl pl_opengl_get(pl_gpu gpu);
+
+struct pl_opengl_framebuffer {
+ // ID of the framebuffer, or 0 to use the context's default framebuffer.
+ int id;
+
+ // If true, then the framebuffer is assumed to be "flipped" relative to
+ // normal GL semantics, i.e. set this to `true` if the first pixel is the
+ // top left corner.
+ bool flipped;
+};
+
+struct pl_opengl_swapchain_params {
+ // Set this to the platform-specific function to swap buffers, e.g.
+ // glXSwapBuffers, eglSwapBuffers etc. This will be called internally by
+ // `pl_swapchain_swap_buffers`. Required, unless you never call that
+ // function.
+ void (*swap_buffers)(void *priv);
+
+ // Initial framebuffer description. This can be changed later on using
+ // `pl_opengl_swapchain_update_fb`.
+ struct pl_opengl_framebuffer framebuffer;
+
+ // Attempt forcing a specific latency. If this is nonzero, then
+ // `pl_swapchain_swap_buffers` will wait until fewer than N frames are "in
+ // flight" before returning. Setting this to a high number generally
+ // accomplished nothing, because the OpenGL driver typically limits the
+ // number of buffers on its own. But setting it to a low number like 2 or
+ // even 1 can reduce latency (at the cost of throughput).
+ int max_swapchain_depth;
+
+ // Arbitrary user pointer that gets passed to `swap_buffers` etc.
+ void *priv;
+};
+
+#define pl_opengl_swapchain_params(...) (&(struct pl_opengl_swapchain_params) { __VA_ARGS__ })
+
+// Creates an instance of `pl_swapchain` tied to the active context.
+// Note: Due to OpenGL semantics, users *must* call `pl_swapchain_resize`
+// before attempting to use this swapchain, otherwise calls to
+// `pl_swapchain_start_frame` will fail.
+PL_API pl_swapchain pl_opengl_create_swapchain(pl_opengl gl,
+ const struct pl_opengl_swapchain_params *params);
+
+// Update the framebuffer description. After calling this function, users
+// *must* call `pl_swapchain_resize` before attempting to use the swapchain
+// again, otherwise calls to `pl_swapchain_start_frame` will fail.
+PL_API void pl_opengl_swapchain_update_fb(pl_swapchain sw,
+ const struct pl_opengl_framebuffer *fb);
+
+struct pl_opengl_wrap_params {
+ // The GLuint texture object itself. Optional. If no texture is provided,
+ // then only the opaque framebuffer `fbo` will be wrapped, leaving the
+ // resulting `pl_tex` object with some operations (such as sampling) being
+ // unsupported.
+ unsigned int texture;
+
+ // The GLuint associated framebuffer. Optional. If this is not specified,
+ // then libplacebo will attempt creating a framebuffer from the provided
+ // texture object (if possible).
+ //
+ // Note: As a special case, if neither a texture nor an FBO are provided,
+ // this is equivalent to wrapping the OpenGL default framebuffer (id 0).
+ unsigned int framebuffer;
+
+ // The image's dimensions (unused dimensions must be 0)
+ int width;
+ int height;
+ int depth;
+
+ // Texture-specific fields:
+ //
+ // Note: These are only relevant if `texture` is provided.
+
+ // The GLenum for the texture target to use, e.g. GL_TEXTURE_2D. Optional.
+ // If this is left as 0, the target is inferred from the number of
+ // dimensions. Users may want to set this to something specific like
+ // GL_TEXTURE_EXTERNAL_OES depending on the nature of the texture.
+ unsigned int target;
+
+ // The texture's GLint sized internal format (e.g. GL_RGBA16F). Required.
+ int iformat;
+};
+
+#define pl_opengl_wrap_params(...) (&(struct pl_opengl_wrap_params) { __VA_ARGS__ })
+
+// Wraps an external OpenGL object into a `pl_tex` abstraction. Due to the
+// internally synchronized nature of OpenGL, no explicit synchronization
+// is needed between libplacebo `pl_tex_` operations, and host accesses to
+// the texture. Wrapping the same OpenGL texture multiple times is permitted.
+// Note that this function transfers no ownership.
+//
+// This wrapper can be destroyed by simply calling `pl_tex_destroy` on it,
+// which will *not* destroy the user-provided OpenGL texture or framebuffer.
+//
+// This function may fail, in which case it returns NULL.
+PL_API pl_tex pl_opengl_wrap(pl_gpu gpu, const struct pl_opengl_wrap_params *params);
+
+// Analogous to `pl_opengl_wrap`, this function takes any `pl_tex` (including
+// ones created by `pl_tex_create`) and unwraps it to expose the underlying
+// OpenGL texture to the user. Note that this function transfers no ownership,
+// i.e. the texture object and framebuffer shall not be destroyed by the user.
+//
+// Returns the OpenGL texture. `out_target` and `out_iformat` will be updated
+// to hold the target type and internal format, respectively. (Optional)
+//
+// For renderable/blittable textures, `out_fbo` will be updated to the ID of
+// the framebuffer attached to this texture, or 0 if there is none. (Optional)
+PL_API unsigned int pl_opengl_unwrap(pl_gpu gpu, pl_tex tex, unsigned int *out_target,
+ int *out_iformat, unsigned int *out_fbo);
+
+PL_API_END
+
+#endif // LIBPLACEBO_OPENGL_H_
diff --git a/src/include/libplacebo/options.h b/src/include/libplacebo/options.h
new file mode 100644
index 0000000..e40f5e7
--- /dev/null
+++ b/src/include/libplacebo/options.h
@@ -0,0 +1,201 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_OPTIONS_H_
+#define LIBPLACEBO_OPTIONS_H_
+
+#include <libplacebo/renderer.h>
+
+PL_API_BEGIN
+
+// High-level heap-managed struct containing storage for all options implied by
+// pl_render_params, including a high-level interface for serializing,
+// deserializing and interfacing with them in a programmatic way.
+
+typedef const struct pl_opt_t *pl_opt;
+typedef struct pl_options_t {
+ // Non-NULL `params.*_params` pointers must always point into this struct
+ struct pl_render_params params;
+
+ // Backing storage for all of the various rendering parameters. Whether
+ // or not these params are active is determined by whether or not
+ // `params.*_params` is set to this address or NULL.
+ struct pl_deband_params deband_params;
+ struct pl_sigmoid_params sigmoid_params;
+ struct pl_color_adjustment color_adjustment;
+ struct pl_peak_detect_params peak_detect_params;
+ struct pl_color_map_params color_map_params;
+ struct pl_dither_params dither_params;
+ struct pl_icc_params icc_params PL_DEPRECATED;
+ struct pl_cone_params cone_params;
+ struct pl_blend_params blend_params;
+ struct pl_deinterlace_params deinterlace_params;
+ struct pl_distort_params distort_params;
+
+ // Backing storage for "custom" scalers. `params.upscaler` etc. will
+ // always be a pointer either to a built-in pl_filter_config, or one of
+ // these structs. `name`, `description` and `allowed` will always be
+ // valid for the respective type of filter config.
+ struct pl_filter_config upscaler;
+ struct pl_filter_config downscaler;
+ struct pl_filter_config plane_upscaler;
+ struct pl_filter_config plane_downscaler;
+ struct pl_filter_config frame_mixer;
+} *pl_options;
+
+// Allocate a new set of render params, with internally backed storage for
+// all parameters. Initialized to an "empty" config (PL_RENDER_DEFAULTS),
+// equivalent to `&pl_render_fast_params`. To initialize the struct instead to
+// the recommended default parameters, use `pl_options_reset` with
+// `pl_render_default_params`.
+//
+// If `log` is provided, errors related to parsing etc. will be logged there.
+PL_API pl_options pl_options_alloc(pl_log log);
+PL_API void pl_options_free(pl_options *opts);
+
+// Resets all options to their default values from a given struct. If `preset`
+// is NULL, `opts` is instead reset back to the initial "empty" configuration,
+// with all options disabled, as if it was freshly allocated.
+//
+// Note: This function will also reset structs which were not included in
+// `preset`, such as any custom upscalers.
+PL_API void pl_options_reset(pl_options opts, const struct pl_render_params *preset);
+
+typedef const struct pl_opt_data_t {
+ // Original options struct.
+ pl_options opts;
+
+ // Triggering option for this callback invocation.
+ pl_opt opt;
+
+ // The raw data associated with this option. Always some pointer into
+ // `opts`. Note that only PL_OPT_BOOL, PL_OPT_INT and PL_OPT_FLOAT have
+ // a fixed representation, for other fields its usefulness is dubious.
+ const void *value;
+
+ // The underlying data, as a formatted, locale-invariant string. Lifetime
+ // is limited until the return of this callback.
+ const char *text;
+} *pl_opt_data;
+
+// Query a single option from `opts` by key, or NULL if none was found.
+// The resulting pointer is only valid until the next pl_options_* call.
+PL_API pl_opt_data pl_options_get(pl_options opts, const char *key);
+
+// Update an option from a formatted value string (see `pl_opt_data.text`).
+// This can be used for all type of options, even non-string ones. In this case,
+// `value` will be parsed according to the option type.
+//
+// Returns whether successful.
+PL_API bool pl_options_set_str(pl_options opts, const char *key, const char *value);
+
+// Programmatically iterate over options set in a `pl_options`, running the
+// provided callback on each entry.
+PL_API void pl_options_iterate(pl_options opts,
+ void (*cb)(void *priv, pl_opt_data data),
+ void *priv);
+
+// Serialize a `pl_options` structs to a comma-separated key/value string. The
+// returned string has a lifetime valid until either the next call to
+// `pl_options_save`, or until the `pl_options` is freed.
+PL_API const char *pl_options_save(pl_options opts);
+
+// Parse a `pl_options` struct from a key/value string, in standard syntax
+// "key1=value1,key2=value2,...", and updates `opts` with the new values.
+// Valid separators include whitespace, commas (,) and (semi)colons (:;).
+//
+// Returns true if no errors occurred.
+PL_API bool pl_options_load(pl_options opts, const char *str);
+
+// Helpers for interfacing with `opts->params.hooks`. Note that using any of
+// these helpers will overwrite the array by an internally managed pointer,
+// so care must be taken when combining them with external management of
+// this memory. Negative indices are possible and are counted relative to the
+// end of the list.
+//
+// Note: These hooks are *not* included in pl_options_save() and related.
+PL_API void pl_options_add_hook(pl_options opts, const struct pl_hook *hook);
+PL_API void pl_options_insert_hook(pl_options opts, const struct pl_hook *hook, int idx);
+PL_API void pl_options_remove_hook_at(pl_options opts, int idx);
+
+// Underlying options system and list
+//
+// Note: By necessity, this option list does not cover every single field
+// present in `pl_render_params`. In particular, fields like `info_callback`,
+// `lut` and `hooks` cannot be configured through the options system, as doing
+// so would require interop with C code or I/O. (However, see
+// `pl_options_add_hook` and related)
+
+enum pl_option_type {
+ // Accepts `yes/no`, `on/off`, `true/false` and variants
+ PL_OPT_BOOL,
+
+ // Parsed as human-readable locale-invariant (C) numbers, scientific
+ // notation accepted for floats
+ PL_OPT_INT,
+ PL_OPT_FLOAT,
+
+ // Parsed as a short string containing only alphanumerics and _-,
+ // corresponding to some name/identifier. Catch-all bucket for several
+ // other types of options, such as presets, struct pointers, and functions
+ //
+ // Note: These options do not correspond to actual strings in C, the
+ // underlying type of option will determine the values of `size` and
+ // corresponding interpretation of pointers.
+ PL_OPT_STRING,
+
+ PL_OPT_TYPE_COUNT,
+};
+
+struct pl_opt_t {
+ // Programmatic key uniquely identifying this option.
+ const char *key;
+
+ // Longer, human readable friendly name
+ const char *name;
+
+ // Data type of option, affects how it is parsed. This field is purely
+ // informative for the user, the actual implementation may vary.
+ enum pl_option_type type;
+
+ // Minimum/maximum value ranges for numeric options (int / float)
+ // If both are 0.0, these limits are disabled/ignored.
+ float min, max;
+
+ // If true, this option is considered deprecated and may be removed
+ // in the future.
+ bool deprecated;
+
+ // If true, this option is considered a 'preset' (read-only), which can
+ // be loaded but not saved. (The equivalent underlying options this preset
+ // corresponds to will be saved instead)
+ bool preset;
+
+ // Internal implementation details (for parsing/saving), opaque to user
+ const void *priv;
+};
+
+// A list of options, terminated by {0} for convenience
+PL_API extern const struct pl_opt_t pl_option_list[];
+PL_API extern const int pl_option_count; // excluding terminating {0}
+
+// Returns the `pl_option` associated with a given key, or NULL
+PL_API pl_opt pl_find_option(const char *key);
+
+PL_API_END
+
+#endif // LIBPLACEBO_OPTIONS_H_
diff --git a/src/include/libplacebo/renderer.h b/src/include/libplacebo/renderer.h
new file mode 100644
index 0000000..d2e01e4
--- /dev/null
+++ b/src/include/libplacebo/renderer.h
@@ -0,0 +1,847 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_RENDERER_H_
+#define LIBPLACEBO_RENDERER_H_
+
+#include <libplacebo/config.h>
+#include <libplacebo/colorspace.h>
+#include <libplacebo/filters.h>
+#include <libplacebo/gpu.h>
+#include <libplacebo/shaders/colorspace.h>
+#include <libplacebo/shaders/deinterlacing.h>
+#include <libplacebo/shaders/dithering.h>
+#include <libplacebo/shaders/film_grain.h>
+#include <libplacebo/shaders/icc.h>
+#include <libplacebo/shaders/lut.h>
+#include <libplacebo/shaders/sampling.h>
+#include <libplacebo/shaders/custom.h>
+#include <libplacebo/swapchain.h>
+
+PL_API_BEGIN
+
+// Thread-safety: Unsafe
+typedef struct pl_renderer_t *pl_renderer;
+
+// Enum values used in pl_renderer_errors_t as a bit positions for error flags
+enum pl_render_error {
+ PL_RENDER_ERR_NONE = 0,
+ PL_RENDER_ERR_FBO = 1 << 0,
+ PL_RENDER_ERR_SAMPLING = 1 << 1,
+ PL_RENDER_ERR_DEBANDING = 1 << 2,
+ PL_RENDER_ERR_BLENDING = 1 << 3,
+ PL_RENDER_ERR_OVERLAY = 1 << 4,
+ PL_RENDER_ERR_PEAK_DETECT = 1 << 5,
+ PL_RENDER_ERR_FILM_GRAIN = 1 << 6,
+ PL_RENDER_ERR_FRAME_MIXING = 1 << 7,
+ PL_RENDER_ERR_DEINTERLACING = 1 << 8,
+ PL_RENDER_ERR_ERROR_DIFFUSION = 1 << 9,
+ PL_RENDER_ERR_HOOKS = 1 << 10,
+ PL_RENDER_ERR_CONTRAST_RECOVERY = 1 << 11,
+};
+
+// Struct describing current renderer state, including internal processing errors,
+// as well as list of signatures of disabled hooks.
+struct pl_render_errors {
+ enum pl_render_error errors;
+ // List containing signatures of disabled hooks
+ const uint64_t *disabled_hooks;
+ int num_disabled_hooks;
+};
+
+// Creates a new renderer object, which is backed by a GPU context. This is a
+// high-level object that takes care of the rendering chain as a whole, from
+// the source textures to the finished frame.
+PL_API pl_renderer pl_renderer_create(pl_log log, pl_gpu gpu);
+PL_API void pl_renderer_destroy(pl_renderer *rr);
+
+// Returns current renderer state, see pl_render_errors.
+PL_API struct pl_render_errors pl_renderer_get_errors(pl_renderer rr);
+
+// Clears errors state of renderer. If `errors` is NULL, all render errors will
+// be cleared. Otherwise only selected errors/hooks will be cleared.
+// If `PL_RENDER_ERR_HOOKS` is set and `num_disabled_hooks` is 0, clear all hooks.
+// Otherwise only selected hooks will be cleard based on `disabled_hooks` array.
+PL_API void pl_renderer_reset_errors(pl_renderer rr,
+ const struct pl_render_errors *errors);
+
+enum pl_lut_type {
+ PL_LUT_UNKNOWN = 0,
+ PL_LUT_NATIVE, // applied to raw image contents (after fixing bit depth)
+ PL_LUT_NORMALIZED, // applied to normalized (HDR) RGB values
+ PL_LUT_CONVERSION, // LUT fully replaces color conversion
+
+ // Note: When using a PL_LUT_CONVERSION to replace the YUV->RGB conversion,
+ // `pl_render_params.color_adjustment` is no longer applied. Similarly,
+ // when using a PL_LUT_CONVERSION to replace the image->target color space
+ // conversion, `pl_render_params.color_map_params` are ignored.
+ //
+ // Note: For LUTs attached to the output frame, PL_LUT_CONVERSION should
+ // instead perform the inverse (RGB->native) conversion.
+ //
+ // Note: PL_LUT_UNKNOWN tries inferring the meaning of the LUT from the
+ // LUT's tagged metadata, and otherwise falls back to PL_LUT_NATIVE.
+};
+
+enum pl_render_stage {
+ PL_RENDER_STAGE_FRAME, // full frame redraws, for fresh/uncached frames
+ PL_RENDER_STAGE_BLEND, // the output blend pass (only for pl_render_image_mix)
+ PL_RENDER_STAGE_COUNT,
+};
+
+struct pl_render_info {
+ const struct pl_dispatch_info *pass; // information about the shader
+ enum pl_render_stage stage; // the associated render stage
+
+ // This specifies the chronological index of this pass within the frame and
+ // stage (starting at `index == 0`).
+ int index;
+
+ // For PL_RENDER_STAGE_BLEND, this specifies the number of frames
+ // being blended (since that results in a different shader).
+ int count;
+};
+
+// Represents the options used for rendering. These affect the quality of
+// the result.
+struct pl_render_params {
+ // Configures the algorithms used for upscaling and downscaling,
+ // respectively. If left as NULL, then libplacebo will only use inexpensive
+ // sampling (bilinear or nearest neighbour depending on the capabilities
+ // of the hardware / texture).
+ //
+ // Note: Setting `downscaler` to NULL also implies `skip_anti_aliasing`,
+ // since the built-in GPU sampling algorithms can't anti-alias.
+ //
+ // Note: If set to the same address as the built-in `pl_filter_bicubic`,
+ // `pl_filter_nearest` etc.; libplacebo will also use the more efficient
+ // direct sampling algorithm where possible without quality loss.
+ const struct pl_filter_config *upscaler;
+ const struct pl_filter_config *downscaler;
+
+ // If set, this overrides the value of `upscaler`/`downscaling` for
+ // subsampled (chroma) planes. These scalers are used whenever the size of
+ // multiple different `pl_plane`s in a single `pl_frame` differ, requiring
+ // adaptation when converting to/from RGB. Note that a value of NULL simply
+ // means "no override". To force built-in scaling explicitly, set this to
+ // `&pl_filter_bilinear`.
+ const struct pl_filter_config *plane_upscaler;
+ const struct pl_filter_config *plane_downscaler;
+
+ // The anti-ringing strength to apply to filters. See the equivalent option
+ // in `pl_sample_filter_params` for more information.
+ float antiringing_strength;
+
+ // Configures the algorithm used for frame mixing (when using
+ // `pl_render_image_mix`). Ignored otherwise. As a special requirement,
+ // this must be a filter config with `polar` set to false, since it's only
+ // used for 1D mixing and thus only 1D filters are compatible.
+ //
+ // If set to NULL, frame mixing is disabled, in which case
+ // `pl_render_image_mix` will use nearest-neighbour semantics. (Note that
+ // this still goes through the redraw cache, unless you also enable
+ // `skip_caching_single_frame`)
+ const struct pl_filter_config *frame_mixer;
+
+ // Configures the settings used to deband source textures. Leaving this as
+ // NULL disables debanding.
+ //
+ // Note: The `deband_params.grain` setting is automatically adjusted to
+ // prevent blowing up on HDR sources. The user need not account for this.
+ const struct pl_deband_params *deband_params;
+
+ // Configures the settings used to sigmoidize the image before upscaling.
+ // This is not always used. If NULL, disables sigmoidization.
+ const struct pl_sigmoid_params *sigmoid_params;
+
+ // Configures the color adjustment parameters used to decode the color.
+ // This can be used to apply additional artistic settings such as
+ // desaturation, etc. If NULL, defaults to &pl_color_adjustment_neutral.
+ const struct pl_color_adjustment *color_adjustment;
+
+ // Configures the settings used to detect the peak of the source content,
+ // for HDR sources. Has no effect on SDR content. If NULL, peak detection
+ // is disabled.
+ const struct pl_peak_detect_params *peak_detect_params;
+
+ // Configures the settings used to tone map from HDR to SDR, or from higher
+ // gamut to standard gamut content. If NULL, defaults to
+ // `&pl_color_map_default_params`.
+ const struct pl_color_map_params *color_map_params;
+
+ // Configures the settings used to dither to the output depth. Leaving this
+ // as NULL disables dithering.
+ const struct pl_dither_params *dither_params;
+
+ // Configures the error diffusion kernel to use for error diffusion
+ // dithering. If set, this will be used instead of `dither_params` whenever
+ // possible. Leaving this as NULL disables error diffusion.
+ const struct pl_error_diffusion_kernel *error_diffusion;
+
+ // Configures the settings used to simulate color blindness, if desired.
+ // If NULL, this feature is disabled.
+ const struct pl_cone_params *cone_params;
+
+ // Configures output blending. When rendering to the final target, the
+ // framebuffer contents will be blended using this blend mode. Requires
+ // that the target format has PL_FMT_CAP_BLENDABLE. NULL disables blending.
+ const struct pl_blend_params *blend_params;
+
+ // Configures the settings used to deinterlace frames (see
+ // `pl_frame.field`), if required.. If NULL, deinterlacing is "disabled",
+ // meaning interlaced frames are rendered as weaved frames instead.
+ //
+ // Note: As a consequence of how `pl_frame` represents individual fields,
+ // and especially when using the `pl_queue`, this will still result in
+ // frames being redundantly rendered twice. As such, it's highly
+ // recommended to, instead, fully disable deinterlacing by not marking
+ // source frames as interlaced in the first place.
+ const struct pl_deinterlace_params *deinterlace_params;
+
+ // If set, applies an extra distortion matrix to the image, after
+ // scaling and before presenting it to the screen. Can be used for e.g.
+ // fractional rotation.
+ //
+ // Note: The distortion canvas will be set to the size of `target->crop`,
+ // so this cannot effectively draw outside the specified target area,
+ // nor change the aspect ratio of the image.
+ const struct pl_distort_params *distort_params;
+
+ // List of custom user shaders / hooks.
+ // See <libplacebo/shaders/custom.h> for more information.
+ const struct pl_hook * const *hooks;
+ int num_hooks;
+
+ // Color mapping LUT. If present, this will be applied as part of the
+ // image being rendered, in normalized RGB space.
+ //
+ // Note: In this context, PL_LUT_NATIVE means "gamma light" and
+ // PL_LUT_NORMALIZED means "linear light". For HDR signals, normalized LUTs
+ // are scaled so 1.0 corresponds to the `pl_color_transfer_nominal_peak`.
+ //
+ // Note: A PL_LUT_CONVERSION fully replaces the color adaptation from
+ // `image` to `target`, including any tone-mapping (if necessary) and ICC
+ // profiles. It has the same representation as PL_LUT_NATIVE, so in this
+ // case the input and output are (respectively) non-linear light RGB.
+ const struct pl_custom_lut *lut;
+ enum pl_lut_type lut_type;
+
+ // If the image being rendered does not span the entire size of the target,
+ // it will be cleared explicitly using this background color (RGB). To
+ // disable this logic, set `skip_target_clearing`.
+ float background_color[3];
+ float background_transparency; // 0.0 for opaque, 1.0 for fully transparent
+ bool skip_target_clearing;
+
+ // If set to a value above 0.0, the output will be rendered with rounded
+ // corners, as if an alpha transparency mask had been applied. The value
+ // indicates the relative fraction of the side length to round - a value
+ // of 1.0 rounds the corners as much as possible.
+ float corner_rounding;
+
+ // If true, then transparent images will made opaque by painting them
+ // against a checkerboard pattern consisting of alternating colors. If both
+ // colors are left as {0}, they default respectively to 93% and 87% gray.
+ bool blend_against_tiles;
+ float tile_colors[2][3];
+ int tile_size;
+
+ // --- Performance / quality trade-off options:
+ // These should generally be left off where quality is desired, as they can
+ // degrade the result quite noticeably; but may be useful for older or
+ // slower hardware. Note that libplacebo will automatically disable
+ // advanced features on hardware where they are unsupported, regardless of
+ // these settings. So only enable them if you need a performance bump.
+
+ // Disables anti-aliasing on downscaling. This will result in moiré
+ // artifacts and nasty, jagged pixels when downscaling, except for some
+ // very limited special cases (e.g. bilinear downsampling to exactly 0.5x).
+ //
+ // Significantly speeds up downscaling with high downscaling ratios.
+ bool skip_anti_aliasing;
+
+ // Normally, when the size of the `target` used with `pl_render_image_mix`
+ // changes, or the render parameters are updated, the internal cache of
+ // mixed frames must be discarded in order to re-render all required
+ // frames. Setting this option to `true` will skip the cache invalidation
+ // and instead re-use the existing frames (with bilinear scaling to the new
+ // size if necessary), which comes at a quality loss shortly after a
+ // resize, but should make it much more smooth.
+ bool preserve_mixing_cache;
+
+ // --- Performance tuning / debugging options
+ // These may affect performance or may make debugging problems easier,
+ // but shouldn't have any effect on the quality.
+
+ // Normally, `pl_render_image_mix` will also push single frames through the
+ // mixer cache, in order to speed up re-draws. Enabling this option
+ // disables that logic, causing single frames to bypass the cache. (Though
+ // it will still read from, if they happen to already be cached)
+ bool skip_caching_single_frame;
+
+ // Disables linearization / sigmoidization before scaling. This might be
+ // useful when tracking down unexpected image artifacts or excessing
+ // ringing, but it shouldn't normally be necessary.
+ bool disable_linear_scaling;
+
+ // Forces the use of the "general" scaling algorithms even when using the
+ // special-cased built-in presets like `pl_filter_bicubic`. Basically, this
+ // disables the more efficient implementations in favor of the slower,
+ // general-purpose ones.
+ bool disable_builtin_scalers;
+
+ // Forces correction of subpixel offsets (using the configured `upscaler`).
+ bool correct_subpixel_offsets;
+
+ // Forces the use of dithering, even when rendering to 16-bit FBOs. This is
+ // generally pretty pointless because most 16-bit FBOs have high enough
+ // depth that rounding errors are below the human perception threshold,
+ // but this can be used to test the dither code.
+ bool force_dither;
+
+ // Disables the gamma-correct dithering logic which normally applies when
+ // dithering to low bit depths. No real use, outside of testing.
+ bool disable_dither_gamma_correction;
+
+ // Completely overrides the use of FBOs, as if there were no renderable
+ // texture format available. This disables most features.
+ bool disable_fbos;
+
+ // Use only low-bit-depth FBOs (8 bits). Note that this also implies
+ // disabling linear scaling and sigmoidization.
+ bool force_low_bit_depth_fbos;
+
+ // If this is true, all shaders will be generated as "dynamic" shaders,
+ // with any compile-time constants being replaced by runtime-adjustable
+ // values. This is generally a performance loss, but has the advantage of
+ // being able to freely change parameters without triggering shader
+ // recompilations.
+ //
+ // It's a good idea to enable while presenting configurable settings to the
+ // user, but it should be set to false once those values are "dialed in".
+ bool dynamic_constants;
+
+ // This callback is invoked for every pass successfully executed in the
+ // process of rendering a frame. Optional.
+ //
+ // Note: `info` is only valid until this function returns.
+ void (*info_callback)(void *priv, const struct pl_render_info *info);
+ void *info_priv;
+
+ // --- Deprecated/removed fields
+ bool allow_delayed_peak_detect PL_DEPRECATED; // moved to pl_peak_detect_params
+ const struct pl_icc_params *icc_params PL_DEPRECATED; // use pl_frame.icc
+ bool ignore_icc_profiles PL_DEPRECATED; // non-functional, just set pl_frame.icc to NULL
+ int lut_entries PL_DEPRECATED; // hard-coded as 256
+ float polar_cutoff PL_DEPRECATED; // hard-coded as 1e-3
+};
+
+// Bare minimum parameters, with no features enabled. This is the fastest
+// possible configuration, and should therefore be fine on any system.
+#define PL_RENDER_DEFAULTS \
+ .color_map_params = &pl_color_map_default_params, \
+ .color_adjustment = &pl_color_adjustment_neutral, \
+ .tile_colors = {{0.93, 0.93, 0.93}, \
+ {0.87, 0.87, 0.87}}, \
+ .tile_size = 32,
+
+#define pl_render_params(...) (&(struct pl_render_params) { PL_RENDER_DEFAULTS __VA_ARGS__ })
+PL_API extern const struct pl_render_params pl_render_fast_params;
+
+// This contains the default/recommended options for reasonable image quality,
+// while also not being too terribly slow. All of the *_params structs are
+// defaulted to the corresponding *_default_params, except for deband_params,
+// which is disabled by default.
+//
+// This should be fine on most integrated GPUs, but if it's too slow,
+// consider using `pl_render_fast_params` instead.
+PL_API extern const struct pl_render_params pl_render_default_params;
+
+// This contains a higher quality preset for better image quality at the cost
+// of quite a bit of performance. In addition to the settings implied by
+// `pl_render_default_params`, it enables debanding, sets the upscaler to
+// `pl_filter_ewa_lanczossharp`, and uses pl_*_high_quality_params structs where
+// available. This should only really be used with a discrete GPU and where
+// maximum image quality is desired.
+PL_API extern const struct pl_render_params pl_render_high_quality_params;
+
+#define PL_MAX_PLANES 4
+
+// High level description of a single slice of an image. This basically
+// represents a single 2D plane, with any number of components
+struct pl_plane {
+ // The texture underlying this plane. The texture must be 2D, and must
+ // have specific parameters set depending on what the plane is being used
+ // for (see `pl_render_image`).
+ pl_tex texture;
+
+ // The preferred behaviour when sampling outside of this texture. Optional,
+ // since the default (PL_TEX_ADDRESS_CLAMP) is very reasonable.
+ enum pl_tex_address_mode address_mode;
+
+ // Controls whether or not the `texture` will be considered flipped
+ // vertically with respect to the overall image dimensions. It's generally
+ // preferable to flip planes using this setting instead of the crop in
+ // cases where the flipping is the result of e.g. negative plane strides or
+ // flipped framebuffers (OpenGL).
+ //
+ // Note that any planar padding (due to e.g. size mismatch or misalignment
+ // of subsampled planes) is always at the physical end of the texture
+ // (highest y coordinate) - even if this bool is true. However, any
+ // subsampling shift (`shift_y`) is applied with respect to the flipped
+ // direction. This ensures the correct interpretation when e.g. vertically
+ // flipping 4:2:0 sources by flipping all planes.
+ bool flipped;
+
+ // Describes the number and interpretation of the components in this plane.
+ // This defines the mapping from component index to the canonical component
+ // order (RGBA, YCbCrA or XYZA). It's worth pointing out that this is
+ // completely separate from `texture->format.sample_order`. The latter is
+ // essentially irrelevant/transparent for the API user, since it just
+ // determines which order the texture data shows up as inside the GLSL
+ // shader; whereas this field controls the actual meaning of the component.
+ //
+ // Example; if the user has a plane with just {Y} and a plane with just
+ // {Cb Cr}, and a GPU that only supports bgra formats, you would still
+ // specify the component mapping as {0} and {1 2} respectively, even though
+ // the GPU is sampling the data in the order BGRA. Use -1 for "ignored"
+ // components.
+ int components; // number of relevant components
+ int component_mapping[4]; // semantic index of each component
+
+ // Controls the sample offset, relative to the "reference" dimensions. For
+ // an example of what to set here, see `pl_chroma_location_offset`. Note
+ // that this is given in unit of reference pixels. For a graphical example,
+ // imagine you have a 2x2 image with a 1x1 (subsampled) plane. Without any
+ // shift (0.0), the situation looks like this:
+ //
+ // X-------X X = reference pixel
+ // | | P = plane pixel
+ // | P |
+ // | |
+ // X-------X
+ //
+ // For 4:2:0 subsampling, this corresponds to PL_CHROMA_CENTER. If the
+ // shift_x was instead set to -0.5, the `P` pixel would be offset to the
+ // left by half the separation between the reference (`X` pixels), resulting
+ // in the following:
+ //
+ // X-------X X = reference pixel
+ // | | P = plane pixel
+ // P |
+ // | |
+ // X-------X
+ //
+ // For 4:2:0 subsampling, this corresponds to PL_CHROMA_LEFT.
+ //
+ // Note: It's recommended to fill this using `pl_chroma_location_offset` on
+ // the chroma planes.
+ float shift_x, shift_y;
+};
+
+enum pl_overlay_mode {
+ PL_OVERLAY_NORMAL = 0, // treat the texture as a normal, full-color texture
+ PL_OVERLAY_MONOCHROME, // treat the texture as a single-component alpha map
+ PL_OVERLAY_MODE_COUNT,
+};
+
+enum pl_overlay_coords {
+ PL_OVERLAY_COORDS_AUTO = 0, // equal to SRC/DST_FRAME, respectively
+ PL_OVERLAY_COORDS_SRC_FRAME, // relative to the raw src frame
+ PL_OVERLAY_COORDS_SRC_CROP, // relative to the src frame crop
+ PL_OVERLAY_COORDS_DST_FRAME, // relative to the raw dst frame
+ PL_OVERLAY_COORDS_DST_CROP, // relative to the dst frame crop
+ PL_OVERLAY_COORDS_COUNT,
+
+ // Note on rotations: If there is an end-to-end rotation between `src` and
+ // `dst`, then any overlays relative to SRC_FRAME or SRC_CROP will be
+ // rotated alongside the image, while overlays relative to DST_FRAME or
+ // DST_CROP will not.
+};
+
+struct pl_overlay_part {
+ pl_rect2df src; // source coordinate with respect to `pl_overlay.tex`
+ pl_rect2df dst; // target coordinates with respect to `pl_overlay.coords`
+
+ // If `mode` is PL_OVERLAY_MONOCHROME, then this specifies the color of
+ // this overlay part. The color is multiplied into the sampled texture's
+ // first channel.
+ float color[4];
+};
+
+// A struct representing an image overlay (e.g. for subtitles or on-screen
+// status messages, controls, ...)
+struct pl_overlay {
+ // The texture containing the backing data for overlay parts. Must have
+ // `params.sampleable` set.
+ pl_tex tex;
+
+ // This controls the coloring mode of this overlay.
+ enum pl_overlay_mode mode;
+
+ // Controls which coordinates this overlay is addressed relative to.
+ enum pl_overlay_coords coords;
+
+ // This controls the colorspace information for this overlay. The contents
+ // of the texture / the value of `color` are interpreted according to this.
+ struct pl_color_repr repr;
+ struct pl_color_space color;
+
+ // The number of parts for this overlay.
+ const struct pl_overlay_part *parts;
+ int num_parts;
+};
+
+// High-level description of a complete frame, including metadata and planes
+struct pl_frame {
+ // Each frame is split up into some number of planes, each of which may
+ // carry several components and be of any size / offset.
+ int num_planes;
+ struct pl_plane planes[PL_MAX_PLANES];
+
+ // For interlaced frames. If set, this `pl_frame` corresponds to a single
+ // field of the underlying source textures. `first_field` indicates which
+ // of these fields is ordered first in time. `prev` and `next` should point
+ // to the previous/next frames in the file, or NULL if there are none.
+ //
+ // Note: Setting these fields on the render target has no meaning and will
+ // be ignored.
+ enum pl_field field;
+ enum pl_field first_field;
+ const struct pl_frame *prev, *next;
+
+ // If set, will be called immediately before GPU access to this frame. This
+ // function *may* be used to, for example, perform synchronization with
+ // external APIs (e.g. `pl_vulkan_hold/release`). If your mapping requires
+ // a memcpy of some sort (e.g. pl_tex_transfer), users *should* instead do
+ // the memcpy up-front and avoid the use of these callbacks - because they
+ // might be called multiple times on the same frame.
+ //
+ // This function *may* arbitrarily mutate the `pl_frame`, but it *should*
+ // ideally only update `planes` - in particular, color metadata and so
+ // forth should be provided up-front as best as possible. Note that changes
+ // here will not be reflected back to the structs provided in the original
+ // `pl_render_*` call (e.g. via `pl_frame_mix`).
+ //
+ // Note: Unless dealing with interlaced frames, only one frame will ever be
+ // acquired at a time per `pl_render_*` call. So users *can* safely use
+ // this with, for example, hwdec mappers that can only map a single frame
+ // at a time. When using this with, for example, `pl_render_image_mix`,
+ // each frame to be blended is acquired and release in succession, before
+ // moving on to the next frame. For interlaced frames, the previous and
+ // next frames must also be acquired simultaneously.
+ bool (*acquire)(pl_gpu gpu, struct pl_frame *frame);
+
+ // If set, will be called after a plane is done being used by the GPU,
+ // *including* after any errors (e.g. `acquire` returning false).
+ void (*release)(pl_gpu gpu, struct pl_frame *frame);
+
+ // Color representation / encoding / semantics of this frame.
+ struct pl_color_repr repr;
+ struct pl_color_space color;
+
+ // Optional ICC profile associated with this frame.
+ pl_icc_object icc;
+
+ // Alternative to `icc`, this can be used in cases where allocating and
+ // tracking an pl_icc_object externally may be inconvenient. The resulting
+ // profile will be managed internally by the pl_renderer.
+ struct pl_icc_profile profile;
+
+ // Optional LUT associated with this frame.
+ const struct pl_custom_lut *lut;
+ enum pl_lut_type lut_type;
+
+ // The logical crop / rectangle containing the valid information, relative
+ // to the reference plane's dimensions (e.g. luma). Pixels outside of this
+ // rectangle will ostensibly be ignored, but note that this is not a hard
+ // guarantee. In particular, scaler filters may end up sampling outside of
+ // this crop. This rect may be flipped, and may be partially or wholly
+ // outside the bounds of the underlying textures. (Optional)
+ //
+ // Note that `pl_render_image` will map the input crop directly to the
+ // output crop, stretching and scaling as needed. If you wish to preserve
+ // the aspect ratio, use a dedicated function like pl_rect2df_aspect_copy.
+ pl_rect2df crop;
+
+ // Logical rotation of the image, with respect to the underlying planes.
+ // For example, if this is PL_ROTATION_90, then the image will be rotated
+ // to the right by 90° when mapping to `crop`. The actual position on-screen
+ // is unaffected, so users should ensure that the (rotated) aspect ratio
+ // matches the source. (Or use a helper like `pl_rect2df_aspect_set_rot`)
+ //
+ // Note: For `target` frames, this corresponds to a rotation of the
+ // display, for `image` frames, this corresponds to a rotation of the
+ // camera.
+ //
+ // So, as an example, target->rotation = PL_ROTATE_90 means the end user
+ // has rotated the display to the right by 90° (meaning rendering will be
+ // rotated 90° to the *left* to compensate), and image->rotation =
+ // PL_ROTATE_90 means the video provider has rotated the camera to the
+ // right by 90° (so rendering will be rotated 90° to the *right* to
+ // compensate).
+ pl_rotation rotation;
+
+ // A list of additional overlays associated with this frame. Note that will
+ // be rendered directly onto intermediate/cache frames, so changing any of
+ // these overlays may require flushing the renderer cache.
+ const struct pl_overlay *overlays;
+ int num_overlays;
+
+ // Note on subsampling and plane correspondence: All planes belonging to
+ // the same frame will only be stretched by an integer multiple (or inverse
+ // thereof) in order to match the reference dimensions of this image. For
+ // example, suppose you have an 8x4 image. A valid plane scaling would be
+ // 4x2 -> 8x4 or 4x4 -> 4x4, but not 6x4 -> 8x4. So if a 6x4 plane is
+ // given, then it would be treated like a cropped 8x4 plane (since 1.0 is
+ // the closest scaling ratio to the actual ratio of 1.3).
+ //
+ // For an explanation of why this makes sense, consider the relatively
+ // common example of a subsampled, oddly sized (e.g. jpeg) image. In such
+ // cases, for example a 35x23 image, the 4:2:0 subsampled chroma plane
+ // would have to end up as 17.5x11.5, which gets rounded up to 18x12 by
+ // implementations. So in this example, the 18x12 chroma plane would get
+ // treated by libplacebo as an oversized chroma plane - i.e. the plane
+ // would get sampled as if it was 17.5 pixels wide and 11.5 pixels large.
+
+ // Associated film grain data (see <libplacebo/shaders/film_grain.h>).
+ //
+ // Note: This is ignored for the `target` of `pl_render_image`, since
+ // un-applying grain makes little sense.
+ struct pl_film_grain_data film_grain;
+
+ // Ignored by libplacebo. May be useful for users.
+ void *user_data;
+};
+
+// Helper function to infer the chroma location offset for each plane in a
+// frame. This is equivalent to calling `pl_chroma_location_offset` on all
+// subsampled planes' shift_x/shift_y variables.
+PL_API void pl_frame_set_chroma_location(struct pl_frame *frame,
+ enum pl_chroma_location chroma_loc);
+
+// Fills in a `pl_frame` based on a swapchain frame's FBO and metadata.
+PL_API void pl_frame_from_swapchain(struct pl_frame *out_frame,
+ const struct pl_swapchain_frame *frame);
+
+// Helper function to determine if a frame is logically cropped or not. In
+// particular, this is useful in determining whether or not an output frame
+// needs to be cleared before rendering or not.
+PL_API bool pl_frame_is_cropped(const struct pl_frame *frame);
+
+// Helper function to reset a frame to a given RGB color. If the frame's
+// color representation is something other than RGB, the clear color will
+// be adjusted accordingly. `clear_color` should be non-premultiplied.
+PL_API void pl_frame_clear_rgba(pl_gpu gpu, const struct pl_frame *frame,
+ const float clear_color[4]);
+
+// Like `pl_frame_clear_rgba` but without an alpha channel.
+static inline void pl_frame_clear(pl_gpu gpu, const struct pl_frame *frame,
+ const float clear_color[3])
+{
+ const float clear_color_rgba[4] = { clear_color[0], clear_color[1], clear_color[2], 1.0 };
+ pl_frame_clear_rgba(gpu, frame, clear_color_rgba);
+}
+
+// Helper functions to return the fixed/inferred pl_frame parameters used
+// for rendering internally. Mutates `image` and `target` in-place to hold
+// the modified values, which are what will actually be used for rendering.
+//
+// This currently includes:
+// - Defaulting all missing pl_color_space/repr parameters
+// - Coalescing all rotation to the target
+// - Rounding and clamping the target crop to pixel boundaries and adjusting the
+// image crop correspondingly
+//
+// Note: This is idempotent and does not generally alter the effects of a
+// subsequent `pl_render_image` on the same pl_frame pair. (But see the
+// following warning)
+//
+// Warning: This does *not* call pl_frame.acquire/release, and so the returned
+// metadata *may* be incorrect if the acquire callback mutates the pl_frame in
+// nontrivial ways, in particular the crop and color space fields.
+PL_API void pl_frames_infer(pl_renderer rr, struct pl_frame *image,
+ struct pl_frame *target);
+
+
+// Render a single image to a target using the given parameters. This is
+// fully dynamic, i.e. the params can change at any time. libplacebo will
+// internally detect and flush whatever caches are invalidated as a result of
+// changing colorspace, size etc.
+//
+// Required plane capabilities:
+// - Planes in `image` must be `sampleable`
+// - Planes in `target` must be `renderable`
+//
+// Recommended plane capabilities: (Optional, but good for performance)
+// - Planes in `image` should have `sample_mode` PL_TEX_SAMPLE_LINEAR
+// - Planes in `target` should be `storable`
+// - Planes in `target` should have `blit_dst`
+//
+// Note on lifetime: Once this call returns, the passed structures may be
+// freely overwritten or discarded by the caller, even the referenced
+// `pl_tex` objects may be freely reused.
+//
+// Note: `image` may be NULL, in which case `target.overlays` will still be
+// rendered, but nothing else.
+PL_API bool pl_render_image(pl_renderer rr, const struct pl_frame *image,
+ const struct pl_frame *target,
+ const struct pl_render_params *params);
+
+// Flushes the internal state of this renderer. This is normally not needed,
+// even if the image parameters, colorspace or target configuration change,
+// since libplacebo will internally detect such circumstances and recreate
+// outdated resources automatically. Doing this explicitly *may* be useful to
+// purge some state related to things like HDR peak detection or frame mixing,
+// so calling it is a good idea if the content source is expected to change
+// dramatically (e.g. when switching to a different file).
+PL_API void pl_renderer_flush_cache(pl_renderer rr);
+
+// Mirrors `pl_get_detected_hdr_metadata`, giving you the current internal peak
+// detection HDR metadata (when peak detection is active). Returns false if no
+// information is available (e.g. not HDR source, peak detection disabled).
+PL_API bool pl_renderer_get_hdr_metadata(pl_renderer rr,
+ struct pl_hdr_metadata *metadata);
+
+// Represents a mixture of input frames, distributed temporally.
+//
+// NOTE: Frames must be sorted by timestamp, i.e. `timestamps` must be
+// monotonically increasing.
+struct pl_frame_mix {
+ // The number of frames in this mixture. The number of frames should be
+ // sufficient to meet the needs of the configured frame mixer. See the
+ // section below for more information.
+ //
+ // If the number of frames is 0, this call will be equivalent to
+ // `pl_render_image` with `image == NULL`.
+ int num_frames;
+
+ // A list of the frames themselves. The frames can have different
+ // colorspaces, configurations of planes, or even sizes.
+ //
+ // Note: This is a list of pointers, to avoid users having to copy
+ // around `pl_frame` structs when re-organizing this array.
+ const struct pl_frame **frames;
+
+ // A list of unique signatures, one for each frame. These are used to
+ // identify frames across calls to this function, so it's crucial that they
+ // be both unique per-frame but also stable across invocations of
+ // `pl_render_frame_mix`.
+ const uint64_t *signatures;
+
+ // A list of relative timestamps for each frame. These are relative to the
+ // time of the vsync being drawn, i.e. this function will render the frame
+ // that will be made visible at timestamp 0.0. The values are expected to
+ // be normalized such that a separation of 1.0 corresponds to roughly one
+ // nominal source frame duration. So a constant framerate video file will
+ // always have timestamps like e.g. {-2.3, -1.3, -0.3, 0.7, 1.7, 2.7},
+ // using an example radius of 3.
+ //
+ // In cases where the framerate is variable (e.g. VFR video), the choice of
+ // what to scale to use can be difficult to answer. A typical choice would
+ // be either to use the canonical (container-tagged) framerate, or the
+ // highest momentary framerate, as a reference. If all else fails, you
+ // could also use the display's framerate.
+ //
+ // Note: This function assumes zero-order-hold semantics, i.e. the frame at
+ // timestamp 0.7 is intended to remain visible until timestamp 1.7, when
+ // the next frame replaces it.
+ const float *timestamps;
+
+ // The duration for which the vsync being drawn will be held, using the
+ // same scale as `timestamps`. If the display has an unknown or variable
+ // frame-rate (e.g. Adaptive Sync), then you're probably better off not
+ // using this function and instead just painting the frames directly using
+ // `pl_render_frame` at the correct PTS.
+ //
+ // As an example, if `vsync_duration` is 0.4, then it's assumed that the
+ // vsync being painted is visible for the period [0.0, 0.4].
+ float vsync_duration;
+
+ // Explanation of the frame mixing radius: The algorithm chosen in
+ // `pl_render_params.frame_mixer` has a canonical radius equal to
+ // `pl_filter_config.kernel->radius`. This means that the frame mixing
+ // algorithm will (only) need to consult all of the frames that have a
+ // distance within the interval [-radius, radius]. As such, the user should
+ // include all such frames in `frames`, but may prune or omit frames that
+ // lie outside it.
+ //
+ // The built-in frame mixing (`pl_render_params.frame_mixer == NULL`) has
+ // no concept of radius, it just always needs access to the "current" and
+ // "next" frames.
+};
+
+// Helper function to calculate the base frame mixing radius.
+//
+// Note: When the source FPS exceeds the display FPS, this radius must be
+// increased by the corresponding ratio.
+static inline float pl_frame_mix_radius(const struct pl_render_params *params)
+{
+ // For backwards compatibility, allow !frame_mixer->kernel
+ if (!params->frame_mixer || !params->frame_mixer->kernel)
+ return 0.0;
+
+ return params->frame_mixer->kernel->radius;
+}
+
+// Find closest frame to current PTS by zero-order hold semantics, or NULL.
+PL_API const struct pl_frame *pl_frame_mix_current(const struct pl_frame_mix *mix);
+
+// Find closest frame to current PTS by nearest neighbour semantics, or NULL.
+PL_API const struct pl_frame *pl_frame_mix_nearest(const struct pl_frame_mix *mix);
+
+// Render a mixture of images to the target using the given parameters. This
+// functions much like a generalization of `pl_render_image`, for when the API
+// user has more control over the frame queue / vsync loop, and can provide a
+// few frames from the past and future + timestamp information.
+//
+// This allows libplacebo to perform rudimentary frame mixing / interpolation,
+// in order to eliminate judder artifacts typically associated with
+// source/display frame rate mismatch.
+PL_API bool pl_render_image_mix(pl_renderer rr, const struct pl_frame_mix *images,
+ const struct pl_frame *target,
+ const struct pl_render_params *params);
+
+// Analog of `pl_frame_infer` corresponding to `pl_render_image_mix`. This
+// function will *not* mutate the frames contained in `mix`, and instead
+// return an adjusted copy of the "reference" frame for that image mix in
+// `out_refimage`, or {0} if the mix is empty.
+PL_API void pl_frames_infer_mix(pl_renderer rr, const struct pl_frame_mix *mix,
+ struct pl_frame *target, struct pl_frame *out_ref);
+
+// Backwards compatibility with old filters API, may be deprecated.
+// Redundant with pl_filter_configs and masking `allowed` for
+// PL_FILTER_SCALING and PL_FILTER_FRAME_MIXING respectively.
+
+// A list of recommended frame mixer presets, terminated by {0}
+PL_API extern const struct pl_filter_preset pl_frame_mixers[];
+PL_API extern const int pl_num_frame_mixers; // excluding trailing {0}
+
+// A list of recommended scaler presets, terminated by {0}. This is almost
+// equivalent to `pl_filter_presets` with the exception of including extra
+// built-in filters that don't map to the `pl_filter` architecture.
+PL_API extern const struct pl_filter_preset pl_scale_filters[];
+PL_API extern const int pl_num_scale_filters; // excluding trailing {0}
+
+// Deprecated in favor of `pl_cache_save/pl_cache_load` on the `pl_cache`
+// associated with the `pl_gpu` this renderer is using.
+PL_DEPRECATED PL_API size_t pl_renderer_save(pl_renderer rr, uint8_t *out_cache);
+PL_DEPRECATED PL_API void pl_renderer_load(pl_renderer rr, const uint8_t *cache);
+
+PL_API_END
+
+#endif // LIBPLACEBO_RENDERER_H_
diff --git a/src/include/libplacebo/shaders.h b/src/include/libplacebo/shaders.h
new file mode 100644
index 0000000..b8046be
--- /dev/null
+++ b/src/include/libplacebo/shaders.h
@@ -0,0 +1,273 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_SHADERS_H_
+#define LIBPLACEBO_SHADERS_H_
+
+// This function defines the "direct" interface to libplacebo's GLSL shaders,
+// suitable for use in contexts where the user controls GLSL shader compilation
+// but wishes to include functions generated by libplacebo as part of their
+// own rendering process. This API is normally not used for operation with
+// libplacebo's higher-level constructs such as `pl_dispatch` or `pl_renderer`.
+
+#include <libplacebo/gpu.h>
+
+PL_API_BEGIN
+
+// Thread-safety: Unsafe
+typedef struct pl_shader_t *pl_shader;
+
+struct pl_shader_params {
+ // The `id` represents an abstract identifier for the shader, to avoid
+ // collisions with other shaders being used as part of the same larger,
+ // overarching shader. This is relevant for users which want to combine
+ // multiple `pl_shader` objects together, in which case all `pl_shader`
+ // objects should have a unique `id`.
+ uint8_t id;
+
+ // If `gpu` is non-NULL, then this `gpu` will be used to create objects
+ // such as textures and buffers, or check for required capabilities, for
+ // operations which depend on either of those. This is fully optional, i.e.
+ // these GLSL primitives are designed to be used without a dependency on
+ // `gpu` wherever possible - however, some features may not work, and will
+ // be disabled even if requested.
+ pl_gpu gpu;
+
+ // The `index` represents an abstract frame index, which shaders may use
+ // internally to do things like temporal dithering or seeding PRNGs. If the
+ // user does not care about temporal dithering/debanding, or wants
+ // deterministic rendering, this may safely be left as 0. Otherwise, it
+ // should be incremented by 1 on successive frames.
+ uint8_t index;
+
+ // If `glsl.version` is nonzero, then this structure will be used to
+ // determine the effective GLSL mode and capabilities. If `gpu` is also
+ // set, then this overrides `gpu->glsl`.
+ struct pl_glsl_version glsl;
+
+ // If this is true, all constants in the shader will be replaced by
+ // dynamic variables. This is mainly useful to avoid recompilation for
+ // shaders which expect to have their values change constantly.
+ bool dynamic_constants;
+};
+
+#define pl_shader_params(...) (&(struct pl_shader_params) { __VA_ARGS__ })
+
+// Creates a new, blank, mutable pl_shader object.
+//
+// Note: Rather than allocating and destroying many shaders, users are
+// encouraged to reuse them (using `pl_shader_reset`) for efficiency.
+PL_API pl_shader pl_shader_alloc(pl_log log, const struct pl_shader_params *params);
+
+// Frees a pl_shader and all resources associated with it.
+PL_API void pl_shader_free(pl_shader *sh);
+
+// Resets a pl_shader to a blank slate, without releasing internal memory.
+// If you're going to be re-generating shaders often, this function will let
+// you skip the re-allocation overhead.
+PL_API void pl_shader_reset(pl_shader sh, const struct pl_shader_params *params);
+
+// Returns whether or not a shader is in a "failed" state. Trying to modify a
+// shader in illegal ways (e.g. signature mismatch) will result in the shader
+// being marked as "failed". Since most pl_shader_ operations have a void
+// return type, the user can use this function to figure out whether a specific
+// shader operation has failed or not. This function is somewhat redundant
+// since `pl_shader_finalize` will also return NULL in this case.
+PL_API bool pl_shader_is_failed(const pl_shader sh);
+
+// Returns whether or not a pl_shader needs to be run as a compute shader. This
+// will never be the case unless the `pl_glsl_version` this `pl_shader` was
+// created using has `compute` support enabled.
+PL_API bool pl_shader_is_compute(const pl_shader sh);
+
+// Returns whether or not the shader has any particular output size
+// requirements. Some shaders, in particular those that sample from other
+// textures, have specific output size requirements which need to be respected
+// by the caller. If this is false, then the shader is compatible with every
+// output size. If true, the size requirements are stored into *w and *h.
+PL_API bool pl_shader_output_size(const pl_shader sh, int *w, int *h);
+
+// Indicates the type of signature that is associated with a shader result.
+// Every shader result defines a function that may be called by the user, and
+// this enum indicates the type of value that this function takes and/or
+// returns.
+//
+// Which signature a shader ends up with depends on the type of operation being
+// performed by a shader fragment, as determined by the user's calls. See below
+// for more information.
+enum pl_shader_sig {
+ PL_SHADER_SIG_NONE = 0, // no input / void output
+ PL_SHADER_SIG_COLOR, // vec4 color (normalized so that 1.0 is the ref white)
+
+ // The following are only valid as input signatures:
+ PL_SHADER_SIG_SAMPLER, // (gsampler* src_tex, vecN tex_coord) pair,
+ // specifics depend on how the shader was generated
+};
+
+// Structure encapsulating information about a shader. This is internally
+// refcounted, to allow moving it around without having to create deep copies.
+typedef const struct pl_shader_info_t {
+ // A copy of the parameters used to create the shader.
+ struct pl_shader_params params;
+
+ // A list of friendly names for the semantic operations being performed by
+ // this shader, e.g. "color decoding" or "debanding".
+ const char **steps;
+ int num_steps;
+
+ // As a convenience, this contains a pretty-printed version of the
+ // above list, with entries tallied and separated by commas
+ const char *description;
+} *pl_shader_info;
+
+PL_API pl_shader_info pl_shader_info_ref(pl_shader_info info);
+PL_API void pl_shader_info_deref(pl_shader_info *info);
+
+// Represents a finalized shader fragment. This is not a complete shader, but a
+// collection of raw shader text together with description of the input
+// attributes, variables and vertices it expects to be available.
+struct pl_shader_res {
+ // Descriptive information about the shader. Note that this reference is
+ // attached to the shader itself - the user does not need to manually ref
+ // or deref `info` unless they wish to move it elsewhere.
+ pl_shader_info info;
+
+ // The shader text, as literal GLSL. This will always be a function
+ // definition, such that the the function with the indicated name and
+ // signature may be called by the user.
+ const char *glsl;
+ const char *name;
+ enum pl_shader_sig input; // what the function expects
+ enum pl_shader_sig output; // what the function returns
+
+ // For compute shaders (pl_shader_is_compute), this indicates the requested
+ // work group size. Otherwise, both fields are 0. The interpretation of
+ // these work groups is that they're tiled across the output image.
+ int compute_group_size[2];
+
+ // If this pass is a compute shader, this field indicates the shared memory
+ // size requirements for this shader pass.
+ size_t compute_shmem;
+
+ // A set of input vertex attributes needed by this shader fragment.
+ const struct pl_shader_va *vertex_attribs;
+ int num_vertex_attribs;
+
+ // A set of input variables needed by this shader fragment.
+ const struct pl_shader_var *variables;
+ int num_variables;
+
+ // A list of input descriptors needed by this shader fragment,
+ const struct pl_shader_desc *descriptors;
+ int num_descriptors;
+
+ // A list of compile-time constants used by this shader fragment.
+ const struct pl_shader_const *constants;
+ int num_constants;
+
+ // --- Deprecated fields (see `info`)
+ struct pl_shader_params params PL_DEPRECATED;
+ const char **steps PL_DEPRECATED;
+ int num_steps PL_DEPRECATED;
+ const char *description PL_DEPRECATED;
+};
+
+// Represents a vertex attribute. The four values will be bound to the four
+// corner vertices respectively, in row-wise order starting from the top left:
+// data[0] data[1]
+// data[2] data[3]
+struct pl_shader_va {
+ struct pl_vertex_attrib attr; // VA type, excluding `offset` and `location`
+ const void *data[4];
+};
+
+// Represents a bound shared variable / descriptor
+struct pl_shader_var {
+ struct pl_var var; // the underlying variable description
+ const void *data; // the raw data (as per `pl_var_host_layout`)
+ bool dynamic; // if true, the value is expected to change frequently
+};
+
+struct pl_buffer_var {
+ struct pl_var var;
+ struct pl_var_layout layout;
+};
+
+typedef uint16_t pl_memory_qualifiers;
+enum {
+ PL_MEMORY_COHERENT = 1 << 0, // supports synchronization across shader invocations
+ PL_MEMORY_VOLATILE = 1 << 1, // all writes are synchronized automatically
+
+ // Note: All descriptors are also implicitly assumed to have the 'restrict'
+ // memory qualifier. There is currently no way to override this behavior.
+};
+
+struct pl_shader_desc {
+ struct pl_desc desc; // descriptor type, excluding `int binding`
+ struct pl_desc_binding binding; // contents of the descriptor binding
+
+ // For PL_DESC_BUF_UNIFORM/STORAGE, this specifies the layout of the
+ // variables contained by a buffer. Ignored for the other descriptor types
+ struct pl_buffer_var *buffer_vars;
+ int num_buffer_vars;
+
+ // For storage images and buffers, this specifies additional memory
+ // qualifiers on the descriptor. It's highly recommended to always use
+ // at least PL_MEMORY_RESTRICT. Ignored for other descriptor types.
+ pl_memory_qualifiers memory;
+};
+
+// Represents a compile-time constant. This can be lowered to a specialization
+// constant to support cheaper recompilations.
+struct pl_shader_const {
+ enum pl_var_type type;
+ const char *name;
+ const void *data;
+
+ // If true, this constant *must* be a compile-time constant, which
+ // basically just overrides `pl_shader_params.dynamic_constants`. Useful
+ // for constants which will serve as inputs to e.g. array sizes.
+ bool compile_time;
+};
+
+// Finalize a pl_shader. It is no longer mutable at this point, and any further
+// attempts to modify it result in an error. (Functions which take a `const
+// pl_shader` argument do not modify the shader and may be freely
+// called on an already-finalized shader)
+//
+// The returned pl_shader_res is bound to the lifetime of the pl_shader - and
+// will only remain valid until the pl_shader is freed or reset. This function
+// may be called multiple times, and will produce the same result each time.
+//
+// This function will return NULL if the shader is considered to be in a
+// "failed" state (see pl_shader_is_failed).
+PL_API const struct pl_shader_res *pl_shader_finalize(pl_shader sh);
+
+// Shader objects represent abstract resources that shaders need to manage in
+// order to ensure their operation. This could include shader storage buffers,
+// generated lookup textures, or other sorts of configured state. The body
+// of a shader object is fully opaque; but the user is in charge of cleaning up
+// after them and passing them to the right shader passes.
+//
+// Note: pl_shader_obj objects must be initialized to NULL by the caller.
+typedef struct pl_shader_obj_t *pl_shader_obj;
+
+PL_API void pl_shader_obj_destroy(pl_shader_obj *obj);
+
+PL_API_END
+
+#endif // LIBPLACEBO_SHADERS_H_
diff --git a/src/include/libplacebo/shaders/colorspace.h b/src/include/libplacebo/shaders/colorspace.h
new file mode 100644
index 0000000..ead0958
--- /dev/null
+++ b/src/include/libplacebo/shaders/colorspace.h
@@ -0,0 +1,381 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_SHADERS_COLORSPACE_H_
+#define LIBPLACEBO_SHADERS_COLORSPACE_H_
+
+// Color space transformation shaders. These all input and output a color
+// value (PL_SHADER_SIG_COLOR).
+
+#include <libplacebo/colorspace.h>
+#include <libplacebo/gamut_mapping.h>
+#include <libplacebo/tone_mapping.h>
+#include <libplacebo/shaders.h>
+
+// For backwards compatibility
+#include <libplacebo/shaders/dithering.h>
+
+PL_API_BEGIN
+
+// Transform the input color, in its given representation, to ensure
+// compatibility with the indicated alpha mode. Mutates `repr` to reflect the
+// change. Note that this is a no-op if the input is PL_ALPHA_UNKNOWN.
+PL_API void pl_shader_set_alpha(pl_shader sh, struct pl_color_repr *repr,
+ enum pl_alpha_mode mode);
+
+// Colorspace reshaping for PL_COLOR_SYSTEM_DOLBYVISION. Note that this is done
+// automatically by `pl_shader_decode_color` for PL_COLOR_SYSTEM_DOLBYVISION.
+PL_API void pl_shader_dovi_reshape(pl_shader sh, const struct pl_dovi_metadata *data);
+
+// Decode the color into normalized RGB, given a specified color_repr. This
+// also takes care of additional pre- and post-conversions requires for the
+// "special" color systems (XYZ, BT.2020-C, etc.). If `params` is left as NULL,
+// it defaults to &pl_color_adjustment_neutral.
+//
+// Note: This function always returns PC-range RGB with independent alpha.
+// It mutates the pl_color_repr to reflect the change.
+//
+// Note: For DCDM XYZ decoding output is linear
+PL_API void pl_shader_decode_color(pl_shader sh, struct pl_color_repr *repr,
+ const struct pl_color_adjustment *params);
+
+// Encodes a color from normalized, PC-range, independent alpha RGB into a
+// given representation. That is, this performs the inverse operation of
+// `pl_shader_decode_color` (sans color adjustments).
+//
+// Note: For DCDM XYZ encoding input is expected to be linear
+PL_API void pl_shader_encode_color(pl_shader sh, const struct pl_color_repr *repr);
+
+// Linearize (expand) `vec4 color`, given a specified color space. In essence,
+// this corresponds to the ITU-R EOTF.
+//
+// Note: Unlike the ITU-R EOTF, it never includes the OOTF - even for systems
+// where the EOTF includes the OOTF (such as HLG).
+PL_API void pl_shader_linearize(pl_shader sh, const struct pl_color_space *csp);
+
+// Delinearize (compress), given a color space as output. This loosely
+// corresponds to the inverse EOTF (not the OETF) in ITU-R terminology, again
+// assuming a reference monitor.
+PL_API void pl_shader_delinearize(pl_shader sh, const struct pl_color_space *csp);
+
+struct pl_sigmoid_params {
+ // The center (bias) of the sigmoid curve. Must be between 0.0 and 1.0.
+ // If left as NULL, defaults to 0.75
+ float center;
+
+ // The slope (steepness) of the sigmoid curve. Must be between 1.0 and 20.0.
+ // If left as NULL, defaults to 6.5.
+ float slope;
+};
+
+#define PL_SIGMOID_DEFAULTS \
+ .center = 0.75, \
+ .slope = 6.50,
+
+#define pl_sigmoid_params(...) (&(struct pl_sigmoid_params) { PL_SIGMOID_DEFAULTS __VA_ARGS__ })
+PL_API extern const struct pl_sigmoid_params pl_sigmoid_default_params;
+
+// Applies a sigmoidal color transform to all channels. This helps avoid
+// ringing artifacts during upscaling by bringing the color information closer
+// to neutral and away from the extremes. If `params` is NULL, it defaults to
+// &pl_sigmoid_default_params.
+//
+// Warning: This function clamps the input to the interval [0,1]; and as such
+// it should *NOT* be used on already-decoded high-dynamic range content.
+PL_API void pl_shader_sigmoidize(pl_shader sh, const struct pl_sigmoid_params *params);
+
+// This performs the inverse operation to `pl_shader_sigmoidize`.
+PL_API void pl_shader_unsigmoidize(pl_shader sh, const struct pl_sigmoid_params *params);
+
+struct pl_peak_detect_params {
+ // Smoothing coefficient for the detected values. This controls the time
+ // parameter (tau) of an IIR low pass filter. In other words, it represent
+ // the cutoff period (= 1 / cutoff frequency) in frames. Frequencies below
+ // this length will be suppressed. This helps block out annoying
+ // "sparkling" or "flickering" due to small variations in frame-to-frame
+ // brightness. If left as 0.0, this smoothing is completely disabled.
+ float smoothing_period;
+
+ // In order to avoid reacting sluggishly on scene changes as a result of
+ // the low-pass filter, we disable it when the difference between the
+ // current frame brightness and the average frame brightness exceeds a
+ // given threshold difference. But rather than a single hard cutoff, which
+ // would lead to weird discontinuities on fades, we gradually disable it
+ // over a small window of brightness ranges. These parameters control the
+ // lower and upper bounds of this window, in units of 1% PQ.
+ //
+ // Setting either one of these to 0.0 disables this logic.
+ float scene_threshold_low;
+ float scene_threshold_high;
+
+ // Which percentile of the input image brightness histogram to consider as
+ // the true peak of the scene. If this is set to 100 (or 0), the brightest
+ // pixel is measured. Otherwise, the top of the frequency distribution is
+ // progressively cut off. Setting this too low will cause clipping of very
+ // bright details, but can improve the dynamic brightness range of scenes
+ // with very bright isolated highlights.
+ //
+ // A recommended value is 99.995%, which is very conservative and should
+ // cause no major issues in typical content.
+ float percentile;
+
+ // Allows the peak detection result to be delayed by up to a single frame,
+ // which can sometimes improve thoughput, at the cost of introducing the
+ // possibility of 1-frame flickers on transitions. Disabled by default.
+ bool allow_delayed;
+
+ // --- Deprecated / removed fields
+ float overshoot_margin PL_DEPRECATED;
+ float minimum_peak PL_DEPRECATED;
+};
+
+#define PL_PEAK_DETECT_DEFAULTS \
+ .smoothing_period = 20.0f, \
+ .scene_threshold_low = 1.0f, \
+ .scene_threshold_high = 3.0f, \
+ .percentile = 100.0f,
+
+#define PL_PEAK_DETECT_HQ_DEFAULTS \
+ PL_PEAK_DETECT_DEFAULTS \
+ .percentile = 99.995f,
+
+#define pl_peak_detect_params(...) (&(struct pl_peak_detect_params) { PL_PEAK_DETECT_DEFAULTS __VA_ARGS__ })
+PL_API extern const struct pl_peak_detect_params pl_peak_detect_default_params;
+PL_API extern const struct pl_peak_detect_params pl_peak_detect_high_quality_params;
+
+// This function can be used to measure the CLL and FALL of a video
+// source automatically, using a compute shader. The measured values are
+// smoothed automatically (depending on the parameters), so to keep track of
+// the measured results over time, a tone mapping shader state object is used
+// to hold the state. Returns false on failure initializing the tone mapping
+// object, or if compute shaders are not supported.
+//
+// It's important that the same shader object is used for successive frames
+// belonging to the same source. If the source changes (e.g. due to a file
+// change or seek), the user should reset it with `pl_reset_detected_peak` (or
+// destroy it and use a new state object).
+//
+// The parameter `csp` holds the representation of the color values that are
+// the input to this function. (They must already be in decoded RGB form, i.e.
+// alternate color representations are not supported)
+PL_API bool pl_shader_detect_peak(pl_shader sh, struct pl_color_space csp,
+ pl_shader_obj *state,
+ const struct pl_peak_detect_params *params);
+
+// After dispatching the above shader, this function can be used to retrieve
+// the detected dynamic HDR10+ metadata parameters. The other fields of
+// `metadata` are not written to. Returns whether or not any values were
+// written. If not, the values are left untouched, so this can be used to
+// safely update `pl_hdr_metadata` values in-place. This function may or may
+// not block, depending on the previous setting of `allow_delayed`.
+PL_API bool pl_get_detected_hdr_metadata(const pl_shader_obj state,
+ struct pl_hdr_metadata *metadata);
+
+// After dispatching the above shader, this function *may* be used to read out
+// the detected CLL and FALL directly (in PL_HDR_NORM units). If the shader
+// has never been dispatched yet, i.e. no information is available, this will
+// return false.
+//
+// Deprecated in favor of `pl_get_detected_hdr_metadata`
+PL_DEPRECATED PL_API bool pl_get_detected_peak(const pl_shader_obj state,
+ float *out_cll, float *out_fall);
+
+// Resets the peak detection state in a given tone mapping state object. This
+// is not equal to `pl_shader_obj_destroy`, because it does not destroy any
+// state used by `pl_shader_tone_map`.
+PL_API void pl_reset_detected_peak(pl_shader_obj state);
+
+// Feature map extraction (for pl_color_map_args.feature_map). The result
+// of this shader should be downscaled / low-passed to the indicated kernel
+// size before use. (This does not happen automatically)
+PL_API void pl_shader_extract_features(pl_shader sh, struct pl_color_space csp);
+
+// Deprecated and unused. Libplacebo now always performs a variant of the old
+// hybrid tone-mapping, mixing together the intensity (I) and per-channel (LMS)
+// results.
+enum pl_tone_map_mode {
+ PL_TONE_MAP_AUTO PL_DEPRECATED_ENUMERATOR,
+ PL_TONE_MAP_RGB PL_DEPRECATED_ENUMERATOR,
+ PL_TONE_MAP_MAX PL_DEPRECATED_ENUMERATOR,
+ PL_TONE_MAP_HYBRID PL_DEPRECATED_ENUMERATOR,
+ PL_TONE_MAP_LUMA PL_DEPRECATED_ENUMERATOR,
+ PL_TONE_MAP_MODE_COUNT,
+};
+
+// Deprecated by <libplacebo/gamut_mapping.h>
+enum pl_gamut_mode {
+ PL_GAMUT_CLIP PL_DEPRECATED_ENUMERATOR, // pl_gamut_map_clip
+ PL_GAMUT_WARN PL_DEPRECATED_ENUMERATOR, // pl_gamut_map_highlight
+ PL_GAMUT_DARKEN PL_DEPRECATED_ENUMERATOR, // pl_gamut_map_darken
+ PL_GAMUT_DESATURATE PL_DEPRECATED_ENUMERATOR, // pl_gamut_map_desaturate
+ PL_GAMUT_MODE_COUNT,
+};
+
+struct pl_color_map_params {
+ // --- Gamut mapping options
+
+ // Gamut mapping function to use to handle out-of-gamut colors, including
+ // colors which are out-of-gamut as a consequence of tone mapping.
+ const struct pl_gamut_map_function *gamut_mapping;
+
+ // Gamut mapping constants, for expert tuning. Leave as default otherwise.
+ struct pl_gamut_map_constants gamut_constants;
+
+ // Gamut mapping 3DLUT size, for channels ICh. Defaults to {48, 32, 256}
+ int lut3d_size[3];
+
+ // Use higher quality, but slower, tricubic interpolation for gamut mapping
+ // 3DLUTs. May substantially improve the 3DLUT gamut mapping accuracy, in
+ // particular at smaller 3DLUT sizes. Shouldn't have much effect at the
+ // default size.
+ bool lut3d_tricubic;
+
+ // If true, allows the gamut mapping function to expand the gamut, in
+ // cases where the target gamut exceeds that of the source. If false,
+ // the source gamut will never be enlarged, even when using a gamut
+ // mapping function capable of bidirectional mapping.
+ bool gamut_expansion;
+
+ // --- Tone mapping options
+
+ // Tone mapping function to use to handle out-of-range colors.
+ const struct pl_tone_map_function *tone_mapping_function;
+
+ // Tone mapping constants, for expert tuning. Leave as default otherwise.
+ struct pl_tone_map_constants tone_constants;
+
+ // If true, and supported by the given tone mapping function, libplacebo
+ // will perform inverse tone mapping to expand the dynamic range of a
+ // signal. libplacebo is not liable for any HDR-induced eye damage.
+ bool inverse_tone_mapping;
+
+ // Data source to use when tone-mapping. Setting this to a specific
+ // value allows overriding the default metadata preference logic.
+ enum pl_hdr_metadata_type metadata;
+
+ // Tone mapping LUT size. Defaults to 256.
+ int lut_size;
+
+ // HDR contrast recovery strength. If set to a value above 0.0, the source
+ // image will be divided into high-frequency and low-frequency components,
+ // and a portion of the high-frequency image is added back onto the
+ // tone-mapped output. May cause excessive ringing artifacts for some HDR
+ // sources, but can improve the subjective sharpness and detail left over
+ // in the image after tone-mapping.
+ float contrast_recovery;
+
+ // Contrast recovery lowpass kernel size. Defaults to 3.5. Increasing
+ // or decreasing this will affect the visual appearance substantially.
+ float contrast_smoothness;
+
+ // --- Debugging options
+
+ // Force the use of a full tone-mapping LUT even for functions that have
+ // faster pure GLSL replacements (e.g. clip, linear, saturation).
+ bool force_tone_mapping_lut;
+
+ // Visualize the tone-mapping LUT and gamut mapping 3DLUT, in IPT space.
+ bool visualize_lut;
+
+ // Controls where to draw the visualization, relative to the rendered
+ // video (dimensions 0-1). Optional, defaults to the full picture.
+ pl_rect2df visualize_rect;
+
+ // Controls the rotation of the 3DLUT visualization.
+ float visualize_hue; // useful range [-pi, pi]
+ float visualize_theta; // useful range [0, pi/2]
+
+ // Graphically highlight hard-clipped pixels during tone-mapping (i.e.
+ // pixels that exceed the claimed source luminance range).
+ bool show_clipping;
+
+ // --- Deprecated fields
+ enum pl_tone_map_mode tone_mapping_mode PL_DEPRECATED; // removed
+ float tone_mapping_param PL_DEPRECATED; // see `tone_constants`
+ float tone_mapping_crosstalk PL_DEPRECATED; // now hard-coded as 0.04
+ enum pl_rendering_intent intent PL_DEPRECATED; // see `gamut_mapping`
+ enum pl_gamut_mode gamut_mode PL_DEPRECATED; // see `gamut_mapping`
+ float hybrid_mix PL_DEPRECATED; // removed
+};
+
+#define PL_COLOR_MAP_DEFAULTS \
+ .gamut_mapping = &pl_gamut_map_perceptual, \
+ .tone_mapping_function = &pl_tone_map_spline, \
+ .gamut_constants = { PL_GAMUT_MAP_CONSTANTS }, \
+ .tone_constants = { PL_TONE_MAP_CONSTANTS }, \
+ .metadata = PL_HDR_METADATA_ANY, \
+ .lut3d_size = {48, 32, 256}, \
+ .lut_size = 256, \
+ .visualize_rect = {0, 0, 1, 1}, \
+ .contrast_smoothness = 3.5f,
+
+#define PL_COLOR_MAP_HQ_DEFAULTS \
+ PL_COLOR_MAP_DEFAULTS \
+ .contrast_recovery = 0.30f,
+
+#define pl_color_map_params(...) (&(struct pl_color_map_params) { PL_COLOR_MAP_DEFAULTS __VA_ARGS__ })
+PL_API extern const struct pl_color_map_params pl_color_map_default_params;
+PL_API extern const struct pl_color_map_params pl_color_map_high_quality_params;
+
+// Execution arguments for the `pl_shader_color_map_ex` call. Distinct from
+// `pl_color_map_params` because it is filled by internally-provided execution
+// metadata, instead of user-tunable aesthetic parameters.
+struct pl_color_map_args {
+ // Input/output color space for the mapping.
+ struct pl_color_space src;
+ struct pl_color_space dst;
+
+ // If true, the logic will assume the input has already been linearized by
+ // the caller (e.g. as part of a previous linear light scaling operation).
+ bool prelinearized;
+
+ // Object to be used to store generated LUTs. Note that this is the same
+ // state object used by `pl_shader_detect_peak`, and if that function has
+ // been called on `state` prior to `pl_shader_color_map`, the detected
+ // values will be used to guide the tone mapping algorithm. If this is not
+ // provided, tone/gamut mapping are disabled.
+ pl_shader_obj *state;
+
+ // Low-resolution intensity feature map, as generated by
+ // `pl_shader_extract_features`. Optional. No effect if
+ // `params->contrast_recovery` is disabled.
+ pl_tex feature_map;
+};
+
+#define pl_color_map_args(...) (&(struct pl_color_map_args) { __VA_ARGS__ })
+
+// Maps `vec4 color` from one color space to another color space according
+// to the parameters (described in greater depth above). If `params` is left
+// as NULL, it defaults to `&pl_color_map_default_params`
+PL_API void pl_shader_color_map_ex(pl_shader sh,
+ const struct pl_color_map_params *params,
+ const struct pl_color_map_args *args);
+
+// Backwards compatibility wrapper around `pl_shader_color_map_ex`
+PL_API void pl_shader_color_map(pl_shader sh, const struct pl_color_map_params *params,
+ struct pl_color_space src, struct pl_color_space dst,
+ pl_shader_obj *state, bool prelinearized);
+
+// Applies a set of cone distortion parameters to `vec4 color` in a given color
+// space. This can be used to simulate color blindness. See `pl_cone_params`
+// for more information.
+PL_API void pl_shader_cone_distort(pl_shader sh, struct pl_color_space csp,
+ const struct pl_cone_params *params);
+
+PL_API_END
+
+#endif // LIBPLACEBO_SHADERS_COLORSPACE_H_
diff --git a/src/include/libplacebo/shaders/custom.h b/src/include/libplacebo/shaders/custom.h
new file mode 100644
index 0000000..a4eec69
--- /dev/null
+++ b/src/include/libplacebo/shaders/custom.h
@@ -0,0 +1,341 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_SHADERS_CUSTOM_H_
+#define LIBPLACEBO_SHADERS_CUSTOM_H_
+
+#include <stdlib.h>
+
+// Functions for writing custom shaders and hooking them into the `pl_renderer`
+// pipeline, as well as compatibility functions for parsing shaders in mpv
+// format.
+
+#include <libplacebo/shaders.h>
+#include <libplacebo/dispatch.h>
+#include <libplacebo/colorspace.h>
+
+PL_API_BEGIN
+
+// Parameters describing custom shader text to be embedded into a `pl_shader`
+// object. All of the strings are optional and can be left as NULL, but without
+// a `body` in particular, the shader will do nothing useful on its own.
+struct pl_custom_shader {
+ // The prelude contains text such as extra #defines, #extension pragmas,
+ // or other parts of the shader that must be placed at the very
+ // beginning (before input layout declarations etc.)
+ //
+ // Note: #extension pragmas do not need to be emitted to enable support for
+ // resource types already attached to the shader (e.g. SSBOs), compute
+ // shaders, or GPU capabilities known to libplacebo (e.g. subgroups).
+ const char *prelude;
+
+ // The header contains text such as helper function definitions, extra
+ // uniforms, shared memory variables or buffer descriptions.
+ const char *header;
+
+ // A friendly name for the shader. (Optional)
+ const char *description;
+
+ // The "primary" GLSL code. This will be effectively appended to the "main"
+ // function. It lives in an environment given by the `input` signature, and
+ // is expected to return results in a way given by the `output` signature.
+ //
+ // Note: In the case of PL_SHADER_SIG_COLOR, the output `vec4 color` is
+ // allocated by `pl_shader_custom`, the user merely needs to assign to it.
+ //
+ // Note: For ease of development it can be useful to have the main logic
+ // live inside a helper function defined as part of `header`, and specify
+ // the `body` as a single line that simply calls the helper function.
+ const char *body;
+ enum pl_shader_sig input;
+ enum pl_shader_sig output;
+
+ // Extra descriptors, variables and vertex attributes to attach to the
+ // resulting `pl_shader_res`.
+ //
+ // Note: The names inside these will possibly be replaced by fresh
+ // identifiers internally, so users should avoid looking for exact string
+ // matches for the given names inside the `pl_shader_res`.
+ const struct pl_shader_desc *descriptors;
+ int num_descriptors;
+ const struct pl_shader_var *variables;
+ int num_variables;
+ const struct pl_shader_va *vertex_attribs;
+ int num_vertex_attribs;
+ const struct pl_shader_const *constants;
+ int num_constants;
+
+ // If true, this shader must be a compute shader. The desired workgroup
+ // size and shared memory usage can be optionally specified, or 0 if no
+ // specific work group size or shared memory size restrictions apply.
+ //
+ // See also: `pl_shader_res.compute_group_size`
+ bool compute;
+ size_t compute_shmem;
+ int compute_group_size[2];
+
+ // Fixes the output size requirements of the shader to exact dimensions.
+ // Optional, if left as 0, means the shader can be dispatched at any size.
+ int output_w;
+ int output_h;
+};
+
+// Append custom shader code, including extra descriptors and variables, to an
+// existing `pl_shader` object. Returns whether successful. This function may
+// fail in the event that e.g. the custom shader requires compute shaders on
+// an unsupported GPU, or exceeds the GPU's shared memory capabilities.
+PL_API bool pl_shader_custom(pl_shader sh, const struct pl_custom_shader *params);
+
+// Which "rendering stages" are available for user shader hooking purposes.
+// Except where otherwise noted, all stages are "non-resizable", i.e. the
+// shaders already have specific output size requirements.
+enum pl_hook_stage {
+ // Hook stages for the untouched planes, as made available by the source.
+ // These are all resizable, i.e. there are no specific output stage
+ // requirements.
+ PL_HOOK_RGB_INPUT = 1 << 0,
+ PL_HOOK_LUMA_INPUT = 1 << 1,
+ PL_HOOK_CHROMA_INPUT = 1 << 2,
+ PL_HOOK_ALPHA_INPUT = 1 << 3,
+ PL_HOOK_XYZ_INPUT = 1 << 4,
+
+ // Hook stages for the scaled/aligned planes
+ PL_HOOK_CHROMA_SCALED = 1 << 5,
+ PL_HOOK_ALPHA_SCALED = 1 << 6,
+
+ PL_HOOK_NATIVE = 1 << 7, // Combined image in its native color space
+ PL_HOOK_RGB = 1 << 8, // After conversion to RGB (resizable)
+ PL_HOOK_LINEAR = 1 << 9, // After linearization but before scaling
+ PL_HOOK_SIGMOID = 1 << 10, // After sigmoidization
+ PL_HOOK_PRE_KERNEL = 1 << 11, // Immediately before the main scaler kernel
+ PL_HOOK_POST_KERNEL = 1 << 12, // Immediately after the main scaler kernel
+ PL_HOOK_SCALED = 1 << 13, // After scaling, before color management
+ PL_HOOK_PRE_OUTPUT = 1 << 14, // After color management, before blending/rotation
+ PL_HOOK_OUTPUT = 1 << 15, // After blending/rotation, before dithering
+};
+
+// Returns true if a given hook stage is resizable
+static inline bool pl_hook_stage_resizable(enum pl_hook_stage stage) {
+ switch (stage) {
+ case PL_HOOK_RGB_INPUT:
+ case PL_HOOK_LUMA_INPUT:
+ case PL_HOOK_CHROMA_INPUT:
+ case PL_HOOK_ALPHA_INPUT:
+ case PL_HOOK_XYZ_INPUT:
+ case PL_HOOK_NATIVE:
+ case PL_HOOK_RGB:
+ return true;
+
+ case PL_HOOK_CHROMA_SCALED:
+ case PL_HOOK_ALPHA_SCALED:
+ case PL_HOOK_LINEAR:
+ case PL_HOOK_SIGMOID:
+ case PL_HOOK_PRE_KERNEL:
+ case PL_HOOK_POST_KERNEL:
+ case PL_HOOK_SCALED:
+ case PL_HOOK_PRE_OUTPUT:
+ case PL_HOOK_OUTPUT:
+ return false;
+ }
+
+ abort();
+}
+
+// The different forms of communicating image data between the renderer and
+// the hooks
+enum pl_hook_sig {
+ PL_HOOK_SIG_NONE, // No data is passed, no data is received/returned
+ PL_HOOK_SIG_COLOR, // `vec4 color` already pre-sampled in a `pl_shader`
+ PL_HOOK_SIG_TEX, // `pl_tex` containing the image data
+ PL_HOOK_SIG_COUNT,
+};
+
+struct pl_hook_params {
+ // GPU objects associated with the `pl_renderer`, which the user may
+ // use for their own purposes.
+ pl_gpu gpu;
+ pl_dispatch dispatch;
+
+ // Helper function to fetch a new temporary texture, using renderer-backed
+ // storage. This is guaranteed to have sane image usage requirements and a
+ // 16-bit or floating point format. The user does not need to free/destroy
+ // this texture in any way. May return NULL.
+ pl_tex (*get_tex)(void *priv, int width, int height);
+ void *priv;
+
+ // Which stage triggered the hook to run.
+ enum pl_hook_stage stage;
+
+ // For `PL_HOOK_SIG_COLOR`, this contains the existing shader object with
+ // the color already pre-sampled into `vec4 color`. The user may modify
+ // this as much as they want, as long as they don't dispatch/finalize/reset
+ // it.
+ //
+ // Note that this shader might have specific output size requirements,
+ // depending on the exact shader stage hooked by the user, and may already
+ // be a compute shader.
+ pl_shader sh;
+
+ // For `PL_HOOK_SIG_TEX`, this contains the texture that the user should
+ // sample from.
+ //
+ // Note: This texture object is owned by the renderer, and users must not
+ // modify its contents. It will not be touched for the duration of a frame,
+ // but the contents are lost in between frames.
+ pl_tex tex;
+
+ // The effective current rectangle of the image we're rendering in this
+ // shader, i.e. the effective rect of the content we're interested in,
+ // as a crop of either `sh` or `tex` (depending on the signature).
+ //
+ // Note: This is still set even for `PL_HOOK_SIG_NONE`!
+ pl_rect2df rect;
+
+ // The current effective colorspace and representation, of either the
+ // pre-sampled color (in `sh`), or the contents of `tex`, respectively.
+ //
+ // Note: This is still set even for `PL_HOOK_SIG_NONE`!
+ struct pl_color_repr repr;
+ struct pl_color_space color;
+ int components;
+
+ // The representation and colorspace of the original image, for reference.
+ const struct pl_color_repr *orig_repr;
+ const struct pl_color_space *orig_color;
+
+ // The (cropped) source and destination rectangles of the overall
+ // rendering. These are functionallty equivalent to `image.crop` and
+ // `target.crop`, respectively, but `src_rect` in particular may change as
+ // a result of previous hooks being executed. (e.g. prescalers)
+ pl_rect2df src_rect;
+ pl_rect2d dst_rect;
+};
+
+struct pl_hook_res {
+ // If true, the hook is assumed to have "failed" or errored in some way,
+ // and all other fields are ignored.
+ bool failed;
+
+ // What type of output this hook is returning.
+ // Note: If this is `PL_HOOK_SIG_NONE`, all other fields are ignored.
+ enum pl_hook_sig output;
+
+ // For `PL_HOOK_SIG_COLOR`, this *must* be set to a valid `pl_shader`
+ // object containing the sampled color value (i.e. with an output signature
+ // of `PL_SHADER_SIG_COLOR`), and *should* be allocated from the given
+ // `pl_dispatch` object. Ignored otherwise.
+ pl_shader sh;
+
+ // For `PL_HOOK_SIG_TEX`, this *must* contain the texture object containing
+ // the result of rendering the hook. This *should* be a texture allocated
+ // using the given `get_tex` callback, to ensure the format and texture
+ // usage flags are compatible with what the renderer expects.
+ pl_tex tex;
+
+ // For shaders that return some sort of output, this contains the
+ // new/altered versions of the existing "current texture" metadata.
+ struct pl_color_repr repr;
+ struct pl_color_space color;
+ int components;
+
+ // This contains the new effective rect of the contents. This may be
+ // different from the original `rect` for resizable passes. Ignored for
+ // non-resizable passes.
+ pl_rect2df rect;
+};
+
+enum pl_hook_par_mode {
+ PL_HOOK_PAR_VARIABLE, // normal shader variable
+ PL_HOOK_PAR_DYNAMIC, // dynamic shader variable, e.g. per-frame changing
+ PL_HOOK_PAR_CONSTANT, // fixed at compile time (e.g. for array sizes),
+ // must be scalar (non-vector/matrix)
+ PL_HOOK_PAR_DEFINE, // defined in the preprocessor, must be `int`
+ PL_HOOK_PAR_MODE_COUNT,
+};
+
+typedef union pl_var_data {
+ int i;
+ unsigned u;
+ float f;
+} pl_var_data;
+
+struct pl_hook_par {
+ // Name as used in the shader.
+ const char *name;
+
+ // Type of this shader parameter, and how it's manifested in the shader.
+ enum pl_var_type type;
+ enum pl_hook_par_mode mode;
+
+ // Human-readable explanation of this parameter. (Optional)
+ const char *description;
+
+ // Mutable data pointer to current value of variable.
+ pl_var_data *data;
+
+ // Default/initial value, and lower/upper bounds.
+ pl_var_data initial;
+ pl_var_data minimum;
+ pl_var_data maximum;
+
+ // Human-readable names for the variants of an integer option. This array
+ // can be indexed directly by integer values, ranging from `minimum.i` to
+ // `maximum.i`. May be NULL, in which case options are unnamed.
+ const char * const *names;
+};
+
+// Struct describing a hook.
+//
+// Note: Users may freely create their own instances of this struct, there is
+// nothing particularly special about `pl_mpv_user_shader_parse`.
+struct pl_hook {
+ enum pl_hook_stage stages; // Which stages to hook on
+ enum pl_hook_sig input; // Which input signature this hook expects
+ void *priv; // Arbitrary user context
+
+ // Custom tunable shader parameters exported by this hook. These may be
+ // updated at any time by the user, to influence the behavior of the hook.
+ // Contents are arbitrary and subject to the method of hook construction.
+ const struct pl_hook_par *parameters;
+ int num_parameters;
+
+ // Called at the beginning of passes, to reset/initialize the hook. (Optional)
+ void (*reset)(void *priv);
+
+ // The hook function itself. Called by the renderer at any of the indicated
+ // hook stages. See `pl_hook_res` for more info on the return values.
+ struct pl_hook_res (*hook)(void *priv, const struct pl_hook_params *params);
+
+ // Unique signature identifying this hook, used to disable misbehaving hooks.
+ // All hooks with the same signature will be disabled, should they fail to
+ // execute during run-time.
+ uint64_t signature;
+};
+
+// Compatibility layer with `mpv` user shaders. See the mpv man page for more
+// information on the format. Will return `NULL` if the shader fails parsing.
+//
+// The resulting `pl_hook` objects should be destroyed with the corresponding
+// destructor when no longer needed.
+PL_API const struct pl_hook *
+pl_mpv_user_shader_parse(pl_gpu gpu, const char *shader_text, size_t shader_len);
+
+PL_API void pl_mpv_user_shader_destroy(const struct pl_hook **hook);
+
+PL_API_END
+
+#endif // LIBPLACEBO_SHADERS_CUSTOM_H_
diff --git a/src/include/libplacebo/shaders/deinterlacing.h b/src/include/libplacebo/shaders/deinterlacing.h
new file mode 100644
index 0000000..40e74e8
--- /dev/null
+++ b/src/include/libplacebo/shaders/deinterlacing.h
@@ -0,0 +1,137 @@
+
+/*
+ * This file is part of libplacebo, which is normally licensed under the terms
+ * of the LGPL v2.1+. However, this file (film_grain.h) is also available under
+ * the terms of the more permissive MIT license:
+ *
+ * Copyright (c) 2018-2019 Niklas Haas
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef LIBPLACEBO_SHADERS_DEINTERLACING_H_
+#define LIBPLACEBO_SHADERS_DEINTERLACING_H_
+
+#include <libplacebo/shaders.h>
+
+PL_API_BEGIN
+
+enum pl_field {
+ PL_FIELD_NONE = 0, // no deinterlacing
+ PL_FIELD_EVEN, // "top" fields, with even y coordinates
+ PL_FIELD_ODD, // "bottom" fields, with odd y coordinates
+
+ // Convenience aliases
+ PL_FIELD_TOP = PL_FIELD_EVEN,
+ PL_FIELD_BOTTOM = PL_FIELD_ODD,
+};
+
+static inline enum pl_field pl_field_other(enum pl_field field)
+{
+ switch (field) {
+ case PL_FIELD_EVEN: return PL_FIELD_ODD;
+ case PL_FIELD_ODD: return PL_FIELD_EVEN;
+ default: return field;
+ }
+}
+
+struct pl_field_pair {
+ // Top texture. If only this is specified, it's assumed to contain both
+ // fields in an interleaved fashion (MBAFF).
+ //
+ // Note: Support for separate fields (PAFF), is currently pending, so this
+ // is the only way to provide interlaced frames at the moment.
+ pl_tex top;
+};
+
+#define pl_field_pair(...) ((struct pl_field_pair) { __VA_ARGS__ })
+
+struct pl_deinterlace_source {
+ // Previous, current and next source (interlaced) frames. `prev` and `next`
+ // may be NULL, but `cur` is required. If present, they must all have the
+ // exact same texture dimensions.
+ //
+ // Note: `prev` and `next` are only required for PL_DEINTERLACE_YADIF.
+ struct pl_field_pair prev, cur, next;
+
+ // The parity of the current field to output. This field will be unmodified
+ // from `cur`, with the corresponding other field interpolated.
+ //
+ // If this is `PL_FIELD_NONE`, no deinterlacing is performed, and the
+ // texture is merely sampled as-is.
+ enum pl_field field;
+
+ // The parity of the first frame in a stream. Set this the field that is
+ // (conceptually) ordered first in time.
+ //
+ // If this is `PL_FIELD_NONE`, it will instead default to `PL_FIELD_TOP`.
+ enum pl_field first_field;
+
+ // Components to deinterlace. Components not specified will be ignored.
+ // Optional, if left as 0, all components will be deinterlaced.
+ uint8_t component_mask;
+};
+
+#define pl_deinterlace_source(...) (&(struct pl_deinterlace_source) { __VA_ARGS__ })
+
+enum pl_deinterlace_algorithm {
+ // No-op deinterlacing, just sample the weaved frame un-touched.
+ PL_DEINTERLACE_WEAVE = 0,
+
+ // Naive bob deinterlacing. Doubles the field lines vertically.
+ PL_DEINTERLACE_BOB,
+
+ // "Yet another deinterlacing filter". Deinterlacer with temporal and
+ // spatial information. Based on FFmpeg's Yadif filter algorithm, but
+ // adapted slightly for the GPU.
+ PL_DEINTERLACE_YADIF,
+
+ PL_DEINTERLACE_ALGORITHM_COUNT,
+};
+
+// Returns whether or not an algorithm requires `prev`/`next` refs to be set.
+static inline bool pl_deinterlace_needs_refs(enum pl_deinterlace_algorithm algo)
+{
+ return algo == PL_DEINTERLACE_YADIF;
+}
+
+struct pl_deinterlace_params {
+ // Algorithm to use. The recommended default is PL_DEINTERLACE_YADIF, which
+ // provides a good trade-off of quality and speed.
+ enum pl_deinterlace_algorithm algo;
+
+ // Skip the spatial interlacing check. (PL_DEINTERLACE_YADIF only)
+ bool skip_spatial_check;
+};
+
+#define PL_DEINTERLACE_DEFAULTS \
+ .algo = PL_DEINTERLACE_YADIF,
+
+#define pl_deinterlace_params(...) (&(struct pl_deinterlace_params) { PL_DEINTERLACE_DEFAULTS __VA_ARGS__ })
+PL_API extern const struct pl_deinterlace_params pl_deinterlace_default_params;
+
+// Deinterlaces a set of interleaved source frames and outputs the result into
+// `vec4 color`. If `params` is left as NULL, it defaults to
+// `&pl_deinterlace_default_params`.
+PL_API void pl_shader_deinterlace(pl_shader sh, const struct pl_deinterlace_source *src,
+ const struct pl_deinterlace_params *params);
+
+PL_API_END
+
+#endif // LIBPLACEBO_SHADERS_DEINTERLACING_H_
diff --git a/src/include/libplacebo/shaders/dithering.h b/src/include/libplacebo/shaders/dithering.h
new file mode 100644
index 0000000..9146c81
--- /dev/null
+++ b/src/include/libplacebo/shaders/dithering.h
@@ -0,0 +1,140 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_SHADERS_DITHERING_H_
+#define LIBPLACEBO_SHADERS_DITHERING_H_
+
+// Dithering shaders
+
+#include <libplacebo/colorspace.h>
+#include <libplacebo/dither.h>
+#include <libplacebo/shaders.h>
+
+PL_API_BEGIN
+
+enum pl_dither_method {
+ // Dither with blue noise. Very high quality, but requires the use of a
+ // LUT. Warning: Computing a blue noise texture with a large size can be
+ // very slow, however this only needs to be performed once. Even so, using
+ // this with a `lut_size` greater than 6 is generally ill-advised. This is
+ // the preferred/default dither method.
+ PL_DITHER_BLUE_NOISE,
+
+ // Dither with an ordered (bayer) dither matrix, using a LUT. Low quality,
+ // and since this also uses a LUT, there's generally no advantage to picking
+ // this instead of `PL_DITHER_BLUE_NOISE`. It's mainly there for testing.
+ PL_DITHER_ORDERED_LUT,
+
+ // The same as `PL_DITHER_ORDERED_LUT`, but uses fixed function math instead
+ // of a LUT. This is faster, but only supports a fixed dither matrix size
+ // of 16x16 (equal to a `lut_size` of 4).
+ PL_DITHER_ORDERED_FIXED,
+
+ // Dither with white noise. This does not require a LUT and is fairly cheap
+ // to compute. Unlike the other modes it doesn't show any repeating
+ // patterns either spatially or temporally, but the downside is that this
+ // is visually fairly jarring due to the presence of low frequencies in the
+ // noise spectrum.
+ PL_DITHER_WHITE_NOISE,
+
+ PL_DITHER_METHOD_COUNT,
+};
+
+struct pl_dither_params {
+ // The source of the dither noise to use.
+ enum pl_dither_method method;
+
+ // For the dither methods which require the use of a LUT, this controls
+ // the size of the LUT (base 2). If left as NULL, this defaults to 6, which
+ // is equivalent to a 64x64 dither matrix. Must not be larger than 8.
+ int lut_size;
+
+ // Enables temporal dithering. This reduces the persistence of dithering
+ // artifacts by perturbing the dithering matrix per frame.
+ // Warning: This can cause nasty aliasing artifacts on some LCD screens.
+ bool temporal;
+
+ // Gamma function to use for dither gamma correction. This will only have
+ // an effect when dithering to low bit depths (<= 4).
+ enum pl_color_transfer transfer;
+};
+
+#define PL_DITHER_DEFAULTS \
+ .method = PL_DITHER_BLUE_NOISE, \
+ .lut_size = 6, \
+ /* temporal dithering commonly flickers on LCDs */ \
+ .temporal = false,
+
+#define pl_dither_params(...) (&(struct pl_dither_params) { PL_DITHER_DEFAULTS __VA_ARGS__ })
+PL_API extern const struct pl_dither_params pl_dither_default_params;
+
+// Dither the colors to a lower depth, given in bits. This can be used on input
+// colors of any precision. Basically, this rounds the colors to only linear
+// multiples of the stated bit depth. The average intensity of the result
+// will not change (i.e., the dither noise is balanced in both directions).
+// If `params` is NULL, it defaults to &pl_dither_default_params.
+//
+// For the dither methods which require the use of a LUT, `dither_state` must
+// be set to a valid pointer. To avoid thrashing the resource, users should
+// avoid trying to re-use the same LUT for different dither configurations. If
+// passed as NULL, libplacebo will automatically fall back to dither algorithms
+// that don't require the use of a LUT.
+//
+// Warning: This dithering algorithm is not gamma-invariant; so using it for
+// very low bit depths (below 4 or so) will noticeably increase the brightness
+// of the resulting image. When doing low bit depth dithering for aesthetic
+// purposes, it's recommended that the user explicitly (de)linearize the colors
+// before and after this algorithm.
+PL_API void pl_shader_dither(pl_shader sh, int new_depth,
+ pl_shader_obj *dither_state,
+ const struct pl_dither_params *params);
+
+struct pl_error_diffusion_params {
+ // Both the input and output texture must be provided up-front, with the
+ // same size. The output texture must be storable, and the input texture
+ // must be sampleable.
+ pl_tex input_tex;
+ pl_tex output_tex;
+
+ // Depth to dither to. Required.
+ int new_depth;
+
+ // Error diffusion kernel to use. Optional. If unspecified, defaults to
+ // `&pl_error_diffusion_sierra_lite`.
+ const struct pl_error_diffusion_kernel *kernel;
+};
+
+#define pl_error_diffusion_params(...) (&(struct pl_error_diffusion_params) { __VA_ARGS__ })
+
+// Computes the shared memory requirements for a given error diffusion kernel.
+// This can be used to test up-front whether or not error diffusion would be
+// supported or not, before having to initialize textures.
+PL_API size_t pl_error_diffusion_shmem_req(const struct pl_error_diffusion_kernel *kernel,
+ int height);
+
+// Apply an error diffusion dithering kernel. This is a much more expensive and
+// heavy dithering method, and is not generally recommended for realtime usage
+// where performance is critical.
+//
+// Requires compute shader support. Returns false if dithering fail e.g. as a
+// result of shader memory limits being exceeded. The resulting shader must be
+// dispatched with a work group count of exactly 1.
+PL_API bool pl_shader_error_diffusion(pl_shader sh, const struct pl_error_diffusion_params *params);
+
+PL_API_END
+
+#endif // LIBPLACEBO_SHADERS_DITHERING_H_
diff --git a/src/include/libplacebo/shaders/film_grain.h b/src/include/libplacebo/shaders/film_grain.h
new file mode 100644
index 0000000..8a9c78b
--- /dev/null
+++ b/src/include/libplacebo/shaders/film_grain.h
@@ -0,0 +1,137 @@
+/*
+ * This file is part of libplacebo, which is normally licensed under the terms
+ * of the LGPL v2.1+. However, this file (film_grain.h) is also available under
+ * the terms of the more permissive MIT license:
+ *
+ * Copyright (c) 2018-2019 Niklas Haas
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef LIBPLACEBO_SHADERS_FILM_GRAIN_H_
+#define LIBPLACEBO_SHADERS_FILM_GRAIN_H_
+
+// Film grain synthesis shaders for AV1 / H.274.
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include <libplacebo/colorspace.h>
+#include <libplacebo/shaders.h>
+
+PL_API_BEGIN
+
+enum pl_film_grain_type {
+ PL_FILM_GRAIN_NONE = 0,
+ PL_FILM_GRAIN_AV1,
+ PL_FILM_GRAIN_H274,
+ PL_FILM_GRAIN_COUNT,
+};
+
+// AV1 film grain parameters. For the exact meaning of these, see the AV1
+// specification (section 6.8.20).
+struct pl_av1_grain_data {
+ int num_points_y;
+ uint8_t points_y[14][2]; // [n][0] = value, [n][1] = scaling
+ bool chroma_scaling_from_luma;
+ int num_points_uv[2]; // should be {0} for grayscale images
+ uint8_t points_uv[2][10][2]; // like points_y for points_uv[0, 1] = u, v
+ int scaling_shift;
+ int ar_coeff_lag;
+ int8_t ar_coeffs_y[24];
+ int8_t ar_coeffs_uv[2][25];
+ int ar_coeff_shift;
+ int grain_scale_shift;
+ int8_t uv_mult[2];
+ int8_t uv_mult_luma[2];
+ int16_t uv_offset[2]; // 9-bit value, range [-256, 255]
+ bool overlap;
+};
+
+// H.274 film grain parameters. For the exact meaning of these, see the H.274
+// specification (section 8.5).
+struct pl_h274_grain_data {
+ int model_id;
+ int blending_mode_id;
+ int log2_scale_factor;
+ bool component_model_present[3];
+ uint16_t num_intensity_intervals[3];
+ uint8_t num_model_values[3];
+ const uint8_t *intensity_interval_lower_bound[3];
+ const uint8_t *intensity_interval_upper_bound[3];
+ const int16_t (*comp_model_value[3])[6];
+};
+
+// Tagged union for film grain data
+struct pl_film_grain_data {
+ enum pl_film_grain_type type; // film grain type
+ uint64_t seed; // shared seed value
+
+ union {
+ // Warning: These values are not sanity-checked at all, Invalid grain
+ // data results in undefined behavior!
+ struct pl_av1_grain_data av1;
+ struct pl_h274_grain_data h274;
+ } params;
+};
+
+// Options for the `pl_shader_film_grain` call.
+struct pl_film_grain_params {
+ // Required for all film grain types:
+ struct pl_film_grain_data data; // film grain data
+ pl_tex tex; // texture to sample from
+ struct pl_color_repr *repr; // underlying color representation (see notes)
+ int components;
+ int component_mapping[4]; // same as `struct pl_plane`
+
+ // Notes for `repr`:
+ // - repr->bits affects the rounding for grain generation
+ // - repr->levels affects whether or not we clip to full range or not
+ // - repr->sys affects the interpretation of channels
+ // - *repr gets normalized by this shader, which is why it's a pointer
+
+ // Required for PL_FILM_GRAIN_AV1 only:
+ pl_tex luma_tex; // "luma" texture (see notes)
+ int luma_comp; // index of luma in `luma_tex`
+
+ // Notes for `luma_tex`:
+ // - `luma_tex` must be specified if the `tex` does not itself contain the
+ // "luma-like" component. For XYZ systems, the Y channel is the luma
+ // component. For RGB systems, the G channel is.
+};
+
+#define pl_film_grain_params(...) (&(struct pl_film_grain_params) { __VA_ARGS__ })
+
+// Test if film grain needs to be applied. This is a helper function that users
+// can use to decide whether or not `pl_shader_film_grain` needs to be called,
+// based on the given grain metadata.
+PL_API bool pl_needs_film_grain(const struct pl_film_grain_params *params);
+
+// Sample from a texture while applying film grain at the same time.
+// `grain_state` must be unique for every plane configuration, as it may
+// contain plane-dependent state.
+//
+// Returns false on any error, or if film grain generation is not supported
+// due to GLSL limitations.
+PL_API bool pl_shader_film_grain(pl_shader sh, pl_shader_obj *grain_state,
+ const struct pl_film_grain_params *params);
+
+PL_API_END
+
+#endif // LIBPLACEBO_SHADERS_FILM_GRAIN_H_
diff --git a/src/include/libplacebo/shaders/icc.h b/src/include/libplacebo/shaders/icc.h
new file mode 100644
index 0000000..a4003f4
--- /dev/null
+++ b/src/include/libplacebo/shaders/icc.h
@@ -0,0 +1,135 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_SHADERS_ICC_H_
+#define LIBPLACEBO_SHADERS_ICC_H_
+
+// Functions for generating and applying ICC-derived (3D)LUTs
+
+#include <libplacebo/colorspace.h>
+#include <libplacebo/shaders.h>
+
+PL_API_BEGIN
+
+struct pl_icc_params {
+ // The rendering intent to use, for profiles with multiple intents. A
+ // recommended value is PL_INTENT_RELATIVE_COLORIMETRIC for color-accurate
+ // video reproduction, or PL_INTENT_PERCEPTUAL for profiles containing
+ // meaningful perceptual mapping tables for some more suitable color space
+ // like BT.709.
+ //
+ // If this is set to the special value PL_INTENT_AUTO, will use the
+ // preferred intent provided by the profile header.
+ enum pl_rendering_intent intent;
+
+ // The size of the 3DLUT to generate. If left as NULL, these individually
+ // default to values appropriate for the profile. (Based on internal
+ // precision heuristics)
+ //
+ // Note: Setting this manually is strongly discouraged, as it can result
+ // in excessively high 3DLUT sizes where a much smaller LUT would have
+ // sufficed.
+ int size_r, size_g, size_b;
+
+ // This field can be used to override the detected brightness level of the
+ // ICC profile. If you set this to the special value 0 (or a negative
+ // number), libplacebo will attempt reading the brightness value from the
+ // ICC profile's tagging (if available), falling back to PL_COLOR_SDR_WHITE
+ // if unavailable.
+ float max_luma;
+
+ // Force black point compensation. May help avoid crushed or raised black
+ // points on "improper" profiles containing e.g. colorimetric tables that
+ // do not round-trip. Should not be required on well-behaved profiles,
+ // or when using PL_INTENT_PERCEPTUAL, but YMMV.
+ bool force_bpc;
+
+ // If provided, this pl_cache instance will be used, instead of the
+ // GPU-internal cache, to cache the generated 3DLUTs. Note that these can
+ // get large, especially for large values of size_{r,g,b}, so the user may
+ // wish to split this cache off from the main shader cache. (Optional)
+ pl_cache cache;
+
+ // Deprecated legacy caching API. Replaced by `cache`.
+ PL_DEPRECATED void *cache_priv;
+ PL_DEPRECATED void (*cache_save)(void *priv, uint64_t sig, const uint8_t *cache, size_t size);
+ PL_DEPRECATED bool (*cache_load)(void *priv, uint64_t sig, uint8_t *cache, size_t size);
+};
+
+#define PL_ICC_DEFAULTS \
+ .intent = PL_INTENT_RELATIVE_COLORIMETRIC, \
+ .max_luma = PL_COLOR_SDR_WHITE,
+
+#define pl_icc_params(...) (&(struct pl_icc_params) { PL_ICC_DEFAULTS __VA_ARGS__ })
+PL_API extern const struct pl_icc_params pl_icc_default_params;
+
+// This object represents a "parsed" ICC profile.
+typedef const struct pl_icc_object_t {
+ // Provided params, with the `intent` and `size` fields set (as described)
+ struct pl_icc_params params;
+
+ // Signature of the corresponding ICC profile.
+ uint64_t signature;
+
+ // Detected color space (or UNKNOWN for profiles which don't contain an
+ // exact match), with HDR metedata set to the detected gamut and
+ // white/black value ranges.
+ struct pl_color_space csp;
+
+ // Best estimate of profile gamma. This only serves as a rough guideline.
+ float gamma;
+
+ // Smallest containing primary set, always set.
+ enum pl_color_primaries containing_primaries;
+} *pl_icc_object;
+
+// Attempts opening/parsing the contents of an ICC profile. The resulting
+// object is memory managed and may outlive the original profile - access
+// to the underlying profile is no longer needed once this returns.
+PL_API pl_icc_object pl_icc_open(pl_log log, const struct pl_icc_profile *profile,
+ const struct pl_icc_params *params);
+PL_API void pl_icc_close(pl_icc_object *icc);
+
+// Update an existing pl_icc_object, which may be NULL, replacing it by the
+// new profile and parameters (if incompatible).
+//
+// Returns success. `obj` is set to the created profile, or NULL on error.
+//
+// Note: If `profile->signature` matches `(*obj)->signature`, or if `profile` is
+// NULL, then the existing profile is directly reused, with only the effective
+// parameters changing. In this case, `profile->data` is also *not* read from,
+// and may safely be NULL.
+PL_API bool pl_icc_update(pl_log log, pl_icc_object *obj,
+ const struct pl_icc_profile *profile,
+ const struct pl_icc_params *params);
+
+// Decode the input from the colorspace determined by the attached ICC profile
+// to linear light RGB (in the profile's containing primary set). `lut` must be
+// set to a shader object that will store the GPU resources associated with the
+// generated LUT. The resulting color space will be written to `out_csp`.
+PL_API void pl_icc_decode(pl_shader sh, pl_icc_object profile, pl_shader_obj *lut,
+ struct pl_color_space *out_csp);
+
+// Encode the input from linear light RGB (in the profile's containing primary
+// set) into the colorspace determined by the attached ICC profile. `lut` must
+// be set to a shader object that will store the GPU resources associated with
+// the generated LUT.
+PL_API void pl_icc_encode(pl_shader sh, pl_icc_object profile, pl_shader_obj *lut);
+
+PL_API_END
+
+#endif // LIBPLACEBO_SHADERS_ICC_H_
diff --git a/src/include/libplacebo/shaders/lut.h b/src/include/libplacebo/shaders/lut.h
new file mode 100644
index 0000000..6e30ddc
--- /dev/null
+++ b/src/include/libplacebo/shaders/lut.h
@@ -0,0 +1,78 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_SHADERS_LUT_H_
+#define LIBPLACEBO_SHADERS_LUT_H_
+
+// Shaders for loading and applying arbitrary custom 1D/3DLUTs
+
+#include <libplacebo/colorspace.h>
+#include <libplacebo/shaders.h>
+
+PL_API_BEGIN
+
+// Struct defining custom LUTs
+//
+// Note: Users may freely create their own instances of this struct, there is
+// nothing particularly special about `pl_lut_parse_cube`.
+struct pl_custom_lut {
+ // Some unique signature identifying this LUT, needed to detect state
+ // changes (for cache invalidation). This should ideally be a hash of the
+ // file contents. (Which is what `pl_lut_parse_*` will set it to.)
+ uint64_t signature;
+
+ // Size of each dimension, in the order R, G, B. For 1D LUTs, only the R
+ // dimension should be specified (the others left as 0).
+ int size[3];
+
+ // Raw LUT data itself, in properly scaled floating point format. For 3D
+ // LUTs, the innermost dimension is the first dimension (R), and the
+ // outermost dimension is the last dimension (B). Individual color samples
+ // are in the order R, G, B.
+ const float *data;
+
+ // Extra input/output shaper matrices. Ignored if equal to {0}. This is
+ // mostly useful for 1D LUTs, since 3D LUTs can bake the shaper matrix into
+ // the LUT itself - but it can still help optimize LUT precision.
+ pl_matrix3x3 shaper_in, shaper_out;
+
+ // Nominal metadata for the input/output of a LUT. Left as {0} if unknown.
+ // Note: This is purely informative, `pl_shader_custom_lut` ignores it.
+ struct pl_color_repr repr_in, repr_out;
+ struct pl_color_space color_in, color_out;
+};
+
+// Parse a 3DLUT in .cube format. Returns NULL if the file fails parsing.
+PL_API struct pl_custom_lut *pl_lut_parse_cube(pl_log log, const char *str, size_t str_len);
+
+// Frees a LUT created by `pl_lut_parse_*`.
+PL_API void pl_lut_free(struct pl_custom_lut **lut);
+
+// Apply a `pl_custom_lut`. The user is responsible for ensuring colors going
+// into the LUT are in the expected format as informed by the LUT metadata.
+//
+// `lut_state` must be a pointer to a NULL-initialized shader state object that
+// will be used to encapsulate any required GPU state.
+//
+// Note: `lut` does not have to be allocated by `pl_lut_parse_*`. It can be a
+// struct filled out by the user.
+PL_API void pl_shader_custom_lut(pl_shader sh, const struct pl_custom_lut *lut,
+ pl_shader_obj *lut_state);
+
+PL_API_END
+
+#endif // LIBPLACEBO_SHADERS_LUT_H_
diff --git a/src/include/libplacebo/shaders/sampling.h b/src/include/libplacebo/shaders/sampling.h
new file mode 100644
index 0000000..5221e44
--- /dev/null
+++ b/src/include/libplacebo/shaders/sampling.h
@@ -0,0 +1,257 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_SHADERS_SAMPLING_H_
+#define LIBPLACEBO_SHADERS_SAMPLING_H_
+
+// Sampling operations. These shaders perform some form of sampling operation
+// from a given pl_tex. In order to use these, the pl_shader *must* have been
+// created using the same `gpu` as the originating `pl_tex`. Otherwise, this
+// is undefined behavior. They require nothing (PL_SHADER_SIG_NONE) and return
+// a color (PL_SHADER_SIG_COLOR).
+
+#include <libplacebo/colorspace.h>
+#include <libplacebo/filters.h>
+#include <libplacebo/shaders.h>
+
+PL_API_BEGIN
+
+// Common parameters for sampling operations
+struct pl_sample_src {
+ // There are two mutually exclusive ways of providing the source to sample
+ // from:
+ //
+ // 1. Provide the texture and sampled region directly. This generates
+ // a shader with input signature `PL_SHADER_SIG_NONE`, which binds the
+ // texture as a descriptor (and the coordinates as a vertex attribute)
+ pl_tex tex; // texture to sample
+ pl_rect2df rect; // sub-rect to sample from (optional)
+ enum pl_tex_address_mode address_mode; // preferred texture address mode
+
+ // 2. Have the shader take it as an argument. Doing this requires
+ // specifying the missing metadata of the texture backing the sampler, so
+ // that the shader generation can generate the correct code.
+ int tex_w, tex_h; // dimensions of the actual texture
+ enum pl_fmt_type format; // format of the sampler being accepted
+ enum pl_sampler_type sampler; // type of the sampler being accepted
+ enum pl_tex_sample_mode mode; // sample mode of the sampler being accepted
+ float sampled_w, sampled_h; // dimensions of the sampled region (optional)
+
+ // Common metadata for both sampler input types:
+ int components; // number of components to sample (optional)
+ uint8_t component_mask; // bitmask of components to sample (optional)
+ int new_w, new_h; // dimensions of the resulting output (optional)
+ float scale; // factor to multiply into sampled signal (optional)
+
+ // Note: `component_mask` and `components` are mutually exclusive, the
+ // former is preferred if both are specified.
+};
+
+#define pl_sample_src(...) (&(struct pl_sample_src) { __VA_ARGS__ })
+
+struct pl_deband_params {
+ // The number of debanding steps to perform per sample. Each step reduces a
+ // bit more banding, but takes time to compute. Note that the strength of
+ // each step falls off very quickly, so high numbers (>4) are practically
+ // useless. Defaults to 1.
+ int iterations;
+
+ // The debanding filter's cut-off threshold. Higher numbers increase the
+ // debanding strength dramatically, but progressively diminish image
+ // details. Defaults to 3.0.
+ float threshold;
+
+ // The debanding filter's initial radius. The radius increases linearly
+ // for each iteration. A higher radius will find more gradients, but a
+ // lower radius will smooth more aggressively. Defaults to 16.0.
+ float radius;
+
+ // Add some extra noise to the image. This significantly helps cover up
+ // remaining quantization artifacts. Higher numbers add more noise.
+ // Note: When debanding HDR sources, even a small amount of grain can
+ // result in a very big change to the brightness level. It's recommended to
+ // either scale this value down or disable it entirely for HDR.
+ //
+ // Defaults to 4.0, which is very mild.
+ float grain;
+
+ // 'Neutral' grain value for each channel being debanded (sorted in order
+ // from low to high index). Grain application will be modulated to avoid
+ // disturbing colors close to this value. Set this to a value corresponding
+ // to black in the relevant colorspace.
+ float grain_neutral[3];
+};
+
+#define PL_DEBAND_DEFAULTS \
+ .iterations = 1, \
+ .threshold = 3.0, \
+ .radius = 16.0, \
+ .grain = 4.0,
+
+#define pl_deband_params(...) (&(struct pl_deband_params) {PL_DEBAND_DEFAULTS __VA_ARGS__ })
+PL_API extern const struct pl_deband_params pl_deband_default_params;
+
+// Debands a given texture and returns the sampled color in `vec4 color`. If
+// `params` is left as NULL, it defaults to &pl_deband_default_params. Note
+// that `tex->params.format` must have PL_FMT_CAP_LINEAR. When the given
+// `pl_sample_src` implies scaling, this effectively performs bilinear
+// sampling on the input (but not the output).
+//
+// Note: This can also be used as a pure grain function, by setting the number
+// of iterations to 0.
+PL_API void pl_shader_deband(pl_shader sh, const struct pl_sample_src *src,
+ const struct pl_deband_params *params);
+
+// Performs direct / native texture sampling, using whatever texture filter is
+// available (linear for linearly sampleable sources, nearest otherwise).
+//
+// Note: This is generally very low quality and should be avoided if possible,
+// for both upscaling and downscaling.
+PL_API bool pl_shader_sample_direct(pl_shader sh, const struct pl_sample_src *src);
+
+// Performs hardware-accelerated nearest neighbour sampling. This is similar to
+// `pl_shader_sample_direct`, but forces nearest neighbour interpolation.
+PL_API bool pl_shader_sample_nearest(pl_shader sh, const struct pl_sample_src *src);
+
+// Performs hardware-accelerated bilinear sampling. This is similar to
+// `pl_shader_sample_direct`, but forces bilinear interpolation.
+PL_API bool pl_shader_sample_bilinear(pl_shader sh, const struct pl_sample_src *src);
+
+// Optimized versions of specific, strictly positive scaler kernels that take
+// adantage of linear texture sampling to reduce the number of fetches needed
+// by a factor of four. This family of functions performs radius-2 scaling
+// with only four texture fetches, which is far more efficient than using
+// the generalized 1D scaling method. Only works well for upscaling.
+PL_API bool pl_shader_sample_bicubic(pl_shader sh, const struct pl_sample_src *src);
+PL_API bool pl_shader_sample_hermite(pl_shader sh, const struct pl_sample_src *src);
+PL_API bool pl_shader_sample_gaussian(pl_shader sh, const struct pl_sample_src *src);
+
+// A sampler that is similar to nearest neighbour sampling, but tries to
+// preserve pixel aspect ratios. This is mathematically equivalent to taking an
+// idealized image with square pixels, sampling it at an infinite resolution,
+// and then downscaling that to the desired resolution. (Hence it being called
+// "oversample"). Good for pixel art.
+//
+// The threshold provides a cutoff threshold below which the contribution of
+// pixels should be ignored, trading some amount of aspect ratio distortion for
+// a slightly crisper image. A value of `threshold == 0.5` makes this filter
+// equivalent to regular nearest neighbour sampling.
+PL_API bool pl_shader_sample_oversample(pl_shader sh, const struct pl_sample_src *src,
+ float threshold);
+
+struct pl_sample_filter_params {
+ // The filter to use for sampling.
+ struct pl_filter_config filter;
+
+ // Antiringing strength. A value of 0.0 disables antiringing, and a value
+ // of 1.0 enables full-strength antiringing. Defaults to 0.0 if
+ // unspecified.
+ //
+ // Note: Ignored if `filter.antiring` is already set to something nonzero.
+ float antiring;
+
+ // Disable the use of compute shaders (e.g. if rendering to non-storable tex)
+ bool no_compute;
+ // Disable the use of filter widening / anti-aliasing (for downscaling)
+ bool no_widening;
+
+ // This shader object is used to store the LUT, and will be recreated
+ // if necessary. To avoid thrashing the resource, users should avoid trying
+ // to re-use the same LUT for different filter configurations or scaling
+ // ratios. Must be set to a valid pointer, and the target NULL-initialized.
+ pl_shader_obj *lut;
+
+ // Deprecated / removed fields
+ int lut_entries PL_DEPRECATED; // hard-coded as 256
+ float cutoff PL_DEPRECATED; // hard-coded as 1e-3
+};
+
+#define pl_sample_filter_params(...) (&(struct pl_sample_filter_params) { __VA_ARGS__ })
+
+// Performs polar sampling. This internally chooses between an optimized compute
+// shader, and various fragment shaders, depending on the supported GLSL version
+// and GPU features. Returns whether or not it was successful.
+//
+// Note: `params->filter.polar` must be true to use this function.
+PL_API bool pl_shader_sample_polar(pl_shader sh, const struct pl_sample_src *src,
+ const struct pl_sample_filter_params *params);
+
+// Performs orthogonal (1D) sampling. Using this twice in a row (once vertical
+// and once horizontal) effectively performs a 2D upscale. This is lower
+// quality than polar sampling, but significantly faster, and therefore the
+// recommended default. Returns whether or not it was successful.
+//
+// `src` must represent a scaling operation that only scales in one direction,
+// i.e. either only X or only Y. The other direction must be left unscaled.
+//
+// Note: Due to internal limitations, this may currently only be used on 2D
+// textures - even though the basic principle would work for 1D and 3D textures
+// as well.
+PL_API bool pl_shader_sample_ortho2(pl_shader sh, const struct pl_sample_src *src,
+ const struct pl_sample_filter_params *params);
+
+struct pl_distort_params {
+ // An arbitrary 2x2 affine transformation to apply to the input image.
+ // For simplicity, the input image is explicitly centered and scaled such
+ // that the longer dimension is in [-1,1], before applying this.
+ pl_transform2x2 transform;
+
+ // If true, the texture is placed inside the center of the canvas without
+ // scaling. If false, it is effectively stretched to the canvas size.
+ bool unscaled;
+
+ // If true, the transformation is automatically scaled down and shifted to
+ // ensure that the resulting image fits inside the output canvas.
+ bool constrain;
+
+ // If true, use bicubic interpolation rather than faster bilinear
+ // interpolation. Higher quality but slower.
+ bool bicubic;
+
+ // Specifies the texture address mode to use when sampling out of bounds.
+ enum pl_tex_address_mode address_mode;
+
+ // If set, all out-of-bounds accesses will instead be treated as
+ // transparent, according to the given alpha mode. (Which should match the
+ // alpha mode of the texture)
+ //
+ // Note: `address_mode` has no effect when this is specified.
+ enum pl_alpha_mode alpha_mode;
+};
+
+#define PL_DISTORT_DEFAULTS \
+ .transform.mat.m = {{ 1, 0 }, {0, 1}},
+
+#define pl_distort_params(...) (&(struct pl_distort_params) {PL_DISTORT_DEFAULTS __VA_ARGS__ })
+PL_API extern const struct pl_distort_params pl_distort_default_params;
+
+// Distorts the input image using a given set of transformation parameters.
+// `out_w` and `out_h` determine the size of the effective canvas inside which
+// the distorted result may be rendered. Areas outside of this canvas will
+// be implicitly cut off.
+PL_API void pl_shader_distort(pl_shader sh, pl_tex tex, int out_w, int out_h,
+ const struct pl_distort_params *params);
+
+enum PL_DEPRECATED { // for `int pass`
+ PL_SEP_VERT = 0,
+ PL_SEP_HORIZ,
+ PL_SEP_PASSES
+};
+
+PL_API_END
+
+#endif // LIBPLACEBO_SHADERS_SAMPLING_H_
diff --git a/src/include/libplacebo/swapchain.h b/src/include/libplacebo/swapchain.h
new file mode 100644
index 0000000..b53aa5c
--- /dev/null
+++ b/src/include/libplacebo/swapchain.h
@@ -0,0 +1,171 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_SWAPCHAIN_H_
+#define LIBPLACEBO_SWAPCHAIN_H_
+
+#include <libplacebo/common.h>
+#include <libplacebo/colorspace.h>
+#include <libplacebo/gpu.h>
+
+PL_API_BEGIN
+
+// This abstraction represents a low-level interface to visible surfaces
+// exposed by a graphics API (and accompanying GPU instance), allowing users to
+// directly present frames to the screen (or window, typically). This is a
+// sister API to gpu.h and follows the same convention w.r.t undefined behavior.
+//
+// Thread-safety: Safe
+typedef const struct pl_swapchain_t {
+ pl_log log;
+ pl_gpu gpu;
+} *pl_swapchain;
+
+// Destroys this swapchain. May be used at any time, and may block until the
+// completion of all outstanding rendering commands. The swapchain and any
+// resources retrieved from it must not be used afterwards.
+PL_API void pl_swapchain_destroy(pl_swapchain *sw);
+
+// Returns the approximate current swapchain latency in vsyncs, or 0 if
+// unknown. A latency of 1 means that `submit_frame` followed by `swap_buffers`
+// will block until the just-submitted frame has finished rendering. Typical
+// values are 2 or 3, which enable better pipelining by allowing the GPU to be
+// processing one or two frames at the same time as the user is preparing the
+// next for submission.
+PL_API int pl_swapchain_latency(pl_swapchain sw);
+
+// Update/query the swapchain size. This function performs both roles: it tries
+// setting the swapchain size to the values requested by the user, and returns
+// in the same variables what width/height the swapchain was actually set to -
+// which may be (substantially) different from the values requested by the
+// user. A value of 0 means "unknown/none" (in which case, libplacebo won't try
+// updating the size - it will simply return the current state of the
+// swapchain). It's also possible for libplacebo to return values of 0, such as
+// in the case that the swapchain doesn't exist yet.
+//
+// Returns false on significant errors (e.g. dead surface). This function can
+// effectively be used to probe if creating a swapchain works.
+PL_API bool pl_swapchain_resize(pl_swapchain sw, int *width, int *height);
+
+// Backwards compatibility
+#define pl_swapchain_colors pl_color_space
+
+// Inform the swapchain about the input color space. This API deliberately
+// provides no feedback, because the swapchain can internally decide what to do
+// with this information, including ignoring it entirely, or applying it
+// asynchronously. Users must still base their rendering on the value of
+// `pl_swapchain_frame.color_space`.
+//
+// Note: Calling this function a second time completely overrides any
+// previously specified hint. So calling this on {0} or NULL resets the
+// swapchain back to its initial/preferred colorspace.
+//
+// Note: If `csp->transfer` is a HDR transfer curve but HDR metadata is left
+// unspecified, the HDR metadata defaults to `pl_hdr_metadata_hdr10`.
+// Conversely, if the HDR metadata is non-empty but `csp->transfer` is left as
+// PL_COLOR_TRC_UNKNOWN, then it instead defaults to PL_COLOR_TRC_PQ.
+PL_API void pl_swapchain_colorspace_hint(pl_swapchain sw, const struct pl_color_space *csp);
+
+// The struct used to hold the results of `pl_swapchain_start_frame`
+struct pl_swapchain_frame {
+ // A texture representing the framebuffer users should use for rendering.
+ // It's guaranteed that `fbo->params.renderable` and `fbo->params.blit_dst`
+ // will be true, but no other guarantees are made - not even that
+ // `fbo->params.format` is a real format.
+ pl_tex fbo;
+
+ // If true, the user should assume that this framebuffer will be flipped
+ // as a result of presenting it on-screen. If false, nothing special needs
+ // to be done - but if true, users should flip the coordinate system of
+ // the `pl_pass` that is rendering to this framebuffer.
+ //
+ // Note: Normally, libplacebo follows the convention that (0,0) represents
+ // the top left of the image/screen. So when flipped is true, this means
+ // (0,0) on this framebuffer gets displayed as the bottom left of the image.
+ bool flipped;
+
+ // Indicates the color representation this framebuffer will be interpreted
+ // as by the host system / compositor / display, including the bit depth
+ // and alpha handling (where available).
+ struct pl_color_repr color_repr;
+ struct pl_color_space color_space;
+};
+
+// Retrieve a new frame from the swapchain. Returns whether successful. It's
+// worth noting that this function can fail sporadically for benign reasons,
+// for example the window being invisible or inaccessible. This function may
+// block until an image is available, which may be the case if the GPU is
+// rendering frames significantly faster than the display can output them. It
+// may also be non-blocking, so users shouldn't rely on this call alone in
+// order to meter rendering speed. (Specifics depend on the underlying graphics
+// API)
+PL_API bool pl_swapchain_start_frame(pl_swapchain sw, struct pl_swapchain_frame *out_frame);
+
+// Submits the previously started frame. Non-blocking. This must be issued in
+// lockstep with pl_swapchain_start_frame - there is no way to start multiple
+// frames and submit them out-of-order. The frames submitted this way will
+// generally be made visible in a first-in first-out fashion, although
+// specifics depend on the mechanism used to create the pl_swapchain. (See the
+// platform-specific APIs for more info).
+//
+// Returns whether successful. This should normally never fail, unless the
+// GPU/surface has been lost or some other critical error has occurred. The
+// "started" frame is consumed even in the event of failure.
+//
+// Note that `start_frame` and `submit_frame` form a lock pair, i.e. trying to
+// call e.g. `pl_swapchain_resize` from another thread will block until
+// `pl_swapchain_submit_frame` is finished.
+PL_API bool pl_swapchain_submit_frame(pl_swapchain sw);
+
+// Performs a "buffer swap", or some generalization of the concept. In layman's
+// terms, this blocks until the execution of the Nth previously submitted frame
+// has been "made complete" in some sense. (The N derives from the swapchain's
+// built-in latency. See `pl_swapchain_latency` for more information).
+//
+// Users should include this call in their rendering loops in order to make
+// sure they aren't submitting rendering commands faster than the GPU can
+// process them, which would potentially lead to a queue overrun or exhaust
+// memory.
+//
+// An example loop might look like this:
+//
+// while (rendering) {
+// struct pl_swapchain_frame frame;
+// bool ok = pl_swapchain_start_frame(swapchain, &frame);
+// if (!ok) {
+// /* wait some time, or decide to stop rendering */
+// continue;
+// }
+//
+// /* do some rendering with frame.fbo */
+//
+// ok = pl_swapchain_submit_frame(swapchain);
+// if (!ok)
+// break;
+//
+// pl_swapchain_swap_buffers(swapchain);
+// }
+//
+// The duration this function blocks for, if at all, may be very inconsistent
+// and should not be used as an authoritative source of vsync timing
+// information without sufficient smoothing/filtering (and if so, the time that
+// `start_frame` blocked for should also be included).
+PL_API void pl_swapchain_swap_buffers(pl_swapchain sw);
+
+PL_API_END
+
+#endif // LIBPLACEBO_SWAPCHAIN_H_
diff --git a/src/include/libplacebo/tone_mapping.h b/src/include/libplacebo/tone_mapping.h
new file mode 100644
index 0000000..48f1eb7
--- /dev/null
+++ b/src/include/libplacebo/tone_mapping.h
@@ -0,0 +1,268 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_TONE_MAPPING_H_
+#define LIBPLACEBO_TONE_MAPPING_H_
+
+#include <stddef.h>
+#include <stdbool.h>
+
+#include <libplacebo/common.h>
+#include <libplacebo/colorspace.h>
+
+PL_API_BEGIN
+
+struct pl_tone_map_params;
+struct pl_tone_map_function {
+ const char *name; // Identifier
+ const char *description; // Friendly / longer name
+
+ // This controls the type of values input/output to/from `map`
+ enum pl_hdr_scaling scaling;
+
+ // The tone-mapping function itself. Iterates over all values in `lut`, and
+ // adapts them as needed.
+ //
+ // Note that the `params` struct fed into this function is guaranteed to
+ // satisfy `params->input_scaling == params->output_scaling == scaling`,
+ // and also obeys `params->input_max >= params->output_max`.
+ void (*map)(float *lut, const struct pl_tone_map_params *params);
+
+ // Inverse tone mapping function. Optional. If absent, this tone mapping
+ // curve only works in the forwards direction.
+ //
+ // For this function, `params->input_max <= params->output_max`.
+ void (*map_inverse)(float *lut, const struct pl_tone_map_params *params);
+
+ // Private data. Unused by libplacebo, but may be accessed by `map`.
+ void *priv;
+
+ // --- Deprecated fields
+ const char *param_desc PL_DEPRECATED;
+ float param_min PL_DEPRECATED;
+ float param_def PL_DEPRECATED;
+ float param_max PL_DEPRECATED;
+};
+
+struct pl_tone_map_constants {
+ // Configures the knee point, as a ratio between the source average and
+ // target average (in PQ space). An adaptation of 1.0 always adapts the
+ // source scene average brightness to the (scaled) target average,
+ // while a value of 0.0 never modifies scene brightness. [0,1]
+ //
+ // Affects all methods that use the ST2094 knee point determination
+ // (currently ST2094-40, ST2094-10 and spline)
+ float knee_adaptation;
+
+ // Configures the knee point minimum and maximum, respectively, as
+ // a percentage of the PQ luminance range. Provides a hard limit on the
+ // knee point chosen by `knee_adaptation`.
+ float knee_minimum; // (0, 0.5)
+ float knee_maximum; // (0.5, 1.0)
+
+ // Default knee point to use in the absence of source scene average
+ // metadata. Normally, this is ignored in favor of picking the knee
+ // point as the (relative) source scene average brightness level.
+ float knee_default; // [knee_minimum, knee_maximum]
+
+ // Knee point offset (for BT.2390 only). Note that a value of 0.5 is
+ // the spec-defined default behavior, which differs from the libplacebo
+ // default of 1.0. [0.5, 2]
+ float knee_offset;
+
+ // For the single-pivot polynomial (spline) function, this controls the
+ // coefficients used to tune the slope of the curve. This tuning is designed
+ // to make the slope closer to 1.0 when the difference in peaks is low,
+ // and closer to linear when the difference between peaks is high.
+ float slope_tuning; // [0,10]
+ float slope_offset; // [0,1]
+
+ // Contrast setting for the spline function. Higher values make the curve
+ // steeper (closer to `clip`), preserving midtones at the cost of losing
+ // shadow/highlight details, while lower values make the curve shallowed
+ // (closer to `linear`), preserving highlights at the cost of losing midtone
+ // contrast. Values above 1.0 are possible, resulting in an output with more
+ // contrast than the input.
+ float spline_contrast; // [0,1.5]
+
+ // For the reinhard function, this specifies the local contrast coefficient
+ // at the display peak. Essentially, a value of 0.5 implies that the
+ // reference white will be about half as bright as when clipping. (0,1)
+ float reinhard_contrast;
+
+ // For legacy functions (mobius, gamma) which operate on linear light, this
+ // directly sets the corresponding knee point. (0,1)
+ float linear_knee;
+
+ // For linear methods (linear, linearlight), this controls the linear
+ // exposure/gain applied to the image. (0,10]
+ float exposure;
+};
+
+#define PL_TONE_MAP_CONSTANTS \
+ .knee_adaptation = 0.4f, \
+ .knee_minimum = 0.1f, \
+ .knee_maximum = 0.8f, \
+ .knee_default = 0.4f, \
+ .knee_offset = 1.0f, \
+ .slope_tuning = 1.5f, \
+ .slope_offset = 0.2f, \
+ .spline_contrast = 0.5f, \
+ .reinhard_contrast = 0.5f, \
+ .linear_knee = 0.3f, \
+ .exposure = 1.0f,
+
+struct pl_tone_map_params {
+ // If `function` is NULL, defaults to `pl_tone_map_clip`.
+ const struct pl_tone_map_function *function;
+
+ // Common constants, should be initialized to PL_TONE_MAP_CONSTANTS if
+ // not intending to override them further.
+ struct pl_tone_map_constants constants;
+
+ // The desired input/output scaling of the tone map. If this differs from
+ // `function->scaling`, any required conversion will be performed.
+ //
+ // Note that to maximize LUT efficiency, it's *highly* recommended to use
+ // either PL_HDR_PQ or PL_HDR_SQRT as the input scaling, except when
+ // using `pl_tone_map_sample`.
+ enum pl_hdr_scaling input_scaling;
+ enum pl_hdr_scaling output_scaling;
+
+ // The size of the resulting LUT. (For `pl_tone_map_generate` only)
+ size_t lut_size;
+
+ // The characteristics of the input, in `input_scaling` units.
+ float input_min;
+ float input_max;
+ float input_avg; // or 0 if unknown
+
+ // The desired characteristics of the output, in `output_scaling` units.
+ float output_min;
+ float output_max;
+
+ // The input HDR metadata. Only used by a select few tone-mapping
+ // functions, currently only SMPTE ST2094. (Optional)
+ struct pl_hdr_metadata hdr;
+
+ // --- Deprecated fields
+ float param PL_DEPRECATED; // see `constants`
+};
+
+#define pl_tone_map_params(...) (&(struct pl_tone_map_params) { __VA_ARGS__ });
+
+// Note: Only does pointer equality testing on `function`
+PL_API bool pl_tone_map_params_equal(const struct pl_tone_map_params *a,
+ const struct pl_tone_map_params *b);
+
+// Clamps/defaults the parameters, including input/output maximum.
+PL_API void pl_tone_map_params_infer(struct pl_tone_map_params *params);
+
+// Returns true if the given tone mapping configuration effectively represents
+// a no-op configuration. Tone mapping can be skipped in this case (although
+// strictly speaking, the LUT would still clip illegal input values)
+PL_API bool pl_tone_map_params_noop(const struct pl_tone_map_params *params);
+
+// Generate a tone-mapping LUT for a given configuration. This will always
+// span the entire input range, as given by `input_min` and `input_max`.
+PL_API void pl_tone_map_generate(float *out, const struct pl_tone_map_params *params);
+
+// Samples a tone mapping function at a single position. Note that this is less
+// efficient than `pl_tone_map_generate` for generating multiple values.
+//
+// Ignores `params->lut_size`.
+PL_API float pl_tone_map_sample(float x, const struct pl_tone_map_params *params);
+
+// Performs no tone-mapping, just clips out-of-range colors. Retains perfect
+// color accuracy for in-range colors but completely destroys out-of-range
+// information. Does not perform any black point adaptation.
+PL_API extern const struct pl_tone_map_function pl_tone_map_clip;
+
+// EETF from SMPTE ST 2094-40 Annex B, which uses the provided OOTF based on
+// Bezier curves to perform tone-mapping. The OOTF used is adjusted based on
+// the ratio between the targeted and actual display peak luminances. In the
+// absence of HDR10+ metadata, falls back to a simple constant bezier curve.
+PL_API extern const struct pl_tone_map_function pl_tone_map_st2094_40;
+
+// EETF from SMPTE ST 2094-10 Annex B.2, which takes into account the input
+// signal average luminance in addition to the maximum/minimum.
+//
+// Note: This does *not* currently include the subjective gain/offset/gamma
+// controls defined in Annex B.3. (Open an issue with a valid sample file if
+// you want such parameters to be respected.)
+PL_API extern const struct pl_tone_map_function pl_tone_map_st2094_10;
+
+// EETF from the ITU-R Report BT.2390, a hermite spline roll-off with linear
+// segment.
+PL_API extern const struct pl_tone_map_function pl_tone_map_bt2390;
+
+// EETF from ITU-R Report BT.2446, method A. Can be used for both forward
+// and inverse tone mapping.
+PL_API extern const struct pl_tone_map_function pl_tone_map_bt2446a;
+
+// Simple spline consisting of two polynomials, joined by a single pivot point,
+// which is tuned based on the source scene average brightness (taking into
+// account dynamic metadata if available). This function can be used
+// for both forward and inverse tone mapping.
+PL_API extern const struct pl_tone_map_function pl_tone_map_spline;
+
+// Very simple non-linear curve. Named after Erik Reinhard.
+PL_API extern const struct pl_tone_map_function pl_tone_map_reinhard;
+
+// Generalization of the reinhard tone mapping algorithm to support an
+// additional linear slope near black. The name is derived from its function
+// shape (ax+b)/(cx+d), which is known as a Möbius transformation.
+PL_API extern const struct pl_tone_map_function pl_tone_map_mobius;
+
+// Piece-wise, filmic tone-mapping algorithm developed by John Hable for use in
+// Uncharted 2, inspired by a similar tone-mapping algorithm used by Kodak.
+// Popularized by its use in video games with HDR rendering. Preserves both
+// dark and bright details very well, but comes with the drawback of changing
+// the average brightness quite significantly. This is sort of similar to
+// pl_tone_map_reinhard with `reinhard_contrast=0.24`.
+PL_API extern const struct pl_tone_map_function pl_tone_map_hable;
+
+// Fits a gamma (power) function to transfer between the source and target
+// color spaces, effectively resulting in a perceptual hard-knee joining two
+// roughly linear sections. This preserves details at all scales, but can result
+// in an image with a muted or dull appearance.
+PL_API extern const struct pl_tone_map_function pl_tone_map_gamma;
+
+// Linearly stretches the input range to the output range, in PQ space. This
+// will preserve all details accurately, but results in a significantly
+// different average brightness. Can be used for inverse tone-mapping in
+// addition to regular tone-mapping.
+PL_API extern const struct pl_tone_map_function pl_tone_map_linear;
+
+// Like `pl_tone_map_linear`, but in linear light (instead of PQ). Works well
+// for small range adjustments but may cause severe darkening when
+// downconverting from e.g. 10k nits to SDR.
+PL_API extern const struct pl_tone_map_function pl_tone_map_linear_light;
+
+// A list of built-in tone mapping functions, terminated by NULL
+PL_API extern const struct pl_tone_map_function * const pl_tone_map_functions[];
+PL_API extern const int pl_num_tone_map_functions; // excluding trailing NULL
+
+// Find the tone mapping function with the given name, or NULL on failure.
+PL_API const struct pl_tone_map_function *pl_find_tone_map_function(const char *name);
+
+// Deprecated alias, do not use
+#define pl_tone_map_auto pl_tone_map_spline
+
+PL_API_END
+
+#endif // LIBPLACEBO_TONE_MAPPING_H_
diff --git a/src/include/libplacebo/utils/dav1d.h b/src/include/libplacebo/utils/dav1d.h
new file mode 100644
index 0000000..ece97c5
--- /dev/null
+++ b/src/include/libplacebo/utils/dav1d.h
@@ -0,0 +1,129 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_DAV1D_H_
+#define LIBPLACEBO_DAV1D_H_
+
+#include <libplacebo/gpu.h>
+#include <libplacebo/utils/upload.h>
+#include <dav1d/dav1d.h>
+
+#if defined(__cplusplus) && !defined(PL_DAV1D_IMPLEMENTATION)
+# define PL_DAV1D_API
+# define PL_DAV1D_IMPLEMENTATION 0
+# warning Remember to include this file with a PL_DAV1D_IMPLEMENTATION set to 1 in \
+ C translation unit to provide implementation. Suppress this warning by \
+ defining PL_DAV1D_IMPLEMENTATION to 0 in C++ files.
+#elif !defined(PL_DAV1D_IMPLEMENTATION)
+# define PL_DAV1D_API static inline
+# define PL_DAV1D_IMPLEMENTATION 1
+#else
+# define PL_DAV1D_API
+#endif
+
+PL_API_BEGIN
+
+// Fill in the details of a `pl_frame` from a Dav1dPicture. This function will
+// explicitly clear `out_frame`, setting all extra fields to 0. After this
+// function returns, the only missing data is information related to the plane
+// texture itself (`planes[N].texture`).
+//
+// Note: This will include all possible metadata, including HDR metadata and
+// AV1 film grain data. Users should explicitly clear this out if undesired.
+PL_DAV1D_API void pl_frame_from_dav1dpicture(struct pl_frame *out_frame,
+ const Dav1dPicture *picture);
+
+// Helper function to generate a `pl_color_space` struct from a Dav1dPicture.
+// Useful to update the swapchain colorspace mode dynamically (e.g. for HDR).
+PL_DAV1D_API void pl_swapchain_colors_from_dav1dpicture(struct pl_color_space *out_colors,
+ const Dav1dPicture *picture);
+
+struct pl_dav1d_upload_params {
+ // The picture to upload. Not modified unless `asynchronous` is true.
+ Dav1dPicture *picture;
+
+ // If true, film grain present in `picture` will be exported to the
+ // `pl_frame` as well. This should be set to false unless the user has
+ // disabled `Dav1dSettings.apply_grain`.
+ bool film_grain;
+
+ // If true, libplacebo will probe for the allocation metadata set by
+ // `pl_allocate_dav1dpicture`, and directly import the attached buffers
+ // (saving a memcpy in some cases). Has no effect if the Dav1dPicture was
+ // not allocated using `pl_allocate_dav1dpicture`.
+ //
+ // Note: When this is the case, `asynchronous` has no further effect -
+ // uploads from attached buffers are already asynchronous.
+ bool gpu_allocated;
+
+ // If true, `picture` will be asynchronously uploaded and unref'd
+ // internally by libplacebo, and the struct passed by the user cleared to
+ // {0}. This is needed to avoid `memcpy` in some cases, so setting it to
+ // true is highly recommended wherever possible.
+ //
+ // Note: If `pl_upload_dav1dpicture` returns false, `picture` does not get
+ // unref'd.
+ bool asynchronous;
+};
+
+#define pl_dav1d_upload_params(...) (&(struct pl_dav1d_upload_params) { __VA_ARGS__ })
+
+// Very high level helper function to take a `Dav1dPicture` and upload it to
+// the GPU. Similar in spirit to `pl_upload_plane`, and the same notes apply.
+// `tex` must be an array of 3 pointers of type `pl_tex`, each
+// either pointing to a valid texture, or NULL. Returns whether successful.
+PL_DAV1D_API bool pl_upload_dav1dpicture(pl_gpu gpu,
+ struct pl_frame *out_frame, pl_tex tex[3],
+ const struct pl_dav1d_upload_params *params);
+
+// Allocate a Dav1dPicture from persistently mapped buffers. This can be more
+// efficient than regular Dav1dPictures, especially when using the synchronous
+// `pl_upload_dav1dpicture`, or on platforms that don't support importing
+// PL_HANDLE_HOST_PTR as buffers. Returns 0 or a negative DAV1D_ERR value.
+//
+// Note: These may only be used directly as a Dav1dPicAllocator if the `gpu`
+// passed as the value of `cookie` is `pl_gpu.limits.thread_safe`. Otherwise,
+// the user must manually synchronize this to ensure it runs on the correct
+// thread.
+PL_DAV1D_API int pl_allocate_dav1dpicture(Dav1dPicture *picture, void *gpu);
+PL_DAV1D_API void pl_release_dav1dpicture(Dav1dPicture *picture, void *gpu);
+
+// Mapping functions for the various Dav1dColor* enums. Note that these are not
+// quite 1:1, and even for values that exist in both, the semantics sometimes
+// differ. Some special cases (e.g. ICtCp, or XYZ) are handled differently in
+// libplacebo and libdav1d, respectively.
+PL_DAV1D_API enum pl_color_system pl_system_from_dav1d(enum Dav1dMatrixCoefficients mc);
+PL_DAV1D_API enum Dav1dMatrixCoefficients pl_system_to_dav1d(enum pl_color_system sys);
+PL_DAV1D_API enum pl_color_levels pl_levels_from_dav1d(int color_range);
+PL_DAV1D_API int pl_levels_to_dav1d(enum pl_color_levels levels);
+PL_DAV1D_API enum pl_color_primaries pl_primaries_from_dav1d(enum Dav1dColorPrimaries prim);
+PL_DAV1D_API enum Dav1dColorPrimaries pl_primaries_to_dav1d(enum pl_color_primaries prim);
+PL_DAV1D_API enum pl_color_transfer pl_transfer_from_dav1d(enum Dav1dTransferCharacteristics trc);
+PL_DAV1D_API enum Dav1dTransferCharacteristics pl_transfer_to_dav1d(enum pl_color_transfer trc);
+PL_DAV1D_API enum pl_chroma_location pl_chroma_from_dav1d(enum Dav1dChromaSamplePosition loc);
+PL_DAV1D_API enum Dav1dChromaSamplePosition pl_chroma_to_dav1d(enum pl_chroma_location loc);
+
+
+// Actual implementation, included as part of this header to avoid having
+// a compile-time dependency on libdav1d.
+#if PL_DAV1D_IMPLEMENTATION
+# include <libplacebo/utils/dav1d_internal.h>
+#endif
+
+PL_API_END
+
+#endif // LIBPLACEBO_DAV1D_H_
diff --git a/src/include/libplacebo/utils/dav1d_internal.h b/src/include/libplacebo/utils/dav1d_internal.h
new file mode 100644
index 0000000..2e0512a
--- /dev/null
+++ b/src/include/libplacebo/utils/dav1d_internal.h
@@ -0,0 +1,613 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_DAV1D_H_
+#error This header should be included as part of <libplacebo/utils/dav1d.h>
+#elif defined(__cplusplus)
+#error This header cannot be included from C++ define PL_DAV1D_IMPLEMENTATION appropriately
+#else
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+PL_DAV1D_API enum pl_color_system pl_system_from_dav1d(enum Dav1dMatrixCoefficients mc)
+{
+ switch (mc) {
+ case DAV1D_MC_IDENTITY: return PL_COLOR_SYSTEM_RGB; // or XYZ (unlikely)
+ case DAV1D_MC_BT709: return PL_COLOR_SYSTEM_BT_709;
+ case DAV1D_MC_UNKNOWN: return PL_COLOR_SYSTEM_UNKNOWN;
+ case DAV1D_MC_FCC: return PL_COLOR_SYSTEM_UNKNOWN; // missing
+ case DAV1D_MC_BT470BG: return PL_COLOR_SYSTEM_BT_601;
+ case DAV1D_MC_BT601: return PL_COLOR_SYSTEM_BT_601;
+ case DAV1D_MC_SMPTE240: return PL_COLOR_SYSTEM_SMPTE_240M;
+ case DAV1D_MC_SMPTE_YCGCO: return PL_COLOR_SYSTEM_YCGCO;
+ case DAV1D_MC_BT2020_NCL: return PL_COLOR_SYSTEM_BT_2020_NC;
+ case DAV1D_MC_BT2020_CL: return PL_COLOR_SYSTEM_BT_2020_C;
+ case DAV1D_MC_SMPTE2085: return PL_COLOR_SYSTEM_UNKNOWN; // missing
+ case DAV1D_MC_CHROMAT_NCL: return PL_COLOR_SYSTEM_UNKNOWN; // missing
+ case DAV1D_MC_CHROMAT_CL: return PL_COLOR_SYSTEM_UNKNOWN; // missing
+ // Note: this colorspace is confused between PQ and HLG, which dav1d
+ // requires inferring from other sources, but libplacebo makes
+ // explicit. Default to PQ as it's the more common scenario.
+ case DAV1D_MC_ICTCP: return PL_COLOR_SYSTEM_BT_2100_PQ;
+ case DAV1D_MC_RESERVED: abort();
+ }
+
+ return PL_COLOR_SYSTEM_UNKNOWN;
+}
+
+PL_DAV1D_API enum Dav1dMatrixCoefficients pl_system_to_dav1d(enum pl_color_system sys)
+{
+ switch (sys) {
+ case PL_COLOR_SYSTEM_UNKNOWN: return DAV1D_MC_UNKNOWN;
+ case PL_COLOR_SYSTEM_BT_601: return DAV1D_MC_BT601;
+ case PL_COLOR_SYSTEM_BT_709: return DAV1D_MC_BT709;
+ case PL_COLOR_SYSTEM_SMPTE_240M: return DAV1D_MC_SMPTE240;
+ case PL_COLOR_SYSTEM_BT_2020_NC: return DAV1D_MC_BT2020_NCL;
+ case PL_COLOR_SYSTEM_BT_2020_C: return DAV1D_MC_BT2020_CL;
+ case PL_COLOR_SYSTEM_BT_2100_PQ: return DAV1D_MC_ICTCP;
+ case PL_COLOR_SYSTEM_BT_2100_HLG: return DAV1D_MC_ICTCP;
+ case PL_COLOR_SYSTEM_DOLBYVISION: return DAV1D_MC_UNKNOWN; // missing
+ case PL_COLOR_SYSTEM_YCGCO: return DAV1D_MC_SMPTE_YCGCO;
+ case PL_COLOR_SYSTEM_RGB: return DAV1D_MC_IDENTITY;
+ case PL_COLOR_SYSTEM_XYZ: return DAV1D_MC_IDENTITY;
+ case PL_COLOR_SYSTEM_COUNT: abort();
+ }
+
+ return DAV1D_MC_UNKNOWN;
+}
+
+PL_DAV1D_API enum pl_color_levels pl_levels_from_dav1d(int color_range)
+{
+ return color_range ? PL_COLOR_LEVELS_FULL : PL_COLOR_LEVELS_LIMITED;
+}
+
+PL_DAV1D_API int pl_levels_to_dav1d(enum pl_color_levels levels)
+{
+ return levels == PL_COLOR_LEVELS_FULL;
+}
+
+PL_DAV1D_API enum pl_color_primaries pl_primaries_from_dav1d(enum Dav1dColorPrimaries prim)
+{
+ switch (prim) {
+ case DAV1D_COLOR_PRI_BT709: return PL_COLOR_PRIM_BT_709;
+ case DAV1D_COLOR_PRI_UNKNOWN: return PL_COLOR_PRIM_UNKNOWN;
+ case DAV1D_COLOR_PRI_RESERVED: return PL_COLOR_PRIM_UNKNOWN;
+ case DAV1D_COLOR_PRI_BT470M: return PL_COLOR_PRIM_BT_470M;
+ case DAV1D_COLOR_PRI_BT470BG: return PL_COLOR_PRIM_BT_601_625;
+ case DAV1D_COLOR_PRI_BT601: return PL_COLOR_PRIM_BT_601_525;
+ case DAV1D_COLOR_PRI_SMPTE240: return PL_COLOR_PRIM_BT_601_525;
+ case DAV1D_COLOR_PRI_FILM: return PL_COLOR_PRIM_FILM_C;
+ case DAV1D_COLOR_PRI_BT2020: return PL_COLOR_PRIM_BT_2020;
+ case DAV1D_COLOR_PRI_XYZ: return PL_COLOR_PRIM_UNKNOWN;
+ case DAV1D_COLOR_PRI_SMPTE431: return PL_COLOR_PRIM_DCI_P3;
+ case DAV1D_COLOR_PRI_SMPTE432: return PL_COLOR_PRIM_DISPLAY_P3;
+ case DAV1D_COLOR_PRI_EBU3213: return PL_COLOR_PRIM_EBU_3213;
+ }
+
+ return PL_COLOR_PRIM_UNKNOWN;
+}
+
+PL_DAV1D_API enum Dav1dColorPrimaries pl_primaries_to_dav1d(enum pl_color_primaries prim)
+{
+ switch (prim) {
+ case PL_COLOR_PRIM_UNKNOWN: return DAV1D_COLOR_PRI_UNKNOWN;
+ case PL_COLOR_PRIM_BT_601_525: return DAV1D_COLOR_PRI_BT601;
+ case PL_COLOR_PRIM_BT_601_625: return DAV1D_COLOR_PRI_BT470BG;
+ case PL_COLOR_PRIM_BT_709: return DAV1D_COLOR_PRI_BT709;
+ case PL_COLOR_PRIM_BT_470M: return DAV1D_COLOR_PRI_BT470M;
+ case PL_COLOR_PRIM_EBU_3213: return DAV1D_COLOR_PRI_EBU3213;
+ case PL_COLOR_PRIM_BT_2020: return DAV1D_COLOR_PRI_BT2020;
+ case PL_COLOR_PRIM_APPLE: return DAV1D_COLOR_PRI_UNKNOWN; // missing
+ case PL_COLOR_PRIM_ADOBE: return DAV1D_COLOR_PRI_UNKNOWN; // missing
+ case PL_COLOR_PRIM_PRO_PHOTO: return DAV1D_COLOR_PRI_UNKNOWN; // missing
+ case PL_COLOR_PRIM_CIE_1931: return DAV1D_COLOR_PRI_UNKNOWN; // missing
+ case PL_COLOR_PRIM_DCI_P3: return DAV1D_COLOR_PRI_SMPTE431;
+ case PL_COLOR_PRIM_DISPLAY_P3: return DAV1D_COLOR_PRI_SMPTE432;
+ case PL_COLOR_PRIM_V_GAMUT: return DAV1D_COLOR_PRI_UNKNOWN; // missing
+ case PL_COLOR_PRIM_S_GAMUT: return DAV1D_COLOR_PRI_UNKNOWN; // missing
+ case PL_COLOR_PRIM_FILM_C: return DAV1D_COLOR_PRI_FILM;
+ case PL_COLOR_PRIM_ACES_AP0: return DAV1D_COLOR_PRI_UNKNOWN; // missing
+ case PL_COLOR_PRIM_ACES_AP1: return DAV1D_COLOR_PRI_UNKNOWN; // missing
+ case PL_COLOR_PRIM_COUNT: abort();
+ }
+
+ return DAV1D_COLOR_PRI_UNKNOWN;
+}
+
+PL_DAV1D_API enum pl_color_transfer pl_transfer_from_dav1d(enum Dav1dTransferCharacteristics trc)
+{
+ switch (trc) {
+ case DAV1D_TRC_BT709: return PL_COLOR_TRC_BT_1886; // EOTF != OETF
+ case DAV1D_TRC_UNKNOWN: return PL_COLOR_TRC_UNKNOWN;
+ case DAV1D_TRC_BT470M: return PL_COLOR_TRC_GAMMA22;
+ case DAV1D_TRC_BT470BG: return PL_COLOR_TRC_GAMMA28;
+ case DAV1D_TRC_BT601: return PL_COLOR_TRC_BT_1886; // EOTF != OETF
+ case DAV1D_TRC_SMPTE240: return PL_COLOR_TRC_BT_1886; // EOTF != OETF
+ case DAV1D_TRC_LINEAR: return PL_COLOR_TRC_LINEAR;
+ case DAV1D_TRC_LOG100: return PL_COLOR_TRC_UNKNOWN; // missing
+ case DAV1D_TRC_LOG100_SQRT10: return PL_COLOR_TRC_UNKNOWN; // missing
+ case DAV1D_TRC_IEC61966: return PL_COLOR_TRC_BT_1886; // EOTF != OETF
+ case DAV1D_TRC_BT1361: return PL_COLOR_TRC_BT_1886; // ETOF != OETF
+ case DAV1D_TRC_SRGB: return PL_COLOR_TRC_SRGB;
+ case DAV1D_TRC_BT2020_10BIT: return PL_COLOR_TRC_BT_1886; // EOTF != OETF
+ case DAV1D_TRC_BT2020_12BIT: return PL_COLOR_TRC_BT_1886; // EOTF != OETF
+ case DAV1D_TRC_SMPTE2084: return PL_COLOR_TRC_PQ;
+ case DAV1D_TRC_SMPTE428: return PL_COLOR_TRC_ST428;
+ case DAV1D_TRC_HLG: return PL_COLOR_TRC_HLG;
+ case DAV1D_TRC_RESERVED: abort();
+ }
+
+ return PL_COLOR_TRC_UNKNOWN;
+}
+
+PL_DAV1D_API enum Dav1dTransferCharacteristics pl_transfer_to_dav1d(enum pl_color_transfer trc)
+{
+ switch (trc) {
+ case PL_COLOR_TRC_UNKNOWN: return DAV1D_TRC_UNKNOWN;
+ case PL_COLOR_TRC_BT_1886: return DAV1D_TRC_BT709; // EOTF != OETF
+ case PL_COLOR_TRC_SRGB: return DAV1D_TRC_SRGB;
+ case PL_COLOR_TRC_LINEAR: return DAV1D_TRC_LINEAR;
+ case PL_COLOR_TRC_GAMMA18: return DAV1D_TRC_UNKNOWN; // missing
+ case PL_COLOR_TRC_GAMMA20: return DAV1D_TRC_UNKNOWN; // missing
+ case PL_COLOR_TRC_GAMMA22: return DAV1D_TRC_BT470M;
+ case PL_COLOR_TRC_GAMMA24: return DAV1D_TRC_UNKNOWN; // missing
+ case PL_COLOR_TRC_GAMMA26: return DAV1D_TRC_UNKNOWN; // missing
+ case PL_COLOR_TRC_GAMMA28: return DAV1D_TRC_BT470BG;
+ case PL_COLOR_TRC_ST428: return DAV1D_TRC_SMPTE428;
+ case PL_COLOR_TRC_PRO_PHOTO: return DAV1D_TRC_UNKNOWN; // missing
+ case PL_COLOR_TRC_PQ: return DAV1D_TRC_SMPTE2084;
+ case PL_COLOR_TRC_HLG: return DAV1D_TRC_HLG;
+ case PL_COLOR_TRC_V_LOG: return DAV1D_TRC_UNKNOWN; // missing
+ case PL_COLOR_TRC_S_LOG1: return DAV1D_TRC_UNKNOWN; // missing
+ case PL_COLOR_TRC_S_LOG2: return DAV1D_TRC_UNKNOWN; // missing
+ case PL_COLOR_TRC_COUNT: abort();
+ }
+
+ return DAV1D_TRC_UNKNOWN;
+}
+
+PL_DAV1D_API enum pl_chroma_location pl_chroma_from_dav1d(enum Dav1dChromaSamplePosition loc)
+{
+ switch (loc) {
+ case DAV1D_CHR_UNKNOWN: return PL_CHROMA_UNKNOWN;
+ case DAV1D_CHR_VERTICAL: return PL_CHROMA_LEFT;
+ case DAV1D_CHR_COLOCATED: return PL_CHROMA_TOP_LEFT;
+ }
+
+ return PL_CHROMA_UNKNOWN;
+}
+
+PL_DAV1D_API enum Dav1dChromaSamplePosition pl_chroma_to_dav1d(enum pl_chroma_location loc)
+{
+ switch (loc) {
+ case PL_CHROMA_UNKNOWN: return DAV1D_CHR_UNKNOWN;
+ case PL_CHROMA_LEFT: return DAV1D_CHR_VERTICAL;
+ case PL_CHROMA_CENTER: return DAV1D_CHR_UNKNOWN; // missing
+ case PL_CHROMA_TOP_LEFT: return DAV1D_CHR_COLOCATED;
+ case PL_CHROMA_TOP_CENTER: return DAV1D_CHR_UNKNOWN; // missing
+ case PL_CHROMA_BOTTOM_LEFT: return DAV1D_CHR_UNKNOWN; // missing
+ case PL_CHROMA_BOTTOM_CENTER: return DAV1D_CHR_UNKNOWN; // missing
+ case PL_CHROMA_COUNT: abort();
+ }
+
+ return DAV1D_CHR_UNKNOWN;
+}
+
+static inline float pl_fixed24_8(uint32_t n)
+{
+ return (float) n / (1 << 8);
+}
+
+static inline float pl_fixed18_14(uint32_t n)
+{
+ return (float) n / (1 << 14);
+}
+
+static inline float pl_fixed0_16(uint16_t n)
+{
+ return (float) n / (1 << 16);
+}
+
+// Align to a power of 2
+#define PL_ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1))
+
+PL_DAV1D_API void pl_frame_from_dav1dpicture(struct pl_frame *out,
+ const Dav1dPicture *picture)
+{
+ const Dav1dSequenceHeader *seq_hdr = picture->seq_hdr;
+ int num_planes;
+ switch (picture->p.layout) {
+ case DAV1D_PIXEL_LAYOUT_I400:
+ num_planes = 1;
+ break;
+ case DAV1D_PIXEL_LAYOUT_I420:
+ case DAV1D_PIXEL_LAYOUT_I422:
+ case DAV1D_PIXEL_LAYOUT_I444:
+ num_planes = 3;
+ break;
+ default: abort();
+ }
+
+ *out = (struct pl_frame) {
+ .num_planes = num_planes,
+ .planes = {
+ // Components are always in order, which makes things easy
+ {
+ .components = 1,
+ .component_mapping = {0},
+ }, {
+ .components = 1,
+ .component_mapping = {1},
+ }, {
+ .components = 1,
+ .component_mapping = {2},
+ },
+ },
+ .crop = {
+ 0, 0, picture->p.w, picture->p.h,
+ },
+ .color = {
+ .primaries = pl_primaries_from_dav1d(seq_hdr->pri),
+ .transfer = pl_transfer_from_dav1d(seq_hdr->trc),
+ },
+ .repr = {
+ .sys = pl_system_from_dav1d(seq_hdr->mtrx),
+ .levels = pl_levels_from_dav1d(seq_hdr->color_range),
+ .bits = {
+ .sample_depth = PL_ALIGN2(picture->p.bpc, 8),
+ .color_depth = picture->p.bpc,
+ },
+ },
+ };
+
+ if (seq_hdr->mtrx == DAV1D_MC_ICTCP && seq_hdr->trc == DAV1D_TRC_HLG) {
+
+ // dav1d makes no distinction between PQ and HLG ICtCp, so we need
+ // to manually fix it in the case that we have HLG ICtCp data.
+ out->repr.sys = PL_COLOR_SYSTEM_BT_2100_HLG;
+
+ } else if (seq_hdr->mtrx == DAV1D_MC_IDENTITY &&
+ seq_hdr->pri == DAV1D_COLOR_PRI_XYZ)
+ {
+
+ // dav1d handles this as a special case, but doesn't provide an
+ // explicit flag for it either, so we have to resort to this ugly hack,
+ // even though CIE 1931 RGB *is* a valid thing in principle!
+ out->repr.sys= PL_COLOR_SYSTEM_XYZ;
+
+ } else if (!out->repr.sys) {
+
+ // PL_COLOR_SYSTEM_UNKNOWN maps to RGB, so hard-code this one
+ out->repr.sys = pl_color_system_guess_ycbcr(picture->p.w, picture->p.h);
+ }
+
+ const Dav1dContentLightLevel *cll = picture->content_light;
+ if (cll) {
+ out->color.hdr.max_cll = cll->max_content_light_level;
+ out->color.hdr.max_fall = cll->max_frame_average_light_level;
+ }
+
+ // This overrides the CLL values above, if both are present
+ const Dav1dMasteringDisplay *md = picture->mastering_display;
+ if (md) {
+ out->color.hdr.max_luma = pl_fixed24_8(md->max_luminance);
+ out->color.hdr.min_luma = pl_fixed18_14(md->min_luminance);
+ out->color.hdr.prim = (struct pl_raw_primaries) {
+ .red.x = pl_fixed0_16(md->primaries[0][0]),
+ .red.y = pl_fixed0_16(md->primaries[0][1]),
+ .green.x = pl_fixed0_16(md->primaries[1][0]),
+ .green.y = pl_fixed0_16(md->primaries[1][1]),
+ .blue.x = pl_fixed0_16(md->primaries[2][0]),
+ .blue.y = pl_fixed0_16(md->primaries[2][1]),
+ .white.x = pl_fixed0_16(md->white_point[0]),
+ .white.y = pl_fixed0_16(md->white_point[1]),
+ };
+ }
+
+ if (picture->frame_hdr->film_grain.present) {
+ const Dav1dFilmGrainData *fg = &picture->frame_hdr->film_grain.data;
+ out->film_grain = (struct pl_film_grain_data) {
+ .type = PL_FILM_GRAIN_AV1,
+ .seed = fg->seed,
+ .params.av1 = {
+ .num_points_y = fg->num_y_points,
+ .chroma_scaling_from_luma = fg->chroma_scaling_from_luma,
+ .num_points_uv = { fg->num_uv_points[0], fg->num_uv_points[1] },
+ .scaling_shift = fg->scaling_shift,
+ .ar_coeff_lag = fg->ar_coeff_lag,
+ .ar_coeff_shift = (int) fg->ar_coeff_shift,
+ .grain_scale_shift = fg->grain_scale_shift,
+ .uv_mult = { fg->uv_mult[0], fg->uv_mult[1] },
+ .uv_mult_luma = { fg->uv_luma_mult[0], fg->uv_luma_mult[1] },
+ .uv_offset = { fg->uv_offset[0], fg->uv_offset[1] },
+ .overlap = fg->overlap_flag,
+ },
+ };
+
+ struct pl_av1_grain_data *av1 = &out->film_grain.params.av1;
+ memcpy(av1->points_y, fg->y_points, sizeof(av1->points_y));
+ memcpy(av1->points_uv, fg->uv_points, sizeof(av1->points_uv));
+ memcpy(av1->ar_coeffs_y, fg->ar_coeffs_y, sizeof(av1->ar_coeffs_y));
+ memcpy(av1->ar_coeffs_uv[0], fg->ar_coeffs_uv[0], sizeof(av1->ar_coeffs_uv[0]));
+ memcpy(av1->ar_coeffs_uv[1], fg->ar_coeffs_uv[1], sizeof(av1->ar_coeffs_uv[1]));
+ }
+
+ switch (picture->p.layout) {
+ case DAV1D_PIXEL_LAYOUT_I400:
+ case DAV1D_PIXEL_LAYOUT_I444:
+ break;
+ case DAV1D_PIXEL_LAYOUT_I420:
+ case DAV1D_PIXEL_LAYOUT_I422:
+ // Only set the chroma location for definitely subsampled images
+ pl_frame_set_chroma_location(out, pl_chroma_from_dav1d(seq_hdr->chr));
+ break;
+ }
+}
+
+PL_DAV1D_API void pl_swapchain_colors_from_dav1dpicture(struct pl_swapchain_colors *out_colors,
+ const Dav1dPicture *picture)
+{
+ struct pl_frame frame;
+ pl_frame_from_dav1dpicture(&frame, picture);
+
+ *out_colors = (struct pl_swapchain_colors) {
+ .primaries = frame.color.primaries,
+ .transfer = frame.color.transfer,
+ };
+
+ const Dav1dContentLightLevel *cll = picture->content_light;
+ if (cll) {
+ out_colors->hdr.max_cll = cll->max_content_light_level;
+ out_colors->hdr.max_fall = cll->max_frame_average_light_level;
+ }
+
+ const Dav1dMasteringDisplay *md = picture->mastering_display;
+ if (md) {
+ out_colors->hdr.min_luma = pl_fixed18_14(md->min_luminance);
+ out_colors->hdr.max_luma = pl_fixed24_8(md->max_luminance);
+ out_colors->hdr.prim.red.x = pl_fixed0_16(md->primaries[0][0]);
+ out_colors->hdr.prim.red.y = pl_fixed0_16(md->primaries[0][1]);
+ out_colors->hdr.prim.green.x = pl_fixed0_16(md->primaries[1][0]);
+ out_colors->hdr.prim.green.y = pl_fixed0_16(md->primaries[1][1]);
+ out_colors->hdr.prim.blue.x = pl_fixed0_16(md->primaries[2][0]);
+ out_colors->hdr.prim.blue.y = pl_fixed0_16(md->primaries[2][1]);
+ out_colors->hdr.prim.white.x = pl_fixed0_16(md->white_point[0]);
+ out_colors->hdr.prim.white.y = pl_fixed0_16(md->white_point[1]);
+ }
+}
+
+#define PL_MAGIC0 0x2c2a1269
+#define PL_MAGIC1 0xc6d02577
+
+struct pl_dav1dalloc {
+ uint32_t magic[2];
+ pl_gpu gpu;
+ pl_buf buf;
+};
+
+struct pl_dav1dref {
+ Dav1dPicture pic;
+ uint8_t count;
+};
+
+static void pl_dav1dpicture_unref(void *priv)
+{
+ struct pl_dav1dref *ref = priv;
+ if (--ref->count == 0) {
+ dav1d_picture_unref(&ref->pic);
+ free(ref);
+ }
+}
+
+PL_DAV1D_API bool pl_upload_dav1dpicture(pl_gpu gpu,
+ struct pl_frame *out,
+ pl_tex tex[3],
+ const struct pl_dav1d_upload_params *params)
+{
+ Dav1dPicture *pic = params->picture;
+ pl_frame_from_dav1dpicture(out, pic);
+ if (!params->film_grain)
+ out->film_grain.type = PL_FILM_GRAIN_NONE;
+
+ const int bytes = (pic->p.bpc + 7) / 8; // rounded up
+ int sub_x = 0, sub_y = 0;
+ switch (pic->p.layout) {
+ case DAV1D_PIXEL_LAYOUT_I400:
+ case DAV1D_PIXEL_LAYOUT_I444:
+ break;
+ case DAV1D_PIXEL_LAYOUT_I420:
+ sub_x = sub_y = 1;
+ break;
+ case DAV1D_PIXEL_LAYOUT_I422:
+ sub_x = 1;
+ break;
+ }
+
+ struct pl_plane_data data[3] = {
+ {
+ // Y plane
+ .type = PL_FMT_UNORM,
+ .width = pic->p.w,
+ .height = pic->p.h,
+ .pixel_stride = bytes,
+ .component_size = {bytes * 8},
+ .component_map = {0},
+ }, {
+ // U plane
+ .type = PL_FMT_UNORM,
+ .width = pic->p.w >> sub_x,
+ .height = pic->p.h >> sub_y,
+ .pixel_stride = bytes,
+ .component_size = {bytes * 8},
+ .component_map = {1},
+ }, {
+ // V plane
+ .type = PL_FMT_UNORM,
+ .width = pic->p.w >> sub_x,
+ .height = pic->p.h >> sub_y,
+ .pixel_stride = bytes,
+ .component_size = {bytes * 8},
+ .component_map = {2},
+ },
+ };
+
+ pl_buf buf = NULL;
+ struct pl_dav1dalloc *alloc = params->gpu_allocated ? pic->allocator_data : NULL;
+ struct pl_dav1dref *ref = NULL;
+
+ if (alloc && alloc->magic[0] == PL_MAGIC0 && alloc->magic[1] == PL_MAGIC1) {
+ // Re-use pre-allocated buffers directly
+ assert(alloc->gpu == gpu);
+ buf = alloc->buf;
+ } else if (params->asynchronous && gpu->limits.callbacks) {
+ ref = malloc(sizeof(*ref));
+ if (!ref)
+ return false;
+ memcpy(&ref->pic, pic, sizeof(Dav1dPicture));
+ ref->count = out->num_planes;
+ }
+
+ for (int p = 0; p < out->num_planes; p++) {
+ ptrdiff_t stride = p > 0 ? pic->stride[1] : pic->stride[0];
+ if (stride < 0) {
+ data[p].pixels = (uint8_t *) pic->data[p] + stride * (data[p].height - 1);
+ data[p].row_stride = -stride;
+ out->planes[p].flipped = true;
+ } else {
+ data[p].pixels = pic->data[p];
+ data[p].row_stride = stride;
+ }
+
+ if (buf) {
+ data[p].buf = buf;
+ data[p].buf_offset = (uintptr_t) data[p].pixels - (uintptr_t) buf->data;
+ data[p].pixels = NULL;
+ } else if (ref) {
+ data[p].priv = ref;
+ data[p].callback = pl_dav1dpicture_unref;
+ }
+
+ if (!pl_upload_plane(gpu, &out->planes[p], &tex[p], &data[p])) {
+ free(ref);
+ return false;
+ }
+ }
+
+ if (params->asynchronous) {
+ if (ref) {
+ *pic = (Dav1dPicture) {0};
+ } else {
+ dav1d_picture_unref(pic);
+ }
+ }
+
+ return true;
+}
+
+PL_DAV1D_API int pl_allocate_dav1dpicture(Dav1dPicture *p, void *cookie)
+{
+ pl_gpu gpu = cookie;
+ if (!gpu->limits.max_mapped_size || !gpu->limits.host_cached ||
+ !gpu->limits.buf_transfer)
+ {
+ return DAV1D_ERR(ENOTSUP);
+ }
+
+ // Copied from dav1d_default_picture_alloc
+ const int hbd = p->p.bpc > 8;
+ const int aligned_w = PL_ALIGN2(p->p.w, 128);
+ const int aligned_h = PL_ALIGN2(p->p.h, 128);
+ const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400;
+ const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ p->stride[0] = aligned_w << hbd;
+ p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0;
+
+ // Align strides up to multiples of the GPU performance hints
+ p->stride[0] = PL_ALIGN2(p->stride[0], gpu->limits.align_tex_xfer_pitch);
+ p->stride[1] = PL_ALIGN2(p->stride[1], gpu->limits.align_tex_xfer_pitch);
+
+ // Aligning offsets to 4 also implicitly aligns to the texel alignment (1 or 2)
+ size_t off_align = PL_ALIGN2(gpu->limits.align_tex_xfer_offset, 4);
+ const size_t y_sz = PL_ALIGN2(p->stride[0] * aligned_h, off_align);
+ const size_t uv_sz = PL_ALIGN2(p->stride[1] * (aligned_h >> ss_ver), off_align);
+
+ // The extra DAV1D_PICTURE_ALIGNMENTs are to brute force plane alignment,
+ // even in the case that the driver gives us insane alignments
+ const size_t pic_size = y_sz + 2 * uv_sz;
+ const size_t total_size = pic_size + DAV1D_PICTURE_ALIGNMENT * 4;
+
+ // Validate size limitations
+ if (total_size > gpu->limits.max_mapped_size)
+ return DAV1D_ERR(ENOMEM);
+
+ pl_buf buf = pl_buf_create(gpu, pl_buf_params(
+ .size = total_size,
+ .host_mapped = true,
+ .memory_type = PL_BUF_MEM_HOST,
+ ));
+
+ if (!buf)
+ return DAV1D_ERR(ENOMEM);
+
+ struct pl_dav1dalloc *alloc = malloc(sizeof(struct pl_dav1dalloc));
+ if (!alloc) {
+ pl_buf_destroy(gpu, &buf);
+ return DAV1D_ERR(ENOMEM);
+ }
+
+ *alloc = (struct pl_dav1dalloc) {
+ .magic = { PL_MAGIC0, PL_MAGIC1 },
+ .gpu = gpu,
+ .buf = buf,
+ };
+
+ assert(buf->data);
+ uintptr_t base = (uintptr_t) buf->data, data[3];
+ data[0] = PL_ALIGN2(base, DAV1D_PICTURE_ALIGNMENT);
+ data[1] = PL_ALIGN2(data[0] + y_sz, DAV1D_PICTURE_ALIGNMENT);
+ data[2] = PL_ALIGN2(data[1] + uv_sz, DAV1D_PICTURE_ALIGNMENT);
+
+ p->allocator_data = alloc;
+ p->data[0] = (void *) data[0];
+ p->data[1] = (void *) data[1];
+ p->data[2] = (void *) data[2];
+ return 0;
+}
+
+PL_DAV1D_API void pl_release_dav1dpicture(Dav1dPicture *p, void *cookie)
+{
+ struct pl_dav1dalloc *alloc = p->allocator_data;
+ if (!alloc)
+ return;
+
+ assert(alloc->magic[0] == PL_MAGIC0);
+ assert(alloc->magic[1] == PL_MAGIC1);
+ assert(alloc->gpu == cookie);
+ pl_buf_destroy(alloc->gpu, &alloc->buf);
+ free(alloc);
+
+ p->data[0] = p->data[1] = p->data[2] = p->allocator_data = NULL;
+}
+
+#undef PL_ALIGN2
+#undef PL_MAGIC0
+#undef PL_MAGIC1
+
+#endif // LIBPLACEBO_DAV1D_H_
diff --git a/src/include/libplacebo/utils/dolbyvision.h b/src/include/libplacebo/utils/dolbyvision.h
new file mode 100644
index 0000000..6d4d72e
--- /dev/null
+++ b/src/include/libplacebo/utils/dolbyvision.h
@@ -0,0 +1,34 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_DOLBYVISION_H_
+#define LIBPLACEBO_DOLBYVISION_H_
+
+#include <libplacebo/colorspace.h>
+
+PL_API_BEGIN
+
+// Parses the Dolby Vision RPU, and sets the `pl_hdr_metadata` dynamic
+// brightness metadata fields accordingly.
+//
+// Note: requires `PL_HAVE_LIBDOVI` to be defined, no-op otherwise.
+PL_API void pl_hdr_metadata_from_dovi_rpu(struct pl_hdr_metadata *out,
+ const uint8_t *buf, size_t size);
+
+PL_API_END
+
+#endif // LIBPLACEBO_DOLBYVISION_H_
diff --git a/src/include/libplacebo/utils/frame_queue.h b/src/include/libplacebo/utils/frame_queue.h
new file mode 100644
index 0000000..2a9c90c
--- /dev/null
+++ b/src/include/libplacebo/utils/frame_queue.h
@@ -0,0 +1,230 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_FRAME_QUEUE_H
+#define LIBPLACEBO_FRAME_QUEUE_H
+
+#include <libplacebo/renderer.h>
+#include <libplacebo/shaders/deinterlacing.h>
+
+PL_API_BEGIN
+
+// An abstraction layer for automatically turning a conceptual stream of
+// (frame, pts) pairs, as emitted by a decoder or filter graph, into a
+// `pl_frame_mix` suitable for `pl_render_image_mix`.
+//
+// This API ensures that minimal work is performed (e.g. only mapping frames
+// that are actually required), while also satisfying the requirements
+// of any configured frame mixer.
+//
+// Thread-safety: Safe
+typedef struct pl_queue_t *pl_queue;
+
+enum pl_queue_status {
+ PL_QUEUE_OK, // success
+ PL_QUEUE_EOF, // no more frames are available
+ PL_QUEUE_MORE, // more frames needed, but not (yet) available
+ PL_QUEUE_ERR = -1, // some unknown error occurred while retrieving frames
+};
+
+struct pl_source_frame {
+ // The frame's presentation timestamp, in seconds relative to the first
+ // frame. These must be monotonically increasing for subsequent frames.
+ // To implement a discontinuous jump, users must explicitly reset the
+ // frame queue with `pl_queue_reset` and restart from PTS 0.0.
+ double pts;
+
+ // The frame's duration. This is not needed in normal scenarios, as the
+ // FPS can be inferred from the `pts` values themselves. Providing it
+ // only helps initialize the value for initial frames, which can smooth
+ // out the interpolation weights. Its use is also highly recommended
+ // when displaying interlaced frames. (Optional)
+ float duration;
+
+ // If set to something other than PL_FIELD_NONE, this source frame is
+ // marked as interlaced. It will be split up into two separate frames
+ // internally, and exported to the resulting `pl_frame_mix` as a pair of
+ // fields, referencing the corresponding previous and next frames. The
+ // first field will have the same PTS as `pts`, and the second field will
+ // be inserted at the timestamp `pts + duration/2`.
+ //
+ // Note: As a result of FPS estimates being unreliable around streams with
+ // mixed FPS (or when mixing interlaced and progressive frames), it's
+ // highly recommended to always specify a valid `duration` for interlaced
+ // frames.
+ enum pl_field first_field;
+
+ // Abstract frame data itself. To allow mapping frames only when they're
+ // actually needed, frames use a lazy representation. The provided
+ // callbacks will be invoked to interface with it.
+ void *frame_data;
+
+ // This will be called to map the frame to the GPU, only if needed.
+ //
+ // `tex` is a pointer to an array of 4 texture objects (or NULL), which
+ // *may* serve as backing storage for the texture being mapped. These are
+ // intended to be recreated by `map`, e.g. using `pl_tex_recreate` or
+ // `pl_upload_plane` as appropriate. They will be managed internally by
+ // `pl_queue` and destroyed at some unspecified future point in time.
+ //
+ // Note: If `map` fails, it will not be retried, nor will `discard` be run.
+ // The user should clean up state in this case.
+ bool (*map)(pl_gpu gpu, pl_tex *tex, const struct pl_source_frame *src,
+ struct pl_frame *out_frame);
+
+ // If present, this will be called on frames that are done being used by
+ // `pl_queue`. This may be useful to e.g. unmap textures backed by external
+ // APIs such as hardware decoders. (Optional)
+ void (*unmap)(pl_gpu gpu, struct pl_frame *frame, const struct pl_source_frame *src);
+
+ // This function will be called for frames that are deemed unnecessary
+ // (e.g. never became visible) and should instead be cleanly freed.
+ // (Optional)
+ void (*discard)(const struct pl_source_frame *src);
+};
+
+// Create a new, empty frame queue.
+//
+// It's highly recommended to fully render a single frame with `pts == 0.0`,
+// and flush the GPU pipeline with `pl_gpu_finish`, prior to starting the timed
+// playback loop.
+PL_API pl_queue pl_queue_create(pl_gpu gpu);
+PL_API void pl_queue_destroy(pl_queue *queue);
+
+// Explicitly clear the queue. This is essentially equivalent to destroying
+// and recreating the queue, but preserves any internal memory allocations.
+//
+// Note: Calling `pl_queue_reset` may block, if another thread is currently
+// blocked on a different `pl_queue_*` call.
+PL_API void pl_queue_reset(pl_queue queue);
+
+// Explicitly push a frame. This is an alternative way to feed the frame queue
+// with incoming frames, the other method being the asynchronous callback
+// specified as `pl_queue_params.get_frame`. Both methods may be used
+// simultaneously, although providing `get_frame` is recommended since it
+// avoids the risk of the queue underrunning.
+//
+// When no more frames are available, call this function with `frame == NULL`
+// to indicate EOF and begin draining the frame queue.
+PL_API void pl_queue_push(pl_queue queue, const struct pl_source_frame *frame);
+
+// Variant of `pl_queue_push` that blocks while the queue is judged
+// (internally) to be "too full". This is useful for asynchronous decoder loops
+// in order to prevent the queue from exhausting available RAM if frames are
+// decoded significantly faster than they're displayed.
+//
+// The given `timeout` parameter specifies how long to wait before giving up,
+// in nanoseconds. Returns false if this timeout was reached.
+PL_API bool pl_queue_push_block(pl_queue queue, uint64_t timeout,
+ const struct pl_source_frame *frame);
+
+struct pl_queue_params {
+ // The PTS of the frame that will be rendered. This should be set to the
+ // timestamp (in seconds) of the next vsync, relative to the initial frame.
+ //
+ // These must be monotonically increasing. To implement a discontinuous
+ // jump, users must explicitly reset the frame queue with `pl_queue_reset`
+ // and restart from PTS 0.0.
+ double pts;
+
+ // The radius of the configured mixer. This should be set to the value
+ // as returned by `pl_frame_mix_radius`.
+ float radius;
+
+ // The estimated duration of a vsync, in seconds. This will only be used as
+ // a hint, the true value will be estimated by comparing `pts` timestamps
+ // between calls to `pl_queue_update`. (Optional)
+ float vsync_duration;
+
+ // If the difference between the (estimated) vsync duration and the
+ // (measured) frame duration is smaller than this threshold, silently
+ // disable interpolation and switch to ZOH semantics instead.
+ //
+ // For example, a value of 0.01 allows the FPS to differ by up to 1%
+ // without being interpolated. Note that this will result in a continuous
+ // phase drift unless also compensated for by the user, which will
+ // eventually resulted in a dropped or duplicated frame. (Though this can
+ // be preferable to seeing that same phase drift result in a temporally
+ // smeared image)
+ float interpolation_threshold;
+
+ // Specifies how long `pl_queue_update` will wait for frames to become
+ // available, in nanoseconds, before giving up and returning with
+ // QUEUE_MORE.
+ //
+ // If `get_frame` is provided, this value is ignored by `pl_queue` and
+ // should instead be interpreted by the provided callback.
+ uint64_t timeout;
+
+ // This callback will be used to pull new frames from the decoder. It may
+ // block if needed. The user is responsible for setting appropriate time
+ // limits and/or returning and interpreting QUEUE_MORE as sensible.
+ //
+ // Providing this callback is entirely optional. Users can instead choose
+ // to manually feed the frame queue with new frames using `pl_queue_push`.
+ enum pl_queue_status (*get_frame)(struct pl_source_frame *out_frame,
+ const struct pl_queue_params *params);
+ void *priv;
+};
+
+#define pl_queue_params(...) (&(struct pl_queue_params) { __VA_ARGS__ })
+
+// Advance the frame queue's internal state to the target timestamp. Any frames
+// which are no longer needed (i.e. too far in the past) are automatically
+// unmapped and evicted. Any future frames which are needed to fill the queue
+// must either have been pushed in advance, or will be requested using the
+// provided `get_frame` callback. If you call this on `out_mix == NULL`, the
+// queue state will advance, but no frames will be mapped.
+//
+// This function may return with PL_QUEUE_MORE, in which case the user may wish
+// to ensure more frames are available and then re-run this function with the
+// same parameters. In this case, `out_mix` is still written to, but it may be
+// incomplete (or even contain no frames at all). Additionally, when the source
+// contains interlaced frames (see `pl_source_frame.first_field`), this
+// function may return with PL_QUEUE_MORE if a frame is missing references to
+// a future frame.
+//
+// The resulting mix of frames in `out_mix` will represent the neighbourhood of
+// the target timestamp, and can be passed to `pl_render_image_mix` as-is.
+//
+// Note: `out_mix` will only remain valid until the next call to
+// `pl_queue_update` or `pl_queue_reset`.
+PL_API enum pl_queue_status pl_queue_update(pl_queue queue, struct pl_frame_mix *out_mix,
+ const struct pl_queue_params *params);
+
+// Returns a pl_queue's internal estimates for FPS and VPS (vsyncs per second).
+// Returns 0.0 if no estimate is available.
+PL_API float pl_queue_estimate_fps(pl_queue queue);
+PL_API float pl_queue_estimate_vps(pl_queue queue);
+
+// Returns the number of frames currently contained in a pl_queue.
+PL_API int pl_queue_num_frames(pl_queue queue);
+
+// Inspect the contents of the Nth queued frame. Returns false if `idx` is
+// out of range.
+//
+// Warning: No guarantee is made to ensure validity of `out->frame_data`
+// after this call. In particular, pl_queue_* calls made from another thread
+// may call `discard()` on the frame in question. The user bears responsibility
+// to avoid accessing `out->frame_data` in a multi-threaded scenario unless
+// an external guarantee can be made that the frame won't be dequeued until
+// it is done being used by the user.
+PL_API bool pl_queue_peek(pl_queue queue, int idx, struct pl_source_frame *out);
+
+PL_API_END
+
+#endif // LIBPLACEBO_FRAME_QUEUE_H
diff --git a/src/include/libplacebo/utils/libav.h b/src/include/libplacebo/utils/libav.h
new file mode 100644
index 0000000..91f3dd8
--- /dev/null
+++ b/src/include/libplacebo/utils/libav.h
@@ -0,0 +1,284 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_LIBAV_H_
+#define LIBPLACEBO_LIBAV_H_
+
+#include <libplacebo/config.h>
+#include <libplacebo/gpu.h>
+#include <libplacebo/shaders/deinterlacing.h>
+#include <libplacebo/utils/upload.h>
+
+#if defined(__cplusplus) && !defined(PL_LIBAV_IMPLEMENTATION)
+# define PL_LIBAV_API
+# define PL_LIBAV_IMPLEMENTATION 0
+# warning Remember to include this file with a PL_LIBAV_IMPLEMENTATION set to 1 in \
+ C translation unit to provide implementation. Suppress this warning by \
+ defining PL_LIBAV_IMPLEMENTATION to 0 in C++ files.
+#elif !defined(PL_LIBAV_IMPLEMENTATION)
+# define PL_LIBAV_API static inline
+# define PL_LIBAV_IMPLEMENTATION 1
+#else
+# define PL_LIBAV_API
+#endif
+
+PL_API_BEGIN
+
+#include <libavformat/avformat.h>
+#include <libavutil/frame.h>
+#include <libavutil/version.h>
+#include <libavcodec/avcodec.h>
+
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 16, 100) && defined(PL_HAVE_DOVI)
+# define PL_HAVE_LAV_DOLBY_VISION
+# include <libavutil/dovi_meta.h>
+#endif
+
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(56, 61, 100)
+# define PL_HAVE_LAV_FILM_GRAIN
+# include <libavutil/film_grain_params.h>
+#endif
+
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(56, 25, 100)
+# define PL_HAVE_LAV_HDR
+# include <libavutil/hdr_dynamic_metadata.h>
+# include <libavutil/mastering_display_metadata.h>
+#endif
+
+//------------------------------------------------------------------------
+// Important note: For support for AVVkFrame, which depends on <vulkan.h>,
+// users *SHOULD* include <vulkan/vulkan.h> manually before this header.
+//------------------------------------------------------------------------
+
+
+// Fill in the details of a `pl_frame` from an AVFrame. This function will
+// explicitly clear `out_frame`, setting all extra fields to 0. After this
+// function returns, the only missing data is information related to the plane
+// texture itself (`planes[N].texture`), as well as any overlays (e.g.
+// subtitles).
+//
+// Note: If the AVFrame contains an embedded ICC profile or H.274 film grain
+// metadata, the resulting `out_image->profile` will reference this pointer,
+// meaning that in general, the `pl_frame` is only guaranteed to be valid as
+// long as the AVFrame is not freed.
+//
+// Note: This will ignore Dolby Vision metadata by default (to avoid leaking
+// memory), either switch to pl_map_avframe_ex or do it manually using
+// pl_map_dovi_metadata.
+PL_LIBAV_API void pl_frame_from_avframe(struct pl_frame *out_frame, const AVFrame *frame);
+
+// Deprecated aliases for backwards compatibility
+#define pl_image_from_avframe pl_frame_from_avframe
+#define pl_target_from_avframe pl_frame_from_avframe
+
+// Copy extra metadata from an AVStream to a pl_frame. This should be called
+// after `pl_frame_from_avframe` or `pl_map_avframe` (respectively), and sets
+// metadata associated with stream-level side data. This is needed because
+// FFmpeg rather annoyingly does not propagate stream-level metadata to frames.
+PL_LIBAV_API void pl_frame_copy_stream_props(struct pl_frame *out_frame,
+ const AVStream *stream);
+
+#ifdef PL_HAVE_LAV_HDR
+struct pl_av_hdr_metadata {
+ // All fields are optional and may be passed as `NULL`.
+ const AVMasteringDisplayMetadata *mdm;
+ const AVContentLightMetadata *clm;
+ const AVDynamicHDRPlus *dhp;
+};
+
+// Helper function to update a `pl_hdr_metadata` struct from HDR10/HDR10+
+// metadata in the FFmpeg format. Unspecified/invalid elements will be left
+// uninitialized in `out`.
+PL_LIBAV_API void pl_map_hdr_metadata(struct pl_hdr_metadata *out,
+ const struct pl_av_hdr_metadata *metadata);
+#endif
+
+#ifdef PL_HAVE_LAV_DOLBY_VISION
+// Helper function to map Dolby Vision metadata from the FFmpeg format.
+PL_LIBAV_API void pl_map_dovi_metadata(struct pl_dovi_metadata *out,
+ const AVDOVIMetadata *metadata);
+
+// Helper function to map Dolby Vision metadata from the FFmpeg format
+// to `pl_dovi_metadata`, and adds it to the `pl_frame`.
+// The `pl_frame` colorspace fields and HDR struct are also updated with
+// values from the `AVDOVIMetadata`.
+//
+// Note: The `pl_dovi_metadata` must be allocated externally.
+// Also, currently the metadata is only used if the `AVDOVIRpuDataHeader`
+// `disable_residual_flag` field is not zero and can be checked before allocating.
+PL_LIBAV_API void pl_frame_map_avdovi_metadata(struct pl_frame *out_frame,
+ struct pl_dovi_metadata *dovi,
+ const AVDOVIMetadata *metadata);
+#endif
+
+// Helper function to test if a pixfmt would be supported by the GPU.
+// Essentially, this can be used to check if `pl_map_avframe` would work for a
+// given AVPixelFormat, without actually uploading or allocating anything.
+PL_LIBAV_API bool pl_test_pixfmt(pl_gpu gpu, enum AVPixelFormat pixfmt);
+
+// Variant of `pl_test_pixfmt` that also tests for the given capabilities
+// being present. Note that in the presence of hardware accelerated frames,
+// this cannot be tested without frame-specific information (i.e. swformat),
+// but in practice this should be a non-issue as GPU-native hwformats will
+// probably be fully supported.
+PL_LIBAV_API bool pl_test_pixfmt_caps(pl_gpu gpu, enum AVPixelFormat pixfmt,
+ enum pl_fmt_caps caps);
+
+// Like `pl_frame_from_avframe`, but the texture pointers are also initialized
+// to ensure they have the correct size and format to match the AVframe.
+// Similar in spirit to `pl_recreate_plane`, and the same notes apply. `tex`
+// must be an array of 4 pointers of type `pl_tex`, each either
+// pointing to a valid texture, or NULL. Returns whether successful.
+PL_LIBAV_API bool pl_frame_recreate_from_avframe(pl_gpu gpu, struct pl_frame *out_frame,
+ pl_tex tex[4], const AVFrame *frame);
+
+struct pl_avframe_params {
+ // The AVFrame to map. Required.
+ const AVFrame *frame;
+
+ // Backing textures for frame data. Required for all non-hwdec formats.
+ // This must point to an array of four valid textures (or NULL entries).
+ //
+ // Note: Not cleaned up by `pl_unmap_avframe`. The intent is for users to
+ // re-use this texture array for subsequent frames, to avoid texture
+ // creation/destruction overhead.
+ pl_tex *tex;
+
+ // Also map Dolby Vision metadata (if supported). Note that this also
+ // overrides the colorimetry metadata (forces BT.2020+PQ).
+ bool map_dovi;
+};
+
+#define PL_AVFRAME_DEFAULTS \
+ .map_dovi = true,
+
+#define pl_avframe_params(...) (&(struct pl_avframe_params) { PL_AVFRAME_DEFAULTS __VA_ARGS__ })
+
+// Very high level helper function to take an `AVFrame` and map it to the GPU.
+// The resulting `pl_frame` remains valid until `pl_unmap_avframe` is called,
+// which must be called at some point to clean up state. The `AVFrame` is
+// automatically ref'd and unref'd if needed. Returns whether successful.
+//
+// Note: `out_frame->user_data` points to a privately managed opaque struct
+// and must not be touched by the user.
+PL_LIBAV_API bool pl_map_avframe_ex(pl_gpu gpu, struct pl_frame *out_frame,
+ const struct pl_avframe_params *params);
+PL_LIBAV_API void pl_unmap_avframe(pl_gpu gpu, struct pl_frame *frame);
+
+// Backwards compatibility with previous versions of this API.
+PL_LIBAV_API bool pl_map_avframe(pl_gpu gpu, struct pl_frame *out_frame,
+ pl_tex tex[4], const AVFrame *avframe);
+
+// Return the AVFrame* that a pl_frame was mapped from (via pl_map_avframe_ex)
+// Note: This reference is attached to the `pl_frame` and will get freed by
+// pl_unmap_avframe.
+PL_LIBAV_API AVFrame *pl_get_mapped_avframe(const struct pl_frame *frame);
+
+// Download the texture contents of a `pl_frame` back to a corresponding
+// AVFrame. Blocks until completion.
+//
+// Note: This function performs minimal verification, so incorrect usage will
+// likely result in broken frames. Use `pl_frame_recreate_from_avframe` to
+// ensure matching formats.
+PL_LIBAV_API bool pl_download_avframe(pl_gpu gpu,
+ const struct pl_frame *frame,
+ AVFrame *out_frame);
+
+// Helper functions to update the colorimetry data in an AVFrame based on
+// the values specified in the given color space / color repr / profile.
+//
+// Note: These functions can and will allocate AVFrame side data if needed,
+// in particular to encode HDR metadata in `space.hdr`.
+PL_LIBAV_API void pl_avframe_set_color(AVFrame *frame, struct pl_color_space space);
+PL_LIBAV_API void pl_avframe_set_repr(AVFrame *frame, struct pl_color_repr repr);
+PL_LIBAV_API void pl_avframe_set_profile(AVFrame *frame, struct pl_icc_profile profile);
+
+// Map an AVPixelFormat to an array of pl_plane_data structs. The array must
+// have at least `av_pix_fmt_count_planes(fmt)` elements, but never more than
+// 4. This function leaves `width`, `height` and `row_stride`, as well as the
+// data pointers, uninitialized.
+//
+// If `bits` is non-NULL, this function will attempt aligning the resulting
+// `pl_plane_data` struct for optimal compatibility, placing the resulting
+// `pl_bit_depth` metadata into `bits`.
+//
+// Returns the number of plane structs written to, or 0 on error.
+//
+// Note: This function is usually clumsier to use than the higher-level
+// functions above, but it might have some fringe use cases, for example if
+// the user wants to replace the data buffers by `pl_buf` references in the
+// `pl_plane_data` before uploading it to the GPU.
+PL_LIBAV_API int pl_plane_data_from_pixfmt(struct pl_plane_data data[4],
+ struct pl_bit_encoding *bits,
+ enum AVPixelFormat pix_fmt);
+
+// Callback for AVCodecContext.get_buffer2 that allocates memory from
+// persistently mapped buffers. This can be more efficient than regular
+// system memory, especially on platforms that don't support importing
+// PL_HANDLE_HOST_PTR as buffers.
+//
+// Note: `avctx->opaque` must be a pointer that *points* to the GPU instance.
+// That is, it should have type `pl_gpu *`.
+PL_LIBAV_API int pl_get_buffer2(AVCodecContext *avctx, AVFrame *pic, int flags);
+
+// Mapping functions for the various libavutil enums. Note that these are not
+// quite 1:1, and even for values that exist in both, the semantics sometimes
+// differ. Some special cases (e.g. ICtCp, or XYZ) are handled differently in
+// libplacebo and libavutil, respectively.
+//
+// Because of this, it's generally recommended to avoid these and instead use
+// helpers like `pl_frame_from_avframe`, which contain extra logic to patch
+// through all of the special cases.
+PL_LIBAV_API enum pl_color_system pl_system_from_av(enum AVColorSpace spc);
+PL_LIBAV_API enum AVColorSpace pl_system_to_av(enum pl_color_system sys);
+PL_LIBAV_API enum pl_color_levels pl_levels_from_av(enum AVColorRange range);
+PL_LIBAV_API enum AVColorRange pl_levels_to_av(enum pl_color_levels levels);
+PL_LIBAV_API enum pl_color_primaries pl_primaries_from_av(enum AVColorPrimaries prim);
+PL_LIBAV_API enum AVColorPrimaries pl_primaries_to_av(enum pl_color_primaries prim);
+PL_LIBAV_API enum pl_color_transfer pl_transfer_from_av(enum AVColorTransferCharacteristic trc);
+PL_LIBAV_API enum AVColorTransferCharacteristic pl_transfer_to_av(enum pl_color_transfer trc);
+PL_LIBAV_API enum pl_chroma_location pl_chroma_from_av(enum AVChromaLocation loc);
+PL_LIBAV_API enum AVChromaLocation pl_chroma_to_av(enum pl_chroma_location loc);
+
+// Helper function to generate a `pl_color_space` struct from an AVFrame.
+PL_LIBAV_API void pl_color_space_from_avframe(struct pl_color_space *out_csp,
+ const AVFrame *frame);
+
+// Helper function to pick the right `pl_field` value for an AVFrame.
+PL_LIBAV_API enum pl_field pl_field_from_avframe(const AVFrame *frame);
+
+#ifdef PL_HAVE_LAV_FILM_GRAIN
+// Fill in film grain parameters from an AVFilmGrainParams.
+//
+// Note: The resulting struct will only remain valid as long as the
+// `AVFilmGrainParams` remains valid.
+PL_LIBAV_API void pl_film_grain_from_av(struct pl_film_grain_data *out_data,
+ const AVFilmGrainParams *fgp);
+#endif
+
+// Deprecated alias for backwards compatibility
+#define pl_swapchain_colors_from_avframe pl_color_space_from_avframe
+
+// Actual implementation, included as part of this header to avoid having
+// a compile-time dependency on libavutil.
+#if PL_LIBAV_IMPLEMENTATION
+# include <libplacebo/utils/libav_internal.h>
+#endif
+
+PL_API_END
+
+#endif // LIBPLACEBO_LIBAV_H_
diff --git a/src/include/libplacebo/utils/libav_internal.h b/src/include/libplacebo/utils/libav_internal.h
new file mode 100644
index 0000000..4c269e5
--- /dev/null
+++ b/src/include/libplacebo/utils/libav_internal.h
@@ -0,0 +1,1482 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_LIBAV_H_
+#error This header should be included as part of <libplacebo/utils/libav.h>
+#elif defined(__cplusplus)
+#error This header cannot be included from C++ define PL_LIBAV_IMPLEMENTATION appropriately
+#else
+
+#include <assert.h>
+
+#include <libplacebo/utils/dolbyvision.h>
+
+#include <libavutil/hwcontext.h>
+#include <libavutil/hwcontext_drm.h>
+#include <libavutil/imgutils.h>
+#include <libavutil/pixdesc.h>
+#include <libavutil/display.h>
+#include <libavcodec/version.h>
+
+// Try importing <vulkan.h> dynamically if it wasn't already
+#if !defined(VK_API_VERSION_1_2) && defined(__has_include)
+# if __has_include(<vulkan/vulkan.h>)
+# include <vulkan/vulkan.h>
+# endif
+#endif
+
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 8, 100) && \
+ defined(PL_HAVE_VULKAN) && defined(VK_API_VERSION_1_2)
+# define PL_HAVE_LAV_VULKAN
+# include <libavutil/hwcontext_vulkan.h>
+# include <libplacebo/vulkan.h>
+# if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(58, 11, 100)
+# define PL_HAVE_LAV_VULKAN_V2
+# endif
+#endif
+
+PL_LIBAV_API enum pl_color_system pl_system_from_av(enum AVColorSpace spc)
+{
+ switch (spc) {
+ case AVCOL_SPC_RGB: return PL_COLOR_SYSTEM_RGB;
+ case AVCOL_SPC_BT709: return PL_COLOR_SYSTEM_BT_709;
+ case AVCOL_SPC_UNSPECIFIED: return PL_COLOR_SYSTEM_UNKNOWN;
+ case AVCOL_SPC_RESERVED: return PL_COLOR_SYSTEM_UNKNOWN;
+ case AVCOL_SPC_FCC: return PL_COLOR_SYSTEM_UNKNOWN; // missing
+ case AVCOL_SPC_BT470BG: return PL_COLOR_SYSTEM_BT_601;
+ case AVCOL_SPC_SMPTE170M: return PL_COLOR_SYSTEM_BT_601;
+ case AVCOL_SPC_SMPTE240M: return PL_COLOR_SYSTEM_SMPTE_240M;
+ case AVCOL_SPC_YCGCO: return PL_COLOR_SYSTEM_YCGCO;
+ case AVCOL_SPC_BT2020_NCL: return PL_COLOR_SYSTEM_BT_2020_NC;
+ case AVCOL_SPC_BT2020_CL: return PL_COLOR_SYSTEM_BT_2020_C;
+ case AVCOL_SPC_SMPTE2085: return PL_COLOR_SYSTEM_UNKNOWN; // missing
+ case AVCOL_SPC_CHROMA_DERIVED_NCL: return PL_COLOR_SYSTEM_UNKNOWN; // missing
+ case AVCOL_SPC_CHROMA_DERIVED_CL: return PL_COLOR_SYSTEM_UNKNOWN; // missing
+ // Note: this colorspace is confused between PQ and HLG, which libav*
+ // requires inferring from other sources, but libplacebo makes explicit.
+ // Default to PQ as it's the more common scenario.
+ case AVCOL_SPC_ICTCP: return PL_COLOR_SYSTEM_BT_2100_PQ;
+ case AVCOL_SPC_NB: return PL_COLOR_SYSTEM_COUNT;
+ }
+
+ return PL_COLOR_SYSTEM_UNKNOWN;
+}
+
+PL_LIBAV_API enum AVColorSpace pl_system_to_av(enum pl_color_system sys)
+{
+ switch (sys) {
+ case PL_COLOR_SYSTEM_UNKNOWN: return AVCOL_SPC_UNSPECIFIED;
+ case PL_COLOR_SYSTEM_BT_601: return AVCOL_SPC_SMPTE170M;
+ case PL_COLOR_SYSTEM_BT_709: return AVCOL_SPC_BT709;
+ case PL_COLOR_SYSTEM_SMPTE_240M: return AVCOL_SPC_SMPTE240M;
+ case PL_COLOR_SYSTEM_BT_2020_NC: return AVCOL_SPC_BT2020_NCL;
+ case PL_COLOR_SYSTEM_BT_2020_C: return AVCOL_SPC_BT2020_CL;
+ case PL_COLOR_SYSTEM_BT_2100_PQ: return AVCOL_SPC_ICTCP;
+ case PL_COLOR_SYSTEM_BT_2100_HLG: return AVCOL_SPC_ICTCP;
+ case PL_COLOR_SYSTEM_DOLBYVISION: return AVCOL_SPC_UNSPECIFIED; // missing
+ case PL_COLOR_SYSTEM_YCGCO: return AVCOL_SPC_YCGCO;
+ case PL_COLOR_SYSTEM_RGB: return AVCOL_SPC_RGB;
+ case PL_COLOR_SYSTEM_XYZ: return AVCOL_SPC_UNSPECIFIED; // handled differently
+ case PL_COLOR_SYSTEM_COUNT: return AVCOL_SPC_NB;
+ }
+
+ return AVCOL_SPC_UNSPECIFIED;
+}
+
+PL_LIBAV_API enum pl_color_levels pl_levels_from_av(enum AVColorRange range)
+{
+ switch (range) {
+ case AVCOL_RANGE_UNSPECIFIED: return PL_COLOR_LEVELS_UNKNOWN;
+ case AVCOL_RANGE_MPEG: return PL_COLOR_LEVELS_LIMITED;
+ case AVCOL_RANGE_JPEG: return PL_COLOR_LEVELS_FULL;
+ case AVCOL_RANGE_NB: return PL_COLOR_LEVELS_COUNT;
+ }
+
+ return PL_COLOR_LEVELS_UNKNOWN;
+}
+
+PL_LIBAV_API enum AVColorRange pl_levels_to_av(enum pl_color_levels levels)
+{
+ switch (levels) {
+ case PL_COLOR_LEVELS_UNKNOWN: return AVCOL_RANGE_UNSPECIFIED;
+ case PL_COLOR_LEVELS_LIMITED: return AVCOL_RANGE_MPEG;
+ case PL_COLOR_LEVELS_FULL: return AVCOL_RANGE_JPEG;
+ case PL_COLOR_LEVELS_COUNT: return AVCOL_RANGE_NB;
+ }
+
+ return AVCOL_RANGE_UNSPECIFIED;
+}
+
+PL_LIBAV_API enum pl_color_primaries pl_primaries_from_av(enum AVColorPrimaries prim)
+{
+ switch (prim) {
+ case AVCOL_PRI_RESERVED0: return PL_COLOR_PRIM_UNKNOWN;
+ case AVCOL_PRI_BT709: return PL_COLOR_PRIM_BT_709;
+ case AVCOL_PRI_UNSPECIFIED: return PL_COLOR_PRIM_UNKNOWN;
+ case AVCOL_PRI_RESERVED: return PL_COLOR_PRIM_UNKNOWN;
+ case AVCOL_PRI_BT470M: return PL_COLOR_PRIM_BT_470M;
+ case AVCOL_PRI_BT470BG: return PL_COLOR_PRIM_BT_601_625;
+ case AVCOL_PRI_SMPTE170M: return PL_COLOR_PRIM_BT_601_525;
+ case AVCOL_PRI_SMPTE240M: return PL_COLOR_PRIM_BT_601_525;
+ case AVCOL_PRI_FILM: return PL_COLOR_PRIM_FILM_C;
+ case AVCOL_PRI_BT2020: return PL_COLOR_PRIM_BT_2020;
+ case AVCOL_PRI_SMPTE428: return PL_COLOR_PRIM_CIE_1931;
+ case AVCOL_PRI_SMPTE431: return PL_COLOR_PRIM_DCI_P3;
+ case AVCOL_PRI_SMPTE432: return PL_COLOR_PRIM_DISPLAY_P3;
+ case AVCOL_PRI_JEDEC_P22: return PL_COLOR_PRIM_EBU_3213;
+ case AVCOL_PRI_NB: return PL_COLOR_PRIM_COUNT;
+ }
+
+ return PL_COLOR_PRIM_UNKNOWN;
+}
+
+PL_LIBAV_API enum AVColorPrimaries pl_primaries_to_av(enum pl_color_primaries prim)
+{
+ switch (prim) {
+ case PL_COLOR_PRIM_UNKNOWN: return AVCOL_PRI_UNSPECIFIED;
+ case PL_COLOR_PRIM_BT_601_525: return AVCOL_PRI_SMPTE170M;
+ case PL_COLOR_PRIM_BT_601_625: return AVCOL_PRI_BT470BG;
+ case PL_COLOR_PRIM_BT_709: return AVCOL_PRI_BT709;
+ case PL_COLOR_PRIM_BT_470M: return AVCOL_PRI_BT470M;
+ case PL_COLOR_PRIM_EBU_3213: return AVCOL_PRI_JEDEC_P22;
+ case PL_COLOR_PRIM_BT_2020: return AVCOL_PRI_BT2020;
+ case PL_COLOR_PRIM_APPLE: return AVCOL_PRI_UNSPECIFIED; // missing
+ case PL_COLOR_PRIM_ADOBE: return AVCOL_PRI_UNSPECIFIED; // missing
+ case PL_COLOR_PRIM_PRO_PHOTO: return AVCOL_PRI_UNSPECIFIED; // missing
+ case PL_COLOR_PRIM_CIE_1931: return AVCOL_PRI_SMPTE428;
+ case PL_COLOR_PRIM_DCI_P3: return AVCOL_PRI_SMPTE431;
+ case PL_COLOR_PRIM_DISPLAY_P3: return AVCOL_PRI_SMPTE432;
+ case PL_COLOR_PRIM_V_GAMUT: return AVCOL_PRI_UNSPECIFIED; // missing
+ case PL_COLOR_PRIM_S_GAMUT: return AVCOL_PRI_UNSPECIFIED; // missing
+ case PL_COLOR_PRIM_FILM_C: return AVCOL_PRI_FILM;
+ case PL_COLOR_PRIM_ACES_AP0: return AVCOL_PRI_UNSPECIFIED; // missing
+ case PL_COLOR_PRIM_ACES_AP1: return AVCOL_PRI_UNSPECIFIED; // missing
+ case PL_COLOR_PRIM_COUNT: return AVCOL_PRI_NB;
+ }
+
+ return AVCOL_PRI_UNSPECIFIED;
+}
+
+PL_LIBAV_API enum pl_color_transfer pl_transfer_from_av(enum AVColorTransferCharacteristic trc)
+{
+ switch (trc) {
+ case AVCOL_TRC_RESERVED0: return PL_COLOR_TRC_UNKNOWN;
+ case AVCOL_TRC_BT709: return PL_COLOR_TRC_BT_1886; // EOTF != OETF
+ case AVCOL_TRC_UNSPECIFIED: return PL_COLOR_TRC_UNKNOWN;
+ case AVCOL_TRC_RESERVED: return PL_COLOR_TRC_UNKNOWN;
+ case AVCOL_TRC_GAMMA22: return PL_COLOR_TRC_GAMMA22;
+ case AVCOL_TRC_GAMMA28: return PL_COLOR_TRC_GAMMA28;
+ case AVCOL_TRC_SMPTE170M: return PL_COLOR_TRC_BT_1886; // EOTF != OETF
+ case AVCOL_TRC_SMPTE240M: return PL_COLOR_TRC_BT_1886; // EOTF != OETF
+ case AVCOL_TRC_LINEAR: return PL_COLOR_TRC_LINEAR;
+ case AVCOL_TRC_LOG: return PL_COLOR_TRC_UNKNOWN; // missing
+ case AVCOL_TRC_LOG_SQRT: return PL_COLOR_TRC_UNKNOWN; // missing
+ case AVCOL_TRC_IEC61966_2_4: return PL_COLOR_TRC_BT_1886; // EOTF != OETF
+ case AVCOL_TRC_BT1361_ECG: return PL_COLOR_TRC_BT_1886; // ETOF != OETF
+ case AVCOL_TRC_IEC61966_2_1: return PL_COLOR_TRC_SRGB;
+ case AVCOL_TRC_BT2020_10: return PL_COLOR_TRC_BT_1886; // EOTF != OETF
+ case AVCOL_TRC_BT2020_12: return PL_COLOR_TRC_BT_1886; // EOTF != OETF
+ case AVCOL_TRC_SMPTE2084: return PL_COLOR_TRC_PQ;
+ case AVCOL_TRC_SMPTE428: return PL_COLOR_TRC_ST428;
+ case AVCOL_TRC_ARIB_STD_B67: return PL_COLOR_TRC_HLG;
+ case AVCOL_TRC_NB: return PL_COLOR_TRC_COUNT;
+ }
+
+ return PL_COLOR_TRC_UNKNOWN;
+}
+
+PL_LIBAV_API enum AVColorTransferCharacteristic pl_transfer_to_av(enum pl_color_transfer trc)
+{
+ switch (trc) {
+ case PL_COLOR_TRC_UNKNOWN: return AVCOL_TRC_UNSPECIFIED;
+ case PL_COLOR_TRC_BT_1886: return AVCOL_TRC_BT709; // EOTF != OETF
+ case PL_COLOR_TRC_SRGB: return AVCOL_TRC_IEC61966_2_1;
+ case PL_COLOR_TRC_LINEAR: return AVCOL_TRC_LINEAR;
+ case PL_COLOR_TRC_GAMMA18: return AVCOL_TRC_UNSPECIFIED; // missing
+ case PL_COLOR_TRC_GAMMA20: return AVCOL_TRC_UNSPECIFIED; // missing
+ case PL_COLOR_TRC_GAMMA22: return AVCOL_TRC_GAMMA22;
+ case PL_COLOR_TRC_GAMMA24: return AVCOL_TRC_UNSPECIFIED; // missing
+ case PL_COLOR_TRC_GAMMA26: return AVCOL_TRC_UNSPECIFIED; // missing
+ case PL_COLOR_TRC_GAMMA28: return AVCOL_TRC_GAMMA28;
+ case PL_COLOR_TRC_ST428: return AVCOL_TRC_SMPTE428;
+ case PL_COLOR_TRC_PRO_PHOTO: return AVCOL_TRC_UNSPECIFIED; // missing
+ case PL_COLOR_TRC_PQ: return AVCOL_TRC_SMPTE2084;
+ case PL_COLOR_TRC_HLG: return AVCOL_TRC_ARIB_STD_B67;
+ case PL_COLOR_TRC_V_LOG: return AVCOL_TRC_UNSPECIFIED; // missing
+ case PL_COLOR_TRC_S_LOG1: return AVCOL_TRC_UNSPECIFIED; // missing
+ case PL_COLOR_TRC_S_LOG2: return AVCOL_TRC_UNSPECIFIED; // missing
+ case PL_COLOR_TRC_COUNT: return AVCOL_TRC_NB;
+ }
+
+ return AVCOL_TRC_UNSPECIFIED;
+}
+
+PL_LIBAV_API enum pl_chroma_location pl_chroma_from_av(enum AVChromaLocation loc)
+{
+ switch (loc) {
+ case AVCHROMA_LOC_UNSPECIFIED: return PL_CHROMA_UNKNOWN;
+ case AVCHROMA_LOC_LEFT: return PL_CHROMA_LEFT;
+ case AVCHROMA_LOC_CENTER: return PL_CHROMA_CENTER;
+ case AVCHROMA_LOC_TOPLEFT: return PL_CHROMA_TOP_LEFT;
+ case AVCHROMA_LOC_TOP: return PL_CHROMA_TOP_CENTER;
+ case AVCHROMA_LOC_BOTTOMLEFT: return PL_CHROMA_BOTTOM_LEFT;
+ case AVCHROMA_LOC_BOTTOM: return PL_CHROMA_BOTTOM_CENTER;
+ case AVCHROMA_LOC_NB: return PL_CHROMA_COUNT;
+ }
+
+ return PL_CHROMA_UNKNOWN;
+}
+
+PL_LIBAV_API enum AVChromaLocation pl_chroma_to_av(enum pl_chroma_location loc)
+{
+ switch (loc) {
+ case PL_CHROMA_UNKNOWN: return AVCHROMA_LOC_UNSPECIFIED;
+ case PL_CHROMA_LEFT: return AVCHROMA_LOC_LEFT;
+ case PL_CHROMA_CENTER: return AVCHROMA_LOC_CENTER;
+ case PL_CHROMA_TOP_LEFT: return AVCHROMA_LOC_TOPLEFT;
+ case PL_CHROMA_TOP_CENTER: return AVCHROMA_LOC_TOP;
+ case PL_CHROMA_BOTTOM_LEFT: return AVCHROMA_LOC_BOTTOMLEFT;
+ case PL_CHROMA_BOTTOM_CENTER: return AVCHROMA_LOC_BOTTOM;
+ case PL_CHROMA_COUNT: return AVCHROMA_LOC_NB;
+ }
+
+ return AVCHROMA_LOC_UNSPECIFIED;
+}
+
+#ifdef PL_HAVE_LAV_HDR
+PL_LIBAV_API void pl_map_hdr_metadata(struct pl_hdr_metadata *out,
+ const struct pl_av_hdr_metadata *data)
+{
+ if (data->mdm) {
+ if (data->mdm->has_luminance) {
+ out->max_luma = av_q2d(data->mdm->max_luminance);
+ out->min_luma = av_q2d(data->mdm->min_luminance);
+ if (out->max_luma < 10.0 || out->min_luma >= out->max_luma)
+ out->max_luma = out->min_luma = 0; /* sanity */
+ }
+ if (data->mdm->has_primaries) {
+ out->prim = (struct pl_raw_primaries) {
+ .red.x = av_q2d(data->mdm->display_primaries[0][0]),
+ .red.y = av_q2d(data->mdm->display_primaries[0][1]),
+ .green.x = av_q2d(data->mdm->display_primaries[1][0]),
+ .green.y = av_q2d(data->mdm->display_primaries[1][1]),
+ .blue.x = av_q2d(data->mdm->display_primaries[2][0]),
+ .blue.y = av_q2d(data->mdm->display_primaries[2][1]),
+ .white.x = av_q2d(data->mdm->white_point[0]),
+ .white.y = av_q2d(data->mdm->white_point[1]),
+ };
+ }
+ }
+
+ if (data->clm) {
+ out->max_cll = data->clm->MaxCLL;
+ out->max_fall = data->clm->MaxFALL;
+ }
+
+ if (data->dhp && data->dhp->application_version < 2) {
+ float hist_max = 0;
+ const AVHDRPlusColorTransformParams *pars = &data->dhp->params[0];
+ assert(data->dhp->num_windows > 0);
+ out->scene_max[0] = 10000 * av_q2d(pars->maxscl[0]);
+ out->scene_max[1] = 10000 * av_q2d(pars->maxscl[1]);
+ out->scene_max[2] = 10000 * av_q2d(pars->maxscl[2]);
+ out->scene_avg = 10000 * av_q2d(pars->average_maxrgb);
+
+ // Calculate largest value from histogram to use as fallback for clips
+ // with missing MaxSCL information. Note that this may end up picking
+ // the "reserved" value at the 5% percentile, which in practice appears
+ // to track the brightest pixel in the scene.
+ for (int i = 0; i < pars->num_distribution_maxrgb_percentiles; i++) {
+ float hist_val = av_q2d(pars->distribution_maxrgb[i].percentile);
+ if (hist_val > hist_max)
+ hist_max = hist_val;
+ }
+ hist_max *= 10000;
+ if (!out->scene_max[0])
+ out->scene_max[0] = hist_max;
+ if (!out->scene_max[1])
+ out->scene_max[1] = hist_max;
+ if (!out->scene_max[2])
+ out->scene_max[2] = hist_max;
+
+ if (pars->tone_mapping_flag == 1) {
+ out->ootf.target_luma = av_q2d(data->dhp->targeted_system_display_maximum_luminance);
+ out->ootf.knee_x = av_q2d(pars->knee_point_x);
+ out->ootf.knee_y = av_q2d(pars->knee_point_y);
+ assert(pars->num_bezier_curve_anchors < 16);
+ for (int i = 0; i < pars->num_bezier_curve_anchors; i++)
+ out->ootf.anchors[i] = av_q2d(pars->bezier_curve_anchors[i]);
+ out->ootf.num_anchors = pars->num_bezier_curve_anchors;
+ }
+ }
+}
+#endif // PL_HAVE_LAV_HDR
+
+static inline void *pl_get_side_data_raw(const AVFrame *frame,
+ enum AVFrameSideDataType type)
+{
+ const AVFrameSideData *sd = av_frame_get_side_data(frame, type);
+ return sd ? (void *) sd->data : NULL;
+}
+
+PL_LIBAV_API void pl_color_space_from_avframe(struct pl_color_space *out_csp,
+ const AVFrame *frame)
+{
+ *out_csp = (struct pl_color_space) {
+ .primaries = pl_primaries_from_av(frame->color_primaries),
+ .transfer = pl_transfer_from_av(frame->color_trc),
+ };
+
+#ifdef PL_HAVE_LAV_HDR
+ pl_map_hdr_metadata(&out_csp->hdr, &(struct pl_av_hdr_metadata) {
+ .mdm = pl_get_side_data_raw(frame, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA),
+ .clm = pl_get_side_data_raw(frame, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL),
+ .dhp = pl_get_side_data_raw(frame, AV_FRAME_DATA_DYNAMIC_HDR_PLUS),
+ });
+#endif
+}
+
+PL_LIBAV_API enum pl_field pl_field_from_avframe(const AVFrame *frame)
+{
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(58, 7, 100)
+ if (!frame || !(frame->flags & AV_FRAME_FLAG_INTERLACED))
+ return PL_FIELD_NONE;
+ return (frame->flags & AV_FRAME_FLAG_TOP_FIELD_FIRST)
+ ? PL_FIELD_TOP : PL_FIELD_BOTTOM;
+#else
+ if (!frame || !frame->interlaced_frame)
+ return PL_FIELD_NONE;
+ return frame->top_field_first ? PL_FIELD_TOP : PL_FIELD_BOTTOM;
+#endif
+}
+
+#ifdef PL_HAVE_LAV_FILM_GRAIN
+PL_LIBAV_API void pl_film_grain_from_av(struct pl_film_grain_data *out_data,
+ const AVFilmGrainParams *fgp)
+{
+ out_data->seed = fgp->seed;
+
+ switch (fgp->type) {
+ case AV_FILM_GRAIN_PARAMS_NONE: break;
+ case AV_FILM_GRAIN_PARAMS_AV1: {
+ const AVFilmGrainAOMParams *src = &fgp->codec.aom;
+ struct pl_av1_grain_data *dst = &out_data->params.av1;
+ out_data->type = PL_FILM_GRAIN_AV1;
+ *dst = (struct pl_av1_grain_data) {
+ .num_points_y = src->num_y_points,
+ .chroma_scaling_from_luma = src->chroma_scaling_from_luma,
+ .num_points_uv = { src->num_uv_points[0], src->num_uv_points[1] },
+ .scaling_shift = src->scaling_shift,
+ .ar_coeff_lag = src->ar_coeff_lag,
+ .ar_coeff_shift = src->ar_coeff_shift,
+ .grain_scale_shift = src->grain_scale_shift,
+ .uv_mult = { src->uv_mult[0], src->uv_mult[1] },
+ .uv_mult_luma = { src->uv_mult_luma[0], src->uv_mult_luma[1] },
+ .uv_offset = { src->uv_offset[0], src->uv_offset[1] },
+ .overlap = src->overlap_flag,
+ };
+
+ assert(sizeof(dst->ar_coeffs_uv) == sizeof(src->ar_coeffs_uv));
+ memcpy(dst->points_y, src->y_points, sizeof(dst->points_y));
+ memcpy(dst->points_uv, src->uv_points, sizeof(dst->points_uv));
+ memcpy(dst->ar_coeffs_y, src->ar_coeffs_y, sizeof(dst->ar_coeffs_y));
+ memcpy(dst->ar_coeffs_uv, src->ar_coeffs_uv, sizeof(dst->ar_coeffs_uv));
+ break;
+ }
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 2, 100)
+ case AV_FILM_GRAIN_PARAMS_H274: {
+ const AVFilmGrainH274Params *src = &fgp->codec.h274;
+ struct pl_h274_grain_data *dst = &out_data->params.h274;
+ out_data->type = PL_FILM_GRAIN_H274;
+ *dst = (struct pl_h274_grain_data) {
+ .model_id = src->model_id,
+ .blending_mode_id = src->blending_mode_id,
+ .log2_scale_factor = src->log2_scale_factor,
+ .component_model_present = {
+ src->component_model_present[0],
+ src->component_model_present[1],
+ src->component_model_present[2],
+ },
+ .intensity_interval_lower_bound = {
+ src->intensity_interval_lower_bound[0],
+ src->intensity_interval_lower_bound[1],
+ src->intensity_interval_lower_bound[2],
+ },
+ .intensity_interval_upper_bound = {
+ src->intensity_interval_upper_bound[0],
+ src->intensity_interval_upper_bound[1],
+ src->intensity_interval_upper_bound[2],
+ },
+ .comp_model_value = {
+ src->comp_model_value[0],
+ src->comp_model_value[1],
+ src->comp_model_value[2],
+ },
+ };
+ memcpy(dst->num_intensity_intervals, src->num_intensity_intervals,
+ sizeof(dst->num_intensity_intervals));
+ memcpy(dst->num_model_values, src->num_model_values,
+ sizeof(dst->num_model_values));
+ break;
+ }
+#endif
+ }
+}
+#endif // PL_HAVE_LAV_FILM_GRAIN
+
+static inline int pl_plane_data_num_comps(const struct pl_plane_data *data)
+{
+ for (int i = 0; i < 4; i++) {
+ if (data->component_size[i] == 0)
+ return i;
+ }
+
+ return 4;
+}
+
+PL_LIBAV_API int pl_plane_data_from_pixfmt(struct pl_plane_data out_data[4],
+ struct pl_bit_encoding *out_bits,
+ enum AVPixelFormat pix_fmt)
+{
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+ int planes = av_pix_fmt_count_planes(pix_fmt);
+ struct pl_plane_data aligned_data[4];
+ struct pl_bit_encoding bits;
+ bool first;
+ if (!desc || planes < 0) // e.g. AV_PIX_FMT_NONE
+ return 0;
+
+ if (desc->flags & AV_PIX_FMT_FLAG_BITSTREAM) {
+ // Bitstream formats will most likely never be supported
+ return 0;
+ }
+
+ if (desc->flags & AV_PIX_FMT_FLAG_PAL) {
+ // Palette formats are (currently) not supported
+ return 0;
+ }
+
+ if (desc->flags & AV_PIX_FMT_FLAG_BAYER) {
+ // Bayer format don't have valid `desc->offset` values, so we can't
+ // use `pl_plane_data_from_mask` on them.
+ return 0;
+ }
+
+ if (desc->nb_components == 0 || desc->nb_components > 4) {
+ // Bogus components, possibly fake/virtual/hwaccel format?
+ return 0;
+ }
+
+ if (planes > 4)
+ return 0; // This shouldn't ever happen
+
+ // Fill in the details for each plane
+ for (int p = 0; p < planes; p++) {
+ struct pl_plane_data *data = &out_data[p];
+ int size[4] = {0};
+ int shift[4] = {0};
+ data->swapped = desc->flags & AV_PIX_FMT_FLAG_BE;
+ data->type = (desc->flags & AV_PIX_FMT_FLAG_FLOAT)
+ ? PL_FMT_FLOAT
+ : PL_FMT_UNORM;
+
+ data->pixel_stride = 0;
+
+ for (int c = 0; c < desc->nb_components; c++) {
+ const AVComponentDescriptor *comp = &desc->comp[c];
+ if (comp->plane != p)
+ continue;
+ if (data->swapped && comp->shift) {
+ // We cannot naively handle packed big endian formats because
+ // swapping the words also swaps the component order, so just
+ // exit out as a stupid safety measure
+ return 0;
+ }
+
+ size[c] = comp->depth;
+ shift[c] = comp->shift + comp->offset * 8;
+
+ if (data->pixel_stride && (int) data->pixel_stride != comp->step) {
+ // Pixel format contains components with different pixel stride
+ // (e.g. packed YUYV), this is currently not supported
+ return 0;
+ }
+ data->pixel_stride = comp->step;
+ }
+
+ pl_plane_data_from_comps(data, size, shift);
+ }
+
+ if (!out_bits)
+ return planes;
+
+ // Attempt aligning all of the planes for optimum compatibility
+ first = true;
+ for (int p = 0; p < planes; p++) {
+ aligned_data[p] = out_data[p];
+
+ // Planes with only an alpha component should be ignored
+ if (pl_plane_data_num_comps(&aligned_data[p]) == 1 &&
+ aligned_data[p].component_map[0] == PL_CHANNEL_A)
+ {
+ continue;
+ }
+
+ if (!pl_plane_data_align(&aligned_data[p], &bits))
+ goto misaligned;
+
+ if (first) {
+ *out_bits = bits;
+ first = false;
+ } else {
+ if (!pl_bit_encoding_equal(&bits, out_bits))
+ goto misaligned;
+ }
+ }
+
+ // Overwrite the planes by their aligned versions
+ for (int p = 0; p < planes; p++)
+ out_data[p] = aligned_data[p];
+
+ return planes;
+
+misaligned:
+ *out_bits = (struct pl_bit_encoding) {0};
+ return planes;
+}
+
+PL_LIBAV_API bool pl_test_pixfmt_caps(pl_gpu gpu, enum AVPixelFormat pixfmt,
+ enum pl_fmt_caps caps)
+{
+ struct pl_bit_encoding bits;
+ struct pl_plane_data data[4];
+ pl_fmt fmt;
+ int planes;
+
+ switch (pixfmt) {
+ case AV_PIX_FMT_DRM_PRIME:
+ case AV_PIX_FMT_VAAPI:
+ return gpu->import_caps.tex & PL_HANDLE_DMA_BUF;
+
+#ifdef PL_HAVE_LAV_VULKAN
+ case AV_PIX_FMT_VULKAN:
+ return pl_vulkan_get(gpu);
+#endif
+
+ default: break;
+ }
+
+ planes = pl_plane_data_from_pixfmt(data, &bits, pixfmt);
+ if (!planes)
+ return false;
+
+ for (int i = 0; i < planes; i++) {
+ data[i].row_stride = 0;
+ fmt = pl_plane_find_fmt(gpu, NULL, &data[i]);
+ if (!fmt || (fmt->caps & caps) != caps)
+ return false;
+
+ }
+
+ return true;
+}
+
+PL_LIBAV_API bool pl_test_pixfmt(pl_gpu gpu, enum AVPixelFormat pixfmt)
+{
+ return pl_test_pixfmt_caps(gpu, pixfmt, 0);
+}
+
+PL_LIBAV_API void pl_avframe_set_color(AVFrame *frame, struct pl_color_space csp)
+{
+ const AVFrameSideData *sd;
+ (void) sd;
+
+ frame->color_primaries = pl_primaries_to_av(csp.primaries);
+ frame->color_trc = pl_transfer_to_av(csp.transfer);
+
+#ifdef PL_HAVE_LAV_HDR
+ if (csp.hdr.max_cll) {
+ sd = av_frame_get_side_data(frame, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL);
+ if (!sd) {
+ sd = av_frame_new_side_data(frame, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL,
+ sizeof(AVContentLightMetadata));
+ }
+
+ if (sd) {
+ AVContentLightMetadata *clm = (AVContentLightMetadata *) sd->data;
+ *clm = (AVContentLightMetadata) {
+ .MaxCLL = csp.hdr.max_cll,
+ .MaxFALL = csp.hdr.max_fall,
+ };
+ }
+ }
+
+ if (csp.hdr.max_luma || csp.hdr.prim.red.x) {
+ sd = av_frame_get_side_data(frame, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA);
+ if (!sd) {
+ sd = av_frame_new_side_data(frame, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA,
+ sizeof(AVMasteringDisplayMetadata));
+ }
+
+ if (sd) {
+ AVMasteringDisplayMetadata *mdm = (AVMasteringDisplayMetadata *) sd->data;
+ *mdm = (AVMasteringDisplayMetadata) {
+ .max_luminance = av_d2q(csp.hdr.max_luma, 1000000),
+ .min_luminance = av_d2q(csp.hdr.min_luma, 1000000),
+ .has_luminance = !!csp.hdr.max_luma,
+ .display_primaries = {
+ {
+ av_d2q(csp.hdr.prim.red.x, 1000000),
+ av_d2q(csp.hdr.prim.red.y, 1000000),
+ }, {
+ av_d2q(csp.hdr.prim.green.x, 1000000),
+ av_d2q(csp.hdr.prim.green.y, 1000000),
+ }, {
+ av_d2q(csp.hdr.prim.blue.x, 1000000),
+ av_d2q(csp.hdr.prim.blue.y, 1000000),
+ }
+ },
+ .white_point = {
+ av_d2q(csp.hdr.prim.white.x, 1000000),
+ av_d2q(csp.hdr.prim.white.y, 1000000),
+ },
+ .has_primaries = !!csp.hdr.prim.red.x,
+ };
+ }
+ }
+#endif // PL_HAVE_LAV_HDR
+}
+
+PL_LIBAV_API void pl_avframe_set_repr(AVFrame *frame, struct pl_color_repr repr)
+{
+ frame->colorspace = pl_system_to_av(repr.sys);
+ frame->color_range = pl_levels_to_av(repr.levels);
+
+ // No real way to map repr.bits, the image format already has to match
+}
+
+PL_LIBAV_API void pl_avframe_set_profile(AVFrame *frame, struct pl_icc_profile profile)
+{
+ const AVFrameSideData *sd;
+ av_frame_remove_side_data(frame, AV_FRAME_DATA_ICC_PROFILE);
+
+ if (!profile.len)
+ return;
+
+ sd = av_frame_new_side_data(frame, AV_FRAME_DATA_ICC_PROFILE, profile.len);
+ memcpy(sd->data, profile.data, profile.len);
+}
+
+PL_LIBAV_API void pl_frame_from_avframe(struct pl_frame *out,
+ const AVFrame *frame)
+{
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
+ int planes = av_pix_fmt_count_planes(frame->format);
+ const AVFrameSideData *sd;
+ assert(desc);
+
+ if (desc->flags & AV_PIX_FMT_FLAG_HWACCEL) {
+ const AVHWFramesContext *hwfc = (AVHWFramesContext *) frame->hw_frames_ctx->data;
+ desc = av_pix_fmt_desc_get(hwfc->sw_format);
+ planes = av_pix_fmt_count_planes(hwfc->sw_format);
+ }
+
+ // This should never fail, and there's nothing really useful we can do in
+ // this failure case anyway, since this is a `void` function.
+ assert(planes <= 4);
+
+ *out = (struct pl_frame) {
+ .num_planes = planes,
+ .crop = {
+ .x0 = frame->crop_left,
+ .y0 = frame->crop_top,
+ .x1 = frame->width - frame->crop_right,
+ .y1 = frame->height - frame->crop_bottom,
+ },
+ .repr = {
+ .sys = pl_system_from_av(frame->colorspace),
+ .levels = pl_levels_from_av(frame->color_range),
+ .alpha = (desc->flags & AV_PIX_FMT_FLAG_ALPHA)
+ ? PL_ALPHA_INDEPENDENT
+ : PL_ALPHA_UNKNOWN,
+
+ // For sake of simplicity, just use the first component's depth as
+ // the authoritative color depth for the whole image. Usually, this
+ // will be overwritten by more specific information when using e.g.
+ // `pl_map_avframe`, but for the sake of e.g. users wishing to map
+ // hwaccel frames manually, this is a good default.
+ .bits.color_depth = desc->comp[0].depth,
+ },
+ };
+
+ pl_color_space_from_avframe(&out->color, frame);
+
+ if (frame->colorspace == AVCOL_SPC_ICTCP &&
+ frame->color_trc == AVCOL_TRC_ARIB_STD_B67)
+ {
+ // libav* makes no distinction between PQ and HLG ICtCp, so we need
+ // to manually fix it in the case that we have HLG ICtCp data.
+ out->repr.sys = PL_COLOR_SYSTEM_BT_2100_HLG;
+
+ } else if (strncmp(desc->name, "xyz", 3) == 0) {
+
+ // libav* handles this as a special case, but doesn't provide an
+ // explicit flag for it either, so we have to resort to this ugly
+ // hack...
+ out->repr.sys = PL_COLOR_SYSTEM_XYZ;
+
+ } else if (desc->flags & AV_PIX_FMT_FLAG_RGB) {
+
+ out->repr.sys = PL_COLOR_SYSTEM_RGB;
+ out->repr.levels = PL_COLOR_LEVELS_FULL; // libav* ignores levels for RGB
+
+ } else if (!pl_color_system_is_ycbcr_like(out->repr.sys)) {
+ // libav* likes leaving this as UNKNOWN (or even RGB) for YCbCr frames,
+ // which confuses libplacebo since we infer UNKNOWN as RGB. To get
+ // around this, explicitly infer a suitable colorspace.
+ out->repr.sys = pl_color_system_guess_ycbcr(frame->width, frame->height);
+ }
+
+ if ((sd = av_frame_get_side_data(frame, AV_FRAME_DATA_ICC_PROFILE))) {
+ out->profile = (struct pl_icc_profile) {
+ .data = sd->data,
+ .len = sd->size,
+ };
+
+ // Needed to ensure profile uniqueness
+ pl_icc_profile_compute_signature(&out->profile);
+ }
+
+ if ((sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DISPLAYMATRIX))) {
+ double rot = av_display_rotation_get((const int32_t *) sd->data);
+ out->rotation = pl_rotation_normalize(4.5 - rot / 90.0);
+ }
+
+#ifdef PL_HAVE_LAV_FILM_GRAIN
+ if ((sd = av_frame_get_side_data(frame, AV_FRAME_DATA_FILM_GRAIN_PARAMS)))
+ pl_film_grain_from_av(&out->film_grain, (AVFilmGrainParams *) sd->data);
+#endif // HAVE_LAV_FILM_GRAIN
+
+ for (int p = 0; p < out->num_planes; p++) {
+ struct pl_plane *plane = &out->planes[p];
+
+ // Fill in the component mapping array
+ for (int c = 0; c < desc->nb_components; c++) {
+ if (desc->comp[c].plane == p)
+ plane->component_mapping[plane->components++] = c;
+ }
+
+ // Clear the superfluous components
+ for (int c = plane->components; c < 4; c++)
+ plane->component_mapping[c] = PL_CHANNEL_NONE;
+ }
+
+ // Only set the chroma location for definitely subsampled images, makes no
+ // sense otherwise
+ if (desc->log2_chroma_w || desc->log2_chroma_h) {
+ enum pl_chroma_location loc = pl_chroma_from_av(frame->chroma_location);
+ pl_frame_set_chroma_location(out, loc);
+ }
+}
+
+#if LIBAVFORMAT_VERSION_INT >= AV_VERSION_INT(60, 15, 100)
+PL_LIBAV_API const uint8_t *pl_av_stream_get_side_data(const AVStream *st,
+ enum AVPacketSideDataType type)
+{
+ const AVPacketSideData *sd;
+ sd = av_packet_side_data_get(st->codecpar->coded_side_data,
+ st->codecpar->nb_coded_side_data,
+ type);
+ return sd ? sd->data : NULL;
+}
+#else
+# define pl_av_stream_get_side_data(st, type) av_stream_get_side_data(st, type, NULL)
+#endif
+
+PL_LIBAV_API void pl_frame_copy_stream_props(struct pl_frame *out,
+ const AVStream *stream)
+{
+ const uint8_t *sd;
+ if ((sd = pl_av_stream_get_side_data(stream, AV_PKT_DATA_DISPLAYMATRIX))) {
+ double rot = av_display_rotation_get((const int32_t *) sd);
+ out->rotation = pl_rotation_normalize(4.5 - rot / 90.0);
+ }
+
+#ifdef PL_HAVE_LAV_HDR
+ pl_map_hdr_metadata(&out->color.hdr, &(struct pl_av_hdr_metadata) {
+ .mdm = (void *) pl_av_stream_get_side_data(stream,
+ AV_PKT_DATA_MASTERING_DISPLAY_METADATA),
+ .clm = (void *) pl_av_stream_get_side_data(stream,
+ AV_PKT_DATA_CONTENT_LIGHT_LEVEL),
+# if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(59, 2, 100)
+ .dhp = (void *) pl_av_stream_get_side_data(stream,
+ AV_PKT_DATA_DYNAMIC_HDR10_PLUS),
+# endif
+ });
+#endif
+}
+
+#undef pl_av_stream_get_side_data
+
+#ifdef PL_HAVE_LAV_DOLBY_VISION
+PL_LIBAV_API void pl_map_dovi_metadata(struct pl_dovi_metadata *out,
+ const AVDOVIMetadata *data)
+{
+ const AVDOVIRpuDataHeader *header;
+ const AVDOVIDataMapping *mapping;
+ const AVDOVIColorMetadata *color;
+ if (!data)
+ return;
+
+ header = av_dovi_get_header(data);
+ mapping = av_dovi_get_mapping(data);
+ color = av_dovi_get_color(data);
+
+ for (int i = 0; i < 3; i++)
+ out->nonlinear_offset[i] = av_q2d(color->ycc_to_rgb_offset[i]);
+ for (int i = 0; i < 9; i++) {
+ float *nonlinear = &out->nonlinear.m[0][0];
+ float *linear = &out->linear.m[0][0];
+ nonlinear[i] = av_q2d(color->ycc_to_rgb_matrix[i]);
+ linear[i] = av_q2d(color->rgb_to_lms_matrix[i]);
+ }
+ for (int c = 0; c < 3; c++) {
+ const AVDOVIReshapingCurve *csrc = &mapping->curves[c];
+ struct pl_reshape_data *cdst = &out->comp[c];
+ cdst->num_pivots = csrc->num_pivots;
+ for (int i = 0; i < csrc->num_pivots; i++) {
+ const float scale = 1.0f / ((1 << header->bl_bit_depth) - 1);
+ cdst->pivots[i] = scale * csrc->pivots[i];
+ }
+ for (int i = 0; i < csrc->num_pivots - 1; i++) {
+ const float scale = 1.0f / (1 << header->coef_log2_denom);
+ cdst->method[i] = csrc->mapping_idc[i];
+ switch (csrc->mapping_idc[i]) {
+ case AV_DOVI_MAPPING_POLYNOMIAL:
+ for (int k = 0; k < 3; k++) {
+ cdst->poly_coeffs[i][k] = (k <= csrc->poly_order[i])
+ ? scale * csrc->poly_coef[i][k]
+ : 0.0f;
+ }
+ break;
+ case AV_DOVI_MAPPING_MMR:
+ cdst->mmr_order[i] = csrc->mmr_order[i];
+ cdst->mmr_constant[i] = scale * csrc->mmr_constant[i];
+ for (int j = 0; j < csrc->mmr_order[i]; j++) {
+ for (int k = 0; k < 7; k++)
+ cdst->mmr_coeffs[i][j][k] = scale * csrc->mmr_coef[i][j][k];
+ }
+ break;
+ }
+ }
+ }
+}
+
+PL_LIBAV_API void pl_frame_map_avdovi_metadata(struct pl_frame *out_frame,
+ struct pl_dovi_metadata *dovi,
+ const AVDOVIMetadata *metadata)
+{
+ const AVDOVIRpuDataHeader *header;
+ const AVDOVIColorMetadata *color;
+ if (!dovi || !metadata)
+ return;
+
+ header = av_dovi_get_header(metadata);
+ color = av_dovi_get_color(metadata);
+ if (header->disable_residual_flag) {
+ pl_map_dovi_metadata(dovi, metadata);
+
+ out_frame->repr.dovi = dovi;
+ out_frame->repr.sys = PL_COLOR_SYSTEM_DOLBYVISION;
+ out_frame->color.primaries = PL_COLOR_PRIM_BT_2020;
+ out_frame->color.transfer = PL_COLOR_TRC_PQ;
+ out_frame->color.hdr.min_luma =
+ pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, color->source_min_pq / 4095.0f);
+ out_frame->color.hdr.max_luma =
+ pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, color->source_max_pq / 4095.0f);
+ }
+}
+#endif // PL_HAVE_LAV_DOLBY_VISION
+
+PL_LIBAV_API bool pl_frame_recreate_from_avframe(pl_gpu gpu,
+ struct pl_frame *out,
+ pl_tex tex[4],
+ const AVFrame *frame)
+{
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
+ struct pl_plane_data data[4] = {0};
+ int planes;
+
+ pl_frame_from_avframe(out, frame);
+ planes = pl_plane_data_from_pixfmt(data, &out->repr.bits, frame->format);
+ if (!planes)
+ return false;
+
+ for (int p = 0; p < planes; p++) {
+ bool is_chroma = p == 1 || p == 2; // matches lavu logic
+ data[p].width = AV_CEIL_RSHIFT(frame->width, is_chroma ? desc->log2_chroma_w : 0);
+ data[p].height = AV_CEIL_RSHIFT(frame->height, is_chroma ? desc->log2_chroma_h : 0);
+
+ if (!pl_recreate_plane(gpu, &out->planes[p], &tex[p], &data[p]))
+ return false;
+ }
+
+ return true;
+}
+
+static void pl_avframe_free_cb(void *priv)
+{
+ AVFrame *frame = priv;
+ av_frame_free(&frame);
+}
+
+#define PL_MAGIC0 0xfb5b3b8b
+#define PL_MAGIC1 0xee659f6d
+
+struct pl_avalloc {
+ uint32_t magic[2];
+ pl_gpu gpu;
+ pl_buf buf;
+};
+
+// Attached to `pl_frame.user_data` for mapped AVFrames
+struct pl_avframe_priv {
+ AVFrame *avframe;
+ struct pl_dovi_metadata dovi; // backing storage for per-frame dovi metadata
+ pl_tex planar; // for planar vulkan textures
+};
+
+static void pl_fix_hwframe_sample_depth(struct pl_frame *out, const AVFrame *frame)
+{
+ const AVHWFramesContext *hwfc = (AVHWFramesContext *) frame->hw_frames_ctx->data;
+ pl_fmt fmt = out->planes[0].texture->params.format;
+ struct pl_bit_encoding *bits = &out->repr.bits;
+
+ bits->sample_depth = fmt->component_depth[0];
+
+ switch (hwfc->sw_format) {
+ case AV_PIX_FMT_P010: bits->bit_shift = 6; break;
+ default: break;
+ }
+}
+
+static bool pl_map_avframe_drm(pl_gpu gpu, struct pl_frame *out,
+ const AVFrame *frame)
+{
+ const AVHWFramesContext *hwfc = (AVHWFramesContext *) frame->hw_frames_ctx->data;
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(hwfc->sw_format);
+ const AVDRMFrameDescriptor *drm = (AVDRMFrameDescriptor *) frame->data[0];
+ assert(frame->format == AV_PIX_FMT_DRM_PRIME);
+ if (!(gpu->import_caps.tex & PL_HANDLE_DMA_BUF))
+ return false;
+
+ assert(drm->nb_layers >= out->num_planes);
+ for (int n = 0; n < out->num_planes; n++) {
+ const AVDRMLayerDescriptor *layer = &drm->layers[n];
+ const AVDRMPlaneDescriptor *plane = &layer->planes[0];
+ const AVDRMObjectDescriptor *object = &drm->objects[plane->object_index];
+ pl_fmt fmt = pl_find_fourcc(gpu, layer->format);
+ bool is_chroma = n == 1 || n == 2;
+ if (!fmt || !pl_fmt_has_modifier(fmt, object->format_modifier))
+ return false;
+
+ assert(layer->nb_planes == 1); // we only support planar formats
+ assert(plane->pitch >= 0); // definitely requires special handling
+ out->planes[n].texture = pl_tex_create(gpu, pl_tex_params(
+ .w = AV_CEIL_RSHIFT(frame->width, is_chroma ? desc->log2_chroma_w : 0),
+ .h = AV_CEIL_RSHIFT(frame->height, is_chroma ? desc->log2_chroma_h : 0),
+ .format = fmt,
+ .sampleable = true,
+ .blit_src = fmt->caps & PL_FMT_CAP_BLITTABLE,
+ .import_handle = PL_HANDLE_DMA_BUF,
+ .shared_mem = {
+ .handle.fd = object->fd,
+ .size = object->size,
+ .offset = plane->offset,
+ .drm_format_mod = object->format_modifier,
+ .stride_w = plane->pitch,
+ },
+ ));
+ if (!out->planes[n].texture)
+ return false;
+ }
+
+ pl_fix_hwframe_sample_depth(out, frame);
+ return true;
+}
+
+// Derive a DMABUF from any other hwaccel format, and map that instead
+static bool pl_map_avframe_derived(pl_gpu gpu, struct pl_frame *out,
+ const AVFrame *frame)
+{
+ const int flags = AV_HWFRAME_MAP_READ | AV_HWFRAME_MAP_DIRECT;
+ struct pl_avframe_priv *priv = out->user_data;
+ AVFrame *derived = av_frame_alloc();
+ derived->width = frame->width;
+ derived->height = frame->height;
+ derived->format = AV_PIX_FMT_DRM_PRIME;
+ derived->hw_frames_ctx = av_buffer_ref(frame->hw_frames_ctx);
+ if (av_hwframe_map(derived, frame, flags) < 0)
+ goto error;
+ if (av_frame_copy_props(derived, frame) < 0)
+ goto error;
+ if (!pl_map_avframe_drm(gpu, out, derived))
+ goto error;
+
+ av_frame_free(&priv->avframe);
+ priv->avframe = derived;
+ return true;
+
+error:
+ av_frame_free(&derived);
+ return false;
+}
+
+#ifdef PL_HAVE_LAV_VULKAN
+static bool pl_acquire_avframe(pl_gpu gpu, struct pl_frame *frame)
+{
+ const struct pl_avframe_priv *priv = frame->user_data;
+ AVHWFramesContext *hwfc = (void *) priv->avframe->hw_frames_ctx->data;
+ AVVulkanFramesContext *vkfc = hwfc->hwctx;
+ AVVkFrame *vkf = (AVVkFrame *) priv->avframe->data[0];
+
+#ifdef PL_HAVE_LAV_VULKAN_V2
+ vkfc->lock_frame(hwfc, vkf);
+#else
+ (void) vkfc;
+#endif
+
+ for (int n = 0; n < frame->num_planes; n++) {
+ pl_vulkan_release_ex(gpu, pl_vulkan_release_params(
+ .tex = priv->planar ? priv->planar : frame->planes[n].texture,
+ .layout = vkf->layout[n],
+ .qf = VK_QUEUE_FAMILY_IGNORED,
+ .semaphore = {
+ .sem = vkf->sem[n],
+ .value = vkf->sem_value[n],
+ },
+ ));
+ if (priv->planar)
+ break;
+ }
+
+ return true;
+}
+
+static void pl_release_avframe(pl_gpu gpu, struct pl_frame *frame)
+{
+ const struct pl_avframe_priv *priv = frame->user_data;
+ AVHWFramesContext *hwfc = (void *) priv->avframe->hw_frames_ctx->data;
+ AVVulkanFramesContext *vkfc = hwfc->hwctx;
+ AVVkFrame *vkf = (AVVkFrame *) priv->avframe->data[0];
+
+ for (int n = 0; n < frame->num_planes; n++) {
+ int ok = pl_vulkan_hold_ex(gpu, pl_vulkan_hold_params(
+ .tex = priv->planar ? priv->planar : frame->planes[n].texture,
+ .out_layout = &vkf->layout[n],
+ .qf = VK_QUEUE_FAMILY_IGNORED,
+ .semaphore = {
+ .sem = vkf->sem[n],
+ .value = vkf->sem_value[n] + 1,
+ },
+ ));
+
+ vkf->access[n] = 0;
+ vkf->sem_value[n] += !!ok;
+ if (priv->planar)
+ break;
+ }
+
+#ifdef PL_HAVE_LAV_VULKAN_V2
+ vkfc->unlock_frame(hwfc, vkf);
+#else
+ (void) vkfc;
+#endif
+}
+
+static bool pl_map_avframe_vulkan(pl_gpu gpu, struct pl_frame *out,
+ const AVFrame *frame)
+{
+ const AVHWFramesContext *hwfc = (AVHWFramesContext *) frame->hw_frames_ctx->data;
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(hwfc->sw_format);
+ const AVVulkanFramesContext *vkfc = hwfc->hwctx;
+ AVVkFrame *vkf = (AVVkFrame *) frame->data[0];
+ struct pl_avframe_priv *priv = out->user_data;
+ pl_vulkan vk = pl_vulkan_get(gpu);
+
+#ifdef PL_HAVE_LAV_VULKAN_V2
+ const VkFormat *vk_fmt = vkfc->format;
+#else
+ const VkFormat *vk_fmt = av_vkfmt_from_pixfmt(hwfc->sw_format);
+#endif
+
+ assert(frame->format == AV_PIX_FMT_VULKAN);
+ priv->planar = NULL;
+ if (!vk)
+ return false;
+
+ for (int n = 0; n < out->num_planes; n++) {
+ struct pl_plane *plane = &out->planes[n];
+ bool chroma = n == 1 || n == 2;
+ int num_subplanes;
+ assert(vk_fmt[n]);
+
+ plane->texture = pl_vulkan_wrap(gpu, pl_vulkan_wrap_params(
+ .image = vkf->img[n],
+ .width = AV_CEIL_RSHIFT(hwfc->width, chroma ? desc->log2_chroma_w : 0),
+ .height = AV_CEIL_RSHIFT(hwfc->height, chroma ? desc->log2_chroma_h : 0),
+ .format = vk_fmt[n],
+ .usage = vkfc->usage,
+ ));
+ if (!plane->texture)
+ return false;
+
+ num_subplanes = plane->texture->params.format->num_planes;
+ if (num_subplanes) {
+ assert(num_subplanes == out->num_planes);
+ priv->planar = plane->texture;
+ for (int i = 0; i < num_subplanes; i++)
+ out->planes[i].texture = priv->planar->planes[i];
+ break;
+ }
+ }
+
+ out->acquire = pl_acquire_avframe;
+ out->release = pl_release_avframe;
+ pl_fix_hwframe_sample_depth(out, frame);
+ return true;
+}
+
+static void pl_unmap_avframe_vulkan(pl_gpu gpu, struct pl_frame *frame)
+{
+ struct pl_avframe_priv *priv = frame->user_data;
+ if (priv->planar) {
+ pl_tex_destroy(gpu, &priv->planar);
+ for (int n = 0; n < frame->num_planes; n++)
+ frame->planes[n].texture = NULL;
+ }
+}
+#endif
+
+PL_LIBAV_API bool pl_map_avframe_ex(pl_gpu gpu, struct pl_frame *out,
+ const struct pl_avframe_params *params)
+{
+ const AVFrame *frame = params->frame;
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
+ struct pl_plane_data data[4] = {0};
+ pl_tex *tex = params->tex;
+ int planes;
+
+ struct pl_avframe_priv *priv = malloc(sizeof(*priv));
+ if (!priv)
+ goto error;
+
+ pl_frame_from_avframe(out, frame);
+ priv->avframe = av_frame_clone(frame);
+ out->user_data = priv;
+
+#ifdef PL_HAVE_LAV_DOLBY_VISION
+ if (params->map_dovi) {
+ AVFrameSideData *sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DOVI_METADATA);
+ if (sd) {
+ const AVDOVIMetadata *metadata = (AVDOVIMetadata *) sd->data;
+ const AVDOVIRpuDataHeader *header = av_dovi_get_header(metadata);
+ // Only automatically map DoVi RPUs that don't require an EL
+ if (header->disable_residual_flag)
+ pl_frame_map_avdovi_metadata(out, &priv->dovi, metadata);
+ }
+
+#ifdef PL_HAVE_LIBDOVI
+ sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DOVI_RPU_BUFFER);
+ if (sd)
+ pl_hdr_metadata_from_dovi_rpu(&out->color.hdr, sd->buf->data, sd->buf->size);
+#endif // PL_HAVE_LIBDOVI
+ }
+
+#endif // PL_HAVE_LAV_DOLBY_VISION
+
+ switch (frame->format) {
+ case AV_PIX_FMT_DRM_PRIME:
+ if (!pl_map_avframe_drm(gpu, out, frame))
+ goto error;
+ return true;
+
+ case AV_PIX_FMT_VAAPI:
+ if (!pl_map_avframe_derived(gpu, out, frame))
+ goto error;
+ return true;
+
+#ifdef PL_HAVE_LAV_VULKAN
+ case AV_PIX_FMT_VULKAN:
+ if (!pl_map_avframe_vulkan(gpu, out, frame))
+ goto error;
+ return true;
+#endif
+
+ default: break;
+ }
+
+ // Backing textures are required from this point onwards
+ if (!tex)
+ goto error;
+
+ planes = pl_plane_data_from_pixfmt(data, &out->repr.bits, frame->format);
+ if (!planes)
+ goto error;
+
+ for (int p = 0; p < planes; p++) {
+ AVBufferRef *buf = av_frame_get_plane_buffer((AVFrame *) frame, p);
+ struct pl_avalloc *alloc = buf ? av_buffer_get_opaque(buf) : NULL;
+ bool is_chroma = p == 1 || p == 2; // matches lavu logic
+
+ data[p].width = AV_CEIL_RSHIFT(frame->width, is_chroma ? desc->log2_chroma_w : 0);
+ data[p].height = AV_CEIL_RSHIFT(frame->height, is_chroma ? desc->log2_chroma_h : 0);
+ if (frame->linesize[p] < 0) {
+ data[p].pixels = frame->data[p] + frame->linesize[p] * (data[p].height - 1);
+ data[p].row_stride = -frame->linesize[p];
+ out->planes[p].flipped = true;
+ } else {
+ data[p].pixels = frame->data[p];
+ data[p].row_stride = frame->linesize[p];
+ }
+
+ // Probe for frames allocated by pl_get_buffer2
+ if (alloc && alloc->magic[0] == PL_MAGIC0 && alloc->magic[1] == PL_MAGIC1) {
+ data[p].buf = alloc->buf;
+ data[p].buf_offset = (uintptr_t) data[p].pixels - (uintptr_t) alloc->buf->data;
+ data[p].pixels = NULL;
+ } else if (gpu->limits.callbacks) {
+ // Use asynchronous upload if possible
+ data[p].callback = pl_avframe_free_cb;
+ data[p].priv = av_frame_clone(frame);
+ }
+
+ if (!pl_upload_plane(gpu, &out->planes[p], &tex[p], &data[p])) {
+ av_frame_free((AVFrame **) &data[p].priv);
+ goto error;
+ }
+
+ out->planes[p].texture = tex[p];
+ }
+
+ return true;
+
+error:
+ pl_unmap_avframe(gpu, out);
+ return false;
+}
+
+// Backwards compatibility with previous versions of this API.
+PL_LIBAV_API bool pl_map_avframe(pl_gpu gpu, struct pl_frame *out_frame,
+ pl_tex tex[4], const AVFrame *avframe)
+{
+ return pl_map_avframe_ex(gpu, out_frame, &(struct pl_avframe_params) {
+ .frame = avframe,
+ .tex = tex,
+ });
+}
+
+PL_LIBAV_API void pl_unmap_avframe(pl_gpu gpu, struct pl_frame *frame)
+{
+ struct pl_avframe_priv *priv = frame->user_data;
+ const AVPixFmtDescriptor *desc;
+ if (!priv)
+ goto done;
+
+#ifdef PL_HAVE_LAV_VULKAN
+ if (priv->avframe->format == AV_PIX_FMT_VULKAN)
+ pl_unmap_avframe_vulkan(gpu, frame);
+#endif
+
+ desc = av_pix_fmt_desc_get(priv->avframe->format);
+ if (desc->flags & AV_PIX_FMT_FLAG_HWACCEL) {
+ for (int i = 0; i < 4; i++)
+ pl_tex_destroy(gpu, &frame->planes[i].texture);
+ }
+
+ av_frame_free(&priv->avframe);
+ free(priv);
+
+done:
+ memset(frame, 0, sizeof(*frame)); // sanity
+}
+
+PL_LIBAV_API AVFrame *pl_get_mapped_avframe(const struct pl_frame *frame)
+{
+ struct pl_avframe_priv *priv = frame->user_data;
+ return priv->avframe;
+}
+
+static void pl_done_cb(void *priv)
+{
+ bool *status = priv;
+ *status = true;
+}
+
+PL_LIBAV_API bool pl_download_avframe(pl_gpu gpu,
+ const struct pl_frame *frame,
+ AVFrame *out_frame)
+{
+ bool done[4] = {0};
+ if (frame->num_planes != av_pix_fmt_count_planes(out_frame->format))
+ return false;
+
+ for (int p = 0; p < frame->num_planes; p++) {
+ bool ok = pl_tex_download(gpu, pl_tex_transfer_params(
+ .tex = frame->planes[p].texture,
+ .row_pitch = out_frame->linesize[p],
+ .ptr = out_frame->data[p],
+ // Use synchronous transfer for the last plane
+ .callback = (p+1) < frame->num_planes ? pl_done_cb : NULL,
+ .priv = &done[p],
+ ));
+
+ if (!ok)
+ return false;
+ }
+
+ for (int p = 0; p < frame->num_planes - 1; p++) {
+ while (!done[p])
+ pl_tex_poll(gpu, frame->planes[p].texture, UINT64_MAX);
+ }
+
+ return true;
+}
+
+#define PL_DIV_UP(x, y) (((x) + (y) - 1) / (y))
+#define PL_ALIGN(x, align) ((align) ? PL_DIV_UP(x, align) * (align) : (x))
+#define PL_MAX(x, y) ((x) > (y) ? (x) : (y))
+#define PL_LCM(x, y) ((x) * ((y) / av_gcd(x, y)))
+
+static inline void pl_avalloc_free(void *opaque, uint8_t *data)
+{
+ struct pl_avalloc *alloc = opaque;
+ assert(alloc->magic[0] == PL_MAGIC0);
+ assert(alloc->magic[1] == PL_MAGIC1);
+ assert(alloc->buf->data == data);
+ pl_buf_destroy(alloc->gpu, &alloc->buf);
+ free(alloc);
+}
+
+PL_LIBAV_API int pl_get_buffer2(AVCodecContext *avctx, AVFrame *pic, int flags)
+{
+ int alignment[AV_NUM_DATA_POINTERS];
+ int width = pic->width;
+ int height = pic->height;
+ size_t planesize[4];
+ int ret = 0;
+
+ pl_gpu *pgpu = avctx->opaque;
+ pl_gpu gpu = pgpu ? *pgpu : NULL;
+ struct pl_plane_data data[4];
+ struct pl_avalloc *alloc;
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pic->format);
+ int planes = pl_plane_data_from_pixfmt(data, NULL, pic->format);
+
+ // Sanitize frame structs
+ memset(pic->data, 0, sizeof(pic->data));
+ memset(pic->linesize, 0, sizeof(pic->linesize));
+ memset(pic->buf, 0, sizeof(pic->buf));
+ pic->extended_data = pic->data;
+ pic->extended_buf = NULL;
+
+ if (!(avctx->codec->capabilities & AV_CODEC_CAP_DR1) || !planes)
+ goto fallback;
+ if (!gpu || !gpu->limits.thread_safe || !gpu->limits.max_mapped_size ||
+ !gpu->limits.host_cached)
+ {
+ goto fallback;
+ }
+
+ avcodec_align_dimensions2(avctx, &width, &height, alignment);
+ if ((ret = av_image_fill_linesizes(pic->linesize, pic->format, width)))
+ return ret;
+
+ for (int p = 0; p < planes; p++) {
+ alignment[p] = PL_LCM(alignment[p], gpu->limits.align_tex_xfer_pitch);
+ alignment[p] = PL_LCM(alignment[p], gpu->limits.align_tex_xfer_offset);
+ alignment[p] = PL_LCM(alignment[p], data[p].pixel_stride);
+ pic->linesize[p] = PL_ALIGN(pic->linesize[p], alignment[p]);
+ }
+
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(56, 56, 100)
+ ret = av_image_fill_plane_sizes(planesize, pic->format, height, (ptrdiff_t[4]) {
+ pic->linesize[0], pic->linesize[1], pic->linesize[2], pic->linesize[3],
+ });
+ if (ret < 0)
+ return ret;
+#else
+ uint8_t *ptrs[4], * const base = (uint8_t *) 0x10000;
+ ret = av_image_fill_pointers(ptrs, pic->format, height, base, pic->linesize);
+ if (ret < 0)
+ return ret;
+ for (int p = 0; p < 4; p++)
+ planesize[p] = (uintptr_t) ptrs[p] - (uintptr_t) base;
+#endif
+
+ for (int p = 0; p < planes; p++) {
+ const size_t buf_size = planesize[p] + alignment[p];
+ if (buf_size > gpu->limits.max_mapped_size) {
+ av_frame_unref(pic);
+ goto fallback;
+ }
+
+ alloc = malloc(sizeof(*alloc));
+ if (!alloc) {
+ av_frame_unref(pic);
+ return AVERROR(ENOMEM);
+ }
+
+ *alloc = (struct pl_avalloc) {
+ .magic = { PL_MAGIC0, PL_MAGIC1 },
+ .gpu = gpu,
+ .buf = pl_buf_create(gpu, pl_buf_params(
+ .size = buf_size,
+ .memory_type = PL_BUF_MEM_HOST,
+ .host_mapped = true,
+ .storable = desc->flags & AV_PIX_FMT_FLAG_BE,
+ )),
+ };
+
+ if (!alloc->buf) {
+ free(alloc);
+ av_frame_unref(pic);
+ return AVERROR(ENOMEM);
+ }
+
+ pic->data[p] = (uint8_t *) PL_ALIGN((uintptr_t) alloc->buf->data, alignment[p]);
+ pic->buf[p] = av_buffer_create(alloc->buf->data, buf_size, pl_avalloc_free, alloc, 0);
+ if (!pic->buf[p]) {
+ pl_buf_destroy(gpu, &alloc->buf);
+ free(alloc);
+ av_frame_unref(pic);
+ return AVERROR(ENOMEM);
+ }
+ }
+
+ return 0;
+
+fallback:
+ return avcodec_default_get_buffer2(avctx, pic, flags);
+}
+
+#undef PL_MAGIC0
+#undef PL_MAGIC1
+#undef PL_ALIGN
+#undef PL_MAX
+
+#endif // LIBPLACEBO_LIBAV_H_
diff --git a/src/include/libplacebo/utils/upload.h b/src/include/libplacebo/utils/upload.h
new file mode 100644
index 0000000..9e8d436
--- /dev/null
+++ b/src/include/libplacebo/utils/upload.h
@@ -0,0 +1,153 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_UPLOAD_H_
+#define LIBPLACEBO_UPLOAD_H_
+
+#include <stdint.h>
+
+#include <libplacebo/gpu.h>
+#include <libplacebo/renderer.h>
+
+PL_API_BEGIN
+
+// This file contains a utility function to assist in uploading data from host
+// memory to a texture. In particular, the texture will be suitable for use as
+// a `pl_plane`.
+
+// Description of the host representation of an image plane
+struct pl_plane_data {
+ enum pl_fmt_type type; // meaning of the data (must not be UINT or SINT)
+ int width, height; // dimensions of the plane
+ int component_size[4]; // size in bits of each coordinate
+ int component_pad[4]; // ignored bits preceding each component
+ int component_map[4]; // semantic meaning of each component (pixel order)
+ size_t pixel_stride; // offset in bytes between pixels (required)
+ size_t row_stride; // offset in bytes between rows (optional)
+ bool swapped; // pixel data is endian-swapped (non-native)
+
+ // Similar to `pl_tex_transfer_params`, you can either upload from a raw
+ // pointer address, or a buffer + offset. Again, the use of these two
+ // mechanisms is mutually exclusive.
+ //
+ // 1. Uploading from host memory
+ const void *pixels; // the actual data underlying this plane
+
+ // 2. Uploading from a buffer (requires `pl_gpu_limits.buf_transfer`)
+ pl_buf buf; // the buffer to use
+ size_t buf_offset; // offset of data within buffer, must be a
+ // multiple of `pixel_stride` as well as of 4
+
+ // Similar to `pl_tex_transfer_params.callback`, this allows turning the
+ // upload of a plane into an asynchronous upload. The same notes apply.
+ void (*callback)(void *priv);
+ void *priv;
+
+ // Note: When using this together with `pl_frame`, there is some amount of
+ // overlap between `component_pad` and `pl_color_repr.bits`. Some key
+ // differences between the two:
+ //
+ // - the bits from `component_pad` are ignored; whereas the superfluous bits
+ // in a `pl_color_repr` must be 0.
+ // - the `component_pad` exists to align the component size and placement
+ // with the capabilities of GPUs; the `pl_color_repr` exists to control
+ // the semantics of the color samples on a finer granularity.
+ // - the `pl_color_repr` applies to the color sample as a whole, and
+ // therefore applies to all planes; the `component_pad` can be different
+ // for each plane.
+ // - `component_pad` interacts with float textures by moving the actual
+ // float in memory. `pl_color_repr` interacts with float data as if
+ // the float was converted from an integer under full range semantics.
+ //
+ // To help establish the motivating difference, a typical example of a use
+ // case would be yuv420p10. Since 10-bit GPU texture support is limited,
+ // and working with non-byte-aligned pixels is awkward in general, the
+ // convention is to represent yuv420p10 as 16-bit samples with either the
+ // high or low bits set to 0. In this scenario, the `component_size` of the
+ // `pl_plane_data` and `pl_bit_encoding.sample_depth` would be 16, while
+ // the `pl_bit_encoding.color_depth` would be 10 (and additionally, the
+ // `pl_bit_encoding.bit_shift` would be either 0 or 6, depending on
+ // whether the low or the high bits are used).
+ //
+ // On the contrary, something like a packed, 8-bit XBGR format (where the
+ // X bits are ignored and may contain garbage) would set `component_pad[0]`
+ // to 8, and the component_size[0:2] (respectively) to 8 as well.
+ //
+ // As a general rule of thumb, for maximum compatibility, you should try
+ // and align component_size/component_pad to multiples of 8 and explicitly
+ // clear any remaining superfluous bits (+ use `pl_color_repr.bits` to
+ // ensure they're decoded correctly). You should also try to align the
+ // `pixel_stride` to a power of two.
+};
+
+// Fills in the `component_size`, `component_pad` and `component_map` fields
+// based on the supplied mask for each component (in semantic order, i.e.
+// RGBA). Each element of `mask` must have a contiguous range of set bits.
+PL_API void pl_plane_data_from_mask(struct pl_plane_data *data, uint64_t mask[4]);
+
+// Fills in the `component_size`, `component_pad` and `component_map` fields
+// based on the supplied sizes (in bits) and shift of each component (in
+// semantic order).
+//
+// Similar to `pl_plane_data_from_mask` but not limited to 64-bit pixels.
+PL_API void pl_plane_data_from_comps(struct pl_plane_data *data, int size[4],
+ int shift[4]);
+
+// Helper function to take a `pl_plane_data` struct and try and improve its
+// alignment to make it more likely to correspond to a real `pl_fmt`. It does
+// this by attempting to round each component up to the nearest byte boundary.
+// This relies on the assumption (true in practice) that superfluous bits of
+// byte-misaligned formats are explicitly set to 0.
+//
+// The resulting shift must be consistent across all components, in which case
+// it's returned in `out_bits`. If no alignment was possible, `out_bits` is set
+// to {0}, and this function returns false.
+PL_API bool pl_plane_data_align(struct pl_plane_data *data, struct pl_bit_encoding *out_bits);
+
+// Helper function to find a suitable `pl_fmt` based on a pl_plane_data's
+// requirements. This is called internally by `pl_upload_plane`, but it's
+// exposed to users both as a convenience and so they may pre-emptively check
+// if a format would be supported without actually having to attempt the upload.
+PL_API pl_fmt pl_plane_find_fmt(pl_gpu gpu, int out_map[4], const struct pl_plane_data *data);
+
+// Upload an image plane to a texture, and output the resulting `pl_plane`
+// struct to `out_plane` (optional). `tex` must be a valid pointer to a texture
+// (or NULL), which will be destroyed and reinitialized if it does not already
+// exist or is incompatible. Returns whether successful.
+//
+// The resulting texture is guaranteed to be `sampleable`, and it will also try
+// and maximize compatibility with the other `pl_renderer` requirements
+// (blittable, linear filterable, etc.).
+//
+// Note: `out_plane->shift_x/y` and `out_plane->flipped` are left
+// uninitialized, and should be set explicitly by the user.
+PL_API bool pl_upload_plane(pl_gpu gpu, struct pl_plane *out_plane,
+ pl_tex *tex, const struct pl_plane_data *data);
+
+// Like `pl_upload_plane`, but only creates an uninitialized texture object
+// rather than actually performing an upload. This can be useful to, for
+// example, prepare textures to be used as the target of rendering.
+//
+// The resulting texture is guaranteed to be `renderable`, and it will also try
+// to maximize compatibility with the other `pl_renderer` requirements
+// (blittable, storable, etc.).
+PL_API bool pl_recreate_plane(pl_gpu gpu, struct pl_plane *out_plane,
+ pl_tex *tex, const struct pl_plane_data *data);
+
+PL_API_END
+
+#endif // LIBPLACEBO_UPLOAD_H_
diff --git a/src/include/libplacebo/vulkan.h b/src/include/libplacebo/vulkan.h
new file mode 100644
index 0000000..4e5db95
--- /dev/null
+++ b/src/include/libplacebo/vulkan.h
@@ -0,0 +1,638 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_VULKAN_H_
+#define LIBPLACEBO_VULKAN_H_
+
+#include <vulkan/vulkan.h>
+#include <libplacebo/gpu.h>
+#include <libplacebo/swapchain.h>
+
+PL_API_BEGIN
+
+#define PL_VK_MIN_VERSION VK_API_VERSION_1_2
+
+// Structure representing a VkInstance. Using this is not required.
+typedef const struct pl_vk_inst_t {
+ VkInstance instance;
+
+ // The Vulkan API version supported by this VkInstance.
+ uint32_t api_version;
+
+ // The associated vkGetInstanceProcAddr pointer.
+ PFN_vkGetInstanceProcAddr get_proc_addr;
+
+ // The instance extensions that were successfully enabled, including
+ // extensions enabled by libplacebo internally. May contain duplicates.
+ const char * const *extensions;
+ int num_extensions;
+
+ // The instance layers that were successfully enabled, including
+ // layers enabled by libplacebo internally. May contain duplicates.
+ const char * const *layers;
+ int num_layers;
+} *pl_vk_inst;
+
+struct pl_vk_inst_params {
+ // If set, enable the debugging and validation layers. These should
+ // generally be lightweight and relatively harmless to enable.
+ bool debug;
+
+ // If set, also enable GPU-assisted verification and best practices
+ // layers. (Note: May cause substantial slowdown and/or result in lots of
+ // false positive spam)
+ bool debug_extra;
+
+ // If nonzero, restricts the Vulkan API version to be at most this. This
+ // is only really useful for explicitly testing backwards compatibility.
+ uint32_t max_api_version;
+
+ // Pointer to a user-provided `vkGetInstanceProcAddr`. If this is NULL,
+ // libplacebo will use the directly linked version (if available).
+ PFN_vkGetInstanceProcAddr get_proc_addr;
+
+ // Enables extra instance extensions. Instance creation will fail if these
+ // extensions are not all supported. The user may use this to enable e.g.
+ // windowing system integration.
+ const char * const *extensions;
+ int num_extensions;
+
+ // Enables extra optional instance extensions. These are opportunistically
+ // enabled if supported by the device, but otherwise skipped.
+ const char * const *opt_extensions;
+ int num_opt_extensions;
+
+ // Enables extra layers. Instance creation will fail if these layers are
+ // not all supported.
+ //
+ // NOTE: Layers needed for required/optional extensions are automatically
+ // enabled. The user does not specifically need to enable layers related
+ // to extension support.
+ const char * const *layers;
+ int num_layers;
+
+ // Enables extra optional layers. These are opportunistically enabled if
+ // supported by the platform, but otherwise skipped.
+ const char * const *opt_layers;
+ int num_opt_layers;
+};
+
+#define pl_vk_inst_params(...) (&(struct pl_vk_inst_params) { __VA_ARGS__ })
+PL_API extern const struct pl_vk_inst_params pl_vk_inst_default_params;
+
+// Helper function to simplify instance creation. The user could also bypass
+// these helpers and do it manually, but this function is provided as a
+// convenience. It also sets up a debug callback which forwards all vulkan
+// messages to the `pl_log` callback.
+PL_API pl_vk_inst pl_vk_inst_create(pl_log log, const struct pl_vk_inst_params *params);
+PL_API void pl_vk_inst_destroy(pl_vk_inst *inst);
+
+struct pl_vulkan_queue {
+ uint32_t index; // Queue family index
+ uint32_t count; // Queue family count
+};
+
+// Structure representing the actual vulkan device and associated GPU instance
+typedef const struct pl_vulkan_t *pl_vulkan;
+struct pl_vulkan_t {
+ pl_gpu gpu;
+
+ // The vulkan objects in use. The user may use this for their own purposes,
+ // but please note that the lifetime is tied to the lifetime of the
+ // pl_vulkan object, and must not be destroyed by the user. Note that the
+ // created vulkan device may have any number of queues and queue family
+ // assignments; so using it for queue submission commands is ill-advised.
+ VkInstance instance;
+ VkPhysicalDevice phys_device;
+ VkDevice device;
+
+ // The associated vkGetInstanceProcAddr pointer.
+ PFN_vkGetInstanceProcAddr get_proc_addr;
+
+ // The Vulkan API version supported by this VkPhysicalDevice.
+ uint32_t api_version;
+
+ // The device extensions that were successfully enabled, including
+ // extensions enabled by libplacebo internally. May contain duplicates.
+ const char * const *extensions;
+ int num_extensions;
+
+ // The device features that were enabled at device creation time.
+ //
+ // Note: Whenever a feature flag is ambiguious between several alternative
+ // locations, for completeness' sake, we include both.
+ const VkPhysicalDeviceFeatures2 *features;
+
+ // The explicit queue families we are using to provide a given capability.
+ struct pl_vulkan_queue queue_graphics; // provides VK_QUEUE_GRAPHICS_BIT
+ struct pl_vulkan_queue queue_compute; // provides VK_QUEUE_COMPUTE_BIT
+ struct pl_vulkan_queue queue_transfer; // provides VK_QUEUE_TRANSFER_BIT
+
+ // Functions for locking a queue. These must be used to lock VkQueues for
+ // submission or other related operations when sharing the VkDevice between
+ // multiple threads, Using this on queue families or indices not contained
+ // in `queues` is undefined behavior.
+ void (*lock_queue)(pl_vulkan vk, uint32_t qf, uint32_t qidx);
+ void (*unlock_queue)(pl_vulkan vk, uint32_t qf, uint32_t qidx);
+
+ // --- Deprecated fields
+
+ // These are the same active queue families and their queue counts in list
+ // form. This list does not contain duplicates, nor any extra queues
+ // enabled at device creation time. Deprecated in favor of querying
+ // `vkGetPhysicalDeviceQueueFamilyProperties` directly.
+ const struct pl_vulkan_queue *queues PL_DEPRECATED;
+ int num_queues PL_DEPRECATED;
+};
+
+struct pl_vulkan_params {
+ // The vulkan instance. Optional, if NULL then libplacebo will internally
+ // create a VkInstance with the settings from `instance_params`.
+ //
+ // Note: The VkInstance provided by the user *MUST* be created with a
+ // VkApplicationInfo.apiVersion of PL_VK_MIN_VERSION or higher.
+ VkInstance instance;
+
+ // Pointer to `vkGetInstanceProcAddr`. If this is NULL, libplacebo will
+ // use the directly linked version (if available).
+ //
+ // Note: This overwrites the same value from `instance_params`.
+ PFN_vkGetInstanceProcAddr get_proc_addr;
+
+ // Configures the settings used for creating an internal vulkan instance.
+ // May be NULL. Ignored if `instance` is set.
+ const struct pl_vk_inst_params *instance_params;
+
+ // When choosing the device, rule out all devices that don't support
+ // presenting to this surface. When creating a device, enable all extensions
+ // needed to ensure we can present to this surface. Optional. Only legal
+ // when specifying an existing VkInstance to use.
+ VkSurfaceKHR surface;
+
+ // --- Physical device selection options
+
+ // The vulkan physical device. May be set by the caller to indicate the
+ // physical device to use. Otherwise, libplacebo will pick the "best"
+ // available GPU, based on the advertised device type. (i.e., it will
+ // prefer discrete GPUs over integrated GPUs). Only legal when specifying
+ // an existing VkInstance to use.
+ VkPhysicalDevice device;
+
+ // When choosing the device, only choose a device with this exact name.
+ // This overrides `allow_software`. No effect if `device` is set. Note: A
+ // list of devices and their names are logged at level PL_LOG_INFO.
+ const char *device_name;
+
+ // When choosing the device, only choose a device with this exact UUID.
+ // This overrides `allow_software` and `device_name`. No effect if `device`
+ // is set.
+ uint8_t device_uuid[16];
+
+ // When choosing the device, controls whether or not to also allow software
+ // GPUs. No effect if `device` or `device_name` are set.
+ bool allow_software;
+
+ // --- Logical device creation options
+
+ // Controls whether or not to allow asynchronous transfers, using transfer
+ // queue families, if supported by the device. This can be significantly
+ // faster and more power efficient, and also allows streaming uploads in
+ // parallel with rendering commands. Enabled by default.
+ bool async_transfer;
+
+ // Controls whether or not to allow asynchronous compute, using dedicated
+ // compute queue families, if supported by the device. On some devices,
+ // these can allow the GPU to schedule compute shaders in parallel with
+ // fragment shaders. Enabled by default.
+ bool async_compute;
+
+ // Limits the number of queues to use. If left as 0, libplacebo will use as
+ // many queues as the device supports. Multiple queues can result in
+ // improved efficiency when submitting multiple commands that can entirely
+ // or partially execute in parallel. Defaults to 1, since using more queues
+ // can actually decrease performance.
+ //
+ // Note: libplacebo will always *create* logical devices with all available
+ // queues for a given QF enabled, regardless of this setting.
+ int queue_count;
+
+ // Bitmask of extra queue families to enable. If set, then *all* queue
+ // families matching *any* of these flags will be enabled at device
+ // creation time. Setting this to VK_QUEUE_FLAG_BITS_MAX_ENUM effectively
+ // enables all queue families supported by the device.
+ VkQueueFlags extra_queues;
+
+ // Enables extra device extensions. Device creation will fail if these
+ // extensions are not all supported. The user may use this to enable e.g.
+ // interop extensions.
+ const char * const *extensions;
+ int num_extensions;
+
+ // Enables extra optional device extensions. These are opportunistically
+ // enabled if supported by the device, but otherwise skipped.
+ const char * const *opt_extensions;
+ int num_opt_extensions;
+
+ // Optional extra features to enable at device creation time. These are
+ // opportunistically enabled if supported by the physical device, but
+ // otherwise kept disabled.
+ const VkPhysicalDeviceFeatures2 *features;
+
+ // --- Misc/debugging options
+
+ // Restrict specific features to e.g. work around driver bugs, or simply
+ // for testing purposes
+ int max_glsl_version; // limit the maximum GLSL version
+ uint32_t max_api_version; // limit the maximum vulkan API version
+};
+
+// Default/recommended parameters. Should generally be safe and efficient.
+#define PL_VULKAN_DEFAULTS \
+ .async_transfer = true, \
+ .async_compute = true, \
+ /* enabling multiple queues often decreases perf */ \
+ .queue_count = 1,
+
+#define pl_vulkan_params(...) (&(struct pl_vulkan_params) { PL_VULKAN_DEFAULTS __VA_ARGS__ })
+PL_API extern const struct pl_vulkan_params pl_vulkan_default_params;
+
+// Creates a new vulkan device based on the given parameters and initializes
+// a new GPU. If `params` is left as NULL, it defaults to
+// &pl_vulkan_default_params.
+//
+// Thread-safety: Safe
+PL_API pl_vulkan pl_vulkan_create(pl_log log, const struct pl_vulkan_params *params);
+
+// Destroys the vulkan device and all associated objects, except for the
+// VkInstance provided by the user.
+//
+// Note that all resources allocated from this vulkan object (e.g. via the
+// `vk->ra` or using `pl_vulkan_create_swapchain`) *must* be explicitly
+// destroyed by the user before calling this.
+//
+// Also note that this function will block until all in-flight GPU commands are
+// finished processing. You can avoid this by manually calling `pl_gpu_finish`
+// before `pl_vulkan_destroy`.
+PL_API void pl_vulkan_destroy(pl_vulkan *vk);
+
+// For a `pl_gpu` backed by `pl_vulkan`, this function can be used to retrieve
+// the underlying `pl_vulkan`. Returns NULL for any other type of `gpu`.
+PL_API pl_vulkan pl_vulkan_get(pl_gpu gpu);
+
+struct pl_vulkan_device_params {
+ // The instance to use. Required!
+ //
+ // Note: The VkInstance provided by the user *must* be created with a
+ // VkApplicationInfo.apiVersion of PL_VK_MIN_VERSION or higher.
+ VkInstance instance;
+
+ // Mirrored from `pl_vulkan_params`. All of these fields are optional.
+ PFN_vkGetInstanceProcAddr get_proc_addr;
+ VkSurfaceKHR surface;
+ const char *device_name;
+ uint8_t device_uuid[16];
+ bool allow_software;
+};
+
+#define pl_vulkan_device_params(...) (&(struct pl_vulkan_device_params) { __VA_ARGS__ })
+
+// Helper function to choose the best VkPhysicalDevice, given a VkInstance.
+// This uses the same logic as `pl_vulkan_create` uses internally. If no
+// matching device was found, this returns VK_NULL_HANDLE.
+PL_API VkPhysicalDevice pl_vulkan_choose_device(pl_log log,
+ const struct pl_vulkan_device_params *params);
+
+struct pl_vulkan_swapchain_params {
+ // The surface to use for rendering. Required, the user is in charge of
+ // creating this. Must belong to the same VkInstance as `vk->instance`.
+ VkSurfaceKHR surface;
+
+ // The preferred presentation mode. See the vulkan documentation for more
+ // information about these. If the device/surface combination does not
+ // support this mode, libplacebo will fall back to VK_PRESENT_MODE_FIFO_KHR.
+ //
+ // Warning: Leaving this zero-initialized is the same as having specified
+ // VK_PRESENT_MODE_IMMEDIATE_KHR, which is probably not what the user
+ // wants!
+ VkPresentModeKHR present_mode;
+
+ // Allow up to N in-flight frames. This essentially controls how many
+ // rendering commands may be queued up at the same time. See the
+ // documentation for `pl_swapchain_get_latency` for more information. For
+ // vulkan specifically, we are only able to wait until the GPU has finished
+ // rendering a frame - we are unable to wait until the display has actually
+ // finished displaying it. So this only provides a rough guideline.
+ // Optional, defaults to 3.
+ int swapchain_depth;
+
+ // This suppresses automatic recreation of the swapchain when any call
+ // returns VK_SUBOPTIMAL_KHR. Normally, libplacebo will recreate the
+ // swapchain internally on the next `pl_swapchain_start_frame`. If enabled,
+ // clients are assumed to take care of swapchain recreations themselves, by
+ // calling `pl_swapchain_resize` as appropriate. libplacebo will tolerate
+ // the "suboptimal" status indefinitely.
+ bool allow_suboptimal;
+
+ // Disable high-bit (10 or more) SDR formats. May help work around buggy
+ // drivers which don't dither properly when outputting high bit depth
+ // SDR backbuffers to 8-bit screens.
+ bool disable_10bit_sdr;
+};
+
+#define pl_vulkan_swapchain_params(...) (&(struct pl_vulkan_swapchain_params) { __VA_ARGS__ })
+
+// Creates a new vulkan swapchain based on an existing VkSurfaceKHR. Using this
+// function requires that the vulkan device was created with the
+// VK_KHR_swapchain extension. The easiest way of accomplishing this is to set
+// the `pl_vulkan_params.surface` explicitly at creation time.
+PL_API pl_swapchain pl_vulkan_create_swapchain(pl_vulkan vk,
+ const struct pl_vulkan_swapchain_params *params);
+
+// This will return true if the vulkan swapchain is internally detected
+// as being suboptimal (VK_SUBOPTIMAL_KHR). This might be of use to clients
+// who have `params->allow_suboptimal` enabled.
+PL_API bool pl_vulkan_swapchain_suboptimal(pl_swapchain sw);
+
+// Vulkan interop API, for sharing a single VkDevice (and associated vulkan
+// resources) directly with the API user. The use of this API is a bit sketchy
+// and requires careful communication of Vulkan API state.
+
+struct pl_vulkan_import_params {
+ // The vulkan instance. Required.
+ //
+ // Note: The VkInstance provided by the user *must* be created with a
+ // VkApplicationInfo.apiVersion of PL_VK_MIN_VERSION or higher.
+ VkInstance instance;
+
+ // Pointer to `vkGetInstanceProcAddr`. If this is NULL, libplacebo will
+ // use the directly linked version (if available).
+ PFN_vkGetInstanceProcAddr get_proc_addr;
+
+ // The physical device selected by the user. Required.
+ VkPhysicalDevice phys_device;
+
+ // The logical device created by the user. Required.
+ VkDevice device;
+
+ // --- Logical device parameters
+
+ // List of all device-level extensions that were enabled. (Instance-level
+ // extensions need not be re-specified here, since it's guaranteed that any
+ // instance-level extensions that device-level extensions depend on were
+ // enabled at the instance level)
+ const char * const *extensions;
+ int num_extensions;
+
+ // Enabled queue families. At least `queue_graphics` is required.
+ //
+ // It's okay for multiple queue families to be specified with the same
+ // index, e.g. in the event that a dedicated compute queue also happens to
+ // be the dedicated transfer queue.
+ //
+ // It's also okay to leave the queue struct as {0} in the event that no
+ // dedicated queue exists for a given operation type. libplacebo will
+ // automatically fall back to using e.g. the graphics queue instead.
+ struct pl_vulkan_queue queue_graphics; // must support VK_QUEUE_GRAPHICS_BIT
+ struct pl_vulkan_queue queue_compute; // must support VK_QUEUE_COMPUTE_BIT
+ struct pl_vulkan_queue queue_transfer; // must support VK_QUEUE_TRANSFER_BIT
+
+ // Enabled VkPhysicalDeviceFeatures. The device *must* be created with
+ // all of the features in `pl_vulkan_required_features` enabled.
+ const VkPhysicalDeviceFeatures2 *features;
+
+ // Functions for locking a queue. If set, these will be used instead of
+ // libplacebo's internal functions for `pl_vulkan.(un)lock_queue`.
+ void (*lock_queue)(void *ctx, uint32_t qf, uint32_t qidx);
+ void (*unlock_queue)(void *ctx, uint32_t qf, uint32_t qidx);
+ void *queue_ctx;
+
+ // --- Misc/debugging options
+
+ // Restrict specific features to e.g. work around driver bugs, or simply
+ // for testing purposes. See `pl_vulkan_params` for a description of these.
+ int max_glsl_version;
+ uint32_t max_api_version;
+};
+
+#define pl_vulkan_import_params(...) (&(struct pl_vulkan_import_params) { __VA_ARGS__ })
+
+// For purely informative reasons, this contains a list of extensions and
+// device features that libplacebo *can* make use of. These are all strictly
+// optional, but provide a hint to the API user as to what might be worth
+// enabling at device creation time.
+//
+// Note: This also includes physical device features provided by extensions.
+// They are all provided using extension-specific features structs, rather
+// than the more general purpose VkPhysicalDeviceVulkan11Features etc.
+PL_API extern const char * const pl_vulkan_recommended_extensions[];
+PL_API extern const int pl_vulkan_num_recommended_extensions;
+PL_API extern const VkPhysicalDeviceFeatures2 pl_vulkan_recommended_features;
+
+// A list of device features that are required by libplacebo. These
+// *must* be provided by imported Vulkan devices.
+//
+// Note: `pl_vulkan_recommended_features` does not include this list.
+PL_API extern const VkPhysicalDeviceFeatures2 pl_vulkan_required_features;
+
+// Import an existing VkDevice instead of creating a new one, and wrap it into
+// a `pl_vulkan` abstraction. It's safe to `pl_vulkan_destroy` this, which will
+// destroy application state related to libplacebo but leave the underlying
+// VkDevice intact.
+PL_API pl_vulkan pl_vulkan_import(pl_log log, const struct pl_vulkan_import_params *params);
+
+struct pl_vulkan_wrap_params {
+ // The image itself. It *must* be usable concurrently by all of the queue
+ // family indices listed in `pl_vulkan->queues`. Note that this requires
+ // the use of VK_SHARING_MODE_CONCURRENT if `pl_vulkan->num_queues` is
+ // greater than 1. If this is difficult to achieve for the user, then
+ // `async_transfer` / `async_compute` should be turned off, which
+ // guarantees the use of only one queue family.
+ VkImage image;
+
+ // Which aspect of `image` to wrap. Only useful for wrapping individual
+ // sub-planes of planar images. If left as 0, it defaults to the entire
+ // image (i.e. the union of VK_IMAGE_ASPECT_PLANE_N_BIT for planar formats,
+ // and VK_IMAGE_ASPECT_COLOR_BIT otherwise).
+ VkImageAspectFlags aspect;
+
+ // The image's dimensions (unused dimensions must be 0)
+ int width;
+ int height;
+ int depth;
+
+ // The image's format. libplacebo will try to map this to an equivalent
+ // pl_fmt. If no compatible pl_fmt is found, wrapping will fail.
+ VkFormat format;
+
+ // The usage flags the image was created with. libplacebo will set the
+ // pl_tex capabilities to include whatever it can, as determined by the set
+ // of enabled usage flags.
+ VkImageUsageFlags usage;
+
+ // See `pl_tex_params`
+ void *user_data;
+ pl_debug_tag debug_tag;
+};
+
+#define pl_vulkan_wrap_params(...) (&(struct pl_vulkan_wrap_params) { \
+ .debug_tag = PL_DEBUG_TAG, \
+ __VA_ARGS__ \
+ })
+
+// Wraps an external VkImage into a pl_tex abstraction. By default, the image
+// is considered "held" by the user and must be released before calling any
+// pl_tex_* API calls on it (see `pl_vulkan_release`).
+//
+// This wrapper can be destroyed by simply calling `pl_tex_destroy` on it,
+// which will not destroy the underlying VkImage. If a pl_tex wrapper is
+// destroyed while an image is not currently being held by the user, that
+// image is left in an undefined state.
+//
+// Wrapping the same VkImage multiple times is undefined behavior, as is trying
+// to wrap an image belonging to a different VkDevice than the one in use by
+// `gpu`.
+//
+// This function may fail, in which case it returns NULL.
+PL_API pl_tex pl_vulkan_wrap(pl_gpu gpu, const struct pl_vulkan_wrap_params *params);
+
+// Analogous to `pl_vulkan_wrap`, this function takes any `pl_tex` (including
+// ones created by `pl_tex_create`) and unwraps it to expose the underlying
+// VkImage to the user. Unlike `pl_vulkan_wrap`, this `pl_tex` is *not*
+// considered held after calling this function - the user must explicitly
+// `pl_vulkan_hold` before accessing the VkImage.
+//
+// `out_format` and `out_flags` will be updated to hold the VkImage's
+// format and usage flags. (Optional)
+PL_API VkImage pl_vulkan_unwrap(pl_gpu gpu, pl_tex tex,
+ VkFormat *out_format, VkImageUsageFlags *out_flags);
+
+// Represents a vulkan semaphore/value pair (for compatibility with timeline
+// semaphores). When using normal, binary semaphores, `value` may be ignored.
+typedef struct pl_vulkan_sem {
+ VkSemaphore sem;
+ uint64_t value;
+} pl_vulkan_sem;
+
+struct pl_vulkan_hold_params {
+ // The Vulkan image to hold. It will be marked as held. Attempting to
+ // perform any pl_tex_* operation (except pl_tex_destroy) on a held image
+ // is undefined behavior.
+ pl_tex tex;
+
+ // The layout to transition the image to when holding. Alternatively, a
+ // pointer to receive the current image layout. If `out_layout` is
+ // provided, `layout` is ignored.
+ VkImageLayout layout;
+ VkImageLayout *out_layout;
+
+ // The queue family index to transition the image to. This can be used with
+ // VK_QUEUE_FAMILY_EXTERNAL to transition the image to an external API. As
+ // a special case, if set to VK_QUEUE_FAMILY_IGNORED, libplacebo will not
+ // transition the image, even if this image was not set up for concurrent
+ // usage. Ignored for concurrent images.
+ uint32_t qf;
+
+ // The semaphore to fire when the image is available for use. (Required)
+ pl_vulkan_sem semaphore;
+};
+
+#define pl_vulkan_hold_params(...) (&(struct pl_vulkan_hold_params) { __VA_ARGS__ })
+
+// "Hold" a shared image, transferring control over the image to the user.
+// Returns whether successful.
+PL_API bool pl_vulkan_hold_ex(pl_gpu gpu, const struct pl_vulkan_hold_params *params);
+
+struct pl_vulkan_release_params {
+ // The image to be released. It must be marked as "held". Performing any
+ // operation on the VkImage underlying this `pl_tex` while it is not being
+ // held by the user is undefined behavior.
+ pl_tex tex;
+
+ // The current layout of the image at the point in time when `semaphore`
+ // fires, or if no semaphore is specified, at the time of call.
+ VkImageLayout layout;
+
+ // The queue family index to transition the image to. This can be used with
+ // VK_QUEUE_FAMILY_EXTERNAL to transition the image rom an external API. As
+ // a special case, if set to VK_QUEUE_FAMILY_IGNORED, libplacebo will not
+ // transition the image, even if this image was not set up for concurrent
+ // usage. Ignored for concurrent images.
+ uint32_t qf;
+
+ // The semaphore to wait on before libplacebo will actually use or modify
+ // the image. (Optional)
+ //
+ // Note: the lifetime of `semaphore` is indeterminate, and destroying it
+ // while the texture is still depending on that semaphore is undefined
+ // behavior.
+ //
+ // Technically, the only way to be sure that it's safe to free is to use
+ // `pl_gpu_finish()` or similar (e.g. `pl_vulkan_destroy` or
+ // `vkDeviceWaitIdle`) after another operation involving `tex` has been
+ // emitted (or the texture has been destroyed).
+ //
+ //
+ // Warning: If `tex` is a planar image (`pl_fmt.num_planes > 0`), and
+ // `semaphore` is specified, it *must* be a timeline semaphore! Failure to
+ // respect this will result in undefined behavior. This warning does not
+ // apply to individual planes (as exposed by `pl_tex.planes`).
+ pl_vulkan_sem semaphore;
+};
+
+#define pl_vulkan_release_params(...) (&(struct pl_vulkan_release_params) { __VA_ARGS__ })
+
+// "Release" a shared image, transferring control to libplacebo.
+PL_API void pl_vulkan_release_ex(pl_gpu gpu, const struct pl_vulkan_release_params *params);
+
+struct pl_vulkan_sem_params {
+ // The type of semaphore to create.
+ VkSemaphoreType type;
+
+ // For VK_SEMAPHORE_TYPE_TIMELINE, sets the initial timeline value.
+ uint64_t initial_value;
+
+ // If set, exports this VkSemaphore to the handle given in `out_handle`.
+ // The user takes over ownership, and should manually close it before
+ // destroying this VkSemaphore (via `pl_vulkan_sem_destroy`).
+ enum pl_handle_type export_handle;
+ union pl_handle *out_handle;
+
+ // Optional debug tag to identify this semaphore.
+ pl_debug_tag debug_tag;
+};
+
+#define pl_vulkan_sem_params(...) (&(struct pl_vulkan_sem_params) { \
+ .debug_tag = PL_DEBUG_TAG, \
+ __VA_ARGS__ \
+ })
+
+// Helper functions to create and destroy vulkan semaphores. Returns
+// VK_NULL_HANDLE on failure.
+PL_API VkSemaphore pl_vulkan_sem_create(pl_gpu gpu, const struct pl_vulkan_sem_params *params);
+PL_API void pl_vulkan_sem_destroy(pl_gpu gpu, VkSemaphore *semaphore);
+
+// Backwards-compatibility wrappers for older versions of the API.
+PL_DEPRECATED PL_API bool pl_vulkan_hold(pl_gpu gpu, pl_tex tex, VkImageLayout layout,
+ pl_vulkan_sem sem_out);
+PL_DEPRECATED PL_API bool pl_vulkan_hold_raw(pl_gpu gpu, pl_tex tex, VkImageLayout *out_layout,
+ pl_vulkan_sem sem_out);
+PL_DEPRECATED PL_API void pl_vulkan_release(pl_gpu gpu, pl_tex tex, VkImageLayout layout,
+ pl_vulkan_sem sem_in);
+
+PL_API_END
+
+#endif // LIBPLACEBO_VULKAN_H_
diff --git a/src/log.c b/src/log.c
new file mode 100644
index 0000000..0829dd3
--- /dev/null
+++ b/src/log.c
@@ -0,0 +1,471 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <locale.h>
+
+#include "common.h"
+#include "log.h"
+#include "pl_thread.h"
+
+struct priv {
+ pl_mutex lock;
+ enum pl_log_level log_level_cap;
+ pl_str logbuffer;
+};
+
+pl_log pl_log_create(int api_ver, const struct pl_log_params *params)
+{
+ (void) api_ver;
+ struct pl_log_t *log = pl_zalloc_obj(NULL, log, struct priv);
+ struct priv *p = PL_PRIV(log);
+ log->params = *PL_DEF(params, &pl_log_default_params);
+ pl_mutex_init(&p->lock);
+ pl_info(log, "Initialized libplacebo %s (API v%d)", PL_VERSION, PL_API_VER);
+ return log;
+}
+
+const struct pl_log_params pl_log_default_params = {0};
+
+void pl_log_destroy(pl_log *plog)
+{
+ pl_log log = *plog;
+ if (!log)
+ return;
+
+ struct priv *p = PL_PRIV(log);
+ pl_mutex_destroy(&p->lock);
+ pl_free((void *) log);
+ *plog = NULL;
+}
+
+struct pl_log_params pl_log_update(pl_log ptr, const struct pl_log_params *params)
+{
+ struct pl_log_t *log = (struct pl_log_t *) ptr;
+ if (!log)
+ return pl_log_default_params;
+
+ struct priv *p = PL_PRIV(log);
+ pl_mutex_lock(&p->lock);
+ struct pl_log_params prev_params = log->params;
+ log->params = *PL_DEF(params, &pl_log_default_params);
+ pl_mutex_unlock(&p->lock);
+
+ return prev_params;
+}
+
+enum pl_log_level pl_log_level_update(pl_log ptr, enum pl_log_level level)
+{
+ struct pl_log_t *log = (struct pl_log_t *) ptr;
+ if (!log)
+ return PL_LOG_NONE;
+
+ struct priv *p = PL_PRIV(log);
+ pl_mutex_lock(&p->lock);
+ enum pl_log_level prev_level = log->params.log_level;
+ log->params.log_level = level;
+ pl_mutex_unlock(&p->lock);
+
+ return prev_level;
+}
+
+void pl_log_level_cap(pl_log log, enum pl_log_level cap)
+{
+ if (!log)
+ return;
+
+ struct priv *p = PL_PRIV(log);
+ pl_mutex_lock(&p->lock);
+ p->log_level_cap = cap;
+ pl_mutex_unlock(&p->lock);
+}
+
+static FILE *default_stream(void *stream, enum pl_log_level level)
+{
+ return PL_DEF(stream, level <= PL_LOG_WARN ? stderr : stdout);
+}
+
+void pl_log_simple(void *stream, enum pl_log_level level, const char *msg)
+{
+ static const char *prefix[] = {
+ [PL_LOG_FATAL] = "fatal",
+ [PL_LOG_ERR] = "error",
+ [PL_LOG_WARN] = "warn",
+ [PL_LOG_INFO] = "info",
+ [PL_LOG_DEBUG] = "debug",
+ [PL_LOG_TRACE] = "trace",
+ };
+
+ FILE *h = default_stream(stream, level);
+ fprintf(h, "%5s: %s\n", prefix[level], msg);
+ if (level <= PL_LOG_WARN)
+ fflush(h);
+}
+
+void pl_log_color(void *stream, enum pl_log_level level, const char *msg)
+{
+ static const char *color[] = {
+ [PL_LOG_FATAL] = "31;1", // bright red
+ [PL_LOG_ERR] = "31", // red
+ [PL_LOG_WARN] = "33", // yellow/orange
+ [PL_LOG_INFO] = "32", // green
+ [PL_LOG_DEBUG] = "34", // blue
+ [PL_LOG_TRACE] = "30;1", // bright black
+ };
+
+ FILE *h = default_stream(stream, level);
+ fprintf(h, "\033[%sm%s\033[0m\n", color[level], msg);
+ if (level <= PL_LOG_WARN)
+ fflush(h);
+}
+
+static void pl_msg_va(pl_log log, enum pl_log_level lev,
+ const char *fmt, va_list va)
+{
+ // Test log message without taking the lock, to avoid thrashing the
+ // lock for thousands of trace messages unless those are actually
+ // enabled. This may be a false negative, in which case log messages may
+ // be lost as a result. But this shouldn't be a big deal, since any
+ // situation leading to lost log messages would itself be a race condition.
+ if (!pl_msg_test(log, lev))
+ return;
+
+ // Re-test the log message level with held lock to avoid false positives,
+ // which would be a considerably bigger deal than false negatives
+ struct priv *p = PL_PRIV(log);
+ pl_mutex_lock(&p->lock);
+
+ // Apply this cap before re-testing the log level, to avoid giving users
+ // messages that should have been dropped by the log level.
+ lev = PL_MAX(lev, p->log_level_cap);
+ if (!pl_msg_test(log, lev))
+ goto done;
+
+ p->logbuffer.len = 0;
+ pl_str_append_vasprintf((void *) log, &p->logbuffer, fmt, va);
+ log->params.log_cb(log->params.log_priv, lev, (char *) p->logbuffer.buf);
+
+done:
+ pl_mutex_unlock(&p->lock);
+}
+
+void pl_msg(pl_log log, enum pl_log_level lev, const char *fmt, ...)
+{
+ va_list va;
+ va_start(va, fmt);
+ pl_msg_va(log, lev, fmt, va);
+ va_end(va);
+}
+
+void pl_msg_source(pl_log log, enum pl_log_level lev, const char *src)
+{
+ if (!pl_msg_test(log, lev) || !src)
+ return;
+
+ int line = 1;
+ while (*src) {
+ const char *end = strchr(src, '\n');
+ if (!end) {
+ pl_msg(log, lev, "[%3d] %s", line, src);
+ break;
+ }
+
+ pl_msg(log, lev, "[%3d] %.*s", line, (int)(end - src), src);
+ src = end + 1;
+ line++;
+ }
+}
+
+#ifdef PL_HAVE_DBGHELP
+
+#include <windows.h>
+#include <dbghelp.h>
+#include <shlwapi.h>
+
+// https://github.com/llvm/llvm-project/blob/f03cd763384bbb67ddfa12957859ed58841d4b34/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.h#L85-L106
+static inline uintptr_t get_prev_inst_pc(uintptr_t pc) {
+#if defined(__arm__)
+ // T32 (Thumb) branch instructions might be 16 or 32 bit long,
+ // so we return (pc-2) in that case in order to be safe.
+ // For A32 mode we return (pc-4) because all instructions are 32 bit long.
+ return (pc - 3) & (~1);
+#elif defined(__x86_64__) || defined(__i386__)
+ return pc - 1;
+#else
+ return pc - 4;
+#endif
+}
+
+static DWORD64 get_preferred_base(const char *module)
+{
+ DWORD64 image_base = 0;
+ HANDLE file_mapping = NULL;
+ HANDLE file_view = NULL;
+
+ HANDLE file = CreateFile(module, GENERIC_READ, FILE_SHARE_READ, NULL,
+ OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
+ if (file == INVALID_HANDLE_VALUE)
+ goto done;
+
+ file_mapping = CreateFileMapping(file, NULL, PAGE_READONLY, 0, 0, NULL);
+ if (file_mapping == NULL)
+ goto done;
+
+ file_view = MapViewOfFile(file_mapping, FILE_MAP_READ, 0, 0, 0);
+ if (file_view == NULL)
+ goto done;
+
+ PIMAGE_DOS_HEADER dos_header = (PIMAGE_DOS_HEADER) file_view;
+ if (dos_header->e_magic != IMAGE_DOS_SIGNATURE)
+ goto done;
+
+ PIMAGE_NT_HEADERS pe_header = (PIMAGE_NT_HEADERS) ((char *) file_view +
+ dos_header->e_lfanew);
+ if (pe_header->Signature != IMAGE_NT_SIGNATURE)
+ goto done;
+
+ if (pe_header->FileHeader.SizeOfOptionalHeader != sizeof(pe_header->OptionalHeader))
+ goto done;
+
+ image_base = pe_header->OptionalHeader.ImageBase;
+
+done:
+ if (file_view)
+ UnmapViewOfFile(file_view);
+ if (file_mapping)
+ CloseHandle(file_mapping);
+ if (file != INVALID_HANDLE_VALUE)
+ CloseHandle(file);
+
+ return image_base;
+}
+
+void pl_log_stack_trace(pl_log log, enum pl_log_level lev)
+{
+ if (!pl_msg_test(log, lev))
+ return;
+
+ void *tmp = pl_tmp(NULL);
+ PL_ARRAY(void *) frames = {0};
+
+ size_t capacity = 16;
+ do {
+ capacity *= 2;
+ PL_ARRAY_RESIZE(tmp, frames, capacity);
+ // Skip first frame, we don't care about this function
+ frames.num = CaptureStackBackTrace(1, capacity, frames.elem, NULL);
+ } while (capacity == frames.num);
+
+ if (!frames.num) {
+ pl_free(tmp);
+ return;
+ }
+
+ // Load dbghelp on demand. While it is available on all Windows versions,
+ // no need to keep it loaded all the time as stack trace printing function,
+ // in theory should be used repetitively rarely.
+ HANDLE process = GetCurrentProcess();
+ HMODULE dbghelp = LoadLibrary("dbghelp.dll");
+ DWORD options;
+ SYMBOL_INFO *symbol = NULL;
+ BOOL use_dbghelp = !!dbghelp;
+
+#define DBGHELP_SYM(sym) \
+ __typeof__(&sym) p##sym = (__typeof__(&sym))(void *) GetProcAddress(dbghelp, #sym); \
+ use_dbghelp &= !!p##sym
+
+ DBGHELP_SYM(SymCleanup);
+ DBGHELP_SYM(SymFromAddr);
+ DBGHELP_SYM(SymGetLineFromAddr64);
+ DBGHELP_SYM(SymGetModuleInfo64);
+ DBGHELP_SYM(SymGetOptions);
+ DBGHELP_SYM(SymGetSearchPathW);
+ DBGHELP_SYM(SymInitialize);
+ DBGHELP_SYM(SymSetOptions);
+ DBGHELP_SYM(SymSetSearchPathW);
+
+#undef DBGHELP_SYM
+
+ struct priv *p = PL_PRIV(log);
+ PL_ARRAY(wchar_t) base_search = { .num = 1024 };
+
+ if (use_dbghelp) {
+ // DbgHelp is not thread-safe. Note that on Windows mutex is recursive,
+ // so no need to unlock before calling pl_msg.
+ pl_mutex_lock(&p->lock);
+
+ options = pSymGetOptions();
+ pSymSetOptions(SYMOPT_UNDNAME | SYMOPT_DEFERRED_LOADS |
+ SYMOPT_LOAD_LINES | SYMOPT_FAVOR_COMPRESSED);
+ use_dbghelp &= pSymInitialize(process, NULL, TRUE);
+
+ if (use_dbghelp) {
+ symbol = pl_alloc(tmp, sizeof(SYMBOL_INFO) + 512);
+ symbol->SizeOfStruct = sizeof(SYMBOL_INFO);
+ symbol->MaxNameLen = 512;
+
+ PL_ARRAY_RESIZE(tmp, base_search, base_search.num);
+ BOOL ret = pSymGetSearchPathW(process, base_search.elem,
+ base_search.num);
+ base_search.num = ret ? wcslen(base_search.elem) : 0;
+ PL_ARRAY_APPEND(tmp, base_search, L'\0');
+ } else {
+ pSymSetOptions(options);
+ pl_mutex_unlock(&p->lock);
+ }
+ }
+
+ pl_msg(log, lev, " Backtrace:");
+ for (int n = 0; n < frames.num; n++) {
+ uintptr_t pc = get_prev_inst_pc((uintptr_t) frames.elem[n]);
+ pl_str out = {0};
+ pl_str_append_asprintf(tmp, &out, " #%-2d 0x%"PRIxPTR, n, pc);
+
+ MEMORY_BASIC_INFORMATION meminfo = {0};
+ char module_path[MAX_PATH] = {0};
+ if (VirtualQuery((LPCVOID) pc, &meminfo, sizeof(meminfo))) {
+ DWORD sz = GetModuleFileNameA(meminfo.AllocationBase, module_path,
+ sizeof(module_path));
+ if (sz == sizeof(module_path))
+ pl_msg(log, PL_LOG_ERR, "module path truncated");
+
+ if (use_dbghelp) {
+ // According to documentation it should search in "The directory
+ // that contains the corresponding module.", but it doesn't appear
+ // to work, so manually set the path to module path.
+ // https://learn.microsoft.com/windows/win32/debug/symbol-paths
+ PL_ARRAY(wchar_t) mod_search = { .num = MAX_PATH };
+ PL_ARRAY_RESIZE(tmp, mod_search, mod_search.num);
+
+ sz = GetModuleFileNameW(meminfo.AllocationBase,
+ mod_search.elem, mod_search.num);
+
+ if (sz > 0 && sz != MAX_PATH &&
+ // TODO: Replace with PathCchRemoveFileSpec once mingw-w64
+ // >= 8.0.1 is commonly available, at the time of writing
+ // there are a few high profile Linux distributions that ship
+ // 8.0.0.
+ PathRemoveFileSpecW(mod_search.elem))
+ {
+ mod_search.num = wcslen(mod_search.elem);
+ PL_ARRAY_APPEND(tmp, mod_search, L';');
+ PL_ARRAY_CONCAT(tmp, mod_search, base_search);
+ pSymSetSearchPathW(process, mod_search.elem);
+ }
+ }
+ }
+
+ DWORD64 sym_displacement;
+ if (use_dbghelp && pSymFromAddr(process, pc, &sym_displacement, symbol))
+ pl_str_append_asprintf(tmp, &out, " in %s+0x%llx",
+ symbol->Name, sym_displacement);
+
+ DWORD line_displacement;
+ IMAGEHLP_LINE64 line = {sizeof(line)};
+ if (use_dbghelp &&
+ pSymGetLineFromAddr64(process, pc, &line_displacement, &line))
+ {
+ pl_str_append_asprintf(tmp, &out, " %s:%lu+0x%lx", line.FileName,
+ line.LineNumber, line_displacement);
+ goto done;
+ }
+
+ // LLVM tools by convention use absolute addresses with "prefered" base
+ // image offset. We need to read this offset from binary, because due to
+ // ASLR we are not loaded at this base. While Windows tools like WinDbg
+ // expect relative offset to image base. So to be able to easily use it
+ // with both worlds, print both values.
+ DWORD64 module_base = get_preferred_base(module_path);
+ pl_str_append_asprintf(tmp, &out, " (%s+0x%"PRIxPTR") (0x%llx)", module_path,
+ pc - (uintptr_t) meminfo.AllocationBase,
+ module_base + (pc - (uintptr_t) meminfo.AllocationBase));
+
+done:
+ pl_msg(log, lev, "%s", out.buf);
+ }
+
+ if (use_dbghelp) {
+ pSymSetOptions(options);
+ pSymCleanup(process);
+ pl_mutex_unlock(&p->lock);
+ }
+ // Unload dbghelp. Maybe it is better to keep it loaded?
+ if (dbghelp)
+ FreeLibrary(dbghelp);
+ pl_free(tmp);
+}
+
+#elif defined(PL_HAVE_UNWIND)
+#define UNW_LOCAL_ONLY
+#include <libunwind.h>
+#include <dlfcn.h>
+
+void pl_log_stack_trace(pl_log log, enum pl_log_level lev)
+{
+ if (!pl_msg_test(log, lev))
+ return;
+
+ unw_cursor_t cursor;
+ unw_context_t uc;
+ unw_word_t ip, off;
+ unw_getcontext(&uc);
+ unw_init_local(&cursor, &uc);
+
+ int depth = 0;
+ pl_msg(log, lev, " Backtrace:");
+ while (unw_step(&cursor) > 0) {
+ char symbol[256] = "<unknown>";
+ Dl_info info = {
+ .dli_fname = "<unknown>",
+ };
+
+ unw_get_reg(&cursor, UNW_REG_IP, &ip);
+ unw_get_proc_name(&cursor, symbol, sizeof(symbol), &off);
+ dladdr((void *) (uintptr_t) ip, &info);
+ pl_msg(log, lev, " #%-2d 0x%016" PRIxPTR " in %s+0x%" PRIxPTR" at %s+0x%" PRIxPTR,
+ depth++, ip, symbol, off, info.dli_fname, ip - (uintptr_t) info.dli_fbase);
+ }
+}
+
+#elif defined(PL_HAVE_EXECINFO)
+#include <execinfo.h>
+
+void pl_log_stack_trace(pl_log log, enum pl_log_level lev)
+{
+ if (!pl_msg_test(log, lev))
+ return;
+
+ PL_ARRAY(void *) buf = {0};
+ size_t buf_avail = 16;
+ do {
+ buf_avail *= 2;
+ PL_ARRAY_RESIZE(NULL, buf, buf_avail);
+ buf.num = backtrace(buf.elem, buf_avail);
+ } while (buf.num == buf_avail);
+
+ pl_msg(log, lev, " Backtrace:");
+ char **strings = backtrace_symbols(buf.elem, buf.num);
+ for (int i = 1; i < buf.num; i++)
+ pl_msg(log, lev, " #%-2d %s", i - 1, strings[i]);
+
+ free(strings);
+ pl_free(buf.elem);
+}
+
+#else
+void pl_log_stack_trace(pl_log log, enum pl_log_level lev) { }
+#endif
diff --git a/src/log.h b/src/log.h
new file mode 100644
index 0000000..dcf8d28
--- /dev/null
+++ b/src/log.h
@@ -0,0 +1,84 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <stdarg.h>
+
+#include "common.h"
+
+#include <libplacebo/log.h>
+
+// Internal logging-related functions
+
+// Warning: Not entirely thread-safe. Exercise caution when using. May result
+// in either false positives or false negatives. Make sure to re-run this
+// function while `lock` is held, to ensure no race conditions on the check.
+static inline bool pl_msg_test(pl_log log, enum pl_log_level lev)
+{
+ return log && log->params.log_cb && log->params.log_level >= lev;
+}
+
+void pl_msg(pl_log log, enum pl_log_level lev, const char *fmt, ...)
+ PL_PRINTF(3, 4);
+
+// Convenience macros
+#define pl_fatal(log, ...) pl_msg(log, PL_LOG_FATAL, __VA_ARGS__)
+#define pl_err(log, ...) pl_msg(log, PL_LOG_ERR, __VA_ARGS__)
+#define pl_warn(log, ...) pl_msg(log, PL_LOG_WARN, __VA_ARGS__)
+#define pl_info(log, ...) pl_msg(log, PL_LOG_INFO, __VA_ARGS__)
+#define pl_debug(log, ...) pl_msg(log, PL_LOG_DEBUG, __VA_ARGS__)
+#define pl_trace(log, ...) pl_msg(log, PL_LOG_TRACE, __VA_ARGS__)
+
+#define PL_MSG(obj, lev, ...) pl_msg((obj)->log, lev, __VA_ARGS__)
+
+#define PL_FATAL(obj, ...) PL_MSG(obj, PL_LOG_FATAL, __VA_ARGS__)
+#define PL_ERR(obj, ...) PL_MSG(obj, PL_LOG_ERR, __VA_ARGS__)
+#define PL_WARN(obj, ...) PL_MSG(obj, PL_LOG_WARN, __VA_ARGS__)
+#define PL_INFO(obj, ...) PL_MSG(obj, PL_LOG_INFO, __VA_ARGS__)
+#define PL_DEBUG(obj, ...) PL_MSG(obj, PL_LOG_DEBUG, __VA_ARGS__)
+#define PL_TRACE(obj, ...) PL_MSG(obj, PL_LOG_TRACE, __VA_ARGS__)
+
+// Log something with line numbers included
+void pl_msg_source(pl_log log, enum pl_log_level lev, const char *src);
+
+// Temporarily cap the log level to a certain verbosity. This is intended for
+// things like probing formats, attempting to create buffers that may fail, and
+// other types of operations in which we want to suppress errors. Call with
+// PL_LOG_NONE to disable this cap.
+//
+// Warning: This is generally not thread-safe, and only provided as a temporary
+// hack until a better solution can be thought of.
+void pl_log_level_cap(pl_log log, enum pl_log_level cap);
+
+// CPU execution time reporting helper
+static inline void pl_log_cpu_time(pl_log log, pl_clock_t start, pl_clock_t stop,
+ const char *operation)
+{
+ double ms = pl_clock_diff(stop, start) * 1e3;
+ enum pl_log_level lev = PL_LOG_DEBUG;
+ if (ms > 10)
+ lev = PL_LOG_INFO;
+ if (ms > 1000)
+ lev = PL_LOG_WARN;
+
+ pl_msg(log, lev, "Spent %.3f ms %s%s", ms, operation,
+ ms > 100 ? " (slow!)" : "");
+}
+
+// Log stack trace
+PL_NOINLINE void pl_log_stack_trace(pl_log log, enum pl_log_level lev);
diff --git a/src/meson.build b/src/meson.build
new file mode 100644
index 0000000..63f9d53
--- /dev/null
+++ b/src/meson.build
@@ -0,0 +1,347 @@
+### Common dependencies
+unwind = dependency('libunwind', required: get_option('unwind'))
+libexecinfo = cc.find_library('execinfo', required: false)
+has_execinfo = cc.has_function('backtrace_symbols', dependencies: libexecinfo, prefix: '#include <execinfo.h>')
+dbghelp = cc.check_header('dbghelp.h', prefix: '#include <windows.h>')
+conf_internal.set('PL_HAVE_DBGHELP', dbghelp)
+conf_internal.set('PL_HAVE_UNWIND', unwind.found())
+conf_internal.set('PL_HAVE_EXECINFO', has_execinfo)
+if dbghelp
+ build_deps += cc.find_library('shlwapi', required: true)
+elif unwind.found()
+ build_deps += [unwind, cc.find_library('dl', required : false)]
+elif has_execinfo
+ build_deps += libexecinfo
+endif
+
+link_args = []
+link_depends = []
+
+# Looks like meson in certain configuration returns ' ' instead of empty string
+mingw32 = cc.get_define('__MINGW32__').strip()
+if host_machine.system() == 'windows' and mingw32 != '' and host_machine.cpu() in ['aarch64', 'arm', 'x86_64']
+ # MinGW-w64 math functions are significantly slower than the UCRT ones.
+ # In particular powf is over 7 times slower than UCRT counterpart.
+ # MinGW-w64 explicitly excludes some math functions from their ucrtbase def
+ # file and replaces with own versions. To workaround the issue, generate the
+ # import library and link it with UCRT versions of math functions.
+ dlltool = find_program('llvm-dlltool', 'dlltool')
+ ucrt_math = custom_target('ucrt_math.lib',
+ output : ['ucrt_math.lib'],
+ input : 'ucrt_math.def',
+ command : [dlltool, '-d', '@INPUT@', '-l', '@OUTPUT@'])
+ link_args += ucrt_math.full_path()
+ link_depends += ucrt_math
+ # MinGW-w64 inlines functions like powf, rewriting them to pow. We want to use
+ # the powf specialization from UCRT, so disable inlining.
+ add_project_arguments(['-D__CRT__NO_INLINE'], language: ['c', 'cpp'])
+endif
+
+# Work around missing atomics on some (obscure) platforms
+atomic_test = '''
+#include <stdatomic.h>
+#include <stdint.h>
+int main(void) {
+ _Atomic uint32_t x32;
+ atomic_init(&x32, 0);
+}'''
+
+if not cc.links(atomic_test)
+ build_deps += cc.find_library('atomic')
+endif
+
+
+### Common source files
+headers = [
+ 'cache.h',
+ 'colorspace.h',
+ 'common.h',
+ 'd3d11.h',
+ 'dispatch.h',
+ 'dither.h',
+ 'dummy.h',
+ 'filters.h',
+ 'gamut_mapping.h',
+ 'gpu.h',
+ 'log.h',
+ 'opengl.h',
+ 'options.h',
+ 'renderer.h',
+ 'shaders/colorspace.h',
+ 'shaders/custom.h',
+ 'shaders/deinterlacing.h',
+ 'shaders/dithering.h',
+ 'shaders/film_grain.h',
+ 'shaders/icc.h',
+ 'shaders/lut.h',
+ 'shaders/sampling.h',
+ 'shaders.h',
+ 'swapchain.h',
+ 'tone_mapping.h',
+ 'utils/dav1d.h',
+ 'utils/dav1d_internal.h',
+ 'utils/dolbyvision.h',
+ 'utils/frame_queue.h',
+ 'utils/libav.h',
+ 'utils/libav_internal.h',
+ 'utils/upload.h',
+ 'vulkan.h',
+]
+
+sources = [
+ 'cache.c',
+ 'colorspace.c',
+ 'common.c',
+ 'convert.cc',
+ 'dither.c',
+ 'dispatch.c',
+ 'dummy.c',
+ 'filters.c',
+ 'format.c',
+ 'gamut_mapping.c',
+ 'glsl/spirv.c',
+ 'gpu.c',
+ 'gpu/utils.c',
+ 'log.c',
+ 'options.c',
+ 'pl_alloc.c',
+ 'pl_string.c',
+ 'swapchain.c',
+ 'tone_mapping.c',
+ 'utils/dolbyvision.c',
+ 'utils/frame_queue.c',
+ 'utils/upload.c',
+]
+
+# Source files that may use GLSL pragmas, we need to use custom_target
+# to the proper environment and dependency information for these
+foreach f : ['renderer.c', 'shaders.c']
+ sources += custom_target(f,
+ command: glsl_preproc,
+ depend_files: glsl_deps,
+ env: python_env,
+ input: f,
+ output: f,
+ )
+endforeach
+
+# More .c files defined here, we can't put them in this file because of meson
+# preventing the use of / in custom_target output filenames
+subdir('shaders')
+
+tests = [
+ 'cache.c',
+ 'colorspace.c',
+ 'common.c',
+ 'dither.c',
+ 'dummy.c',
+ 'lut.c',
+ 'filters.c',
+ 'options.c',
+ 'string.c',
+ 'tone_mapping.c',
+ 'utils.c',
+]
+
+fuzzers = [
+ 'lut.c',
+ 'options.c',
+ 'shaders.c',
+ 'user_shaders.c',
+]
+
+components = configuration_data()
+
+
+### Optional dependencies / components
+subdir('glsl')
+subdir('d3d11')
+subdir('opengl')
+subdir('vulkan')
+
+lcms = dependency('lcms2', version: '>=2.9', required: get_option('lcms'))
+components.set('lcms', lcms.found())
+if lcms.found()
+ build_deps += lcms
+ tests += 'icc.c'
+endif
+
+# Check to see if libplacebo built this way is sane
+if not (components.get('vulkan') or components.get('opengl') or components.get('d3d11'))
+ warning('Building without any graphics API. libplacebo built this way still ' +
+ 'has some limited use (e.g. generating GLSL shaders), but most of ' +
+ 'its functionality will be missing or impaired!')
+endif
+
+has_spirv = components.get('shaderc') or components.get('glslang')
+needs_spirv = components.get('vulkan') or components.get('d3d11')
+if needs_spirv and not has_spirv
+ warning('Building without any GLSL compiler (shaderc, glslang), but with ' +
+ 'APIs required that require one (vulkan, d3d11). This build is very ' +
+ 'likely to be very limited in functionality!')
+endif
+
+dovi = get_option('dovi')
+components.set('dovi', dovi.allowed())
+
+libdovi = dependency('dovi', version: '>=1.6.7', required: get_option('libdovi').require(dovi.allowed()))
+components.set('libdovi', libdovi.found())
+if libdovi.found()
+ build_deps += libdovi
+endif
+
+xxhash_inc = include_directories()
+xxhash = dependency('libxxhash', required: get_option('xxhash'))
+components.set('xxhash', xxhash.found())
+if xxhash.found()
+ xxhash_inc = xxhash.get_variable('includedir')
+endif
+
+# Generate configuration files
+defs = ''
+pc_vars = []
+
+foreach comp : components.keys()
+ found = components.get(comp)
+ varname = comp.underscorify().to_upper()
+ summary(comp, found, section: 'Optional features', bool_yn: true)
+ defs += (found ? '#define PL_HAVE_@0@ 1\n' : '#undef PL_HAVE_@0@\n').format(varname)
+ pc_vars += 'pl_has_@0@=@1@'.format(varname.to_lower(), found ? 1 : 0)
+endforeach
+
+conf_public.set('extra_defs', defs)
+subdir('./include/libplacebo') # generate config.h in the right location
+sources += configure_file(
+ output: 'config_internal.h',
+ configuration: conf_internal
+)
+
+version_h = vcs_tag(
+ command: ['git', 'describe', '--dirty'],
+ fallback: version_pretty,
+ replace_string: '@buildver@',
+ input: 'version.h.in',
+ output: 'version.h',
+)
+
+sources += version_h
+
+if host_machine.system() == 'windows'
+ windows = import('windows')
+ sources += windows.compile_resources(libplacebo_rc, depends: version_h,
+ include_directories: meson.project_source_root()/'win32')
+endif
+
+fast_float_inc = include_directories()
+if fs.is_dir('../3rdparty/fast_float/include')
+ fast_float_inc = include_directories('../3rdparty/fast_float/include')
+endif
+
+### Main library build process
+inc = include_directories('./include')
+lib = library('placebo', sources,
+ c_args: ['-DPL_EXPORT'],
+ install: true,
+ dependencies: build_deps + glad_dep,
+ soversion: apiver,
+ include_directories: [ inc, vulkan_headers_inc, fast_float_inc, xxhash_inc ],
+ link_args: link_args,
+ link_depends: link_depends,
+ gnu_symbol_visibility: 'hidden',
+ name_prefix: 'lib'
+)
+
+libplacebo = declare_dependency(
+ include_directories: inc,
+ compile_args: get_option('default_library') == 'static' ? ['-DPL_STATIC'] : [],
+ link_with: lib,
+ variables: pc_vars,
+)
+
+
+### Install process
+proj_name = meson.project_name()
+foreach h : headers
+ parts = h.split('/')
+ path = proj_name
+ foreach p : parts
+ if p != parts[-1]
+ path = path / p
+ endif
+ endforeach
+
+ install_headers('include' / proj_name / h, subdir: path)
+endforeach
+
+extra_cflags = []
+if get_option('default_library') == 'static'
+ extra_cflags = ['-DPL_STATIC']
+elif get_option('default_library') == 'both'
+ # meson doesn't support Cflags.private, insert it forcefully...
+ extra_cflags = ['\nCflags.private:', '-DPL_STATIC']
+endif
+
+pkg = import('pkgconfig')
+pkg.generate(
+ name: proj_name,
+ description: 'Reusable library for GPU-accelerated video/image rendering',
+ libraries: lib,
+ version: version,
+ variables: pc_vars,
+ extra_cflags: extra_cflags,
+)
+
+
+### Testing
+tdep_static = declare_dependency(
+ dependencies: build_deps,
+ include_directories: [ inc, include_directories('.') ],
+ compile_args: '-DPL_STATIC'
+ # TODO: Define objects here once Meson 1.1.0 is ok to use
+ # objects: lib.extract_all_objects(recursive: false)
+ )
+
+tdep_shared = declare_dependency(
+ include_directories: [ inc, include_directories('.') ],
+ compile_args: get_option('default_library') == 'static' ? ['-DPL_STATIC'] : [],
+ link_with: lib,
+ )
+
+if get_option('tests')
+ subdir('tests')
+endif
+
+if get_option('bench')
+ if not components.get('vk-proc-addr')
+ error('Compiling the benchmark suite requires vulkan support!')
+ endif
+
+ bench = executable('bench',
+ 'tests/bench.c',
+ dependencies: [tdep_shared, vulkan_headers],
+ link_args: link_args,
+ link_depends: link_depends,
+ include_directories: vulkan_headers_inc,
+ )
+ test('benchmark', bench, is_parallel: false, timeout: 600)
+endif
+
+if get_option('fuzz')
+ foreach f : fuzzers
+ executable('fuzz.' + f, 'tests/fuzz/' + f,
+ objects: lib.extract_all_objects(recursive: false),
+ dependencies: tdep_static,
+ link_args: link_args,
+ link_depends: link_depends,
+ )
+ endforeach
+endif
+
+pl_thread = declare_dependency(
+ include_directories: include_directories('.'),
+ dependencies: threads,
+)
+
+pl_clock = declare_dependency(
+ include_directories: include_directories('.'),
+)
diff --git a/src/opengl/common.h b/src/opengl/common.h
new file mode 100644
index 0000000..c84c69f
--- /dev/null
+++ b/src/opengl/common.h
@@ -0,0 +1,66 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../common.h"
+#include "../log.h"
+#include "../gpu.h"
+#include "pl_thread.h"
+
+#include <libplacebo/opengl.h>
+
+// Collision with llvm-mingw <winnt.h>
+#undef MemoryBarrier
+
+#define GLAD_GL
+#define GLAD_GLES2
+#include <glad/gl.h>
+#include <glad/egl.h>
+
+typedef GladGLContext gl_funcs;
+
+// PL_PRIV(pl_opengl)
+struct gl_ctx {
+ pl_log log;
+ struct pl_opengl_params params;
+ bool is_debug;
+ bool is_debug_egl;
+ bool is_gles;
+
+ // For context locking
+ pl_mutex lock;
+ int count;
+
+ // Dispatch table
+ gl_funcs func;
+};
+
+struct gl_cb {
+ void (*callback)(void *priv);
+ void *priv;
+ GLsync sync;
+};
+
+struct fbo_format {
+ pl_fmt fmt;
+ const struct gl_format *glfmt;
+};
+
+// For locking/unlocking
+bool gl_make_current(pl_opengl gl);
+void gl_release_current(pl_opengl gl);
diff --git a/src/opengl/context.c b/src/opengl/context.c
new file mode 100644
index 0000000..6ca14b8
--- /dev/null
+++ b/src/opengl/context.c
@@ -0,0 +1,332 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <ctype.h>
+
+#include "common.h"
+#include "utils.h"
+#include "gpu.h"
+
+const struct pl_opengl_params pl_opengl_default_params = {0};
+
+static void GLAPIENTRY debug_cb(GLenum source, GLenum type, GLuint id,
+ GLenum severity, GLsizei length,
+ const GLchar *message, const void *userParam)
+{
+ pl_log log = (void *) userParam;
+ enum pl_log_level level = PL_LOG_ERR;
+
+ switch (severity) {
+ case GL_DEBUG_SEVERITY_NOTIFICATION:level = PL_LOG_DEBUG; break;
+ case GL_DEBUG_SEVERITY_LOW: level = PL_LOG_INFO; break;
+ case GL_DEBUG_SEVERITY_MEDIUM: level = PL_LOG_WARN; break;
+ case GL_DEBUG_SEVERITY_HIGH: level = PL_LOG_ERR; break;
+ }
+
+ pl_msg(log, level, "GL: %s", message);
+
+ if (level <= PL_LOG_ERR)
+ pl_log_stack_trace(log, level);
+}
+
+static void GLAPIENTRY debug_cb_egl(EGLenum error, const char *command,
+ EGLint messageType, EGLLabelKHR threadLabel,
+ EGLLabelKHR objectLabel, const char *message)
+{
+ pl_log log = threadLabel;
+ enum pl_log_level level = PL_LOG_ERR;
+
+ switch (messageType) {
+ case EGL_DEBUG_MSG_CRITICAL_KHR: level = PL_LOG_FATAL; break;
+ case EGL_DEBUG_MSG_ERROR_KHR: level = PL_LOG_ERR; break;
+ case EGL_DEBUG_MSG_WARN_KHR: level = PL_LOG_WARN; break;
+ case EGL_DEBUG_MSG_INFO_KHR: level = PL_LOG_DEBUG; break;
+ }
+
+ pl_msg(log, level, "EGL: %s: %s %s", command, egl_err_str(error),
+ message);
+
+ if (level <= PL_LOG_ERR)
+ pl_log_stack_trace(log, level);
+}
+
+// Guards access to the (thread-unsafe) glad global EGL state
+static pl_static_mutex glad_egl_mutex = PL_STATIC_MUTEX_INITIALIZER;
+
+void pl_opengl_destroy(pl_opengl *ptr)
+{
+ pl_opengl pl_gl = *ptr;
+ if (!pl_gl)
+ return;
+
+ struct gl_ctx *p = PL_PRIV(pl_gl);
+ gl_funcs *gl = &p->func;
+ if (!gl_make_current(pl_gl)) {
+ PL_WARN(p, "Failed uninitializing OpenGL context, leaking resources!");
+ return;
+ }
+
+ if (p->is_debug)
+ gl->DebugMessageCallback(NULL, NULL);
+
+ if (p->is_debug_egl)
+ eglDebugMessageControlKHR(NULL, NULL);
+
+ pl_gpu_destroy(pl_gl->gpu);
+
+#ifdef PL_HAVE_GL_PROC_ADDR
+ if (p->is_gles) {
+ gladLoaderUnloadGLES2Context(gl);
+ } else {
+ gladLoaderUnloadGLContext(gl);
+ }
+
+ bool used_loader = !p->params.get_proc_addr && !p->params.get_proc_addr_ex;
+ if (p->params.egl_display && used_loader) {
+ pl_static_mutex_lock(&glad_egl_mutex);
+ gladLoaderUnloadEGL();
+ pl_static_mutex_unlock(&glad_egl_mutex);
+ }
+#endif
+
+ gl_release_current(pl_gl);
+ pl_mutex_destroy(&p->lock);
+ pl_free_ptr((void **) ptr);
+
+}
+
+typedef PL_ARRAY(const char *) ext_arr_t;
+static void add_exts_str(void *alloc, ext_arr_t *arr, const char *extstr)
+{
+ pl_str rest = pl_str_strip(pl_str0(pl_strdup0(alloc, pl_str0(extstr))));
+ while (rest.len) {
+ pl_str ext = pl_str_split_char(rest, ' ', &rest);
+ ext.buf[ext.len] = '\0'; // re-use separator for terminator
+ PL_ARRAY_APPEND(alloc, *arr, (char *) ext.buf);
+ }
+}
+
+pl_opengl pl_opengl_create(pl_log log, const struct pl_opengl_params *params)
+{
+ params = PL_DEF(params, &pl_opengl_default_params);
+ struct pl_opengl_t *pl_gl = pl_zalloc_obj(NULL, pl_gl, struct gl_ctx);
+ struct gl_ctx *p = PL_PRIV(pl_gl);
+ gl_funcs *gl = &p->func;
+ p->params = *params;
+ p->log = log;
+
+ pl_mutex_init_type(&p->lock, PL_MUTEX_RECURSIVE);
+ if (!gl_make_current(pl_gl)) {
+ pl_free(pl_gl);
+ return NULL;
+ }
+
+ bool ok;
+ if (params->get_proc_addr_ex) {
+ ok = gladLoadGLContextUserPtr(gl, params->get_proc_addr_ex, params->proc_ctx);
+ } else if (params->get_proc_addr) {
+ ok = gladLoadGLContext(gl, params->get_proc_addr);
+ } else {
+#ifdef PL_HAVE_GL_PROC_ADDR
+ ok = gladLoaderLoadGLContext(gl);
+#else
+ PL_FATAL(p, "No `glGetProcAddress` function provided, and libplacebo "
+ "built without its built-in OpenGL loader!");
+ goto error;
+#endif
+ }
+
+ if (!ok) {
+ PL_INFO(p, "Failed loading core GL, retrying as GLES...");
+ } else if (gl_is_gles(pl_gl)) {
+ PL_INFO(p, "GL context seems to be OpenGL ES, reloading as GLES...");
+ ok = false;
+ }
+
+ if (!ok) {
+ memset(gl, 0, sizeof(*gl));
+ if (params->get_proc_addr_ex) {
+ ok = gladLoadGLES2ContextUserPtr(gl, params->get_proc_addr_ex, params->proc_ctx);
+ } else if (params->get_proc_addr) {
+ ok = gladLoadGLES2Context(gl, params->get_proc_addr);
+ } else {
+#ifdef PL_HAVE_GL_PROC_ADDR
+ ok = gladLoaderLoadGLES2Context(gl);
+#else
+ pl_unreachable();
+#endif
+ }
+ p->is_gles = ok;
+ }
+
+ if (!ok) {
+ PL_FATAL(p, "Failed to initialize OpenGL context - make sure a valid "
+ "OpenGL context is bound to the current thread!");
+ goto error;
+ }
+
+ const char *version = (const char *) gl->GetString(GL_VERSION);
+ if (version) {
+ const char *ver = version;
+ while (!isdigit(*ver) && *ver != '\0')
+ ver++;
+ if (sscanf(ver, "%d.%d", &pl_gl->major, &pl_gl->minor) != 2) {
+ PL_FATAL(p, "Invalid GL_VERSION string: %s\n", version);
+ goto error;
+ }
+ }
+
+ if (!pl_gl->major) {
+ PL_FATAL(p, "No OpenGL version detected - make sure an OpenGL context "
+ "is bound to the current thread!");
+ goto error;
+ }
+
+ static const int gl_ver_req = 3;
+ if (pl_gl->major < gl_ver_req) {
+ PL_FATAL(p, "OpenGL version too old (%d < %d), please use a newer "
+ "OpenGL implementation or downgrade libplacebo!",
+ pl_gl->major, gl_ver_req);
+ goto error;
+ }
+
+ PL_INFO(p, "Detected OpenGL version strings:");
+ PL_INFO(p, " GL_VERSION: %s", version);
+ PL_INFO(p, " GL_VENDOR: %s", (char *) gl->GetString(GL_VENDOR));
+ PL_INFO(p, " GL_RENDERER: %s", (char *) gl->GetString(GL_RENDERER));
+
+ ext_arr_t exts = {0};
+ if (pl_gl->major >= 3) {
+ gl->GetIntegerv(GL_NUM_EXTENSIONS, &exts.num);
+ PL_ARRAY_RESIZE(pl_gl, exts, exts.num);
+ for (int i = 0; i < exts.num; i++)
+ exts.elem[i] = (const char *) gl->GetStringi(GL_EXTENSIONS, i);
+ } else {
+ add_exts_str(pl_gl, &exts, (const char *) gl->GetString(GL_EXTENSIONS));
+ }
+
+ if (pl_msg_test(log, PL_LOG_DEBUG)) {
+ PL_DEBUG(p, " GL_EXTENSIONS:");
+ for (int i = 0; i < exts.num; i++)
+ PL_DEBUG(p, " %s", exts.elem[i]);
+ }
+
+ if (params->egl_display) {
+ pl_static_mutex_lock(&glad_egl_mutex);
+ if (params->get_proc_addr_ex) {
+ ok = gladLoadEGLUserPtr(params->egl_display, params->get_proc_addr_ex,
+ params->proc_ctx);
+ } else if (params->get_proc_addr) {
+ ok = gladLoadEGL(params->egl_display, params->get_proc_addr);
+ } else {
+#ifdef PL_HAVE_GL_PROC_ADDR
+ ok = gladLoaderLoadEGL(params->egl_display);
+#else
+ pl_unreachable();
+#endif
+ }
+ pl_static_mutex_unlock(&glad_egl_mutex);
+
+ if (!ok) {
+ PL_FATAL(p, "Failed loading EGL functions - double check EGLDisplay?");
+ goto error;
+ }
+
+ int start = exts.num;
+ add_exts_str(pl_gl, &exts, eglQueryString(params->egl_display,
+ EGL_EXTENSIONS));
+ if (exts.num > start) {
+ PL_DEBUG(p, " EGL_EXTENSIONS:");
+ for (int i = start; i < exts.num; i++)
+ PL_DEBUG(p, " %s", exts.elem[i]);
+ }
+ }
+
+ pl_gl->extensions = exts.elem;
+ pl_gl->num_extensions = exts.num;
+
+ if (!params->allow_software && gl_is_software(pl_gl)) {
+ PL_FATAL(p, "OpenGL context is suspected to be a software rasterizer, "
+ "but `allow_software` is false.");
+ goto error;
+ }
+
+ if (params->debug) {
+ if (pl_opengl_has_ext(pl_gl, "GL_KHR_debug")) {
+ gl->DebugMessageCallback(debug_cb, log);
+ gl->Enable(GL_DEBUG_OUTPUT);
+ p->is_debug = true;
+ } else {
+ PL_WARN(p, "OpenGL debugging requested, but GL_KHR_debug is not "
+ "available... ignoring!");
+ }
+
+ if (params->egl_display && pl_opengl_has_ext(pl_gl, "EGL_KHR_debug")) {
+ static const EGLAttrib attribs[] = {
+ // Enable everything under the sun, because the `pl_ctx` log
+ // level may change at runtime.
+ EGL_DEBUG_MSG_CRITICAL_KHR, EGL_TRUE,
+ EGL_DEBUG_MSG_ERROR_KHR, EGL_TRUE,
+ EGL_DEBUG_MSG_WARN_KHR, EGL_TRUE,
+ EGL_DEBUG_MSG_INFO_KHR, EGL_TRUE,
+ EGL_NONE,
+ };
+
+ eglDebugMessageControlKHR(debug_cb_egl, attribs);
+ eglLabelObjectKHR(NULL, EGL_OBJECT_THREAD_KHR, NULL, (void *) log);
+ p->is_debug_egl = true;
+ }
+ }
+
+ pl_gl->gpu = pl_gpu_create_gl(log, pl_gl, params);
+ if (!pl_gl->gpu)
+ goto error;
+
+ gl_release_current(pl_gl);
+ return pl_gl;
+
+error:
+ PL_FATAL(p, "Failed initializing opengl context!");
+ gl_release_current(pl_gl);
+ pl_opengl_destroy((pl_opengl *) &pl_gl);
+ return NULL;
+}
+
+bool gl_make_current(pl_opengl pl_gl)
+{
+ struct gl_ctx *p = PL_PRIV(pl_gl);
+ pl_mutex_lock(&p->lock);
+ if (!p->count && p->params.make_current) {
+ if (!p->params.make_current(p->params.priv)) {
+ PL_ERR(p, "Failed making OpenGL context current on calling thread!");
+ pl_mutex_unlock(&p->lock);
+ return false;
+ }
+ }
+
+ p->count++;
+ return true;
+}
+
+void gl_release_current(pl_opengl pl_gl)
+{
+ struct gl_ctx *p = PL_PRIV(pl_gl);
+ p->count--;
+ if (!p->count && p->params.release_current)
+ p->params.release_current(p->params.priv);
+ pl_mutex_unlock(&p->lock);
+}
diff --git a/src/opengl/formats.c b/src/opengl/formats.c
new file mode 100644
index 0000000..6604835
--- /dev/null
+++ b/src/opengl/formats.c
@@ -0,0 +1,485 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+#include "common.h"
+#include "formats.h"
+#include "utils.h"
+
+#ifdef PL_HAVE_UNIX
+static bool supported_fourcc(struct pl_gl *p, EGLint fourcc)
+{
+ for (int i = 0; i < p->egl_formats.num; ++i)
+ if (fourcc == p->egl_formats.elem[i])
+ return true;
+ return false;
+}
+#endif
+
+#define FMT(_name, bits, ftype, _caps) \
+ (struct pl_fmt_t) { \
+ .name = _name, \
+ .type = PL_FMT_##ftype, \
+ .caps = (enum pl_fmt_caps) (_caps), \
+ .sample_order = {0, 1, 2, 3}, \
+ .component_depth = {bits, bits, bits, bits}, \
+ }
+
+// Convenience to make the names simpler
+enum {
+ // Type aliases
+ U8 = GL_UNSIGNED_BYTE,
+ U16 = GL_UNSIGNED_SHORT,
+ U32 = GL_UNSIGNED_INT,
+ I8 = GL_BYTE,
+ I16 = GL_SHORT,
+ I32 = GL_INT,
+ FLT = GL_FLOAT,
+
+ // Component aliases
+ R = GL_RED,
+ RG = GL_RG,
+ RGB = GL_RGB,
+ RGBA = GL_RGBA,
+ BGRA = GL_BGRA,
+ RI = GL_RED_INTEGER,
+ RGI = GL_RG_INTEGER,
+ RGBI = GL_RGB_INTEGER,
+ RGBAI = GL_RGBA_INTEGER,
+
+ // Capability aliases
+ S = PL_FMT_CAP_SAMPLEABLE,
+ L = PL_FMT_CAP_LINEAR,
+ F = PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_BLITTABLE, // FBO support
+ V = PL_FMT_CAP_VERTEX,
+};
+
+// Basic 8-bit formats
+const struct gl_format formats_norm8[] = {
+ {GL_R8, R, U8, FMT("r8", 8, UNORM, S|L|F|V)},
+ {GL_RG8, RG, U8, FMT("rg8", 8, UNORM, S|L|F|V)},
+ {GL_RGB8, RGB, U8, FMT("rgb8", 8, UNORM, S|L|F|V)},
+ {GL_RGBA8, RGBA, U8, FMT("rgba8", 8, UNORM, S|L|F|V)},
+};
+
+// Signed variants
+/* TODO: these are broken in mesa
+const struct gl_format formats_snorm8[] = {
+ {GL_R8_SNORM, R, I8, FMT("r8s", 8, SNORM, S|L|F|V)},
+ {GL_RG8_SNORM, RG, I8, FMT("rg8s", 8, SNORM, S|L|F|V)},
+ {GL_RGB8_SNORM, RGB, I8, FMT("rgb8s", 8, SNORM, S|L|F|V)},
+ {GL_RGBA8_SNORM, RGBA, I8, FMT("rgba8s", 8, SNORM, S|L|F|V)},
+};
+*/
+
+// BGRA 8-bit
+const struct gl_format formats_bgra8[] = {
+ {GL_RGBA8, BGRA, U8, {
+ .name = "bgra8",
+ .type = PL_FMT_UNORM,
+ .caps = S|L|F|V,
+ .sample_order = {2, 1, 0, 3},
+ .component_depth = {8, 8, 8, 8},
+ }},
+};
+
+// Basic 16-bit formats, excluding rgb16 (special cased below)
+const struct gl_format formats_norm16[] = {
+ {GL_R16, R, U16, FMT("r16", 16, UNORM, S|L|F|V)},
+ {GL_RG16, RG, U16, FMT("rg16", 16, UNORM, S|L|F|V)},
+ {GL_RGBA16, RGBA, U16, FMT("rgba16", 16, UNORM, S|L|F|V)},
+};
+
+// Renderable version of rgb16
+const struct gl_format formats_rgb16_fbo[] = {
+ {GL_RGB16, RGB, U16, FMT("rgb16", 16, UNORM, S|L|F|V)},
+};
+
+// Non-renderable version of rgb16
+const struct gl_format formats_rgb16_fallback[] = {
+ {GL_RGB16, RGB, U16, FMT("rgb16", 16, UNORM, S|L|V)},
+};
+
+// Signed 16-bit variants
+/* TODO: these are broken in mesa and nvidia
+const struct gl_format formats_snorm16[] = {
+ {GL_R16_SNORM, R, I16, FMT("r16s", 16, SNORM, S|L|F|V)},
+ {GL_RG16_SNORM, RG, I16, FMT("rg16s", 16, SNORM, S|L|F|V)},
+ {GL_RGB16_SNORM, RGB, I16, FMT("rgb16s", 16, SNORM, S|L|F|V)},
+ {GL_RGBA16_SNORM, RGBA, I16, FMT("rgba16s", 16, SNORM, S|L|F|V)},
+};
+*/
+
+// Floating point texture formats
+const struct gl_format formats_float[] = {
+ {GL_R16F, R, FLT, FMT("r16f", 16, FLOAT, S|L|F)},
+ {GL_RG16F, RG, FLT, FMT("rg16f", 16, FLOAT, S|L|F)},
+ {GL_RGB16F, RGB, FLT, FMT("rgb16f", 16, FLOAT, S|L|F)},
+ {GL_RGBA16F, RGBA, FLT, FMT("rgba16f", 16, FLOAT, S|L|F)},
+ {GL_R32F, R, FLT, FMT("r32f", 32, FLOAT, S|L|F|V)},
+ {GL_RG32F, RG, FLT, FMT("rg32f", 32, FLOAT, S|L|F|V)},
+ {GL_RGB32F, RGB, FLT, FMT("rgb32f", 32, FLOAT, S|L|F|V)},
+ {GL_RGBA32F, RGBA, FLT, FMT("rgba32f", 32, FLOAT, S|L|F|V)},
+};
+
+// Renderable 16-bit float formats (excluding rgb16f)
+const struct gl_format formats_float16_fbo[] = {
+ {GL_R16F, R, FLT, FMT("r16f", 16, FLOAT, S|L|F)},
+ {GL_RG16F, RG, FLT, FMT("rg16f", 16, FLOAT, S|L|F)},
+ {GL_RGB16F, RGB, FLT, FMT("rgb16f", 16, FLOAT, S|L)},
+ {GL_RGBA16F, RGBA, FLT, FMT("rgba16f", 16, FLOAT, S|L|F)},
+};
+
+// Non-renderable 16-bit float formats
+const struct gl_format formats_float16_fallback[] = {
+ {GL_R16F, R, FLT, FMT("r16f", 16, FLOAT, S|L)},
+ {GL_RG16F, RG, FLT, FMT("rg16f", 16, FLOAT, S|L)},
+ {GL_RGB16F, RGB, FLT, FMT("rgb16f", 16, FLOAT, S|L)},
+ {GL_RGBA16F, RGBA, FLT, FMT("rgba16f", 16, FLOAT, S|L)},
+};
+
+// (Unsigned) integer formats
+const struct gl_format formats_uint[] = {
+ {GL_R8UI, RI, U8, FMT("r8u", 8, UINT, S|F|V)},
+ {GL_RG8UI, RGI, U8, FMT("rg8u", 8, UINT, S|F|V)},
+ {GL_RGB8UI, RGBI, U8, FMT("rgb8u", 8, UINT, S|V)},
+ {GL_RGBA8UI, RGBAI, U8, FMT("rgba8u", 8, UINT, S|F|V)},
+ {GL_R16UI, RI, U16, FMT("r16u", 16, UINT, S|F|V)},
+ {GL_RG16UI, RGI, U16, FMT("rg16u", 16, UINT, S|F|V)},
+ {GL_RGB16UI, RGBI, U16, FMT("rgb16u", 16, UINT, S|V)},
+ {GL_RGBA16UI, RGBAI, U16, FMT("rgba16u", 16, UINT, S|F|V)},
+};
+
+/* TODO
+ {GL_R32UI, RI, U32, FMT("r32u", 32, UINT)},
+ {GL_RG32UI, RGI, U32, FMT("rg32u", 32, UINT)},
+ {GL_RGB32UI, RGBI, U32, FMT("rgb32u", 32, UINT)},
+ {GL_RGBA32UI, RGBAI, U32, FMT("rgba32u", 32, UINT)},
+
+ {GL_R8I, RI, I8, FMT("r8i", 8, SINT)},
+ {GL_RG8I, RGI, I8, FMT("rg8i", 8, SINT)},
+ {GL_RGB8I, RGBI, I8, FMT("rgb8i", 8, SINT)},
+ {GL_RGBA8I, RGBAI, I8, FMT("rgba8i", 8, SINT)},
+ {GL_R16I, RI, I16, FMT("r16i", 16, SINT)},
+ {GL_RG16I, RGI, I16, FMT("rg16i", 16, SINT)},
+ {GL_RGB16I, RGBI, I16, FMT("rgb16i", 16, SINT)},
+ {GL_RGBA16I, RGBAI, I16, FMT("rgba16i", 16, SINT)},
+ {GL_R32I, RI, I32, FMT("r32i", 32, SINT)},
+ {GL_RG32I, RGI, I32, FMT("rg32i", 32, SINT)},
+ {GL_RGB32I, RGBI, I32, FMT("rgb32i", 32, SINT)},
+ {GL_RGBA32I, RGBAI, I32, FMT("rgba32i", 32, SINT)},
+*/
+
+// GL2 legacy formats
+const struct gl_format formats_legacy_gl2[] = {
+ {GL_RGB8, RGB, U8, FMT("rgb8", 8, UNORM, S|L|V)},
+ {GL_RGBA8, RGBA, U8, FMT("rgba8", 8, UNORM, S|L|V)},
+ {GL_RGB16, RGB, U16, FMT("rgb16", 16, UNORM, S|L|V)},
+ {GL_RGBA16, RGBA, U16, FMT("rgba16", 16, UNORM, S|L|V)},
+};
+
+// GLES2 legacy formats
+const struct gl_format formats_legacy_gles2[] = {
+ {GL_RGB, RGB, U8, FMT("rgb", 8, UNORM, S|L)},
+ {GL_RGBA, RGBA, U8, FMT("rgba", 8, UNORM, S|L)},
+};
+
+// GLES BGRA
+const struct gl_format formats_bgra_gles[] = {
+ {GL_BGRA, BGRA, U8, {
+ .name = "bgra8",
+ .type = PL_FMT_UNORM,
+ .caps = S|L|F|V,
+ .sample_order = {2, 1, 0, 3},
+ .component_depth = {8, 8, 8, 8},
+ }},
+};
+
+// Fallback for vertex-only formats, as a last resort
+const struct gl_format formats_basic_vertex[] = {
+ {GL_R32F, R, FLT, FMT("r32f", 32, FLOAT, V)},
+ {GL_RG32F, RG, FLT, FMT("rg32f", 32, FLOAT, V)},
+ {GL_RGB32F, RGB, FLT, FMT("rgb32f", 32, FLOAT, V)},
+ {GL_RGBA32F, RGBA, FLT, FMT("rgba32f", 32, FLOAT, V)},
+};
+
+static void add_format(pl_gpu pgpu, const struct gl_format *gl_fmt)
+{
+ struct pl_gpu_t *gpu = (struct pl_gpu_t *) pgpu;
+ struct pl_gl *p = PL_PRIV(gpu);
+
+ struct pl_fmt_t *fmt = pl_alloc_obj(gpu, fmt, gl_fmt);
+ const struct gl_format **fmtp = PL_PRIV(fmt);
+ *fmt = gl_fmt->tmpl;
+ *fmtp = gl_fmt;
+
+ // Calculate the host size and number of components
+ switch (gl_fmt->fmt) {
+ case GL_RED:
+ case GL_RED_INTEGER:
+ fmt->num_components = 1;
+ break;
+ case GL_RG:
+ case GL_RG_INTEGER:
+ fmt->num_components = 2;
+ break;
+ case GL_RGB:
+ case GL_RGB_INTEGER:
+ fmt->num_components = 3;
+ break;
+ case GL_RGBA:
+ case GL_RGBA_INTEGER:
+ case GL_BGRA:
+ fmt->num_components = 4;
+ break;
+ default:
+ pl_unreachable();
+ }
+
+ int size;
+ switch (gl_fmt->type) {
+ case GL_BYTE:
+ case GL_UNSIGNED_BYTE:
+ size = 1;
+ break;
+ case GL_SHORT:
+ case GL_UNSIGNED_SHORT:
+ size = 2;
+ break;
+ case GL_INT:
+ case GL_UNSIGNED_INT:
+ case GL_FLOAT:
+ size = 4;
+ break;
+ default:
+ pl_unreachable();
+ }
+
+ // Host visible representation
+ fmt->texel_size = fmt->num_components * size;
+ fmt->texel_align = 1;
+ for (int i = 0; i < fmt->num_components; i++)
+ fmt->host_bits[i] = size * 8;
+
+ // Compute internal size by summing up the depth
+ int ibits = 0;
+ for (int i = 0; i < fmt->num_components; i++)
+ ibits += fmt->component_depth[i];
+ fmt->internal_size = (ibits + 7) / 8;
+
+ // We're not the ones actually emulating these texture format - the
+ // driver is - but we might as well set the hint.
+ fmt->emulated = fmt->texel_size != fmt->internal_size;
+
+ // 3-component formats are almost surely also emulated
+ if (fmt->num_components == 3)
+ fmt->emulated = true;
+
+ // Older OpenGL most likely emulates 32-bit float formats as well
+ if (p->gl_ver < 30 && fmt->component_depth[0] >= 32)
+ fmt->emulated = true;
+
+ // For sanity, clear the superfluous fields
+ for (int i = fmt->num_components; i < 4; i++) {
+ fmt->component_depth[i] = 0;
+ fmt->sample_order[i] = 0;
+ fmt->host_bits[i] = 0;
+ }
+
+ fmt->glsl_type = pl_var_glsl_type_name(pl_var_from_fmt(fmt, ""));
+ fmt->glsl_format = pl_fmt_glsl_format(fmt, fmt->num_components);
+ fmt->fourcc = pl_fmt_fourcc(fmt);
+ pl_assert(fmt->glsl_type);
+
+#ifdef PL_HAVE_UNIX
+ if (p->has_modifiers && fmt->fourcc && supported_fourcc(p, fmt->fourcc)) {
+ int num_mods = 0;
+ bool ok = eglQueryDmaBufModifiersEXT(p->egl_dpy, fmt->fourcc,
+ 0, NULL, NULL, &num_mods);
+ if (ok && num_mods) {
+ // On my system eglQueryDmaBufModifiersEXT seems to never return
+ // MOD_INVALID even though eglExportDMABUFImageQueryMESA happily
+ // returns such modifiers. Since we handle INVALID by not
+ // requiring modifiers at all, always add this value to the
+ // list of supported modifiers. May result in duplicates, but
+ // whatever.
+ uint64_t *mods = pl_calloc(fmt, num_mods + 1, sizeof(uint64_t));
+ mods[0] = DRM_FORMAT_MOD_INVALID;
+ ok = eglQueryDmaBufModifiersEXT(p->egl_dpy, fmt->fourcc, num_mods,
+ &mods[1], NULL, &num_mods);
+
+ if (ok) {
+ fmt->modifiers = mods;
+ fmt->num_modifiers = num_mods + 1;
+ } else {
+ pl_free(mods);
+ }
+ }
+
+ eglGetError(); // ignore probing errors
+ }
+
+ if (!fmt->num_modifiers) {
+ // Hacky fallback for older drivers that don't support properly
+ // querying modifiers
+ static const uint64_t static_mods[] = {
+ DRM_FORMAT_MOD_INVALID,
+ DRM_FORMAT_MOD_LINEAR,
+ };
+
+ fmt->num_modifiers = PL_ARRAY_SIZE(static_mods);
+ fmt->modifiers = static_mods;
+ }
+#endif
+
+ // Gathering requires checking the format type (and extension presence)
+ if (fmt->caps & PL_FMT_CAP_SAMPLEABLE)
+ fmt->gatherable = p->gather_comps >= fmt->num_components;
+
+ // Reading from textures on GLES requires FBO support for this fmt
+ if (p->has_readback && (p->gl_ver || (fmt->caps & PL_FMT_CAP_RENDERABLE)))
+ fmt->caps |= PL_FMT_CAP_HOST_READABLE;
+
+ if (gpu->glsl.compute && fmt->glsl_format && p->has_storage)
+ fmt->caps |= PL_FMT_CAP_STORABLE | PL_FMT_CAP_READWRITE;
+
+ // Only float-type formats are considered blendable in OpenGL
+ switch (fmt->type) {
+ case PL_FMT_UNKNOWN:
+ case PL_FMT_UINT:
+ case PL_FMT_SINT:
+ break;
+ case PL_FMT_FLOAT:
+ case PL_FMT_UNORM:
+ case PL_FMT_SNORM:
+ if (fmt->caps & PL_FMT_CAP_RENDERABLE)
+ fmt->caps |= PL_FMT_CAP_BLENDABLE;
+ break;
+ case PL_FMT_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ // TODO: Texel buffers
+
+ PL_ARRAY_APPEND_RAW(gpu, gpu->formats, gpu->num_formats, fmt);
+}
+
+#define DO_FORMATS(formats) \
+ do { \
+ for (int i = 0; i < PL_ARRAY_SIZE(formats); i++) \
+ add_format(gpu, &formats[i]); \
+ } while (0)
+
+bool gl_setup_formats(struct pl_gpu_t *gpu)
+{
+ struct pl_gl *p = PL_PRIV(gpu);
+
+#ifdef PL_HAVE_UNIX
+ if (p->has_modifiers) {
+ EGLint num_formats = 0;
+ bool ok = eglQueryDmaBufFormatsEXT(p->egl_dpy, 0, NULL,
+ &num_formats);
+ if (ok && num_formats) {
+ p->egl_formats.elem = pl_calloc(gpu, num_formats, sizeof(EGLint));
+ p->egl_formats.num = num_formats;
+ ok = eglQueryDmaBufFormatsEXT(p->egl_dpy, num_formats,
+ p->egl_formats.elem, &num_formats);
+ pl_assert(ok);
+
+ PL_DEBUG(gpu, "EGL formats supported:");
+ for (int i = 0; i < num_formats; ++i) {
+ PL_DEBUG(gpu, " 0x%08x(%.4s)", p->egl_formats.elem[i],
+ PRINT_FOURCC(p->egl_formats.elem[i]));
+ }
+ }
+ }
+#endif
+
+ if (p->gl_ver >= 30) {
+ // Desktop GL3+ has everything
+ DO_FORMATS(formats_norm8);
+ DO_FORMATS(formats_bgra8);
+ DO_FORMATS(formats_norm16);
+ DO_FORMATS(formats_rgb16_fbo);
+ DO_FORMATS(formats_float);
+ DO_FORMATS(formats_uint);
+ goto done;
+ }
+
+ if (p->gl_ver >= 21) {
+ // If we have a reasonable set of extensions, we can enable most
+ // things. Otherwise, pick simple fallback formats
+ if (pl_opengl_has_ext(p->gl, "GL_ARB_texture_float") &&
+ pl_opengl_has_ext(p->gl, "GL_ARB_texture_rg") &&
+ pl_opengl_has_ext(p->gl, "GL_ARB_framebuffer_object"))
+ {
+ DO_FORMATS(formats_norm8);
+ DO_FORMATS(formats_bgra8);
+ DO_FORMATS(formats_norm16);
+ DO_FORMATS(formats_rgb16_fbo);
+ DO_FORMATS(formats_float);
+ } else {
+ // Fallback for GL2
+ DO_FORMATS(formats_legacy_gl2);
+ DO_FORMATS(formats_basic_vertex);
+ }
+ goto done;
+ }
+
+ if (p->gles_ver >= 30) {
+ // GLES 3.0 has some basic formats, with framebuffers for float16
+ // depending on GL_EXT_color_buffer_(half_)float support
+ DO_FORMATS(formats_norm8);
+ if (pl_opengl_has_ext(p->gl, "GL_EXT_texture_norm16")) {
+ DO_FORMATS(formats_norm16);
+ DO_FORMATS(formats_rgb16_fallback);
+ }
+ if (pl_opengl_has_ext(p->gl, "GL_EXT_texture_format_BGRA8888"))
+ DO_FORMATS(formats_bgra_gles);
+ if (pl_opengl_has_ext(p->gl, "GL_EXT_texture_integer"))
+ DO_FORMATS(formats_uint);
+ DO_FORMATS(formats_basic_vertex);
+ if (p->gles_ver >= 32 || pl_opengl_has_ext(p->gl, "GL_EXT_color_buffer_half_float")
+ || pl_opengl_has_ext(p->gl, "GL_EXT_color_buffer_float")) {
+ DO_FORMATS(formats_float16_fbo);
+ } else {
+ DO_FORMATS(formats_float16_fallback);
+ }
+ goto done;
+ }
+
+ if (p->gles_ver >= 20) {
+ // GLES 2.0 only has some legacy fallback formats, with support for
+ // float16 depending on GL_EXT_texture_norm16 being present
+ DO_FORMATS(formats_legacy_gles2);
+ DO_FORMATS(formats_basic_vertex);
+ if (pl_opengl_has_ext(p->gl, "GL_EXT_texture_rg")) {
+ DO_FORMATS(formats_norm8);
+ }
+ if (pl_opengl_has_ext(p->gl, "GL_EXT_texture_format_BGRA8888")) {
+ DO_FORMATS(formats_bgra_gles);
+ }
+ goto done;
+ }
+
+ // Last resort fallback. Probably not very useful
+ DO_FORMATS(formats_basic_vertex);
+ goto done;
+
+done:
+ return gl_check_err(gpu, "gl_setup_formats");
+}
diff --git a/src/opengl/formats.h b/src/opengl/formats.h
new file mode 100644
index 0000000..b98c872
--- /dev/null
+++ b/src/opengl/formats.h
@@ -0,0 +1,32 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+struct gl_format {
+ GLint ifmt; // sized internal format (e.g. GL_RGBA16F)
+ GLenum fmt; // base internal format (e.g. GL_RGBA)
+ GLenum type; // host-visible type (e.g. GL_FLOAT)
+ struct pl_fmt_t tmpl; // pl_fmt template
+};
+
+typedef void (gl_format_cb)(pl_gpu gpu, const struct gl_format *glfmt);
+
+// Add all supported formats to the `pl_gpu` format list.
+bool gl_setup_formats(struct pl_gpu_t *gpu);
diff --git a/src/opengl/gpu.c b/src/opengl/gpu.c
new file mode 100644
index 0000000..b711ac5
--- /dev/null
+++ b/src/opengl/gpu.c
@@ -0,0 +1,645 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+#include "common.h"
+#include "formats.h"
+#include "utils.h"
+
+#ifdef PL_HAVE_UNIX
+#include <unistd.h>
+#endif
+
+#ifdef PL_HAVE_WIN32
+#include <windows.h>
+#include <sysinfoapi.h>
+#endif
+
+static const struct pl_gpu_fns pl_fns_gl;
+
+static void gl_gpu_destroy(pl_gpu gpu)
+{
+ struct pl_gl *p = PL_PRIV(gpu);
+
+ pl_gpu_finish(gpu);
+ while (p->callbacks.num > 0)
+ gl_poll_callbacks(gpu);
+
+ pl_free((void *) gpu);
+}
+
+pl_opengl pl_opengl_get(pl_gpu gpu)
+{
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ if (impl->destroy == gl_gpu_destroy) {
+ struct pl_gl *p = (struct pl_gl *) impl;
+ return p->gl;
+ }
+
+ return NULL;
+}
+
+static pl_handle_caps tex_handle_caps(pl_gpu gpu, bool import)
+{
+ pl_handle_caps caps = 0;
+ struct pl_gl *p = PL_PRIV(gpu);
+
+ if (!p->egl_dpy || (!p->has_egl_storage && !p->has_egl_import))
+ return 0;
+
+ if (import) {
+ if (pl_opengl_has_ext(p->gl, "EGL_EXT_image_dma_buf_import"))
+ caps |= PL_HANDLE_DMA_BUF;
+ } else if (!import && p->egl_ctx) {
+ if (pl_opengl_has_ext(p->gl, "EGL_MESA_image_dma_buf_export"))
+ caps |= PL_HANDLE_DMA_BUF;
+ }
+
+ return caps;
+}
+
+static inline size_t get_page_size(void)
+{
+
+#ifdef PL_HAVE_UNIX
+ return sysconf(_SC_PAGESIZE);
+#endif
+
+#ifdef PL_HAVE_WIN32
+ SYSTEM_INFO sysInfo;
+ GetSystemInfo(&sysInfo);
+ return sysInfo.dwAllocationGranularity;
+#endif
+
+ pl_assert(!"Unsupported platform!");
+}
+
+#define get(pname, field) \
+ do { \
+ GLint tmp = 0; \
+ gl->GetIntegerv((pname), &tmp); \
+ *(field) = tmp; \
+ } while (0)
+
+#define geti(pname, i, field) \
+ do { \
+ GLint tmp = 0; \
+ gl->GetIntegeri_v((pname), i, &tmp);\
+ *(field) = tmp; \
+ } while (0)
+
+pl_gpu pl_gpu_create_gl(pl_log log, pl_opengl pl_gl, const struct pl_opengl_params *params)
+{
+ struct pl_gpu_t *gpu = pl_zalloc_obj(NULL, gpu, struct pl_gl);
+ gpu->log = log;
+
+ struct pl_gl *p = PL_PRIV(gpu);
+ p->impl = pl_fns_gl;
+ p->gl = pl_gl;
+
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ struct pl_glsl_version *glsl = &gpu->glsl;
+ glsl->gles = gl_is_gles(pl_gl);
+ int ver = pl_gl->major * 10 + pl_gl->minor;
+ p->gl_ver = glsl->gles ? 0 : ver;
+ p->gles_ver = glsl->gles ? ver : 0;
+
+ // If possible, query the GLSL version from the implementation
+ const char *glslver = (char *) gl->GetString(GL_SHADING_LANGUAGE_VERSION);
+ if (glslver) {
+ PL_INFO(gpu, " GL_SHADING_LANGUAGE_VERSION: %s", glslver);
+ int major = 0, minor = 0;
+ if (sscanf(glslver, "%d.%d", &major, &minor) == 2)
+ glsl->version = major * 100 + minor;
+ }
+
+ if (!glsl->version) {
+ // Otherwise, use the fixed magic versions 100 and 300 for GLES.
+ if (p->gles_ver >= 30) {
+ glsl->version = 300;
+ } else if (p->gles_ver >= 20) {
+ glsl->version = 100;
+ } else {
+ goto error;
+ }
+ }
+
+ static const int glsl_ver_req = 130;
+ if (glsl->version < glsl_ver_req) {
+ PL_FATAL(gpu, "GLSL version too old (%d < %d), please use a newer "
+ "OpenGL implementation or downgrade libplacebo!",
+ glsl->version, glsl_ver_req);
+ goto error;
+ }
+
+ if (params->max_glsl_version && params->max_glsl_version >= glsl_ver_req) {
+ glsl->version = PL_MIN(glsl->version, params->max_glsl_version);
+ PL_INFO(gpu, "Restricting GLSL version to %d... new version is %d",
+ params->max_glsl_version, glsl->version);
+ }
+
+ if (gl_test_ext(gpu, "GL_ARB_compute_shader", 43, 0) && glsl->version >= 420) {
+ glsl->compute = true;
+ get(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE, &glsl->max_shmem_size);
+ get(GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS, &glsl->max_group_threads);
+ for (int i = 0; i < 3; i++)
+ geti(GL_MAX_COMPUTE_WORK_GROUP_SIZE, i, &glsl->max_group_size[i]);
+ }
+
+ if (gl_test_ext(gpu, "GL_ARB_texture_gather", 40, 0)) {
+ get(GL_MAX_PROGRAM_TEXTURE_GATHER_COMPONENTS_ARB, &p->gather_comps);
+ get(GL_MIN_PROGRAM_TEXTURE_GATHER_OFFSET_ARB, &glsl->min_gather_offset);
+ get(GL_MAX_PROGRAM_TEXTURE_GATHER_OFFSET_ARB, &glsl->max_gather_offset);
+ }
+
+ // Query all device limits
+ struct pl_gpu_limits *limits = &gpu->limits;
+ limits->thread_safe = params->make_current;
+ limits->callbacks = gl_test_ext(gpu, "GL_ARB_sync", 32, 30);
+ limits->align_vertex_stride = 1;
+ if (gl_test_ext(gpu, "GL_ARB_pixel_buffer_object", 31, 0)) {
+ limits->max_buf_size = SIZE_MAX; // no restriction imposed by GL
+ if (gl_test_ext(gpu, "GL_ARB_uniform_buffer_object", 31, 0))
+ get(GL_MAX_UNIFORM_BLOCK_SIZE, &limits->max_ubo_size);
+ if (gl_test_ext(gpu, "GL_ARB_shader_storage_buffer_object", 43, 0) &&
+ gpu->glsl.version >= 140)
+ {
+ get(GL_MAX_SHADER_STORAGE_BLOCK_SIZE, &limits->max_ssbo_size);
+ }
+ limits->max_vbo_size = limits->max_buf_size; // No additional restrictions
+ if (gl_test_ext(gpu, "GL_ARB_buffer_storage", 44, 0)) {
+ const char *vendor = (char *) gl->GetString(GL_VENDOR);
+ limits->max_mapped_size = limits->max_buf_size;
+ limits->host_cached = strcmp(vendor, "AMD") == 0 ||
+ strcmp(vendor, "NVIDIA Corporation") == 0;
+ }
+ }
+
+ get(GL_MAX_TEXTURE_SIZE, &limits->max_tex_2d_dim);
+ if (gl_test_ext(gpu, "GL_EXT_texture3D", 21, 30))
+ get(GL_MAX_3D_TEXTURE_SIZE, &limits->max_tex_3d_dim);
+ // There's no equivalent limit for 1D textures for whatever reason, so
+ // just set it to the same as the 2D limit
+ if (p->gl_ver >= 21)
+ limits->max_tex_1d_dim = limits->max_tex_2d_dim;
+ limits->buf_transfer = true;
+
+ if (p->gl_ver || p->gles_ver >= 30) {
+ get(GL_MAX_FRAGMENT_UNIFORM_COMPONENTS, &limits->max_variable_comps);
+ } else {
+ // fallback for GLES 2.0, which doesn't have max_comps
+ get(GL_MAX_FRAGMENT_UNIFORM_VECTORS, &limits->max_variable_comps);
+ limits->max_variable_comps *= 4;
+ }
+
+ if (glsl->compute) {
+ for (int i = 0; i < 3; i++)
+ geti(GL_MAX_COMPUTE_WORK_GROUP_COUNT, i, &limits->max_dispatch[i]);
+ }
+
+ // Query import/export support
+ p->egl_dpy = params->egl_display;
+ p->egl_ctx = params->egl_context;
+ p->has_egl_storage = pl_opengl_has_ext(p->gl, "GL_EXT_EGL_image_storage");
+ p->has_egl_import = pl_opengl_has_ext(p->gl, "GL_OES_EGL_image_external");
+ gpu->export_caps.tex = tex_handle_caps(gpu, false);
+ gpu->import_caps.tex = tex_handle_caps(gpu, true);
+
+ if (p->egl_dpy) {
+ p->has_modifiers = pl_opengl_has_ext(p->gl,
+ "EGL_EXT_image_dma_buf_import_modifiers");
+ }
+
+ if (pl_opengl_has_ext(pl_gl, "GL_AMD_pinned_memory")) {
+ gpu->import_caps.buf |= PL_HANDLE_HOST_PTR;
+ gpu->limits.align_host_ptr = get_page_size();
+ }
+
+ // Cache some internal capability checks
+ p->has_vao = gl_test_ext(gpu, "GL_ARB_vertex_array_object", 30, 0);
+ p->has_invalidate_fb = gl_test_ext(gpu, "GL_ARB_invalidate_subdata", 43, 30);
+ p->has_invalidate_tex = gl_test_ext(gpu, "GL_ARB_invalidate_subdata", 43, 0);
+ p->has_queries = gl_test_ext(gpu, "GL_ARB_timer_query", 33, 0);
+ p->has_storage = gl_test_ext(gpu, "GL_ARB_shader_image_load_store", 42, 0);
+ p->has_readback = true;
+
+ if (p->has_readback && p->gles_ver) {
+ GLuint fbo = 0, tex = 0;
+ GLint read_type = 0, read_fmt = 0;
+ gl->GenTextures(1, &tex);
+ gl->BindTexture(GL_TEXTURE_2D, tex);
+ gl->GenFramebuffers(1, &fbo);
+ gl->TexImage2D(GL_TEXTURE_2D, 0, GL_R8, 64, 64, 0, GL_RED,
+ GL_UNSIGNED_BYTE, NULL);
+ gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, fbo);
+ gl->FramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
+ GL_TEXTURE_2D, tex, 0);
+ gl->GetIntegerv(GL_IMPLEMENTATION_COLOR_READ_TYPE, &read_type);
+ gl->GetIntegerv(GL_IMPLEMENTATION_COLOR_READ_FORMAT, &read_fmt);
+ if (read_type != GL_UNSIGNED_BYTE || read_fmt != GL_RED) {
+ PL_INFO(gpu, "GPU does not seem to support lossless texture "
+ "readback, restricting readback capabilities! This is a "
+ "GLES/driver limitation, there is little we can do to "
+ "work around it.");
+ p->has_readback = false;
+ }
+ gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+ gl->BindTexture(GL_TEXTURE_2D, 0);
+ gl->DeleteFramebuffers(1, &fbo);
+ gl->DeleteTextures(1, &tex);
+ }
+
+ // We simply don't know, so make up some values
+ limits->align_tex_xfer_offset = 32;
+ limits->align_tex_xfer_pitch = 4;
+ limits->fragment_queues = 1;
+ limits->compute_queues = glsl->compute ? 1 : 0;
+
+ if (!gl_check_err(gpu, "pl_gpu_create_gl")) {
+ PL_WARN(gpu, "Encountered errors while detecting GPU capabilities... "
+ "ignoring, but expect limitations/issues");
+ p->failed = false;
+ }
+
+ // Filter out error messages during format probing
+ pl_log_level_cap(gpu->log, PL_LOG_INFO);
+ bool formats_ok = gl_setup_formats(gpu);
+ pl_log_level_cap(gpu->log, PL_LOG_NONE);
+ if (!formats_ok)
+ goto error;
+
+ return pl_gpu_finalize(gpu);
+
+error:
+ gl_gpu_destroy(gpu);
+ return NULL;
+}
+
+void gl_buf_destroy(pl_gpu gpu, pl_buf buf)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ if (!MAKE_CURRENT()) {
+ PL_ERR(gpu, "Failed uninitializing buffer, leaking resources!");
+ return;
+ }
+
+ struct pl_buf_gl *buf_gl = PL_PRIV(buf);
+ if (buf_gl->fence)
+ gl->DeleteSync(buf_gl->fence);
+
+ if (buf_gl->mapped) {
+ gl->BindBuffer(GL_COPY_WRITE_BUFFER, buf_gl->buffer);
+ gl->UnmapBuffer(GL_COPY_WRITE_BUFFER);
+ gl->BindBuffer(GL_COPY_WRITE_BUFFER, 0);
+ }
+
+ gl->DeleteBuffers(1, &buf_gl->buffer);
+ gl_check_err(gpu, "gl_buf_destroy");
+ RELEASE_CURRENT();
+ pl_free((void *) buf);
+}
+
+pl_buf gl_buf_create(pl_gpu gpu, const struct pl_buf_params *params)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ if (!MAKE_CURRENT())
+ return NULL;
+
+ struct pl_buf_t *buf = pl_zalloc_obj(NULL, buf, struct pl_buf_gl);
+ buf->params = *params;
+ buf->params.initial_data = NULL;
+
+ struct pl_gl *p = PL_PRIV(gpu);
+ struct pl_buf_gl *buf_gl = PL_PRIV(buf);
+ buf_gl->id = ++p->buf_id;
+
+ // Just use this since the generic GL_BUFFER doesn't work
+ GLenum target = GL_ARRAY_BUFFER;
+ const void *data = params->initial_data;
+ size_t total_size = params->size;
+ bool import = false;
+
+ if (params->import_handle == PL_HANDLE_HOST_PTR) {
+ const struct pl_shared_mem *shmem = &params->shared_mem;
+ target = GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD;
+
+ data = shmem->handle.ptr;
+ buf_gl->offset = shmem->offset;
+ total_size = shmem->size;
+ import = true;
+
+ if (params->host_mapped)
+ buf->data = (uint8_t *) data + buf_gl->offset;
+
+ if (buf_gl->offset > 0 && params->drawable) {
+ PL_ERR(gpu, "Cannot combine non-aligned host pointer imports with "
+ "drawable (vertex) buffers! This is a design limitation, "
+ "open an issue if you absolutely need this.");
+ goto error;
+ }
+ }
+
+ gl->GenBuffers(1, &buf_gl->buffer);
+ gl->BindBuffer(target, buf_gl->buffer);
+
+ if (gl_test_ext(gpu, "GL_ARB_buffer_storage", 44, 0) && !import) {
+
+ GLbitfield mapflags = 0, storflags = 0;
+ if (params->host_writable)
+ storflags |= GL_DYNAMIC_STORAGE_BIT;
+ if (params->host_mapped) {
+ mapflags |= GL_MAP_READ_BIT | GL_MAP_WRITE_BIT |
+ GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT;
+ }
+ if (params->memory_type == PL_BUF_MEM_HOST)
+ storflags |= GL_CLIENT_STORAGE_BIT; // hopefully this works
+
+ gl->BufferStorage(target, total_size, data, storflags | mapflags);
+
+ if (params->host_mapped) {
+ buf_gl->mapped = true;
+ buf->data = gl->MapBufferRange(target, buf_gl->offset, params->size,
+ mapflags);
+ if (!buf->data) {
+ gl->BindBuffer(target, 0);
+ if (!gl_check_err(gpu, "gl_buf_create: map"))
+ PL_ERR(gpu, "Failed mapping buffer: unknown reason");
+ goto error;
+ }
+ }
+
+ } else {
+
+ // Make a random guess based on arbitrary criteria we can't know
+ GLenum hint = GL_STREAM_DRAW;
+ if (params->initial_data && !params->host_writable && !params->host_mapped)
+ hint = GL_STATIC_DRAW;
+ if (params->host_readable && !params->host_writable && !params->host_mapped)
+ hint = GL_STREAM_READ;
+ if (params->storable)
+ hint = GL_DYNAMIC_COPY;
+
+ gl->BufferData(target, total_size, data, hint);
+
+ if (import && gl->GetError() == GL_INVALID_OPERATION) {
+ PL_ERR(gpu, "Failed importing host pointer!");
+ goto error;
+ }
+
+ }
+
+ gl->BindBuffer(target, 0);
+ if (!gl_check_err(gpu, "gl_buf_create"))
+ goto error;
+
+ if (params->storable) {
+ buf_gl->barrier = GL_BUFFER_UPDATE_BARRIER_BIT | // for buf_copy etc.
+ GL_PIXEL_BUFFER_BARRIER_BIT | // for tex_upload
+ GL_SHADER_STORAGE_BARRIER_BIT;
+
+ if (params->host_mapped)
+ buf_gl->barrier |= GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT;
+ if (params->uniform)
+ buf_gl->barrier |= GL_UNIFORM_BARRIER_BIT;
+ if (params->drawable)
+ buf_gl->barrier |= GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT;
+ }
+
+ RELEASE_CURRENT();
+ return buf;
+
+error:
+ gl_buf_destroy(gpu, buf);
+ RELEASE_CURRENT();
+ return NULL;
+}
+
+bool gl_buf_poll(pl_gpu gpu, pl_buf buf, uint64_t timeout)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+
+ // Non-persistently mapped buffers are always implicitly reusable in OpenGL,
+ // the implementation will create more buffers under the hood if needed.
+ if (!buf->data)
+ return false;
+
+ if (!MAKE_CURRENT())
+ return true; // conservative guess
+
+ struct pl_buf_gl *buf_gl = PL_PRIV(buf);
+ if (buf_gl->fence) {
+ GLenum res = gl->ClientWaitSync(buf_gl->fence,
+ timeout ? GL_SYNC_FLUSH_COMMANDS_BIT : 0,
+ timeout);
+ if (res == GL_ALREADY_SIGNALED || res == GL_CONDITION_SATISFIED) {
+ gl->DeleteSync(buf_gl->fence);
+ buf_gl->fence = NULL;
+ }
+ }
+
+ gl_poll_callbacks(gpu);
+ RELEASE_CURRENT();
+ return !!buf_gl->fence;
+}
+
+void gl_buf_write(pl_gpu gpu, pl_buf buf, size_t offset,
+ const void *data, size_t size)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ if (!MAKE_CURRENT())
+ return;
+
+ struct pl_buf_gl *buf_gl = PL_PRIV(buf);
+ gl->BindBuffer(GL_ARRAY_BUFFER, buf_gl->buffer);
+ gl->BufferSubData(GL_ARRAY_BUFFER, buf_gl->offset + offset, size, data);
+ gl->BindBuffer(GL_ARRAY_BUFFER, 0);
+ gl_check_err(gpu, "gl_buf_write");
+ RELEASE_CURRENT();
+}
+
+bool gl_buf_read(pl_gpu gpu, pl_buf buf, size_t offset,
+ void *dest, size_t size)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ if (!MAKE_CURRENT())
+ return false;
+
+ struct pl_buf_gl *buf_gl = PL_PRIV(buf);
+ gl->BindBuffer(GL_ARRAY_BUFFER, buf_gl->buffer);
+ gl->GetBufferSubData(GL_ARRAY_BUFFER, buf_gl->offset + offset, size, dest);
+ gl->BindBuffer(GL_ARRAY_BUFFER, 0);
+ bool ok = gl_check_err(gpu, "gl_buf_read");
+ RELEASE_CURRENT();
+ return ok;
+}
+
+void gl_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset,
+ pl_buf src, size_t src_offset, size_t size)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ if (!MAKE_CURRENT())
+ return;
+
+ struct pl_buf_gl *src_gl = PL_PRIV(src);
+ struct pl_buf_gl *dst_gl = PL_PRIV(dst);
+ gl->BindBuffer(GL_COPY_READ_BUFFER, src_gl->buffer);
+ gl->BindBuffer(GL_COPY_WRITE_BUFFER, dst_gl->buffer);
+ gl->CopyBufferSubData(GL_COPY_READ_BUFFER, GL_COPY_WRITE_BUFFER,
+ src_gl->offset + src_offset,
+ dst_gl->offset + dst_offset, size);
+ gl_check_err(gpu, "gl_buf_copy");
+ RELEASE_CURRENT();
+}
+
+#define QUERY_OBJECT_NUM 8
+
+struct pl_timer_t {
+ GLuint query[QUERY_OBJECT_NUM];
+ int index_write; // next index to write to
+ int index_read; // next index to read from
+};
+
+static pl_timer gl_timer_create(pl_gpu gpu)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ struct pl_gl *p = PL_PRIV(gpu);
+ if (!p->has_queries || !MAKE_CURRENT())
+ return NULL;
+
+ pl_timer timer = pl_zalloc_ptr(NULL, timer);
+ gl->GenQueries(QUERY_OBJECT_NUM, timer->query);
+ RELEASE_CURRENT();
+ return timer;
+}
+
+static void gl_timer_destroy(pl_gpu gpu, pl_timer timer)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ if (!MAKE_CURRENT()) {
+ PL_ERR(gpu, "Failed uninitializing timer, leaking resources!");
+ return;
+ }
+
+ gl->DeleteQueries(QUERY_OBJECT_NUM, timer->query);
+ gl_check_err(gpu, "gl_timer_destroy");
+ RELEASE_CURRENT();
+ pl_free(timer);
+}
+
+static uint64_t gl_timer_query(pl_gpu gpu, pl_timer timer)
+{
+ if (timer->index_read == timer->index_write)
+ return 0; // no more unprocessed results
+
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ if (!MAKE_CURRENT())
+ return 0;
+
+ uint64_t res = 0;
+ GLuint query = timer->query[timer->index_read];
+ int avail = 0;
+ gl->GetQueryObjectiv(query, GL_QUERY_RESULT_AVAILABLE, &avail);
+ if (!avail)
+ goto done;
+ gl->GetQueryObjectui64v(query, GL_QUERY_RESULT, &res);
+
+ timer->index_read = (timer->index_read + 1) % QUERY_OBJECT_NUM;
+ // fall through
+
+done:
+ RELEASE_CURRENT();
+ return res;
+}
+
+void gl_timer_begin(pl_gpu gpu, pl_timer timer)
+{
+ if (!timer)
+ return;
+
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ gl->BeginQuery(GL_TIME_ELAPSED, timer->query[timer->index_write]);
+}
+
+void gl_timer_end(pl_gpu gpu, pl_timer timer)
+{
+ if (!timer)
+ return;
+
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ gl->EndQuery(GL_TIME_ELAPSED);
+
+ timer->index_write = (timer->index_write + 1) % QUERY_OBJECT_NUM;
+ if (timer->index_write == timer->index_read) {
+ // forcibly drop the least recent result to make space
+ timer->index_read = (timer->index_read + 1) % QUERY_OBJECT_NUM;
+ }
+}
+
+static void gl_gpu_flush(pl_gpu gpu)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ if (!MAKE_CURRENT())
+ return;
+
+ gl->Flush();
+ gl_check_err(gpu, "gl_gpu_flush");
+ RELEASE_CURRENT();
+}
+
+static void gl_gpu_finish(pl_gpu gpu)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ if (!MAKE_CURRENT())
+ return;
+
+ gl->Finish();
+ gl_check_err(gpu, "gl_gpu_finish");
+ RELEASE_CURRENT();
+}
+
+static bool gl_gpu_is_failed(pl_gpu gpu)
+{
+ struct pl_gl *gl = PL_PRIV(gpu);
+ return gl->failed;
+}
+
+static const struct pl_gpu_fns pl_fns_gl = {
+ .destroy = gl_gpu_destroy,
+ .tex_create = gl_tex_create,
+ .tex_destroy = gl_tex_destroy,
+ .tex_invalidate = gl_tex_invalidate,
+ .tex_clear_ex = gl_tex_clear_ex,
+ .tex_blit = gl_tex_blit,
+ .tex_upload = gl_tex_upload,
+ .tex_download = gl_tex_download,
+ .buf_create = gl_buf_create,
+ .buf_destroy = gl_buf_destroy,
+ .buf_write = gl_buf_write,
+ .buf_read = gl_buf_read,
+ .buf_copy = gl_buf_copy,
+ .buf_poll = gl_buf_poll,
+ .desc_namespace = gl_desc_namespace,
+ .pass_create = gl_pass_create,
+ .pass_destroy = gl_pass_destroy,
+ .pass_run = gl_pass_run,
+ .timer_create = gl_timer_create,
+ .timer_destroy = gl_timer_destroy,
+ .timer_query = gl_timer_query,
+ .gpu_flush = gl_gpu_flush,
+ .gpu_finish = gl_gpu_finish,
+ .gpu_is_failed = gl_gpu_is_failed,
+};
diff --git a/src/opengl/gpu.h b/src/opengl/gpu.h
new file mode 100644
index 0000000..50741d0
--- /dev/null
+++ b/src/opengl/gpu.h
@@ -0,0 +1,141 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../gpu.h"
+#include "common.h"
+
+// Thread safety: Unsafe, same as pl_gpu_destroy
+pl_gpu pl_gpu_create_gl(pl_log log, pl_opengl gl, const struct pl_opengl_params *params);
+
+// --- pl_gpu internal structs and functions
+
+struct pl_gl {
+ struct pl_gpu_fns impl;
+ pl_opengl gl;
+ bool failed;
+
+ // For import/export
+ EGLDisplay egl_dpy;
+ EGLContext egl_ctx;
+ bool egl_storage;
+#ifdef PL_HAVE_UNIX
+ // List of formats supported by EGL_EXT_image_dma_buf_import
+ PL_ARRAY(EGLint) egl_formats;
+#endif
+
+ // Sync objects and associated callbacks
+ PL_ARRAY(struct gl_cb) callbacks;
+
+
+ // Incrementing counters to keep track of object uniqueness
+ int buf_id;
+
+ // Cached capabilities
+ int gl_ver;
+ int gles_ver;
+ bool has_storage;
+ bool has_invalidate_fb;
+ bool has_invalidate_tex;
+ bool has_vao;
+ bool has_queries;
+ bool has_modifiers;
+ bool has_readback;
+ bool has_egl_storage;
+ bool has_egl_import;
+ int gather_comps;
+};
+
+static inline const gl_funcs *gl_funcs_get(pl_gpu gpu)
+{
+ struct pl_gl *p = PL_PRIV(gpu);
+ struct gl_ctx *glctx = PL_PRIV(p->gl);
+ return &glctx->func;
+}
+
+void gl_timer_begin(pl_gpu gpu, pl_timer timer);
+void gl_timer_end(pl_gpu gpu, pl_timer timer);
+
+static inline bool _make_current(pl_gpu gpu)
+{
+ struct pl_gl *p = PL_PRIV(gpu);
+ if (!gl_make_current(p->gl)) {
+ p->failed = true;
+ return false;
+ }
+
+ return true;
+}
+
+static inline void _release_current(pl_gpu gpu)
+{
+ struct pl_gl *p = PL_PRIV(gpu);
+ gl_release_current(p->gl);
+}
+
+#define MAKE_CURRENT() _make_current(gpu)
+#define RELEASE_CURRENT() _release_current(gpu)
+
+struct pl_tex_gl {
+ GLenum target;
+ GLuint texture;
+ bool wrapped_tex;
+ GLuint fbo; // or 0
+ bool wrapped_fb;
+ GLbitfield barrier;
+
+ // GL format fields
+ GLenum format;
+ GLint iformat;
+ GLenum type;
+
+ // For imported/exported textures
+ EGLImageKHR image;
+ int fd;
+};
+
+pl_tex gl_tex_create(pl_gpu, const struct pl_tex_params *);
+void gl_tex_destroy(pl_gpu, pl_tex);
+void gl_tex_invalidate(pl_gpu, pl_tex);
+void gl_tex_clear_ex(pl_gpu, pl_tex, const union pl_clear_color);
+void gl_tex_blit(pl_gpu, const struct pl_tex_blit_params *);
+bool gl_tex_upload(pl_gpu, const struct pl_tex_transfer_params *);
+bool gl_tex_download(pl_gpu, const struct pl_tex_transfer_params *);
+
+struct pl_buf_gl {
+ uint64_t id; // unique per buffer
+ GLuint buffer;
+ size_t offset;
+ GLsync fence;
+ GLbitfield barrier;
+ bool mapped;
+};
+
+pl_buf gl_buf_create(pl_gpu, const struct pl_buf_params *);
+void gl_buf_destroy(pl_gpu, pl_buf);
+void gl_buf_write(pl_gpu, pl_buf, size_t offset, const void *src, size_t size);
+bool gl_buf_read(pl_gpu, pl_buf, size_t offset, void *dst, size_t size);
+void gl_buf_copy(pl_gpu, pl_buf dst, size_t dst_offset,
+ pl_buf src, size_t src_offset, size_t size);
+bool gl_buf_poll(pl_gpu, pl_buf, uint64_t timeout);
+
+struct pl_pass_gl;
+int gl_desc_namespace(pl_gpu, enum pl_desc_type type);
+pl_pass gl_pass_create(pl_gpu, const struct pl_pass_params *);
+void gl_pass_destroy(pl_gpu, pl_pass);
+void gl_pass_run(pl_gpu, const struct pl_pass_run_params *);
diff --git a/src/opengl/gpu_pass.c b/src/opengl/gpu_pass.c
new file mode 100644
index 0000000..58e69a5
--- /dev/null
+++ b/src/opengl/gpu_pass.c
@@ -0,0 +1,707 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+#include "cache.h"
+#include "formats.h"
+#include "utils.h"
+
+int gl_desc_namespace(pl_gpu gpu, enum pl_desc_type type)
+{
+ return (int) type;
+}
+
+struct gl_cache_header {
+ GLenum format;
+};
+
+static GLuint load_cached_program(pl_gpu gpu, pl_cache cache, pl_cache_obj *obj)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ if (!gl_test_ext(gpu, "GL_ARB_get_program_binary", 41, 30))
+ return 0;
+
+ if (!pl_cache_get(cache, obj))
+ return 0;
+
+ if (obj->size < sizeof(struct gl_cache_header))
+ return 0;
+
+ GLuint prog = gl->CreateProgram();
+ if (!gl_check_err(gpu, "load_cached_program: glCreateProgram"))
+ return 0;
+
+ struct gl_cache_header *header = (struct gl_cache_header *) obj->data;
+ pl_str rest = (pl_str) { obj->data, obj->size };
+ rest = pl_str_drop(rest, sizeof(*header));
+ gl->ProgramBinary(prog, header->format, rest.buf, rest.len);
+ gl->GetError(); // discard potential useless error
+
+ GLint status = 0;
+ gl->GetProgramiv(prog, GL_LINK_STATUS, &status);
+ if (status)
+ return prog;
+
+ gl->DeleteProgram(prog);
+ gl_check_err(gpu, "load_cached_program: glProgramBinary");
+ return 0;
+}
+
+static enum pl_log_level gl_log_level(GLint status, GLint log_length)
+{
+ if (!status) {
+ return PL_LOG_ERR;
+ } else if (log_length > 0) {
+ return PL_LOG_INFO;
+ } else {
+ return PL_LOG_DEBUG;
+ }
+}
+
+static bool gl_attach_shader(pl_gpu gpu, GLuint program, GLenum type, const char *src)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ GLuint shader = gl->CreateShader(type);
+ gl->ShaderSource(shader, 1, &src, NULL);
+ gl->CompileShader(shader);
+
+ GLint status = 0;
+ gl->GetShaderiv(shader, GL_COMPILE_STATUS, &status);
+ GLint log_length = 0;
+ gl->GetShaderiv(shader, GL_INFO_LOG_LENGTH, &log_length);
+
+ enum pl_log_level level = gl_log_level(status, log_length);
+ if (pl_msg_test(gpu->log, level)) {
+ GLchar *logstr = pl_zalloc(NULL, log_length + 1);
+ gl->GetShaderInfoLog(shader, log_length, NULL, logstr);
+ PL_MSG(gpu, level, "shader compile log (status=%d): %s", status, logstr);
+ pl_free(logstr);
+ }
+
+ if (!status || !gl_check_err(gpu, "gl_attach_shader"))
+ goto error;
+
+ gl->AttachShader(program, shader);
+ gl->DeleteShader(shader);
+ return true;
+
+error:
+ gl->DeleteShader(shader);
+ return false;
+}
+
+static GLuint gl_compile_program(pl_gpu gpu, const struct pl_pass_params *params)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ GLuint prog = gl->CreateProgram();
+ bool ok = true;
+
+ switch (params->type) {
+ case PL_PASS_COMPUTE:
+ ok &= gl_attach_shader(gpu, prog, GL_COMPUTE_SHADER, params->glsl_shader);
+ break;
+ case PL_PASS_RASTER:
+ ok &= gl_attach_shader(gpu, prog, GL_VERTEX_SHADER, params->vertex_shader);
+ ok &= gl_attach_shader(gpu, prog, GL_FRAGMENT_SHADER, params->glsl_shader);
+ for (int i = 0; i < params->num_vertex_attribs; i++)
+ gl->BindAttribLocation(prog, i, params->vertex_attribs[i].name);
+ break;
+ case PL_PASS_INVALID:
+ case PL_PASS_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ if (!ok || !gl_check_err(gpu, "gl_compile_program: attach shader"))
+ goto error;
+
+ gl->LinkProgram(prog);
+ GLint status = 0;
+ gl->GetProgramiv(prog, GL_LINK_STATUS, &status);
+ GLint log_length = 0;
+ gl->GetProgramiv(prog, GL_INFO_LOG_LENGTH, &log_length);
+
+ enum pl_log_level level = gl_log_level(status, log_length);
+ if (pl_msg_test(gpu->log, level)) {
+ GLchar *logstr = pl_zalloc(NULL, log_length + 1);
+ gl->GetProgramInfoLog(prog, log_length, NULL, logstr);
+ PL_MSG(gpu, level, "shader link log (status=%d): %s", status, logstr);
+ pl_free(logstr);
+ }
+
+ if (!gl_check_err(gpu, "gl_compile_program: link program"))
+ goto error;
+
+ return prog;
+
+error:
+ gl->DeleteProgram(prog);
+ PL_ERR(gpu, "Failed compiling/linking GLSL program");
+ return 0;
+}
+
+// For pl_pass.priv
+struct pl_pass_gl {
+ GLuint program;
+ GLuint vao; // the VAO object
+ uint64_t vao_id; // buf_gl.id of VAO
+ size_t vao_offset; // VBO offset of VAO
+ GLuint buffer; // VBO for raw vertex pointers
+ GLuint index_buffer;
+ GLint *var_locs;
+};
+
+void gl_pass_destroy(pl_gpu gpu, pl_pass pass)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ if (!MAKE_CURRENT()) {
+ PL_ERR(gpu, "Failed uninitializing pass, leaking resources!");
+ return;
+ }
+
+ struct pl_pass_gl *pass_gl = PL_PRIV(pass);
+ if (pass_gl->vao)
+ gl->DeleteVertexArrays(1, &pass_gl->vao);
+ gl->DeleteBuffers(1, &pass_gl->index_buffer);
+ gl->DeleteBuffers(1, &pass_gl->buffer);
+ gl->DeleteProgram(pass_gl->program);
+
+ gl_check_err(gpu, "gl_pass_destroy");
+ RELEASE_CURRENT();
+ pl_free((void *) pass);
+}
+
+static void gl_update_va(pl_gpu gpu, pl_pass pass, size_t vbo_offset)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ for (int i = 0; i < pass->params.num_vertex_attribs; i++) {
+ const struct pl_vertex_attrib *va = &pass->params.vertex_attribs[i];
+ const struct gl_format **glfmtp = PL_PRIV(va->fmt);
+ const struct gl_format *glfmt = *glfmtp;
+
+ bool norm = false;
+ switch (va->fmt->type) {
+ case PL_FMT_UNORM:
+ case PL_FMT_SNORM:
+ norm = true;
+ break;
+
+ case PL_FMT_UNKNOWN:
+ case PL_FMT_FLOAT:
+ case PL_FMT_UINT:
+ case PL_FMT_SINT:
+ break;
+ case PL_FMT_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ gl->EnableVertexAttribArray(i);
+ gl->VertexAttribPointer(i, va->fmt->num_components, glfmt->type, norm,
+ pass->params.vertex_stride,
+ (void *) (va->offset + vbo_offset));
+ }
+}
+
+pl_pass gl_pass_create(pl_gpu gpu, const struct pl_pass_params *params)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ if (!MAKE_CURRENT())
+ return NULL;
+
+ struct pl_gl *p = PL_PRIV(gpu);
+ struct pl_pass_t *pass = pl_zalloc_obj(NULL, pass, struct pl_pass_gl);
+ struct pl_pass_gl *pass_gl = PL_PRIV(pass);
+ pl_cache cache = pl_gpu_cache(gpu);
+ pass->params = pl_pass_params_copy(pass, params);
+
+ pl_cache_obj obj = { .key = CACHE_KEY_GL_PROG };
+ if (cache) {
+ pl_hash_merge(&obj.key, pl_str0_hash(params->glsl_shader));
+ if (params->type == PL_PASS_RASTER)
+ pl_hash_merge(&obj.key, pl_str0_hash(params->vertex_shader));
+ }
+
+ // Load/Compile program
+ if ((pass_gl->program = load_cached_program(gpu, cache, &obj))) {
+ PL_DEBUG(gpu, "Using cached GL program");
+ } else {
+ pl_clock_t start = pl_clock_now();
+ pass_gl->program = gl_compile_program(gpu, params);
+ pl_log_cpu_time(gpu->log, start, pl_clock_now(), "compiling shader");
+ }
+
+ if (!pass_gl->program)
+ goto error;
+
+ // Update program cache if possible
+ if (cache && gl_test_ext(gpu, "GL_ARB_get_program_binary", 41, 30)) {
+ GLint buf_size = 0;
+ gl->GetProgramiv(pass_gl->program, GL_PROGRAM_BINARY_LENGTH, &buf_size);
+ if (buf_size > 0) {
+ buf_size += sizeof(struct gl_cache_header);
+ pl_cache_obj_resize(NULL, &obj, buf_size);
+ struct gl_cache_header *header = obj.data;
+ void *buffer = &header[1];
+ GLsizei binary_size = 0;
+ gl->GetProgramBinary(pass_gl->program, buf_size, &binary_size,
+ &header->format, buffer);
+ bool ok = gl_check_err(gpu, "gl_pass_create: get program binary");
+ if (ok) {
+ obj.size = sizeof(*header) + binary_size;
+ pl_assert(obj.size <= buf_size);
+ pl_cache_set(cache, &obj);
+ }
+ }
+ }
+
+ gl->UseProgram(pass_gl->program);
+ pass_gl->var_locs = pl_calloc(pass, params->num_variables, sizeof(GLint));
+
+ for (int i = 0; i < params->num_variables; i++) {
+ pass_gl->var_locs[i] = gl->GetUniformLocation(pass_gl->program,
+ params->variables[i].name);
+
+ // Due to OpenGL API restrictions, we need to ensure that this is a
+ // variable type we can actually *update*. Fortunately, this is easily
+ // checked by virtue of the fact that all legal combinations of
+ // parameters will have a valid GLSL type name
+ if (!pl_var_glsl_type_name(params->variables[i])) {
+ gl->UseProgram(0);
+ PL_ERR(gpu, "Input variable '%s' does not match any known type!",
+ params->variables[i].name);
+ goto error;
+ }
+ }
+
+ for (int i = 0; i < params->num_descriptors; i++) {
+ const struct pl_desc *desc = &params->descriptors[i];
+ switch (desc->type) {
+ case PL_DESC_SAMPLED_TEX:
+ case PL_DESC_STORAGE_IMG: {
+ // For compatibility with older OpenGL, we need to explicitly
+ // update the texture/image unit bindings after creating the shader
+ // program, since specifying it directly requires GLSL 4.20+
+ GLint loc = gl->GetUniformLocation(pass_gl->program, desc->name);
+ gl->Uniform1i(loc, desc->binding);
+ break;
+ }
+ case PL_DESC_BUF_UNIFORM: {
+ GLuint idx = gl->GetUniformBlockIndex(pass_gl->program, desc->name);
+ gl->UniformBlockBinding(pass_gl->program, idx, desc->binding);
+ break;
+ }
+ case PL_DESC_BUF_STORAGE: {
+ GLuint idx = gl->GetProgramResourceIndex(pass_gl->program,
+ GL_SHADER_STORAGE_BLOCK,
+ desc->name);
+ gl->ShaderStorageBlockBinding(pass_gl->program, idx, desc->binding);
+ break;
+ }
+ case PL_DESC_BUF_TEXEL_UNIFORM:
+ case PL_DESC_BUF_TEXEL_STORAGE:
+ assert(!"unimplemented"); // TODO
+ case PL_DESC_INVALID:
+ case PL_DESC_TYPE_COUNT:
+ pl_unreachable();
+ }
+ }
+
+ gl->UseProgram(0);
+
+ // Initialize the VAO and single vertex buffer
+ gl->GenBuffers(1, &pass_gl->buffer);
+ if (p->has_vao) {
+ gl->GenVertexArrays(1, &pass_gl->vao);
+ gl->BindBuffer(GL_ARRAY_BUFFER, pass_gl->buffer);
+ gl->BindVertexArray(pass_gl->vao);
+ gl_update_va(gpu, pass, 0);
+ gl->BindVertexArray(0);
+ gl->BindBuffer(GL_ARRAY_BUFFER, 0);
+ }
+
+ if (!gl_check_err(gpu, "gl_pass_create"))
+ goto error;
+
+ pl_cache_obj_free(&obj);
+ RELEASE_CURRENT();
+ return pass;
+
+error:
+ PL_ERR(gpu, "Failed creating pass");
+ pl_cache_obj_free(&obj);
+ gl_pass_destroy(gpu, pass);
+ RELEASE_CURRENT();
+ return NULL;
+}
+
+static void update_var(pl_gpu gpu, pl_pass pass,
+ const struct pl_var_update *vu)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ struct pl_pass_gl *pass_gl = PL_PRIV(pass);
+ const struct pl_var *var = &pass->params.variables[vu->index];
+ GLint loc = pass_gl->var_locs[vu->index];
+
+ switch (var->type) {
+ case PL_VAR_SINT: {
+ const int *i = vu->data;
+ pl_assert(var->dim_m == 1);
+ switch (var->dim_v) {
+ case 1: gl->Uniform1iv(loc, var->dim_a, i); break;
+ case 2: gl->Uniform2iv(loc, var->dim_a, i); break;
+ case 3: gl->Uniform3iv(loc, var->dim_a, i); break;
+ case 4: gl->Uniform4iv(loc, var->dim_a, i); break;
+ default: pl_unreachable();
+ }
+ return;
+ }
+ case PL_VAR_UINT: {
+ const unsigned int *u = vu->data;
+ pl_assert(var->dim_m == 1);
+ switch (var->dim_v) {
+ case 1: gl->Uniform1uiv(loc, var->dim_a, u); break;
+ case 2: gl->Uniform2uiv(loc, var->dim_a, u); break;
+ case 3: gl->Uniform3uiv(loc, var->dim_a, u); break;
+ case 4: gl->Uniform4uiv(loc, var->dim_a, u); break;
+ default: pl_unreachable();
+ }
+ return;
+ }
+ case PL_VAR_FLOAT: {
+ const float *f = vu->data;
+ if (var->dim_m == 1) {
+ switch (var->dim_v) {
+ case 1: gl->Uniform1fv(loc, var->dim_a, f); break;
+ case 2: gl->Uniform2fv(loc, var->dim_a, f); break;
+ case 3: gl->Uniform3fv(loc, var->dim_a, f); break;
+ case 4: gl->Uniform4fv(loc, var->dim_a, f); break;
+ default: pl_unreachable();
+ }
+ } else if (var->dim_m == 2 && var->dim_v == 2) {
+ gl->UniformMatrix2fv(loc, var->dim_a, GL_FALSE, f);
+ } else if (var->dim_m == 3 && var->dim_v == 3) {
+ gl->UniformMatrix3fv(loc, var->dim_a, GL_FALSE, f);
+ } else if (var->dim_m == 4 && var->dim_v == 4) {
+ gl->UniformMatrix4fv(loc, var->dim_a, GL_FALSE, f);
+ } else if (var->dim_m == 2 && var->dim_v == 3) {
+ gl->UniformMatrix2x3fv(loc, var->dim_a, GL_FALSE, f);
+ } else if (var->dim_m == 3 && var->dim_v == 2) {
+ gl->UniformMatrix3x2fv(loc, var->dim_a, GL_FALSE, f);
+ } else if (var->dim_m == 2 && var->dim_v == 4) {
+ gl->UniformMatrix2x4fv(loc, var->dim_a, GL_FALSE, f);
+ } else if (var->dim_m == 4 && var->dim_v == 2) {
+ gl->UniformMatrix4x2fv(loc, var->dim_a, GL_FALSE, f);
+ } else if (var->dim_m == 3 && var->dim_v == 4) {
+ gl->UniformMatrix3x4fv(loc, var->dim_a, GL_FALSE, f);
+ } else if (var->dim_m == 4 && var->dim_v == 3) {
+ gl->UniformMatrix4x3fv(loc, var->dim_a, GL_FALSE, f);
+ } else {
+ pl_unreachable();
+ }
+ return;
+ }
+
+ case PL_VAR_INVALID:
+ case PL_VAR_TYPE_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+}
+
+static void update_desc(pl_gpu gpu, pl_pass pass, int index,
+ const struct pl_desc_binding *db)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ const struct pl_desc *desc = &pass->params.descriptors[index];
+
+ static const GLenum access[] = {
+ [PL_DESC_ACCESS_READWRITE] = GL_READ_WRITE,
+ [PL_DESC_ACCESS_READONLY] = GL_READ_ONLY,
+ [PL_DESC_ACCESS_WRITEONLY] = GL_WRITE_ONLY,
+ };
+
+ static const GLint wraps[PL_TEX_ADDRESS_MODE_COUNT] = {
+ [PL_TEX_ADDRESS_CLAMP] = GL_CLAMP_TO_EDGE,
+ [PL_TEX_ADDRESS_REPEAT] = GL_REPEAT,
+ [PL_TEX_ADDRESS_MIRROR] = GL_MIRRORED_REPEAT,
+ };
+
+ static const GLint filters[PL_TEX_SAMPLE_MODE_COUNT] = {
+ [PL_TEX_SAMPLE_NEAREST] = GL_NEAREST,
+ [PL_TEX_SAMPLE_LINEAR] = GL_LINEAR,
+ };
+
+ switch (desc->type) {
+ case PL_DESC_SAMPLED_TEX: {
+ pl_tex tex = db->object;
+ struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+ gl->ActiveTexture(GL_TEXTURE0 + desc->binding);
+ gl->BindTexture(tex_gl->target, tex_gl->texture);
+
+ GLint filter = filters[db->sample_mode];
+ GLint wrap = wraps[db->address_mode];
+ gl->TexParameteri(tex_gl->target, GL_TEXTURE_MIN_FILTER, filter);
+ gl->TexParameteri(tex_gl->target, GL_TEXTURE_MAG_FILTER, filter);
+ switch (pl_tex_params_dimension(tex->params)) {
+ case 3: gl->TexParameteri(tex_gl->target, GL_TEXTURE_WRAP_R, wrap); // fall through
+ case 2: gl->TexParameteri(tex_gl->target, GL_TEXTURE_WRAP_T, wrap); // fall through
+ case 1: gl->TexParameteri(tex_gl->target, GL_TEXTURE_WRAP_S, wrap); break;
+ }
+ return;
+ }
+ case PL_DESC_STORAGE_IMG: {
+ pl_tex tex = db->object;
+ struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+ gl->BindImageTexture(desc->binding, tex_gl->texture, 0, GL_FALSE, 0,
+ access[desc->access], tex_gl->iformat);
+ return;
+ }
+ case PL_DESC_BUF_UNIFORM: {
+ pl_buf buf = db->object;
+ struct pl_buf_gl *buf_gl = PL_PRIV(buf);
+ gl->BindBufferRange(GL_UNIFORM_BUFFER, desc->binding, buf_gl->buffer,
+ buf_gl->offset, buf->params.size);
+ return;
+ }
+ case PL_DESC_BUF_STORAGE: {
+ pl_buf buf = db->object;
+ struct pl_buf_gl *buf_gl = PL_PRIV(buf);
+ gl->BindBufferRange(GL_SHADER_STORAGE_BUFFER, desc->binding, buf_gl->buffer,
+ buf_gl->offset, buf->params.size);
+ return;
+ }
+ case PL_DESC_BUF_TEXEL_UNIFORM:
+ case PL_DESC_BUF_TEXEL_STORAGE:
+ assert(!"unimplemented"); // TODO
+
+ case PL_DESC_INVALID:
+ case PL_DESC_TYPE_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+}
+
+static void unbind_desc(pl_gpu gpu, pl_pass pass, int index,
+ const struct pl_desc_binding *db)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ const struct pl_desc *desc = &pass->params.descriptors[index];
+
+ switch (desc->type) {
+ case PL_DESC_SAMPLED_TEX: {
+ pl_tex tex = db->object;
+ struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+ gl->ActiveTexture(GL_TEXTURE0 + desc->binding);
+ gl->BindTexture(tex_gl->target, 0);
+ return;
+ }
+ case PL_DESC_STORAGE_IMG: {
+ pl_tex tex = db->object;
+ struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+ gl->BindImageTexture(desc->binding, 0, 0, GL_FALSE, 0,
+ GL_WRITE_ONLY, GL_R32F);
+ if (desc->access != PL_DESC_ACCESS_READONLY)
+ gl->MemoryBarrier(tex_gl->barrier);
+ return;
+ }
+ case PL_DESC_BUF_UNIFORM:
+ gl->BindBufferBase(GL_UNIFORM_BUFFER, desc->binding, 0);
+ return;
+ case PL_DESC_BUF_STORAGE: {
+ pl_buf buf = db->object;
+ struct pl_buf_gl *buf_gl = PL_PRIV(buf);
+ gl->BindBufferBase(GL_SHADER_STORAGE_BUFFER, desc->binding, 0);
+ if (desc->access != PL_DESC_ACCESS_READONLY)
+ gl->MemoryBarrier(buf_gl->barrier);
+ return;
+ }
+ case PL_DESC_BUF_TEXEL_UNIFORM:
+ case PL_DESC_BUF_TEXEL_STORAGE:
+ assert(!"unimplemented"); // TODO
+ case PL_DESC_INVALID:
+ case PL_DESC_TYPE_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+}
+
+void gl_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ if (!MAKE_CURRENT())
+ return;
+
+ pl_pass pass = params->pass;
+ struct pl_pass_gl *pass_gl = PL_PRIV(pass);
+ struct pl_gl *p = PL_PRIV(gpu);
+
+ gl->UseProgram(pass_gl->program);
+
+ for (int i = 0; i < params->num_var_updates; i++)
+ update_var(gpu, pass, &params->var_updates[i]);
+ for (int i = 0; i < pass->params.num_descriptors; i++)
+ update_desc(gpu, pass, i, &params->desc_bindings[i]);
+ gl->ActiveTexture(GL_TEXTURE0);
+
+ if (!gl_check_err(gpu, "gl_pass_run: updating uniforms")) {
+ RELEASE_CURRENT();
+ return;
+ }
+
+ switch (pass->params.type) {
+ case PL_PASS_RASTER: {
+ struct pl_tex_gl *target_gl = PL_PRIV(params->target);
+ gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, target_gl->fbo);
+ if (!pass->params.load_target && p->has_invalidate_fb) {
+ GLenum fb = target_gl->fbo ? GL_COLOR_ATTACHMENT0 : GL_COLOR;
+ gl->InvalidateFramebuffer(GL_DRAW_FRAMEBUFFER, 1, &fb);
+ }
+
+ gl->Viewport(params->viewport.x0, params->viewport.y0,
+ pl_rect_w(params->viewport), pl_rect_h(params->viewport));
+ gl->Scissor(params->scissors.x0, params->scissors.y0,
+ pl_rect_w(params->scissors), pl_rect_h(params->scissors));
+ gl->Enable(GL_SCISSOR_TEST);
+ gl->Disable(GL_DEPTH_TEST);
+ gl->Disable(GL_CULL_FACE);
+ gl_check_err(gpu, "gl_pass_run: enabling viewport/scissor");
+
+ const struct pl_blend_params *blend = pass->params.blend_params;
+ if (blend) {
+ static const GLenum map_blend[] = {
+ [PL_BLEND_ZERO] = GL_ZERO,
+ [PL_BLEND_ONE] = GL_ONE,
+ [PL_BLEND_SRC_ALPHA] = GL_SRC_ALPHA,
+ [PL_BLEND_ONE_MINUS_SRC_ALPHA] = GL_ONE_MINUS_SRC_ALPHA,
+ };
+
+ gl->BlendFuncSeparate(map_blend[blend->src_rgb],
+ map_blend[blend->dst_rgb],
+ map_blend[blend->src_alpha],
+ map_blend[blend->dst_alpha]);
+ gl->Enable(GL_BLEND);
+ gl_check_err(gpu, "gl_pass_run: enabling blend");
+ }
+
+ // Update VBO and VAO
+ pl_buf vert = params->vertex_buf;
+ struct pl_buf_gl *vert_gl = vert ? PL_PRIV(vert) : NULL;
+ gl->BindBuffer(GL_ARRAY_BUFFER, vert ? vert_gl->buffer : pass_gl->buffer);
+
+ if (!vert) {
+ // Update the buffer directly. In theory we could also do a memcmp
+ // cache here to avoid unnecessary updates.
+ gl->BufferData(GL_ARRAY_BUFFER, pl_vertex_buf_size(params),
+ params->vertex_data, GL_STREAM_DRAW);
+ }
+
+ if (pass_gl->vao)
+ gl->BindVertexArray(pass_gl->vao);
+
+ uint64_t vert_id = vert ? vert_gl->id : 0;
+ size_t vert_offset = vert ? params->buf_offset : 0;
+ if (!pass_gl->vao || pass_gl->vao_id != vert_id ||
+ pass_gl->vao_offset != vert_offset)
+ {
+ // We need to update the VAO when the buffer ID or offset changes
+ gl_update_va(gpu, pass, vert_offset);
+ pass_gl->vao_id = vert_id;
+ pass_gl->vao_offset = vert_offset;
+ }
+
+ gl_check_err(gpu, "gl_pass_run: update/bind vertex buffer");
+
+ static const GLenum map_prim[PL_PRIM_TYPE_COUNT] = {
+ [PL_PRIM_TRIANGLE_LIST] = GL_TRIANGLES,
+ [PL_PRIM_TRIANGLE_STRIP] = GL_TRIANGLE_STRIP,
+ };
+ GLenum mode = map_prim[pass->params.vertex_type];
+
+ gl_timer_begin(gpu, params->timer);
+
+ if (params->index_data) {
+
+ static const GLenum index_fmts[PL_INDEX_FORMAT_COUNT] = {
+ [PL_INDEX_UINT16] = GL_UNSIGNED_SHORT,
+ [PL_INDEX_UINT32] = GL_UNSIGNED_INT,
+ };
+
+ // Upload indices to temporary buffer object
+ if (!pass_gl->index_buffer)
+ gl->GenBuffers(1, &pass_gl->index_buffer); // lazily allocated
+ gl->BindBuffer(GL_ELEMENT_ARRAY_BUFFER, pass_gl->index_buffer);
+ gl->BufferData(GL_ELEMENT_ARRAY_BUFFER, pl_index_buf_size(params),
+ params->index_data, GL_STREAM_DRAW);
+ gl->DrawElements(mode, params->vertex_count,
+ index_fmts[params->index_fmt], 0);
+ gl->BindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
+
+ } else if (params->index_buf) {
+
+ // The pointer argument becomes the index buffer offset
+ struct pl_buf_gl *index_gl = PL_PRIV(params->index_buf);
+ gl->BindBuffer(GL_ELEMENT_ARRAY_BUFFER, index_gl->buffer);
+ gl->DrawElements(mode, params->vertex_count, GL_UNSIGNED_SHORT,
+ (void *) params->index_offset);
+ gl->BindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
+
+ } else {
+
+ // Note: the VBO offset is handled in the VAO
+ gl->DrawArrays(mode, 0, params->vertex_count);
+ }
+
+ gl_timer_end(gpu, params->timer);
+ gl_check_err(gpu, "gl_pass_run: drawing");
+
+ if (pass_gl->vao) {
+ gl->BindVertexArray(0);
+ } else {
+ for (int i = 0; i < pass->params.num_vertex_attribs; i++)
+ gl->DisableVertexAttribArray(i);
+ }
+
+ gl->BindBuffer(GL_ARRAY_BUFFER, 0);
+ gl->Disable(GL_SCISSOR_TEST);
+ gl->Disable(GL_BLEND);
+ gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+ break;
+ }
+
+ case PL_PASS_COMPUTE:
+ gl_timer_begin(gpu, params->timer);
+ gl->DispatchCompute(params->compute_groups[0],
+ params->compute_groups[1],
+ params->compute_groups[2]);
+ gl_timer_end(gpu, params->timer);
+ break;
+
+ case PL_PASS_INVALID:
+ case PL_PASS_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ for (int i = 0; i < pass->params.num_descriptors; i++)
+ unbind_desc(gpu, pass, i, &params->desc_bindings[i]);
+ gl->ActiveTexture(GL_TEXTURE0);
+
+ gl->UseProgram(0);
+ gl_check_err(gpu, "gl_pass_run");
+ RELEASE_CURRENT();
+}
diff --git a/src/opengl/gpu_tex.c b/src/opengl/gpu_tex.c
new file mode 100644
index 0000000..02eda77
--- /dev/null
+++ b/src/opengl/gpu_tex.c
@@ -0,0 +1,1078 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+#include "formats.h"
+#include "utils.h"
+
+#ifdef PL_HAVE_UNIX
+#include <unistd.h>
+#include <errno.h>
+#endif
+
+void gl_tex_destroy(pl_gpu gpu, pl_tex tex)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ if (!MAKE_CURRENT()) {
+ PL_ERR(gpu, "Failed uninitializing texture, leaking resources!");
+ return;
+ }
+
+ struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+ if (tex_gl->fbo && !tex_gl->wrapped_fb)
+ gl->DeleteFramebuffers(1, &tex_gl->fbo);
+ if (tex_gl->image) {
+ struct pl_gl *p = PL_PRIV(gpu);
+ eglDestroyImageKHR(p->egl_dpy, tex_gl->image);
+ }
+ if (!tex_gl->wrapped_tex)
+ gl->DeleteTextures(1, &tex_gl->texture);
+
+#ifdef PL_HAVE_UNIX
+ if (tex_gl->fd != -1)
+ close(tex_gl->fd);
+#endif
+
+ gl_check_err(gpu, "gl_tex_destroy");
+ RELEASE_CURRENT();
+ pl_free((void *) tex);
+}
+
+static GLbitfield tex_barrier(pl_tex tex)
+{
+ GLbitfield barrier = 0;
+ const struct pl_tex_params *params = &tex->params;
+
+ if (params->sampleable)
+ barrier |= GL_TEXTURE_FETCH_BARRIER_BIT;
+ if (params->renderable || params->blit_src || params->blit_dst)
+ barrier |= GL_FRAMEBUFFER_BARRIER_BIT;
+ if (params->storable)
+ barrier |= GL_SHADER_IMAGE_ACCESS_BARRIER_BIT;
+ if (params->host_writable || params->host_readable)
+ barrier |= GL_TEXTURE_UPDATE_BARRIER_BIT;
+
+ return barrier;
+}
+
+#define ADD_ATTRIB(name, value) \
+ do { \
+ assert(num_attribs + 3 < PL_ARRAY_SIZE(attribs)); \
+ attribs[num_attribs++] = (name); \
+ attribs[num_attribs++] = (value); \
+ } while (0)
+
+#define ADD_DMABUF_PLANE_ATTRIBS(plane, fd, offset, stride) \
+ do { \
+ ADD_ATTRIB(EGL_DMA_BUF_PLANE ## plane ## _FD_EXT, \
+ fd); \
+ ADD_ATTRIB(EGL_DMA_BUF_PLANE ## plane ## _OFFSET_EXT, \
+ offset); \
+ ADD_ATTRIB(EGL_DMA_BUF_PLANE ## plane ## _PITCH_EXT, \
+ stride); \
+ } while (0)
+
+#define ADD_DMABUF_PLANE_MODIFIERS(plane, mod) \
+ do { \
+ ADD_ATTRIB(EGL_DMA_BUF_PLANE ## plane ## _MODIFIER_LO_EXT, \
+ (uint32_t) ((mod) & 0xFFFFFFFFlu)); \
+ ADD_ATTRIB(EGL_DMA_BUF_PLANE ## plane ## _MODIFIER_HI_EXT, \
+ (uint32_t) (((mod) >> 32u) & 0xFFFFFFFFlu)); \
+ } while (0)
+
+static bool gl_tex_import(pl_gpu gpu,
+ enum pl_handle_type handle_type,
+ const struct pl_shared_mem *shared_mem,
+ struct pl_tex_t *tex)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ struct pl_gl *p = PL_PRIV(gpu);
+ if (!MAKE_CURRENT())
+ return false;
+
+ struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+ const struct pl_tex_params *params = &tex->params;
+
+ int attribs[20] = {};
+ int num_attribs = 0;
+ ADD_ATTRIB(EGL_WIDTH, params->w);
+ ADD_ATTRIB(EGL_HEIGHT, params->h);
+
+ switch (handle_type) {
+
+#ifdef PL_HAVE_UNIX
+ case PL_HANDLE_DMA_BUF:
+ if (shared_mem->handle.fd == -1) {
+ PL_ERR(gpu, "%s: invalid fd", __func__);
+ goto error;
+ }
+
+ tex_gl->fd = dup(shared_mem->handle.fd);
+ if (tex_gl->fd == -1) {
+ PL_ERR(gpu, "%s: cannot duplicate fd %d for importing: %s",
+ __func__, shared_mem->handle.fd, strerror(errno));
+ goto error;
+ }
+
+ ADD_ATTRIB(EGL_LINUX_DRM_FOURCC_EXT, params->format->fourcc);
+ ADD_DMABUF_PLANE_ATTRIBS(0, tex_gl->fd, shared_mem->offset,
+ PL_DEF(shared_mem->stride_w, params->w));
+ if (p->has_modifiers)
+ ADD_DMABUF_PLANE_MODIFIERS(0, shared_mem->drm_format_mod);
+
+ attribs[num_attribs] = EGL_NONE;
+
+ // EGL_LINUX_DMA_BUF_EXT requires EGL_NO_CONTEXT
+ tex_gl->image = eglCreateImageKHR(p->egl_dpy,
+ EGL_NO_CONTEXT,
+ EGL_LINUX_DMA_BUF_EXT,
+ (EGLClientBuffer) NULL,
+ attribs);
+
+ break;
+#else // !PL_HAVE_UNIX
+ case PL_HANDLE_DMA_BUF:
+ pl_unreachable();
+#endif
+
+ case PL_HANDLE_WIN32:
+ case PL_HANDLE_WIN32_KMT:
+ case PL_HANDLE_HOST_PTR:
+ case PL_HANDLE_FD:
+ case PL_HANDLE_MTL_TEX:
+ case PL_HANDLE_IOSURFACE:
+ pl_unreachable();
+
+ }
+
+ if (!egl_check_err(gpu, "eglCreateImageKHR") || !tex_gl->image)
+ goto error;
+
+ // tex_gl->image should be already bound
+ if (p->has_egl_storage) {
+ gl->EGLImageTargetTexStorageEXT(GL_TEXTURE_2D, tex_gl->image, NULL);
+ } else {
+ gl->EGLImageTargetTexture2DOES(GL_TEXTURE_2D, tex_gl->image);
+ }
+ if (!egl_check_err(gpu, "EGLImageTargetTexture2DOES"))
+ goto error;
+
+ RELEASE_CURRENT();
+ return true;
+
+error:
+ PL_ERR(gpu, "Failed importing GL texture!");
+ RELEASE_CURRENT();
+ return false;
+}
+
+static EGLenum egl_from_gl_target(pl_gpu gpu, int target)
+{
+ switch(target) {
+ case GL_TEXTURE_2D: return EGL_GL_TEXTURE_2D;
+ case GL_TEXTURE_3D: return EGL_GL_TEXTURE_3D;
+ default:
+ PL_ERR(gpu, "%s: unsupported texture target 0x%x", __func__, target);
+ return 0;
+ }
+}
+
+static bool gl_tex_export(pl_gpu gpu, enum pl_handle_type handle_type,
+ bool preserved, struct pl_tex_t *tex)
+{
+ struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+ struct pl_gl *p = PL_PRIV(gpu);
+
+ EGLenum egltarget = egl_from_gl_target(gpu, tex_gl->target);
+ if (!egltarget)
+ goto error;
+
+ int attribs[] = {
+ EGL_IMAGE_PRESERVED, preserved,
+ EGL_NONE,
+ };
+
+ // We assume that tex_gl->texture is already bound
+ tex_gl->image = eglCreateImageKHR(p->egl_dpy,
+ p->egl_ctx,
+ egltarget,
+ (EGLClientBuffer) (uintptr_t) tex_gl->texture,
+ attribs);
+ if (!egl_check_err(gpu, "eglCreateImageKHR") || !tex_gl->image)
+ goto error;
+
+ switch (handle_type) {
+
+#ifdef PL_HAVE_UNIX
+ case PL_HANDLE_DMA_BUF: {
+ int fourcc = 0;
+ int num_planes = 0;
+ EGLuint64KHR modifier = 0;
+ bool ok;
+ ok = eglExportDMABUFImageQueryMESA(p->egl_dpy,
+ tex_gl->image,
+ &fourcc,
+ &num_planes,
+ &modifier);
+ if (!egl_check_err(gpu, "eglExportDMABUFImageQueryMESA") || !ok)
+ goto error;
+
+ if (fourcc != tex->params.format->fourcc) {
+ PL_ERR(gpu, "Exported DRM format %s does not match fourcc of "
+ "specified pl_fmt %s? Please open a bug.",
+ PRINT_FOURCC(fourcc), PRINT_FOURCC(tex->params.format->fourcc));
+ goto error;
+ }
+
+ if (num_planes != 1) {
+ PL_ERR(gpu, "Unsupported number of planes: %d", num_planes);
+ goto error;
+ }
+
+ int offset = 0, stride = 0;
+ ok = eglExportDMABUFImageMESA(p->egl_dpy,
+ tex_gl->image,
+ &tex_gl->fd,
+ &stride,
+ &offset);
+ if (!egl_check_err(gpu, "eglExportDMABUFImageMesa") || !ok)
+ goto error;
+
+ off_t fdsize = lseek(tex_gl->fd, 0, SEEK_END);
+ off_t err = fdsize > 0 && lseek(tex_gl->fd, 0, SEEK_SET);
+ if (fdsize <= 0 || err < 0) {
+ PL_ERR(gpu, "Failed querying FD size: %s", strerror(errno));
+ goto error;
+ }
+
+ tex->shared_mem = (struct pl_shared_mem) {
+ .handle.fd = tex_gl->fd,
+ .size = fdsize,
+ .offset = offset,
+ .drm_format_mod = modifier,
+ .stride_w = stride,
+ };
+ break;
+ }
+#else // !PL_HAVE_UNIX
+ case PL_HANDLE_DMA_BUF:
+ pl_unreachable();
+#endif
+
+ case PL_HANDLE_WIN32:
+ case PL_HANDLE_WIN32_KMT:
+ case PL_HANDLE_HOST_PTR:
+ case PL_HANDLE_FD:
+ case PL_HANDLE_MTL_TEX:
+ case PL_HANDLE_IOSURFACE:
+ pl_unreachable();
+
+ }
+
+ return true;
+
+error:
+ PL_ERR(gpu, "Failed exporting GL texture!");
+ return false;
+}
+
+static const char *fb_err_str(GLenum err)
+{
+ switch (err) {
+#define CASE(name) case name: return #name
+ CASE(GL_FRAMEBUFFER_COMPLETE);
+ CASE(GL_FRAMEBUFFER_UNDEFINED);
+ CASE(GL_FRAMEBUFFER_INCOMPLETE_ATTACHMENT);
+ CASE(GL_FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT);
+ CASE(GL_FRAMEBUFFER_INCOMPLETE_DIMENSIONS);
+ CASE(GL_FRAMEBUFFER_INCOMPLETE_DRAW_BUFFER);
+ CASE(GL_FRAMEBUFFER_INCOMPLETE_READ_BUFFER);
+ CASE(GL_FRAMEBUFFER_UNSUPPORTED);
+ CASE(GL_FRAMEBUFFER_INCOMPLETE_MULTISAMPLE);
+ CASE(GL_FRAMEBUFFER_INCOMPLETE_LAYER_TARGETS);
+#undef CASE
+
+ default: return "unknown error";
+ }
+}
+
+pl_tex gl_tex_create(pl_gpu gpu, const struct pl_tex_params *params)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ if (!MAKE_CURRENT())
+ return NULL;
+
+ struct pl_gl *p = PL_PRIV(gpu);
+ struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, struct pl_tex_gl);
+ tex->params = *params;
+ tex->params.initial_data = NULL;
+ tex->sampler_type = PL_SAMPLER_NORMAL;
+
+ struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+
+ const struct gl_format **fmtp = PL_PRIV(params->format);
+ const struct gl_format *fmt = *fmtp;
+ *tex_gl = (struct pl_tex_gl) {
+ .format = fmt->fmt,
+ .iformat = fmt->ifmt,
+ .type = fmt->type,
+ .barrier = tex_barrier(tex),
+ .fd = -1,
+ };
+
+ static const GLint targets[] = {
+ [1] = GL_TEXTURE_1D,
+ [2] = GL_TEXTURE_2D,
+ [3] = GL_TEXTURE_3D,
+ };
+
+ int dims = pl_tex_params_dimension(*params);
+ pl_assert(dims >= 1 && dims <= 3);
+ tex_gl->target = targets[dims];
+
+ gl->GenTextures(1, &tex_gl->texture);
+ gl->BindTexture(tex_gl->target, tex_gl->texture);
+
+ if (params->import_handle) {
+ if (!gl_tex_import(gpu, params->import_handle, &params->shared_mem, tex))
+ goto error;
+ } else {
+ gl->PixelStorei(GL_UNPACK_ALIGNMENT, 1);
+
+ switch (dims) {
+ case 1:
+ gl->TexImage1D(tex_gl->target, 0, tex_gl->iformat, params->w, 0,
+ tex_gl->format, tex_gl->type, params->initial_data);
+ break;
+ case 2:
+ gl->TexImage2D(tex_gl->target, 0, tex_gl->iformat, params->w, params->h,
+ 0, tex_gl->format, tex_gl->type, params->initial_data);
+ break;
+ case 3:
+ gl->TexImage3D(tex_gl->target, 0, tex_gl->iformat, params->w, params->h,
+ params->d, 0, tex_gl->format, tex_gl->type,
+ params->initial_data);
+ break;
+ }
+
+ gl->PixelStorei(GL_UNPACK_ALIGNMENT, 4);
+ }
+
+ if (params->export_handle) {
+ if (!gl_tex_export(gpu, params->export_handle, params->initial_data, tex))
+ goto error;
+ }
+
+ gl->BindTexture(tex_gl->target, 0);
+
+ if (!gl_check_err(gpu, "gl_tex_create: texture"))
+ goto error;
+
+ bool need_fbo = tex->params.renderable;
+ if (tex->params.blit_src || tex->params.blit_dst) {
+ if (dims != 2) {
+ PL_ERR(gpu, "Blittable textures may only be 2D!");
+ goto error;
+ }
+
+ need_fbo = true;
+ }
+
+ bool can_fbo = tex->params.format->caps & PL_FMT_CAP_RENDERABLE &&
+ tex->params.d == 0;
+
+ // Try creating an FBO for host-readable textures, since this allows
+ // reading back with glReadPixels instead of glGetTexImage. (Additionally,
+ // GLES does not support glGetTexImage)
+ if (tex->params.host_readable && (can_fbo || p->gles_ver))
+ need_fbo = true;
+
+ if (need_fbo) {
+ if (!can_fbo) {
+ PL_ERR(gpu, "Trying to create a renderable/blittable/readable "
+ "texture with an incompatible (non-renderable) format!");
+ goto error;
+ }
+
+ gl->GenFramebuffers(1, &tex_gl->fbo);
+ gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, tex_gl->fbo);
+ switch (dims) {
+ case 1:
+ gl->FramebufferTexture1D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
+ GL_TEXTURE_1D, tex_gl->texture, 0);
+ break;
+ case 2:
+ gl->FramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
+ GL_TEXTURE_2D, tex_gl->texture, 0);
+ break;
+ case 3: pl_unreachable();
+ }
+
+ GLenum err = gl->CheckFramebufferStatus(GL_DRAW_FRAMEBUFFER);
+ if (err != GL_FRAMEBUFFER_COMPLETE) {
+ gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+ PL_ERR(gpu, "Failed creating framebuffer: %s", fb_err_str(err));
+ goto error;
+ }
+
+ if (params->host_readable && p->gles_ver) {
+ GLint read_type = 0, read_fmt = 0;
+ gl->GetIntegerv(GL_IMPLEMENTATION_COLOR_READ_TYPE, &read_type);
+ gl->GetIntegerv(GL_IMPLEMENTATION_COLOR_READ_FORMAT, &read_fmt);
+ if (read_type != tex_gl->type || read_fmt != tex_gl->format) {
+ gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+ PL_ERR(gpu, "Trying to create host_readable texture whose "
+ "implementation-defined pixel read format "
+ "(type=0x%X, fmt=0x%X) does not match the texture's "
+ "internal format (type=0x%X, fmt=0x%X)! This is a "
+ "GLES/driver limitation, there's little we can do "
+ "about it.",
+ read_type, read_fmt, tex_gl->type, tex_gl->format);
+ goto error;
+ }
+ }
+
+ gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+ if (!gl_check_err(gpu, "gl_tex_create: fbo"))
+ goto error;
+ }
+
+ RELEASE_CURRENT();
+ return tex;
+
+error:
+ gl_tex_destroy(gpu, tex);
+ RELEASE_CURRENT();
+ return NULL;
+}
+
+static bool gl_fb_query(pl_gpu gpu, int fbo, struct pl_fmt_t *fmt,
+ struct gl_format *glfmt)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ struct pl_gl *p = PL_PRIV(gpu);
+ *fmt = (struct pl_fmt_t) {
+ .name = "fbo",
+ .type = PL_FMT_UNKNOWN,
+ .caps = PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_BLITTABLE | PL_FMT_CAP_BLENDABLE,
+ .num_components = 4,
+ .component_depth = {8, 8, 8, 8}, // default to rgba8
+ .sample_order = {0, 1, 2, 3},
+ };
+
+ *glfmt = (struct gl_format) {
+ .fmt = GL_RGBA,
+ };
+
+ bool can_query = gl_test_ext(gpu, "GL_ARB_framebuffer_object", 30, 20);
+ if (!fbo && p->gles_ver && p->gles_ver < 30)
+ can_query = false; // can't query default framebuffer on GLES 2.0
+
+ if (can_query) {
+ gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, fbo);
+
+ GLenum obj = p->gles_ver ? GL_BACK : GL_BACK_LEFT;
+ if (fbo != 0)
+ obj = GL_COLOR_ATTACHMENT0;
+
+ GLint type = 0;
+ gl->GetFramebufferAttachmentParameteriv(GL_DRAW_FRAMEBUFFER, obj,
+ GL_FRAMEBUFFER_ATTACHMENT_COMPONENT_TYPE, &type);
+ switch (type) {
+ case GL_FLOAT: fmt->type = PL_FMT_FLOAT; break;
+ case GL_INT: fmt->type = PL_FMT_SINT; break;
+ case GL_UNSIGNED_INT: fmt->type = PL_FMT_UINT; break;
+ case GL_SIGNED_NORMALIZED: fmt->type = PL_FMT_SNORM; break;
+ case GL_UNSIGNED_NORMALIZED: fmt->type = PL_FMT_UNORM; break;
+ default: fmt->type = PL_FMT_UNKNOWN; break;
+ }
+
+ gl->GetFramebufferAttachmentParameteriv(GL_DRAW_FRAMEBUFFER, obj,
+ GL_FRAMEBUFFER_ATTACHMENT_RED_SIZE, &fmt->component_depth[0]);
+ gl->GetFramebufferAttachmentParameteriv(GL_DRAW_FRAMEBUFFER, obj,
+ GL_FRAMEBUFFER_ATTACHMENT_GREEN_SIZE, &fmt->component_depth[1]);
+ gl->GetFramebufferAttachmentParameteriv(GL_DRAW_FRAMEBUFFER, obj,
+ GL_FRAMEBUFFER_ATTACHMENT_BLUE_SIZE, &fmt->component_depth[2]);
+ gl->GetFramebufferAttachmentParameteriv(GL_DRAW_FRAMEBUFFER, obj,
+ GL_FRAMEBUFFER_ATTACHMENT_ALPHA_SIZE, &fmt->component_depth[3]);
+
+ gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+ gl_check_err(gpu, "gl_fb_query");
+
+ if (!fmt->component_depth[0]) {
+ PL_INFO(gpu, "OpenGL framebuffer did not export depth information,"
+ "assuming 8-bit framebuffer");
+ for (int i = 0; i < PL_ARRAY_SIZE(fmt->component_depth); i++)
+ fmt->component_depth[i] = 8;
+ }
+
+ // Strip missing components from component map
+ while (!fmt->component_depth[fmt->num_components - 1]) {
+ fmt->num_components--;
+ pl_assert(fmt->num_components);
+ }
+ }
+
+ int gpu_bits = 0;
+ for (int i = 0; i < 4; i++)
+ gpu_bits += fmt->component_depth[i];
+ fmt->internal_size = (gpu_bits + 7) / 8;
+
+ size_t host_size = 0;
+ switch (fmt->type) {
+ case PL_FMT_UNKNOWN:
+ fmt->opaque = true;
+ return true;
+ case PL_FMT_FLOAT:
+ glfmt->type = GL_FLOAT;
+ host_size = sizeof(float);
+ break;
+ case PL_FMT_UNORM:
+ case PL_FMT_UINT:
+ if (gpu_bits > 32) {
+ glfmt->type = GL_UNSIGNED_SHORT;
+ host_size = sizeof(uint16_t);
+ } else {
+ glfmt->type = GL_UNSIGNED_BYTE;
+ host_size = sizeof(uint8_t);
+ }
+ break;
+ case PL_FMT_SNORM:
+ case PL_FMT_SINT:
+ if (gpu_bits > 32) {
+ glfmt->type = GL_SHORT;
+ host_size = sizeof(int16_t);
+ } else {
+ glfmt->type = GL_BYTE;
+ host_size = sizeof(int8_t);
+ }
+ break;
+ case PL_FMT_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ fmt->texel_size = fmt->num_components * host_size;
+ for (int i = 0; i < fmt->num_components; i++)
+ fmt->host_bits[i] = 8 * host_size;
+ fmt->caps |= PL_FMT_CAP_HOST_READABLE;
+
+ return true;
+}
+
+pl_tex pl_opengl_wrap(pl_gpu gpu, const struct pl_opengl_wrap_params *params)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ if (!MAKE_CURRENT())
+ return NULL;
+
+ struct pl_gl *p = PL_PRIV(gpu);
+ struct pl_tex_t *tex = pl_alloc_obj(NULL, tex, struct pl_tex_gl);
+ struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+ *tex = (struct pl_tex_t) {
+ .params = {
+ .w = params->width,
+ .h = params->height,
+ .d = params->depth,
+ },
+ };
+
+ pl_fmt fmt = NULL;
+ const struct gl_format *glfmt = NULL;
+
+ if (params->texture) {
+ // Wrapping texture: Require matching iformat
+ pl_assert(params->iformat);
+ for (int i = 0; i < gpu->num_formats; i++) {
+ const struct gl_format **glfmtp = PL_PRIV(gpu->formats[i]);
+ if ((*glfmtp)->ifmt == params->iformat) {
+ fmt = gpu->formats[i];
+ glfmt = *glfmtp;
+ break;
+ }
+ }
+
+ if (!fmt) {
+ PL_ERR(gpu, "Failed mapping iformat %d to any equivalent `pl_fmt`",
+ params->iformat);
+ goto error;
+ }
+ } else {
+ // Wrapping framebuffer: Allocate/infer generic FBO format
+ fmt = pl_alloc_obj((void *) gpu, fmt, const struct gl_format *);
+ glfmt = pl_alloc_ptr((void *) fmt, glfmt);
+ const struct gl_format **glfmtp = PL_PRIV(fmt);
+ *glfmtp = glfmt;
+ if (!gl_fb_query(gpu, params->framebuffer,
+ (struct pl_fmt_t *) fmt,
+ (struct gl_format *) glfmt))
+ {
+ PL_ERR(gpu, "Failed querying framebuffer specifics!");
+ pl_free((void *) fmt);
+ goto error;
+ }
+ }
+
+ *tex_gl = (struct pl_tex_gl) {
+ .target = params->target,
+ .texture = params->texture,
+ .fbo = params->framebuffer,
+ .wrapped_tex = !!params->texture,
+ .wrapped_fb = params->framebuffer || !params->texture,
+ .iformat = glfmt->ifmt,
+ .format = glfmt->fmt,
+ .type = glfmt->type,
+ .fd = -1,
+ };
+
+ int dims = pl_tex_params_dimension(tex->params);
+ if (!tex_gl->target) {
+ switch (dims) {
+ case 1: tex_gl->target = GL_TEXTURE_1D; break;
+ case 2: tex_gl->target = GL_TEXTURE_2D; break;
+ case 3: tex_gl->target = GL_TEXTURE_3D; break;
+ }
+ }
+
+ // Map texture-specific sampling metadata
+ if (params->texture) {
+ switch (params->target) {
+ case GL_TEXTURE_1D:
+ if (params->width || params->depth) {
+ PL_ERR(gpu, "Invalid texture dimensions for GL_TEXTURE_1D");
+ goto error;
+ }
+ // fall through
+ case GL_TEXTURE_2D:
+ if (params->depth) {
+ PL_ERR(gpu, "Invalid texture dimensions for GL_TEXTURE_2D");
+ goto error;
+ }
+ // fall through
+ case 0:
+ case GL_TEXTURE_3D:
+ tex->sampler_type = PL_SAMPLER_NORMAL;
+ break;
+
+ case GL_TEXTURE_RECTANGLE: tex->sampler_type = PL_SAMPLER_RECT; break;
+ case GL_TEXTURE_EXTERNAL_OES: tex->sampler_type = PL_SAMPLER_EXTERNAL; break;
+
+ default:
+ PL_ERR(gpu, "Failed mapping texture target %u to any equivalent "
+ "`pl_sampler_type`", params->target);
+ goto error;
+ }
+ }
+
+ // Create optional extra fbo if needed/possible
+ bool can_fbo = tex_gl->texture &&
+ (fmt->caps & PL_FMT_CAP_RENDERABLE) &&
+ tex->sampler_type != PL_SAMPLER_EXTERNAL &&
+ dims < 3;
+
+ if (can_fbo && !tex_gl->fbo) {
+ gl->GenFramebuffers(1, &tex_gl->fbo);
+ gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, tex_gl->fbo);
+ switch (dims) {
+ case 1:
+ gl->FramebufferTexture1D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
+ tex_gl->target, tex_gl->texture, 0);
+ break;
+ case 2:
+ gl->FramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
+ tex_gl->target, tex_gl->texture, 0);
+ break;
+ }
+
+ GLenum err = gl->CheckFramebufferStatus(GL_DRAW_FRAMEBUFFER);
+ if (err != GL_FRAMEBUFFER_COMPLETE) {
+ gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+ PL_ERR(gpu, "Failed creating framebuffer: error code %d", err);
+ goto error;
+ }
+
+ if (p->gles_ver) {
+ GLint read_type = 0, read_fmt = 0;
+ gl->GetIntegerv(GL_IMPLEMENTATION_COLOR_READ_TYPE, &read_type);
+ gl->GetIntegerv(GL_IMPLEMENTATION_COLOR_READ_FORMAT, &read_fmt);
+ tex->params.host_readable = read_type == tex_gl->type &&
+ read_fmt == tex_gl->format;
+ } else {
+ tex->params.host_readable = true;
+ }
+
+ gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+ if (!gl_check_err(gpu, "pl_opengl_wrap: fbo"))
+ goto error;
+ }
+
+ // Complete the process of inferring the texture capabilities
+ tex->params.format = fmt;
+ if (tex_gl->texture) {
+ tex->params.sampleable = fmt->caps & PL_FMT_CAP_SAMPLEABLE;
+ tex->params.storable = fmt->caps & PL_FMT_CAP_STORABLE;
+ tex->params.host_writable = !fmt->opaque;
+ tex->params.host_readable |= fmt->caps & PL_FMT_CAP_HOST_READABLE;
+ }
+ if (tex_gl->fbo || tex_gl->wrapped_fb) {
+ tex->params.renderable = fmt->caps & PL_FMT_CAP_RENDERABLE;
+ tex->params.host_readable |= fmt->caps & PL_FMT_CAP_HOST_READABLE;
+ if (dims == 2 && (fmt->caps & PL_FMT_CAP_BLITTABLE)) {
+ tex->params.blit_src = true;
+ tex->params.blit_dst = true;
+ }
+ }
+
+ tex_gl->barrier = tex_barrier(tex);
+ RELEASE_CURRENT();
+ return tex;
+
+error:
+ gl_tex_destroy(gpu, tex);
+ RELEASE_CURRENT();
+ return NULL;
+}
+
+unsigned int pl_opengl_unwrap(pl_gpu gpu, pl_tex tex,
+ unsigned int *out_target, int *out_iformat,
+ unsigned int *out_fbo)
+{
+ struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+ if (!tex_gl->texture) {
+ PL_ERR(gpu, "Trying to call `pl_opengl_unwrap` on a pseudo-texture "
+ "(perhaps obtained by `pl_swapchain_start_frame`?)");
+ return 0;
+ }
+
+ if (out_target)
+ *out_target = tex_gl->target;
+ if (out_iformat)
+ *out_iformat = tex_gl->iformat;
+ if (out_fbo)
+ *out_fbo = tex_gl->fbo;
+
+ return tex_gl->texture;
+}
+
+void gl_tex_invalidate(pl_gpu gpu, pl_tex tex)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ struct pl_gl *p = PL_PRIV(gpu);
+ struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+ if (!MAKE_CURRENT())
+ return;
+
+ if (tex_gl->texture && p->has_invalidate_tex)
+ gl->InvalidateTexImage(tex_gl->texture, 0);
+
+ if ((tex_gl->wrapped_fb || tex_gl->fbo) && p->has_invalidate_fb) {
+ GLenum attachment = tex_gl->fbo ? GL_COLOR_ATTACHMENT0 : GL_COLOR;
+ gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, tex_gl->fbo);
+ gl->InvalidateFramebuffer(GL_DRAW_FRAMEBUFFER, 1, &attachment);
+ gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+ }
+
+ gl_check_err(gpu, "gl_tex_invalidate");
+ RELEASE_CURRENT();
+}
+
+void gl_tex_clear_ex(pl_gpu gpu, pl_tex tex, const union pl_clear_color color)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ if (!MAKE_CURRENT())
+ return;
+
+ struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+ pl_assert(tex_gl->fbo || tex_gl->wrapped_fb);
+
+ switch (tex->params.format->type) {
+ case PL_FMT_UNKNOWN:
+ case PL_FMT_FLOAT:
+ case PL_FMT_UNORM:
+ case PL_FMT_SNORM:
+ gl->ClearColor(color.f[0], color.f[1], color.f[2], color.f[3]);
+ break;
+
+ case PL_FMT_UINT:
+ gl->ClearColorIuiEXT(color.u[0], color.u[1], color.u[2], color.u[3]);
+ break;
+
+ case PL_FMT_SINT:
+ gl->ClearColorIiEXT(color.i[0], color.i[1], color.i[2], color.i[3]);
+ break;
+
+ case PL_FMT_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, tex_gl->fbo);
+ gl->Clear(GL_COLOR_BUFFER_BIT);
+ gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+ gl_check_err(gpu, "gl_tex_clear");
+ RELEASE_CURRENT();
+}
+
+void gl_tex_blit(pl_gpu gpu, const struct pl_tex_blit_params *params)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ if (!MAKE_CURRENT())
+ return;
+
+ struct pl_tex_gl *src_gl = PL_PRIV(params->src);
+ struct pl_tex_gl *dst_gl = PL_PRIV(params->dst);
+
+ pl_assert(src_gl->fbo || src_gl->wrapped_fb);
+ pl_assert(dst_gl->fbo || dst_gl->wrapped_fb);
+ gl->BindFramebuffer(GL_READ_FRAMEBUFFER, src_gl->fbo);
+ gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, dst_gl->fbo);
+
+ static const GLint filters[PL_TEX_SAMPLE_MODE_COUNT] = {
+ [PL_TEX_SAMPLE_NEAREST] = GL_NEAREST,
+ [PL_TEX_SAMPLE_LINEAR] = GL_LINEAR,
+ };
+
+ pl_rect3d src_rc = params->src_rc, dst_rc = params->dst_rc;
+ gl->BlitFramebuffer(src_rc.x0, src_rc.y0, src_rc.x1, src_rc.y1,
+ dst_rc.x0, dst_rc.y0, dst_rc.x1, dst_rc.y1,
+ GL_COLOR_BUFFER_BIT, filters[params->sample_mode]);
+
+ gl->BindFramebuffer(GL_READ_FRAMEBUFFER, 0);
+ gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+ gl_check_err(gpu, "gl_tex_blit");
+ RELEASE_CURRENT();
+}
+
+static int get_alignment(size_t pitch)
+{
+ if (pitch % 8 == 0)
+ return 8;
+ if (pitch % 4 == 0)
+ return 4;
+ if (pitch % 2 == 0)
+ return 2;
+ return 1;
+}
+
+bool gl_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ struct pl_gl *p = PL_PRIV(gpu);
+ pl_tex tex = params->tex;
+ pl_fmt fmt = tex->params.format;
+ pl_buf buf = params->buf;
+ struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+ struct pl_buf_gl *buf_gl = buf ? PL_PRIV(buf) : NULL;
+
+ // If the user requests asynchronous uploads, it's more efficient to do
+ // them via a PBO - this allows us to skip blocking the caller, especially
+ // when the host pointer can be imported directly.
+ if (params->callback && !buf) {
+ size_t buf_size = pl_tex_transfer_size(params);
+ const size_t min_size = 32*1024; // 32 KiB
+ if (buf_size >= min_size && buf_size <= gpu->limits.max_buf_size)
+ return pl_tex_upload_pbo(gpu, params);
+ }
+
+ if (!MAKE_CURRENT())
+ return false;
+
+ uintptr_t src = (uintptr_t) params->ptr;
+ if (buf) {
+ gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, buf_gl->buffer);
+ src = buf_gl->offset + params->buf_offset;
+ }
+
+ bool misaligned = params->row_pitch % fmt->texel_size;
+ int stride_w = params->row_pitch / fmt->texel_size;
+ int stride_h = params->depth_pitch / params->row_pitch;
+
+ int dims = pl_tex_params_dimension(tex->params);
+ if (dims > 1)
+ gl->PixelStorei(GL_UNPACK_ALIGNMENT, get_alignment(params->row_pitch));
+
+ int rows = pl_rect_h(params->rc);
+ if (misaligned) {
+ rows = 1;
+ } else if (stride_w != pl_rect_w(params->rc)) {
+ gl->PixelStorei(GL_UNPACK_ROW_LENGTH, stride_w);
+ }
+
+ int imgs = pl_rect_d(params->rc);
+ if (stride_h != pl_rect_h(params->rc) || rows < stride_h)
+ gl->PixelStorei(GL_UNPACK_IMAGE_HEIGHT, stride_h);
+
+ gl->BindTexture(tex_gl->target, tex_gl->texture);
+ gl_timer_begin(gpu, params->timer);
+
+ switch (dims) {
+ case 1:
+ gl->TexSubImage1D(tex_gl->target, 0, params->rc.x0, pl_rect_w(params->rc),
+ tex_gl->format, tex_gl->type, (void *) src);
+ break;
+ case 2:
+ for (int y = params->rc.y0; y < params->rc.y1; y += rows) {
+ gl->TexSubImage2D(tex_gl->target, 0, params->rc.x0, y,
+ pl_rect_w(params->rc), rows, tex_gl->format,
+ tex_gl->type, (void *) src);
+ src += params->row_pitch * rows;
+ }
+ break;
+ case 3:
+ for (int z = params->rc.z0; z < params->rc.z1; z += imgs) {
+ uintptr_t row_src = src;
+ for (int y = params->rc.y0; y < params->rc.y1; y += rows) {
+ gl->TexSubImage3D(tex_gl->target, 0, params->rc.x0, y, z,
+ pl_rect_w(params->rc), rows, imgs,
+ tex_gl->format, tex_gl->type, (void *) row_src);
+ row_src = (uintptr_t) row_src + params->row_pitch * rows;
+ }
+ src += params->depth_pitch * imgs;
+ }
+ break;
+ }
+
+ gl_timer_end(gpu, params->timer);
+ gl->BindTexture(tex_gl->target, 0);
+ gl->PixelStorei(GL_UNPACK_ALIGNMENT, 4);
+ gl->PixelStorei(GL_UNPACK_ROW_LENGTH, 0);
+ gl->PixelStorei(GL_UNPACK_IMAGE_HEIGHT, 0);
+
+ if (buf) {
+ gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+ if (buf->params.host_mapped) {
+ // Make sure the PBO is not reused until GL is done with it. If a
+ // previous operation is pending, "update" it by creating a new
+ // fence that will cover the previous operation as well.
+ gl->DeleteSync(buf_gl->fence);
+ buf_gl->fence = gl->FenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+ }
+ }
+
+ if (params->callback) {
+ PL_ARRAY_APPEND(gpu, p->callbacks, (struct gl_cb) {
+ .sync = gl->FenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0),
+ .callback = params->callback,
+ .priv = params->priv,
+ });
+ }
+
+ bool ok = gl_check_err(gpu, "gl_tex_upload");
+ RELEASE_CURRENT();
+ return ok;
+}
+
+bool gl_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ struct pl_gl *p = PL_PRIV(gpu);
+ pl_tex tex = params->tex;
+ pl_fmt fmt = tex->params.format;
+ pl_buf buf = params->buf;
+ struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+ struct pl_buf_gl *buf_gl = buf ? PL_PRIV(buf) : NULL;
+ bool ok = true;
+
+ if (params->callback && !buf) {
+ size_t buf_size = pl_tex_transfer_size(params);
+ const size_t min_size = 32*1024; // 32 KiB
+ if (buf_size >= min_size && buf_size <= gpu->limits.max_buf_size)
+ return pl_tex_download_pbo(gpu, params);
+ }
+
+ if (!MAKE_CURRENT())
+ return false;
+
+ uintptr_t dst = (uintptr_t) params->ptr;
+ if (buf) {
+ gl->BindBuffer(GL_PIXEL_PACK_BUFFER, buf_gl->buffer);
+ dst = buf_gl->offset + params->buf_offset;
+ }
+
+ pl_rect3d full = {
+ 0, 0, 0,
+ tex->params.w,
+ PL_DEF(tex->params.h, 1),
+ PL_DEF(tex->params.d, 1),
+ };
+
+ bool misaligned = params->row_pitch % fmt->texel_size;
+ int stride_w = params->row_pitch / fmt->texel_size;
+ int stride_h = params->depth_pitch / params->row_pitch;
+
+ int dims = pl_tex_params_dimension(tex->params);
+ bool is_copy = pl_rect3d_eq(params->rc, full) &&
+ stride_w == tex->params.w &&
+ stride_h == PL_DEF(tex->params.h, 1) &&
+ !misaligned;
+
+ gl_timer_begin(gpu, params->timer);
+
+ if (tex_gl->fbo || tex_gl->wrapped_fb) {
+ // We can use a more efficient path when we have an FBO available
+ if (dims > 1)
+ gl->PixelStorei(GL_PACK_ALIGNMENT, get_alignment(params->row_pitch));
+
+ int rows = pl_rect_h(params->rc);
+ if (misaligned) {
+ rows = 1;
+ } else if (stride_w != tex->params.w) {
+ gl->PixelStorei(GL_PACK_ROW_LENGTH, stride_w);
+ }
+
+ // No 3D framebuffers
+ pl_assert(pl_rect_d(params->rc) == 1);
+
+ gl->BindFramebuffer(GL_READ_FRAMEBUFFER, tex_gl->fbo);
+ for (int y = params->rc.y0; y < params->rc.y1; y += rows) {
+ gl->ReadPixels(params->rc.x0, y, pl_rect_w(params->rc), rows,
+ tex_gl->format, tex_gl->type, (void *) dst);
+ dst += params->row_pitch * rows;
+ }
+ gl->BindFramebuffer(GL_READ_FRAMEBUFFER, 0);
+ gl->PixelStorei(GL_PACK_ALIGNMENT, 4);
+ gl->PixelStorei(GL_PACK_ROW_LENGTH, 0);
+ } else if (is_copy) {
+ // We're downloading the entire texture
+ gl->BindTexture(tex_gl->target, tex_gl->texture);
+ gl->GetTexImage(tex_gl->target, 0, tex_gl->format, tex_gl->type, (void *) dst);
+ gl->BindTexture(tex_gl->target, 0);
+ } else {
+ PL_ERR(gpu, "Partial downloads of 3D textures not implemented!");
+ ok = false;
+ }
+
+ gl_timer_end(gpu, params->timer);
+
+ if (buf) {
+ gl->BindBuffer(GL_PIXEL_PACK_BUFFER, 0);
+ if (ok && buf->params.host_mapped) {
+ gl->DeleteSync(buf_gl->fence);
+ buf_gl->fence = gl->FenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+ }
+ }
+
+ if (params->callback) {
+ PL_ARRAY_APPEND(gpu, p->callbacks, (struct gl_cb) {
+ .sync = gl->FenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0),
+ .callback = params->callback,
+ .priv = params->priv,
+ });
+ }
+
+ ok &= gl_check_err(gpu, "gl_tex_download");
+ RELEASE_CURRENT();
+ return ok;
+}
diff --git a/src/opengl/include/glad/meson.build b/src/opengl/include/glad/meson.build
new file mode 100644
index 0000000..05b3f02
--- /dev/null
+++ b/src/opengl/include/glad/meson.build
@@ -0,0 +1,29 @@
+glad_check = run_command([ python, '-c', 'import glad; print(glad.__version__)' ],
+ env: python_env,
+ capture: true,
+ check: false,
+)
+
+glad_ver = glad_check.returncode() == 0 ? glad_check.stdout().strip() : 'none'
+glad_req = '>= 2.0'
+
+if not glad_ver.version_compare(glad_req)
+ error(f'glad (required: @glad_req@, found: @glad_ver@) was not found in ' +
+ 'PYTHONPATH or `3rdparty`. Please run `git submodule update --init` ' +
+ 'followed by `meson --wipe`.')
+endif
+
+glad = custom_target('gl.h',
+ output: 'gl.h',
+ env: python_env,
+ command: [
+ python, '-m', 'glad', '--out-path=@OUTDIR@/../../',
+ '--reproducible', '--merge', '--api=gl:core,gles2,egl',
+ '--extensions=' + ','.join(gl_extensions), 'c', '--header-only', '--mx'
+ ] + (opengl_link.allowed() ? ['--loader'] : [])
+)
+
+glad_dep = declare_dependency(
+ include_directories: include_directories('..'),
+ sources: glad,
+)
diff --git a/src/opengl/loader_egl.c b/src/opengl/loader_egl.c
new file mode 100644
index 0000000..0e04c71
--- /dev/null
+++ b/src/opengl/loader_egl.c
@@ -0,0 +1,2 @@
+#define GLAD_EGL_IMPLEMENTATION
+#include "common.h"
diff --git a/src/opengl/loader_gl.c b/src/opengl/loader_gl.c
new file mode 100644
index 0000000..26b8bef
--- /dev/null
+++ b/src/opengl/loader_gl.c
@@ -0,0 +1,2 @@
+#define GLAD_GL_IMPLEMENTATION
+#include "common.h"
diff --git a/src/opengl/meson.build b/src/opengl/meson.build
new file mode 100644
index 0000000..59ba921
--- /dev/null
+++ b/src/opengl/meson.build
@@ -0,0 +1,76 @@
+opengl_build = get_option('opengl')
+opengl_link = get_option('gl-proc-addr')
+
+if host_machine.system() == 'windows' or host_machine.system().endswith('bsd') or \
+ host_machine.system() == 'dragonfly'
+ libdl = declare_dependency()
+else
+ libdl = cc.find_library('dl', required : opengl_link)
+endif
+opengl_link = opengl_link.require(libdl.found())
+components.set('opengl', opengl_build.allowed())
+components.set('gl-proc-addr', opengl_link.allowed())
+
+if opengl_build.allowed()
+ sources += [
+ 'opengl/context.c',
+ 'opengl/formats.c',
+ 'opengl/loader_gl.c',
+ 'opengl/loader_egl.c',
+ 'opengl/gpu.c',
+ 'opengl/gpu_tex.c',
+ 'opengl/gpu_pass.c',
+ 'opengl/swapchain.c',
+ 'opengl/utils.c',
+ ]
+
+ if opengl_link.allowed()
+ build_deps += libdl
+ tests += 'opengl_surfaceless.c'
+ endif
+
+ gl_extensions = [
+ 'GL_AMD_pinned_memory',
+ 'GL_ARB_buffer_storage',
+ 'GL_ARB_compute_shader',
+ 'GL_ARB_framebuffer_object',
+ 'GL_ARB_get_program_binary',
+ 'GL_ARB_invalidate_subdata',
+ 'GL_ARB_pixel_buffer_object',
+ 'GL_ARB_program_interface_query',
+ 'GL_ARB_shader_image_load_store',
+ 'GL_ARB_shader_storage_buffer_object',
+ 'GL_ARB_sync',
+ 'GL_ARB_texture_float',
+ 'GL_ARB_texture_gather',
+ 'GL_ARB_texture_rg',
+ 'GL_ARB_timer_query',
+ 'GL_ARB_uniform_buffer_object',
+ 'GL_ARB_vertex_array_object',
+ 'GL_EXT_EGL_image_storage',
+ 'GL_EXT_color_buffer_float',
+ 'GL_EXT_color_buffer_half_float',
+ 'GL_EXT_texture3D',
+ 'GL_EXT_texture_format_BGRA8888',
+ 'GL_EXT_texture_integer',
+ 'GL_EXT_texture_norm16',
+ 'GL_EXT_texture_rg',
+ 'GL_EXT_unpack_subimage',
+ 'GL_KHR_debug',
+ 'GL_OES_EGL_image',
+ 'GL_OES_EGL_image_external',
+ 'EGL_EXT_image_dma_buf_import',
+ 'EGL_EXT_image_dma_buf_import_modifiers',
+ 'EGL_EXT_platform_base',
+ 'EGL_KHR_debug',
+ 'EGL_KHR_image_base',
+ 'EGL_MESA_image_dma_buf_export',
+ 'EGL_MESA_platform_surfaceless',
+ ]
+
+ # Generate GL loader
+ subdir('include/glad')
+else
+ glad_dep = []
+ sources += 'opengl/stubs.c'
+endif
diff --git a/src/opengl/stubs.c b/src/opengl/stubs.c
new file mode 100644
index 0000000..20395f9
--- /dev/null
+++ b/src/opengl/stubs.c
@@ -0,0 +1,63 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../common.h"
+#include "log.h"
+
+#include <libplacebo/opengl.h>
+
+const struct pl_opengl_params pl_opengl_default_params = {0};
+
+pl_opengl pl_opengl_create(pl_log log, const struct pl_opengl_params *params)
+{
+ pl_fatal(log, "libplacebo compiled without OpenGL support!");
+ return NULL;
+}
+
+void pl_opengl_destroy(pl_opengl *pgl)
+{
+ pl_opengl gl = *pgl;
+ pl_assert(!gl);
+}
+
+pl_opengl pl_opengl_get(pl_gpu gpu)
+{
+ return NULL;
+}
+
+pl_swapchain pl_opengl_create_swapchain(pl_opengl gl,
+ const struct pl_opengl_swapchain_params *params)
+{
+ pl_unreachable();
+}
+
+void pl_opengl_swapchain_update_fb(pl_swapchain sw,
+ const struct pl_opengl_framebuffer *fb)
+{
+ pl_unreachable();
+}
+
+pl_tex pl_opengl_wrap(pl_gpu gpu, const struct pl_opengl_wrap_params *params)
+{
+ pl_unreachable();
+}
+
+unsigned int pl_opengl_unwrap(pl_gpu gpu, pl_tex tex, unsigned int *out_target,
+ int *out_iformat, unsigned int *out_fbo)
+{
+ pl_unreachable();
+}
diff --git a/src/opengl/swapchain.c b/src/opengl/swapchain.c
new file mode 100644
index 0000000..46d5f9e
--- /dev/null
+++ b/src/opengl/swapchain.c
@@ -0,0 +1,278 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+#include "formats.h"
+#include "gpu.h"
+#include "swapchain.h"
+#include "utils.h"
+#include "pl_thread.h"
+
+struct priv {
+ struct pl_sw_fns impl;
+
+ struct pl_opengl_swapchain_params params;
+ pl_opengl gl;
+ pl_mutex lock;
+ bool has_sync;
+
+ // current parameters
+ pl_tex fb;
+ bool frame_started;
+
+ // vsync fences
+ int swapchain_depth;
+ PL_ARRAY(GLsync) vsync_fences;
+};
+
+static const struct pl_sw_fns opengl_swapchain;
+
+pl_swapchain pl_opengl_create_swapchain(pl_opengl pl_gl,
+ const struct pl_opengl_swapchain_params *params)
+{
+ pl_gpu gpu = pl_gl->gpu;
+
+ if (params->max_swapchain_depth < 0) {
+ PL_ERR(gpu, "Tried specifying negative swapchain depth?");
+ return NULL;
+ }
+
+ if (!gl_make_current(pl_gl))
+ return NULL;
+
+ struct pl_swapchain_t *sw = pl_zalloc_obj(NULL, sw, struct priv);
+ sw->log = gpu->log;
+ sw->gpu = gpu;
+
+ struct priv *p = PL_PRIV(sw);
+ pl_mutex_init(&p->lock);
+ p->impl = opengl_swapchain;
+ p->params = *params;
+ p->has_sync = pl_opengl_has_ext(pl_gl, "GL_ARB_sync");
+ p->gl = pl_gl;
+
+ gl_release_current(pl_gl);
+ return sw;
+}
+
+static void gl_sw_destroy(pl_swapchain sw)
+{
+ pl_gpu gpu = sw->gpu;
+ struct priv *p = PL_PRIV(sw);
+
+ pl_gpu_flush(gpu);
+ pl_tex_destroy(gpu, &p->fb);
+ pl_mutex_destroy(&p->lock);
+ pl_free((void *) sw);
+}
+
+static int gl_sw_latency(pl_swapchain sw)
+{
+ struct priv *p = PL_PRIV(sw);
+ return p->params.max_swapchain_depth;
+}
+
+static bool gl_sw_resize(pl_swapchain sw, int *width, int *height)
+{
+ struct priv *p = PL_PRIV(sw);
+ const int w = *width, h = *height;
+
+ pl_mutex_lock(&p->lock);
+ if (p->fb && w == p->fb->params.w && h == p->fb->params.h) {
+ pl_mutex_unlock(&p->lock);
+ return true;
+ }
+
+ if (p->frame_started && (w || h)) {
+ PL_ERR(sw, "Tried resizing the swapchain while a frame was in progress! "
+ "Please submit the current frame first.");
+ pl_mutex_unlock(&p->lock);
+ return false;
+ }
+
+ if (w && h) {
+ pl_tex_destroy(sw->gpu, &p->fb);
+ p->fb = pl_opengl_wrap(sw->gpu, pl_opengl_wrap_params(
+ .framebuffer = p->params.framebuffer.id,
+ .width = w,
+ .height = h,
+ ));
+ if (!p->fb) {
+ PL_ERR(sw, "Failed wrapping OpenGL framebuffer!");
+ pl_mutex_unlock(&p->lock);
+ return false;
+ }
+ }
+
+ if (!p->fb) {
+ PL_ERR(sw, "Tried calling `pl_swapchain_resize` with unknown size! "
+ "This is forbidden for OpenGL. The first call to "
+ "`pl_swapchain_resize` must include the width and height of the "
+ "swapchain, because there's no way to figure this out from "
+ "within the API.");
+ pl_mutex_unlock(&p->lock);
+ return false;
+ }
+
+ *width = p->fb->params.w;
+ *height = p->fb->params.h;
+ pl_mutex_unlock(&p->lock);
+ return true;
+}
+
+void pl_opengl_swapchain_update_fb(pl_swapchain sw,
+ const struct pl_opengl_framebuffer *fb)
+{
+ struct priv *p = PL_PRIV(sw);
+ pl_mutex_lock(&p->lock);
+ if (p->frame_started) {
+ PL_ERR(sw,"Tried calling `pl_opengl_swapchain_update_fb` while a frame "
+ "was in progress! Please submit the current frame first.");
+ pl_mutex_unlock(&p->lock);
+ return;
+ }
+
+ if (p->params.framebuffer.id != fb->id)
+ pl_tex_destroy(sw->gpu, &p->fb);
+
+ p->params.framebuffer = *fb;
+ pl_mutex_unlock(&p->lock);
+}
+
+static bool gl_sw_start_frame(pl_swapchain sw,
+ struct pl_swapchain_frame *out_frame)
+{
+ struct priv *p = PL_PRIV(sw);
+ pl_mutex_lock(&p->lock);
+ bool ok = false;
+
+ if (!p->fb) {
+ PL_ERR(sw, "Unknown framebuffer size. Please call `pl_swapchain_resize` "
+ "before `pl_swapchain_start_frame` for OpenGL swapchains!");
+ goto error;
+ }
+
+ if (p->frame_started) {
+ PL_ERR(sw, "Attempted calling `pl_swapchain_start` while a frame was "
+ "already in progress! Call `pl_swapchain_submit_frame` first.");
+ goto error;
+ }
+
+ if (!gl_make_current(p->gl))
+ goto error;
+
+ *out_frame = (struct pl_swapchain_frame) {
+ .fbo = p->fb,
+ .flipped = !p->params.framebuffer.flipped,
+ .color_repr = {
+ .sys = PL_COLOR_SYSTEM_RGB,
+ .levels = PL_COLOR_LEVELS_FULL,
+ .alpha = p->fb->params.format->num_components == 4
+ ? PL_ALPHA_PREMULTIPLIED
+ : PL_ALPHA_UNKNOWN,
+ .bits = {
+ // Just use the red channel in the absence of anything more
+ // sane to do, because the red channel is both guaranteed to
+ // exist and also typically has the minimum number of bits
+ // (which is arguably what matters for dithering)
+ .sample_depth = p->fb->params.format->component_depth[0],
+ .color_depth = p->fb->params.format->component_depth[0],
+ },
+ },
+ .color_space = pl_color_space_monitor,
+ };
+
+ p->frame_started = gl_check_err(sw->gpu, "gl_sw_start_frame");
+ if (!p->frame_started)
+ goto error;
+
+ // keep p->lock held
+ gl_release_current(p->gl);
+ return true;
+
+error:
+ gl_release_current(p->gl);
+ pl_mutex_unlock(&p->lock);
+ return ok;
+}
+
+static bool gl_sw_submit_frame(pl_swapchain sw)
+{
+ struct priv *p = PL_PRIV(sw);
+ struct gl_ctx *glctx = PL_PRIV(p->gl);
+ const gl_funcs *gl = &glctx->func;
+ if (!gl_make_current(p->gl)) {
+ p->frame_started = false;
+ pl_mutex_unlock(&p->lock);
+ return false;
+ }
+
+ pl_assert(p->frame_started);
+ if (p->has_sync && p->params.max_swapchain_depth) {
+ GLsync fence = gl->FenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+ if (fence)
+ PL_ARRAY_APPEND(sw, p->vsync_fences, fence);
+ }
+
+ gl->Flush();
+ p->frame_started = false;
+ bool ok = gl_check_err(sw->gpu, "gl_sw_submit_frame");
+ gl_release_current(p->gl);
+ pl_mutex_unlock(&p->lock);
+
+ return ok;
+}
+
+static void gl_sw_swap_buffers(pl_swapchain sw)
+{
+ struct priv *p = PL_PRIV(sw);
+ struct gl_ctx *glctx = PL_PRIV(p->gl);
+ const gl_funcs *gl = &glctx->func;
+ if (!p->params.swap_buffers) {
+ PL_ERR(sw, "`pl_swapchain_swap_buffers` called but no "
+ "`params.swap_buffers` callback set!");
+ return;
+ }
+
+ pl_mutex_lock(&p->lock);
+ if (!gl_make_current(p->gl)) {
+ pl_mutex_unlock(&p->lock);
+ return;
+ }
+
+ p->params.swap_buffers(p->params.priv);
+
+ const int max_depth = p->params.max_swapchain_depth;
+ while (max_depth && p->vsync_fences.num >= max_depth) {
+ gl->ClientWaitSync(p->vsync_fences.elem[0], GL_SYNC_FLUSH_COMMANDS_BIT, 1e9);
+ gl->DeleteSync(p->vsync_fences.elem[0]);
+ PL_ARRAY_REMOVE_AT(p->vsync_fences, 0);
+ }
+
+ gl_check_err(sw->gpu, "gl_sw_swap_buffers");
+ gl_release_current(p->gl);
+ pl_mutex_unlock(&p->lock);
+}
+
+static const struct pl_sw_fns opengl_swapchain = {
+ .destroy = gl_sw_destroy,
+ .latency = gl_sw_latency,
+ .resize = gl_sw_resize,
+ .start_frame = gl_sw_start_frame,
+ .submit_frame = gl_sw_submit_frame,
+ .swap_buffers = gl_sw_swap_buffers,
+};
diff --git a/src/opengl/utils.c b/src/opengl/utils.c
new file mode 100644
index 0000000..d96a3e7
--- /dev/null
+++ b/src/opengl/utils.c
@@ -0,0 +1,158 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+#include "gpu.h"
+#include "utils.h"
+
+const char *gl_err_str(GLenum err)
+{
+ switch (err) {
+#define CASE(name) case name: return #name
+ CASE(GL_NO_ERROR);
+ CASE(GL_INVALID_ENUM);
+ CASE(GL_INVALID_VALUE);
+ CASE(GL_INVALID_OPERATION);
+ CASE(GL_INVALID_FRAMEBUFFER_OPERATION);
+ CASE(GL_OUT_OF_MEMORY);
+ CASE(GL_STACK_UNDERFLOW);
+ CASE(GL_STACK_OVERFLOW);
+#undef CASE
+
+ default: return "unknown error";
+ }
+}
+
+void gl_poll_callbacks(pl_gpu gpu)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ struct pl_gl *p = PL_PRIV(gpu);
+ while (p->callbacks.num) {
+ struct gl_cb cb = p->callbacks.elem[0];
+ GLenum res = gl->ClientWaitSync(cb.sync, 0, 0);
+ switch (res) {
+ case GL_ALREADY_SIGNALED:
+ case GL_CONDITION_SATISFIED:
+ PL_ARRAY_REMOVE_AT(p->callbacks, 0);
+ cb.callback(cb.priv);
+ continue;
+
+ case GL_WAIT_FAILED:
+ PL_ARRAY_REMOVE_AT(p->callbacks, 0);
+ gl->DeleteSync(cb.sync);
+ p->failed = true;
+ gl_check_err(gpu, "gl_poll_callbacks"); // NOTE: will recurse!
+ return;
+
+ case GL_TIMEOUT_EXPIRED:
+ return;
+
+ default:
+ pl_unreachable();
+ }
+ }
+}
+
+bool gl_check_err(pl_gpu gpu, const char *fun)
+{
+ const gl_funcs *gl = gl_funcs_get(gpu);
+ struct pl_gl *p = PL_PRIV(gpu);
+ bool ret = true;
+
+ while (true) {
+ GLenum error = gl->GetError();
+ if (error == GL_NO_ERROR)
+ break;
+ PL_ERR(gpu, "%s: OpenGL error: %s", fun, gl_err_str(error));
+ ret = false;
+ p->failed = true;
+ }
+
+ gl_poll_callbacks(gpu);
+ return ret;
+}
+
+bool gl_is_software(pl_opengl pl_gl)
+{
+ struct gl_ctx *glctx = PL_PRIV(pl_gl);
+ const gl_funcs *gl = &glctx->func;
+ const char *renderer = (char *) gl->GetString(GL_RENDERER);
+ return !renderer ||
+ strcmp(renderer, "Software Rasterizer") == 0 ||
+ strstr(renderer, "llvmpipe") ||
+ strstr(renderer, "softpipe") ||
+ strcmp(renderer, "Mesa X11") == 0 ||
+ strcmp(renderer, "Apple Software Renderer") == 0;
+}
+
+bool gl_is_gles(pl_opengl pl_gl)
+{
+ struct gl_ctx *glctx = PL_PRIV(pl_gl);
+ const gl_funcs *gl = &glctx->func;
+ const char *version = (char *) gl->GetString(GL_VERSION);
+ return pl_str_startswith0(pl_str0(version), "OpenGL ES");
+}
+
+bool gl_test_ext(pl_gpu gpu, const char *ext, int gl_ver, int gles_ver)
+{
+ struct pl_gl *p = PL_PRIV(gpu);
+ if (gl_ver && p->gl_ver >= gl_ver)
+ return true;
+ if (gles_ver && p->gles_ver >= gles_ver)
+ return true;
+
+ return ext ? pl_opengl_has_ext(p->gl, ext) : false;
+}
+
+const char *egl_err_str(EGLenum err)
+{
+ switch (err) {
+#define CASE(name) case name: return #name
+ CASE(EGL_SUCCESS);
+ CASE(EGL_NOT_INITIALIZED);
+ CASE(EGL_BAD_ACCESS);
+ CASE(EGL_BAD_ALLOC);
+ CASE(EGL_BAD_ATTRIBUTE);
+ CASE(EGL_BAD_CONFIG);
+ CASE(EGL_BAD_CONTEXT);
+ CASE(EGL_BAD_CURRENT_SURFACE);
+ CASE(EGL_BAD_DISPLAY);
+ CASE(EGL_BAD_MATCH);
+ CASE(EGL_BAD_NATIVE_PIXMAP);
+ CASE(EGL_BAD_NATIVE_WINDOW);
+ CASE(EGL_BAD_PARAMETER);
+ CASE(EGL_BAD_SURFACE);
+#undef CASE
+
+ default: return "unknown error";
+ }
+}
+
+bool egl_check_err(pl_gpu gpu, const char *fun)
+{
+ struct pl_gl *p = PL_PRIV(gpu);
+ bool ret = true;
+
+ while (true) {
+ GLenum error = eglGetError();
+ if (error == EGL_SUCCESS)
+ return ret;
+ PL_ERR(gpu, "%s: EGL error: %s", fun, egl_err_str(error));
+ ret = false;
+ p->failed = true;
+ }
+}
diff --git a/src/opengl/utils.h b/src/opengl/utils.h
new file mode 100644
index 0000000..0be229d
--- /dev/null
+++ b/src/opengl/utils.h
@@ -0,0 +1,57 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+// Iterate through callbacks attached to the `pl_gl` and execute all of the
+// ones that have completed.
+//
+// Thread-safety: Unsafe
+void gl_poll_callbacks(pl_gpu gpu);
+
+// Return a human-readable name for various OpenGL errors
+//
+// Thread-safety: Safe
+const char *gl_err_str(GLenum err);
+
+// Check for errors and log them + return false if detected
+//
+// Thread-safety: Unsafe
+bool gl_check_err(pl_gpu gpu, const char *fun);
+
+// Returns true if the context is a suspected software rasterizer
+//
+// Thread-safety: Unsafe
+bool gl_is_software(pl_opengl gl);
+
+// Returns true if the context is detected as OpenGL ES
+//
+// Thread-safety: Unsafe
+bool gl_is_gles(pl_opengl gl);
+
+// Check for presence of an extension, alternatively a minimum GL version
+//
+// Thread-safety: Unsafe
+bool gl_test_ext(pl_gpu gpu, const char *ext, int gl_ver, int gles_ver);
+
+// Thread-safety: Safe
+const char *egl_err_str(EGLenum err);
+
+// Thread-safety: Unsafe
+bool egl_check_err(pl_gpu gpu, const char *fun);
diff --git a/src/options.c b/src/options.c
new file mode 100644
index 0000000..1db53bf
--- /dev/null
+++ b/src/options.c
@@ -0,0 +1,1166 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+
+#include "common.h"
+#include "log.h"
+
+#include <libplacebo/options.h>
+
+struct priv {
+ pl_log log;
+
+ // for pl_options_get
+ struct pl_opt_data_t data;
+ pl_str data_text;
+
+ // for pl_options_save
+ pl_str saved;
+
+ // internally managed hooks array
+ PL_ARRAY(const struct pl_hook *) hooks;
+};
+
+static const struct pl_options_t defaults = {
+ .params = { PL_RENDER_DEFAULTS },
+ .deband_params = { PL_DEBAND_DEFAULTS },
+ .sigmoid_params = { PL_SIGMOID_DEFAULTS },
+ .color_adjustment = { PL_COLOR_ADJUSTMENT_NEUTRAL },
+ .peak_detect_params = { PL_PEAK_DETECT_DEFAULTS },
+ .color_map_params = { PL_COLOR_MAP_DEFAULTS },
+ .dither_params = { PL_DITHER_DEFAULTS },
+ .icc_params = { PL_ICC_DEFAULTS },
+ .cone_params = { PL_CONE_NONE, 1.0 },
+ .deinterlace_params = { PL_DEINTERLACE_DEFAULTS },
+ .distort_params = { PL_DISTORT_DEFAULTS },
+ .upscaler = {
+ .name = "custom",
+ .description = "Custom upscaler",
+ .allowed = PL_FILTER_UPSCALING,
+ },
+ .downscaler = {
+ .name = "custom",
+ .description = "Custom downscaler",
+ .allowed = PL_FILTER_DOWNSCALING,
+ },
+ .plane_upscaler = {
+ .name = "custom",
+ .description = "Custom plane upscaler",
+ .allowed = PL_FILTER_UPSCALING,
+ },
+ .plane_downscaler = {
+ .name = "custom",
+ .description = "Custom plane downscaler",
+ .allowed = PL_FILTER_DOWNSCALING,
+ },
+ .frame_mixer = {
+ .name = "custom",
+ .description = "Custom frame mixer",
+ .allowed = PL_FILTER_FRAME_MIXING,
+ },
+};
+
+// Copies only whitelisted fields
+static inline void copy_filter(struct pl_filter_config *dst,
+ const struct pl_filter_config *src)
+{
+ dst->kernel = src->kernel;
+ dst->window = src->window;
+ dst->radius = src->radius;
+ dst->clamp = src->clamp;
+ dst->blur = src->blur;
+ dst->taper = src->taper;
+ dst->polar = src->polar;
+ for (int i = 0; i < PL_FILTER_MAX_PARAMS; i++) {
+ dst->params[i] = src->params[i];
+ dst->wparams[i] = src->wparams[i];
+ }
+}
+
+static inline void redirect_params(pl_options opts)
+{
+ // Copy all non-NULL params structs into pl_options and redirect them
+#define REDIRECT_PARAMS(field) do \
+{ \
+ if (opts->params.field) { \
+ opts->field = *opts->params.field; \
+ opts->params.field = &opts->field; \
+ } \
+} while (0)
+
+ REDIRECT_PARAMS(deband_params);
+ REDIRECT_PARAMS(sigmoid_params);
+ REDIRECT_PARAMS(color_adjustment);
+ REDIRECT_PARAMS(peak_detect_params);
+ REDIRECT_PARAMS(color_map_params);
+ REDIRECT_PARAMS(dither_params);
+ REDIRECT_PARAMS(icc_params);
+ REDIRECT_PARAMS(cone_params);
+ REDIRECT_PARAMS(deinterlace_params);
+ REDIRECT_PARAMS(distort_params);
+}
+
+void pl_options_reset(pl_options opts, const struct pl_render_params *preset)
+{
+ *opts = defaults;
+ if (preset)
+ opts->params = *preset;
+ redirect_params(opts);
+
+ // Make a copy of all scaler configurations that aren't built-in filters
+ struct {
+ bool upscaler;
+ bool downscaler;
+ bool plane_upscaler;
+ bool plane_downscaler;
+ bool frame_mixer;
+ } fixed = {0};
+
+ for (int i = 0; i < pl_num_filter_configs; i++) {
+ const struct pl_filter_config *f = pl_filter_configs[i];
+ fixed.upscaler |= f == opts->params.upscaler;
+ fixed.downscaler |= f == opts->params.downscaler;
+ fixed.plane_upscaler |= f == opts->params.plane_upscaler;
+ fixed.plane_downscaler |= f == opts->params.plane_downscaler;
+ fixed.frame_mixer |= f == opts->params.frame_mixer;
+ }
+
+#define REDIRECT_SCALER(scaler) do \
+{ \
+ if (opts->params.scaler && !fixed.scaler) { \
+ copy_filter(&opts->scaler, opts->params.scaler); \
+ opts->params.scaler = &opts->scaler; \
+ } \
+} while (0)
+
+ REDIRECT_SCALER(upscaler);
+ REDIRECT_SCALER(downscaler);
+ REDIRECT_SCALER(plane_upscaler);
+ REDIRECT_SCALER(plane_downscaler);
+ REDIRECT_SCALER(frame_mixer);
+}
+
+pl_options pl_options_alloc(pl_log log)
+{
+ struct pl_options_t *opts = pl_zalloc_obj(NULL, opts, struct priv);
+ struct priv *p = PL_PRIV(opts);
+ pl_options_reset(opts, NULL);
+ p->log = log;
+ return opts;
+}
+
+void pl_options_free(pl_options *popts)
+{
+ pl_free_ptr((void **) popts);
+}
+
+static void make_hooks_internal(pl_options opts)
+{
+ struct priv *p = PL_PRIV(opts);
+ struct pl_render_params *params = &opts->params;
+ if (params->num_hooks && params->hooks != p->hooks.elem) {
+ PL_ARRAY_MEMDUP(opts, p->hooks, params->hooks, params->num_hooks);
+ params->hooks = p->hooks.elem;
+ }
+}
+
+void pl_options_add_hook(pl_options opts, const struct pl_hook *hook)
+{
+ struct priv *p = PL_PRIV(opts);
+ make_hooks_internal(opts);
+ PL_ARRAY_APPEND(opts, p->hooks, hook);
+ opts->params.hooks = p->hooks.elem;
+}
+
+void pl_options_insert_hook(pl_options opts, const struct pl_hook *hook, int idx)
+{
+ struct priv *p = PL_PRIV(opts);
+ make_hooks_internal(opts);
+ PL_ARRAY_INSERT_AT(opts, p->hooks, idx, hook);
+ opts->params.hooks = p->hooks.elem;
+}
+
+void pl_options_remove_hook_at(pl_options opts, int idx)
+{
+ struct priv *p = PL_PRIV(opts);
+ make_hooks_internal(opts);
+ PL_ARRAY_REMOVE_AT(p->hooks, idx);
+ opts->params.hooks = p->hooks.elem;
+}
+
+// Options printing/parsing context
+typedef const struct opt_ctx_t {
+ pl_log log; // as a convenience, only needed when parsing
+ pl_opt opt;
+ void *alloc; // for printing only
+ pl_options opts; // current base ptr
+} *opt_ctx;
+
+struct enum_val {
+ const char *name;
+ unsigned val;
+};
+
+struct preset {
+ const char *name;
+ const void *val;
+};
+
+struct named {
+ const char *name;
+};
+
+typedef const struct opt_priv_t {
+ int (*compare)(opt_ctx p, const void *a, const void *b); // optional
+ void (*print)(opt_ctx p, pl_str *out, const void *val); // apends to `out`
+ bool (*parse)(opt_ctx p, pl_str str, void *out_val);
+ const struct enum_val *values; // for enums, terminated by {0}
+ const struct preset *presets; // for preset lists, terminated by {0}
+ const struct named * const *names; // for array-backed options, terminated by NULL
+
+ // Offset and size of option in `struct pl_options_t`
+ size_t offset;
+ size_t size;
+ size_t offset_params; // offset of actual struct (for params toggles)
+} *opt_priv;
+
+static pl_opt_data get_opt_data(opt_ctx ctx)
+{
+ pl_options opts = ctx->opts;
+ struct priv *p = PL_PRIV(opts);
+ opt_priv priv = ctx->opt->priv;
+ const void *val = (void *) ((uintptr_t) opts + priv->offset);
+
+ p->data_text.len = 0;
+ priv->print(ctx, &p->data_text, val);
+ p->data = (struct pl_opt_data_t) {
+ .opts = opts,
+ .opt = ctx->opt,
+ .value = val,
+ .text = (char *) p->data_text.buf,
+ };
+
+ return &p->data;
+}
+
+pl_opt_data pl_options_get(pl_options opts, const char *key)
+{
+ struct priv *p = PL_PRIV(opts);
+
+ pl_opt opt = pl_find_option(key);
+ if (!opt || opt->preset) {
+ PL_ERR(p, "Unrecognized or invalid option '%s'", key);
+ return NULL;
+ }
+
+ return get_opt_data(&(struct opt_ctx_t) {
+ .alloc = opts,
+ .opts = opts,
+ .opt = opt,
+ });
+}
+
+void pl_options_iterate(pl_options opts,
+ void (*cb)(void *priv, pl_opt_data data),
+ void *cb_priv)
+{
+ for (pl_opt opt = pl_option_list; opt->key; opt++) {
+ if (opt->preset)
+ continue;
+
+ struct opt_ctx_t ctx = {
+ .alloc = opts,
+ .opts = opts,
+ .opt = opt,
+ };
+
+ opt_priv priv = opt->priv;
+ const void *val = (void *) ((uintptr_t) opts + priv->offset);
+ const void *ref = (void *) ((uintptr_t) &defaults + priv->offset);
+ int cmp = priv->compare ? priv->compare(&ctx, val, ref)
+ : memcmp(val, ref, priv->size);
+ if (cmp != 0)
+ cb(cb_priv, get_opt_data(&ctx));
+ }
+}
+
+static void save_cb(void *priv, pl_opt_data data)
+{
+ pl_opt opt = data->opt;
+ void *alloc = data->opts;
+ pl_str *out = priv;
+
+ if (out->len)
+ pl_str_append_raw(alloc, out, ",", 1);
+ pl_str_append_raw(alloc, out, opt->key, strlen(opt->key));
+ pl_str_append_raw(alloc, out, "=", 1);
+ pl_str_append(alloc, out, pl_str0(data->text));
+}
+
+const char *pl_options_save(pl_options opts)
+{
+ struct priv *p = PL_PRIV(opts);
+
+ p->saved.len = 0;
+ pl_options_iterate(opts, save_cb, &p->saved);
+ return p->saved.len ? (char *) p->saved.buf : "";
+}
+
+static bool option_set_raw(pl_options opts, pl_str k, pl_str v)
+{
+ struct priv *p = PL_PRIV(opts);
+ k = pl_str_strip(k);
+ v = pl_str_strip(v);
+
+ pl_opt opt;
+ for (opt = pl_option_list; opt->key; opt++) {
+ if (pl_str_equals0(k, opt->key))
+ goto found;
+ }
+
+ PL_ERR(p, "Unrecognized option '%.*s', in '%.*s=%.*s'",
+ PL_STR_FMT(k), PL_STR_FMT(k), PL_STR_FMT(v));
+ return false;
+
+found:
+ PL_TRACE(p, "Parsing option '%s' = '%.*s'", opt->key, PL_STR_FMT(v));
+ if (opt->deprecated)
+ PL_WARN(p, "Option '%s' is deprecated", opt->key);
+
+ struct opt_ctx_t ctx = {
+ .log = p->log,
+ .opts = opts,
+ .opt = opt,
+ };
+
+ opt_priv priv = opt->priv;
+ void *val = (void *) ((uintptr_t) opts + priv->offset);
+ return priv->parse(&ctx, v, val);
+}
+
+bool pl_options_set_str(pl_options opts, const char *key, const char *value)
+{
+ return option_set_raw(opts, pl_str0(key), pl_str0(value));
+}
+
+bool pl_options_load(pl_options opts, const char *str)
+{
+ bool ret = true;
+
+ pl_str rest = pl_str0(str);
+ while (rest.len) {
+ pl_str kv = pl_str_strip(pl_str_split_chars(rest, " ,;:\n", &rest));
+ if (!kv.len)
+ continue;
+ pl_str v, k = pl_str_split_char(kv, '=', &v);
+ ret &= option_set_raw(opts, k, v);
+ }
+
+ return ret;
+}
+
+// Individual option types
+
+static void print_bool(opt_ctx p, pl_str *out, const void *ptr)
+{
+ const bool *val = ptr;
+ if (*val) {
+ pl_str_append(p->alloc, out, pl_str0("yes"));
+ } else {
+ pl_str_append(p->alloc, out, pl_str0("no"));
+ }
+}
+
+static bool parse_bool(opt_ctx p, pl_str str, void *out)
+{
+ bool *res = out;
+ if (pl_str_equals0(str, "yes") ||
+ pl_str_equals0(str, "y") ||
+ pl_str_equals0(str, "on") ||
+ pl_str_equals0(str, "true") ||
+ pl_str_equals0(str, "enabled") ||
+ !str.len) // accept naked option name as well
+ {
+ *res = true;
+ return true;
+ } else if (pl_str_equals0(str, "no") ||
+ pl_str_equals0(str, "n") ||
+ pl_str_equals0(str, "off") ||
+ pl_str_equals0(str, "false") ||
+ pl_str_equals0(str, "disabled"))
+ {
+ *res = false;
+ return true;
+ }
+
+ PL_ERR(p, "Invalid value '%.*s' for option '%s', expected boolean",
+ PL_STR_FMT(str), p->opt->key);
+ return false;
+}
+
+static void print_int(opt_ctx p, pl_str *out, const void *ptr)
+{
+ pl_opt opt = p->opt;
+ const int *val = ptr;
+ pl_assert(opt->min == opt->max || (*val >= opt->min && *val <= opt->max));
+ pl_str_append_asprintf_c(p->alloc, out, "%d", *val);
+}
+
+static bool parse_int(opt_ctx p, pl_str str, void *out)
+{
+ pl_opt opt = p->opt;
+ int val;
+ if (!pl_str_parse_int(str, &val)) {
+ PL_ERR(p, "Invalid value '%.*s' for option '%s', expected integer",
+ PL_STR_FMT(str), opt->key);
+ return false;
+ }
+
+ if (opt->min != opt->max) {
+ if (val < opt->min || val > opt->max) {
+ PL_ERR(p, "Value of %d out of range for option '%s': [%d, %d]",
+ val, opt->key, (int) opt->min, (int) opt->max);
+ return false;
+ }
+ }
+
+ *(int *) out = val;
+ return true;
+}
+
+static void print_float(opt_ctx p, pl_str *out, const void *ptr)
+{
+ pl_opt opt = p->opt;
+ const float *val = ptr;
+ pl_assert(opt->min == opt->max || (*val >= opt->min && *val <= opt->max));
+ pl_str_append_asprintf_c(p->alloc, out, "%f", *val);
+}
+
+static bool parse_fraction(pl_str str, float *val)
+{
+ pl_str denom, num = pl_str_split_char(str, '/', &denom);
+ float n, d;
+ bool ok = denom.buf && denom.len && pl_str_parse_float(num, &n) &&
+ pl_str_parse_float(denom, &d);
+ if (ok)
+ *val = n / d;
+ return ok;
+}
+
+static bool parse_float(opt_ctx p, pl_str str, void *out)
+{
+ pl_opt opt = p->opt;
+ float val;
+ if (!parse_fraction(str, &val) && !pl_str_parse_float(str, &val)) {
+ PL_ERR(p, "Invalid value '%.*s' for option '%s', expected floating point "
+ "or fraction", PL_STR_FMT(str), opt->key);
+ return false;
+ }
+
+ switch (fpclassify(val)) {
+ case FP_NAN:
+ case FP_INFINITE:
+ case FP_SUBNORMAL:
+ PL_ERR(p, "Invalid value '%f' for option '%s', non-normal float",
+ val, opt->key);
+ return false;
+
+ case FP_ZERO:
+ case FP_NORMAL:
+ break;
+ }
+
+ if (opt->min != opt->max) {
+ if (val < opt->min || val > opt->max) {
+ PL_ERR(p, "Value of %.3f out of range for option '%s': [%.2f, %.2f]",
+ val, opt->key, opt->min, opt->max);
+ return false;
+ }
+ }
+
+ *(float *) out = val;
+ return true;
+}
+
+static int compare_params(opt_ctx p, const void *pa, const void *pb)
+{
+ const bool a = *(const void * const *) pa;
+ const bool b = *(const void * const *) pb;
+ return PL_CMP(a, b);
+}
+
+static void print_params(opt_ctx p, pl_str *out, const void *ptr)
+{
+ const bool value = *(const void * const *) ptr;
+ print_bool(p, out, &value);
+}
+
+static bool parse_params(opt_ctx p, pl_str str, void *out)
+{
+ pl_opt opt = p->opt;
+ opt_priv priv = opt->priv;
+ const void **res = out;
+ bool set;
+ if (!parse_bool(p, str, &set))
+ return false;
+ if (set) {
+ *res = (const void *) ((uintptr_t) p->opts + priv->offset_params);
+ } else {
+ *res = NULL;
+ }
+ return true;
+}
+
+static void print_enum(opt_ctx p, pl_str *out, const void *ptr)
+{
+ pl_opt opt = p->opt;
+ opt_priv priv = opt->priv;
+ const unsigned value = *(const unsigned *) ptr;
+ for (int i = 0; priv->values[i].name; i++) {
+ if (priv->values[i].val == value) {
+ pl_str_append(p->alloc, out, pl_str0(priv->values[i].name));
+ return;
+ }
+ }
+
+ pl_unreachable();
+}
+
+static bool parse_enum(opt_ctx p, pl_str str, void *out)
+{
+ pl_opt opt = p->opt;
+ opt_priv priv = opt->priv;
+ for (int i = 0; priv->values[i].name; i++) {
+ if (pl_str_equals0(str, priv->values[i].name)) {
+ *(unsigned *) out = priv->values[i].val;
+ return true;
+ }
+ }
+
+ PL_ERR(p, "Value of '%.*s' unrecognized for option '%s', valid values:",
+ PL_STR_FMT(str), opt->key);
+ for (int i = 0; priv->values[i].name; i++)
+ PL_ERR(p, " %s", priv->values[i].name);
+ return false;
+}
+
+static bool parse_preset(opt_ctx p, pl_str str, void *out)
+{
+ pl_opt opt = p->opt;
+ opt_priv priv = opt->priv;
+ for (int i = 0; priv->presets[i].name; i++) {
+ if (pl_str_equals0(str, priv->presets[i].name)) {
+ if (priv->offset == offsetof(struct pl_options_t, params)) {
+ const struct pl_render_params *preset = priv->presets[i].val;
+ pl_assert(priv->size == sizeof(*preset));
+
+ // Redirect params structs into internal system after loading
+ struct pl_render_params *params = out, prev = *params;
+ *params = *preset;
+ redirect_params(p->opts);
+
+ // Re-apply excluded options
+ params->lut = prev.lut;
+ params->hooks = prev.hooks;
+ params->num_hooks = prev.num_hooks;
+ params->info_callback = prev.info_callback;
+ params->info_priv = prev.info_priv;
+ } else {
+ memcpy(out, priv->presets[i].val, priv->size);
+ }
+ return true;
+ }
+ }
+
+ PL_ERR(p, "Value of '%.*s' unrecognized for option '%s', valid values:",
+ PL_STR_FMT(str), opt->key);
+ for (int i = 0; priv->presets[i].name; i++)
+ PL_ERR(p, " %s", priv->presets[i].name);
+ return false;
+}
+
+static void print_named(opt_ctx p, pl_str *out, const void *ptr)
+{
+ const struct named *value = *(const struct named **) ptr;
+ if (value) {
+ pl_str_append(p->alloc, out, pl_str0(value->name));
+ } else {
+ pl_str_append(p->alloc, out, pl_str0("none"));
+ }
+}
+
+static bool parse_named(opt_ctx p, pl_str str, void *out)
+{
+ pl_opt opt = p->opt;
+ opt_priv priv = opt->priv;
+ const struct named **res = out;
+ if (pl_str_equals0(str, "none")) {
+ *res = NULL;
+ return true;
+ }
+
+ for (int i = 0; priv->names[i]; i++) {
+ if (pl_str_equals0(str, priv->names[i]->name)) {
+ *res = priv->names[i];
+ return true;
+ }
+ }
+
+ PL_ERR(p, "Value of '%.*s' unrecognized for option '%s', valid values:",
+ PL_STR_FMT(str), opt->key);
+ PL_ERR(p, " none");
+ for (int i = 0; priv->names[i]; i++)
+ PL_ERR(p, " %s", priv->names[i]->name);
+ return false;
+}
+
+static void print_scaler(opt_ctx p, pl_str *out, const void *ptr)
+{
+ const struct pl_filter_config *f = *(const struct pl_filter_config **) ptr;
+ if (f) {
+ pl_assert(f->name); // this is either a built-in scaler or ptr to custom
+ pl_str_append(p->alloc, out, pl_str0(f->name));
+ } else {
+ pl_str_append(p->alloc, out, pl_str0("none"));
+ }
+}
+
+static enum pl_filter_usage scaler_usage(pl_opt opt)
+{
+ opt_priv priv = opt->priv;
+ switch (priv->offset) {
+ case offsetof(struct pl_options_t, params.upscaler):
+ case offsetof(struct pl_options_t, params.plane_upscaler):
+ case offsetof(struct pl_options_t, upscaler):
+ case offsetof(struct pl_options_t, plane_upscaler):
+ return PL_FILTER_UPSCALING;
+
+ case offsetof(struct pl_options_t, params.downscaler):
+ case offsetof(struct pl_options_t, params.plane_downscaler):
+ case offsetof(struct pl_options_t, downscaler):
+ case offsetof(struct pl_options_t, plane_downscaler):
+ return PL_FILTER_DOWNSCALING;
+
+ case offsetof(struct pl_options_t, params.frame_mixer):
+ case offsetof(struct pl_options_t, frame_mixer):
+ return PL_FILTER_FRAME_MIXING;
+ }
+
+ pl_unreachable();
+}
+
+static bool parse_scaler(opt_ctx p, pl_str str, void *out)
+{
+ pl_opt opt = p->opt;
+ opt_priv priv = opt->priv;
+ const struct pl_filter_config **res = out;
+ if (pl_str_equals0(str, "none")) {
+ *res = NULL;
+ return true;
+ } else if (pl_str_equals0(str, "custom")) {
+ *res = (void *) ((uintptr_t) p->opts + priv->offset_params);
+ return true;
+ }
+
+ const enum pl_filter_usage usage = scaler_usage(opt);
+ for (int i = 0; i < pl_num_filter_configs; i++) {
+ if (!(pl_filter_configs[i]->allowed & usage))
+ continue;
+ if (pl_str_equals0(str, pl_filter_configs[i]->name)) {
+ *res = pl_filter_configs[i];
+ return true;
+ }
+ }
+
+ PL_ERR(p, "Value of '%.*s' unrecognized for option '%s', valid values:",
+ PL_STR_FMT(str), opt->key);
+ PL_ERR(p, " none");
+ PL_ERR(p, " custom");
+ for (int i = 0; i < pl_num_filter_configs; i++) {
+ if (pl_filter_configs[i]->allowed & usage)
+ PL_ERR(p, " %s", pl_filter_configs[i]->name);
+ }
+ return false;
+}
+
+static bool parse_scaler_preset(opt_ctx p, pl_str str, void *out)
+{
+ pl_opt opt = p->opt;
+ struct pl_filter_config *res = out;
+ if (pl_str_equals0(str, "none")) {
+ *res = (struct pl_filter_config) { .name = "custom" };
+ return true;
+ }
+
+ const enum pl_filter_usage usage = scaler_usage(opt);
+ for (int i = 0; i < pl_num_filter_configs; i++) {
+ if (!(pl_filter_configs[i]->allowed & usage))
+ continue;
+ if (pl_str_equals0(str, pl_filter_configs[i]->name)) {
+ copy_filter(res, pl_filter_configs[i]);
+ return true;
+ }
+ }
+
+ PL_ERR(p, "Value of '%.*s' unrecognized for option '%s', valid values:",
+ PL_STR_FMT(str), opt->key);
+ PL_ERR(p, " none");
+ for (int i = 0; i < pl_num_filter_configs; i++) {
+ if (pl_filter_configs[i]->allowed & usage)
+ PL_ERR(p, " %s", pl_filter_configs[i]->name);
+ }
+ return false;
+}
+
+#define OPT_BOOL(KEY, NAME, FIELD, ...) \
+ { \
+ .key = KEY, \
+ .name = NAME, \
+ .type = PL_OPT_BOOL, \
+ .priv = &(struct opt_priv_t) { \
+ .print = print_bool, \
+ .parse = parse_bool, \
+ .offset = offsetof(struct pl_options_t, FIELD), \
+ .size = sizeof(struct { \
+ bool dummy; \
+ pl_static_assert(sizeof(defaults.FIELD) == sizeof(bool)); \
+ }), \
+ }, \
+ __VA_ARGS__ \
+ }
+
+#define OPT_INT(KEY, NAME, FIELD, ...) \
+ { \
+ .key = KEY, \
+ .name = NAME, \
+ .type = PL_OPT_INT, \
+ .priv = &(struct opt_priv_t) { \
+ .print = print_int, \
+ .parse = parse_int, \
+ .offset = offsetof(struct pl_options_t, FIELD), \
+ .size = sizeof(struct { \
+ int dummy; \
+ pl_static_assert(sizeof(defaults.FIELD) == sizeof(int)); \
+ }), \
+ }, \
+ __VA_ARGS__ \
+ }
+
+#define OPT_FLOAT(KEY, NAME, FIELD, ...) \
+ { \
+ .key = KEY, \
+ .name = NAME, \
+ .type = PL_OPT_FLOAT, \
+ .priv = &(struct opt_priv_t) { \
+ .print = print_float, \
+ .parse = parse_float, \
+ .offset = offsetof(struct pl_options_t, FIELD), \
+ .size = sizeof(struct { \
+ float dummy; \
+ pl_static_assert(sizeof(defaults.FIELD) == sizeof(float)); \
+ }), \
+ }, \
+ __VA_ARGS__ \
+ }
+
+#define OPT_ENABLE_PARAMS(KEY, NAME, PARAMS, ...) \
+ { \
+ .key = KEY, \
+ .name = NAME, \
+ .type = PL_OPT_BOOL, \
+ .priv = &(struct opt_priv_t) { \
+ .compare = compare_params, \
+ .print = print_params, \
+ .parse = parse_params, \
+ .offset = offsetof(struct pl_options_t, params.PARAMS), \
+ .offset_params = offsetof(struct pl_options_t, PARAMS), \
+ .size = sizeof(struct { \
+ void *dummy; \
+ pl_static_assert(sizeof(defaults.params.PARAMS) == sizeof(void*));\
+ }), \
+ }, \
+ __VA_ARGS__ \
+ }
+
+#define OPT_ENUM(KEY, NAME, FIELD, VALUES, ...) \
+ { \
+ .key = KEY, \
+ .name = NAME, \
+ .type = PL_OPT_STRING, \
+ .priv = &(struct opt_priv_t) { \
+ .print = print_enum, \
+ .parse = parse_enum, \
+ .offset = offsetof(struct pl_options_t, FIELD), \
+ .size = sizeof(struct { \
+ unsigned dummy; \
+ pl_static_assert(sizeof(defaults.FIELD) == sizeof(unsigned)); \
+ }), \
+ .values = (struct enum_val[]) { VALUES } \
+ }, \
+ __VA_ARGS__ \
+ }
+
+#define OPT_PRESET(KEY, NAME, PARAMS, PRESETS, ...) \
+ { \
+ .key = KEY, \
+ .name = NAME, \
+ .type = PL_OPT_STRING, \
+ .preset = true, \
+ .priv = &(struct opt_priv_t) { \
+ .parse = parse_preset, \
+ .offset = offsetof(struct pl_options_t, PARAMS), \
+ .size = sizeof(defaults.PARAMS), \
+ .presets = (struct preset[]) { PRESETS }, \
+ }, \
+ __VA_ARGS__ \
+ }
+
+#define OPT_NAMED(KEY, NAME, FIELD, NAMES, ...) \
+ { \
+ .key = KEY, \
+ .name = NAME, \
+ .type = PL_OPT_STRING, \
+ .priv = &(struct opt_priv_t) { \
+ .print = print_named, \
+ .parse = parse_named, \
+ .offset = offsetof(struct pl_options_t, FIELD), \
+ .names = (const struct named * const * ) NAMES, \
+ .size = sizeof(struct { \
+ const struct named *dummy; \
+ pl_static_assert(offsetof(__typeof__(*NAMES[0]), name) == 0); \
+ pl_static_assert(sizeof(defaults.FIELD) == \
+ sizeof(const struct named *)); \
+ }), \
+ }, \
+ __VA_ARGS__ \
+ }
+
+#define OPT_SCALER(KEY, NAME, SCALER, ...) \
+ { \
+ .key = KEY, \
+ .name = NAME, \
+ .type = PL_OPT_STRING, \
+ .priv = &(struct opt_priv_t) { \
+ .print = print_scaler, \
+ .parse = parse_scaler, \
+ .offset = offsetof(struct pl_options_t, params.SCALER), \
+ .offset_params = offsetof(struct pl_options_t, SCALER), \
+ .size = sizeof(struct { \
+ const struct pl_filter_config *dummy; \
+ pl_static_assert(sizeof(defaults.SCALER) == \
+ sizeof(struct pl_filter_config)); \
+ }), \
+ }, \
+ __VA_ARGS__ \
+ }
+
+#define OPT_SCALER_PRESET(KEY, NAME, SCALER, ...) \
+ { \
+ .key = KEY, \
+ .name = NAME, \
+ .type = PL_OPT_STRING, \
+ .preset = true, \
+ .priv = &(struct opt_priv_t) { \
+ .parse = parse_scaler_preset, \
+ .offset = offsetof(struct pl_options_t, SCALER), \
+ .size = sizeof(struct { \
+ struct pl_filter_config dummy; \
+ pl_static_assert(sizeof(defaults.SCALER) == \
+ sizeof(struct pl_filter_config)); \
+ }), \
+ }, \
+ __VA_ARGS__ \
+ }
+
+#define LIST(...) __VA_ARGS__, {0}
+
+#define SCALE_OPTS(PREFIX, NAME, FIELD) \
+ OPT_SCALER(PREFIX, NAME, FIELD), \
+ OPT_SCALER_PRESET(PREFIX"_preset", NAME "preset", FIELD), \
+ OPT_NAMED(PREFIX"_kernel", NAME" kernel", FIELD.kernel, pl_filter_functions), \
+ OPT_NAMED(PREFIX"_window", NAME" window", FIELD.window, pl_filter_functions), \
+ OPT_FLOAT(PREFIX"_radius", NAME" radius", FIELD.radius, .min = 0.0, .max = 16.0), \
+ OPT_FLOAT(PREFIX"_clamp", NAME" clamping", FIELD.clamp, .max = 1.0), \
+ OPT_FLOAT(PREFIX"_blur", NAME" blur factor", FIELD.blur, .max = 100.0), \
+ OPT_FLOAT(PREFIX"_taper", NAME" taper factor", FIELD.taper, .max = 1.0), \
+ OPT_FLOAT(PREFIX"_antiring", NAME" antiringing", FIELD.antiring, .max = 1.0), \
+ OPT_FLOAT(PREFIX"_param1", NAME" parameter 1", FIELD.params[0]), \
+ OPT_FLOAT(PREFIX"_param2", NAME" parameter 2", FIELD.params[1]), \
+ OPT_FLOAT(PREFIX"_wparam1", NAME" window parameter 1", FIELD.wparams[0]), \
+ OPT_FLOAT(PREFIX"_wparam2", NAME" window parameter 2", FIELD.wparams[1]), \
+ OPT_BOOL(PREFIX"_polar", NAME" polar", FIELD.polar)
+
+const struct pl_opt_t pl_option_list[] = {
+ OPT_PRESET("preset", "Global preset", params, LIST(
+ {"default", &pl_render_default_params},
+ {"fast", &pl_render_fast_params},
+ {"high_quality", &pl_render_high_quality_params})),
+
+ // Scalers
+ SCALE_OPTS("upscaler", "Upscaler", upscaler),
+ SCALE_OPTS("downscaler", "Downscaler", downscaler),
+ SCALE_OPTS("plane_upscaler", "Plane upscaler", plane_upscaler),
+ SCALE_OPTS("plane_downscaler", "Plane downscaler", plane_downscaler),
+ SCALE_OPTS("frame_mixer", "Frame mixer", frame_mixer),
+ OPT_FLOAT("antiringing_strength", "Anti-ringing strength", params.antiringing_strength, .max = 1.0),
+
+ // Debanding
+ OPT_ENABLE_PARAMS("deband", "Enable debanding", deband_params),
+ OPT_PRESET("deband_preset", "Debanding preset", deband_params, LIST(
+ {"default", &pl_deband_default_params})),
+ OPT_INT("deband_iterations", "Debanding iterations", deband_params.iterations, .max = 16),
+ OPT_FLOAT("deband_threshold", "Debanding threshold", deband_params.threshold, .max = 1000.0),
+ OPT_FLOAT("deband_radius", "Debanding radius", deband_params.radius, .max = 1000.0),
+ OPT_FLOAT("deband_grain", "Debanding grain", deband_params.grain, .max = 1000.0),
+ OPT_FLOAT("deband_grain_neutral_r", "Debanding grain neutral R", deband_params.grain_neutral[0]),
+ OPT_FLOAT("deband_grain_neutral_g", "Debanding grain neutral G", deband_params.grain_neutral[1]),
+ OPT_FLOAT("deband_grain_neutral_b", "Debanding grain neutral B", deband_params.grain_neutral[2]),
+
+ // Sigmodization
+ OPT_ENABLE_PARAMS("sigmoid", "Enable sigmoidization", sigmoid_params),
+ OPT_PRESET("sigmoid_preset", "Sigmoidization preset", sigmoid_params, LIST(
+ {"default", &pl_sigmoid_default_params})),
+ OPT_FLOAT("sigmoid_center", "Sigmoidization center", sigmoid_params.center, .max = 1.0),
+ OPT_FLOAT("sigmoid_slope", "Sigmoidization slope", sigmoid_params.slope, .min = 1.0, .max = 20.0),
+
+ // Color adjustment
+ OPT_ENABLE_PARAMS("color_adjustment", "Enable color adjustment", color_adjustment),
+ OPT_PRESET("color_adjustment_preset", "Color adjustment preset", color_adjustment, LIST(
+ {"neutral", &pl_color_adjustment_neutral})),
+ OPT_FLOAT("brightness", "Brightness boost", color_adjustment.brightness, .min = -1.0, .max = 1.0),
+ OPT_FLOAT("contrast", "Contrast boost", color_adjustment.contrast, .max = 100.0),
+ OPT_FLOAT("saturation", "Saturation gain", color_adjustment.saturation, .max = 100.0),
+ OPT_FLOAT("hue", "Hue shift", color_adjustment.hue),
+ OPT_FLOAT("gamma", "Gamma adjustment", color_adjustment.gamma, .max = 100.0),
+ OPT_FLOAT("temperature", "Color temperature shift", color_adjustment.temperature,
+ .min = (2500 - 6500) / 3500.0, // see `pl_white_from_temp`
+ .max = (25000 - 6500) / 3500.0),
+
+ // Peak detection
+ OPT_ENABLE_PARAMS("peak_detect", "Enable peak detection", peak_detect_params),
+ OPT_PRESET("peak_detect_preset", "Peak detection preset", peak_detect_params, LIST(
+ {"default", &pl_peak_detect_default_params},
+ {"high_quality", &pl_peak_detect_high_quality_params})),
+ OPT_FLOAT("peak_smoothing_period", "Peak detection smoothing coefficient", peak_detect_params.smoothing_period, .max = 1000.0),
+ OPT_FLOAT("scene_threshold_low", "Scene change threshold low", peak_detect_params.scene_threshold_low, .max = 100.0),
+ OPT_FLOAT("scene_threshold_high", "Scene change threshold high", peak_detect_params.scene_threshold_high, .max = 100.0),
+ OPT_FLOAT("minimum_peak", "Minimum detected peak", peak_detect_params.minimum_peak, .max = 100.0, .deprecated = true),
+ OPT_FLOAT("peak_percentile", "Peak detection percentile", peak_detect_params.percentile, .max = 100.0),
+ OPT_BOOL("allow_delayed_peak", "Allow delayed peak detection", peak_detect_params.allow_delayed),
+
+ // Color mapping
+ OPT_ENABLE_PARAMS("color_map", "Enable color mapping", color_map_params),
+ OPT_PRESET("color_map_preset", "Color mapping preset", color_map_params, LIST(
+ {"default", &pl_color_map_default_params},
+ {"high_quality", &pl_color_map_high_quality_params})),
+ OPT_NAMED("gamut_mapping", "Gamut mapping function", color_map_params.gamut_mapping,
+ pl_gamut_map_functions),
+ OPT_FLOAT("perceptual_deadzone", "Gamut mapping perceptual deadzone", color_map_params.gamut_constants.perceptual_deadzone, .max = 1.0f),
+ OPT_FLOAT("perceptual_strength", "Gamut mapping perceptual strength", color_map_params.gamut_constants.perceptual_strength, .max = 1.0f),
+ OPT_FLOAT("colorimetric_gamma", "Gamut mapping colorimetric gamma", color_map_params.gamut_constants.colorimetric_gamma, .max = 10.0f),
+ OPT_FLOAT("softclip_knee", "Gamut mapping softclip knee point", color_map_params.gamut_constants.softclip_knee, .max = 1.0f),
+ OPT_FLOAT("softclip_desat", "Gamut mapping softclip desaturation strength", color_map_params.gamut_constants.softclip_desat, .max = 1.0f),
+ OPT_INT("lut3d_size_I", "Gamut 3DLUT size I", color_map_params.lut3d_size[0], .max = 1024),
+ OPT_INT("lut3d_size_C", "Gamut 3DLUT size C", color_map_params.lut3d_size[1], .max = 1024),
+ OPT_INT("lut3d_size_h", "Gamut 3DLUT size h", color_map_params.lut3d_size[2], .max = 1024),
+ OPT_BOOL("lut3d_tricubic", "Gamut 3DLUT tricubic interpolation", color_map_params.lut3d_tricubic),
+ OPT_BOOL("gamut_expansion", "Gamut expansion", color_map_params.gamut_expansion),
+ OPT_NAMED("tone_mapping", "Tone mapping function", color_map_params.tone_mapping_function,
+ pl_tone_map_functions),
+ OPT_FLOAT("knee_adaptation", "Tone mapping knee point adaptation", color_map_params.tone_constants.knee_adaptation, .max = 1.0f),
+ OPT_FLOAT("knee_minimum", "Tone mapping knee point minimum", color_map_params.tone_constants.knee_minimum, .max = 0.5f),
+ OPT_FLOAT("knee_maximum", "Tone mapping knee point maximum", color_map_params.tone_constants.knee_maximum, .min = 0.5f, .max = 1.0f),
+ OPT_FLOAT("knee_default", "Tone mapping knee point default", color_map_params.tone_constants.knee_default, .max = 1.0f),
+ OPT_FLOAT("knee_offset", "BT.2390 knee point offset", color_map_params.tone_constants.knee_offset, .min = 0.5f, .max = 2.0f),
+ OPT_FLOAT("slope_tuning", "Spline slope tuning strength", color_map_params.tone_constants.slope_tuning, .max = 10.0f),
+ OPT_FLOAT("slope_offset", "Spline slope tuning offset", color_map_params.tone_constants.slope_offset, .max = 1.0f),
+ OPT_FLOAT("spline_contrast", "Spline slope contrast", color_map_params.tone_constants.spline_contrast, .max = 1.5f),
+ OPT_FLOAT("reinhard_contrast", "Reinhard contrast", color_map_params.tone_constants.reinhard_contrast, .max = 1.0f),
+ OPT_FLOAT("linear_knee", "Tone mapping linear knee point", color_map_params.tone_constants.linear_knee, .max = 1.0f),
+ OPT_FLOAT("exposure", "Tone mapping linear exposure", color_map_params.tone_constants.exposure, .max = 10.0f),
+ OPT_BOOL("inverse_tone_mapping", "Inverse tone mapping", color_map_params.inverse_tone_mapping),
+ OPT_ENUM("tone_map_metadata", "Source of HDR metadata to use", color_map_params.metadata, LIST(
+ {"any", PL_HDR_METADATA_ANY},
+ {"none", PL_HDR_METADATA_NONE},
+ {"hdr10", PL_HDR_METADATA_HDR10},
+ {"hdr10plus", PL_HDR_METADATA_HDR10PLUS},
+ {"cie_y", PL_HDR_METADATA_CIE_Y})),
+ OPT_INT("tone_lut_size", "Tone mapping LUT size", color_map_params.lut_size, .max = 4096),
+ OPT_FLOAT("contrast_recovery", "HDR contrast recovery strength", color_map_params.contrast_recovery, .max = 2.0),
+ OPT_FLOAT("contrast_smoothness", "HDR contrast recovery smoothness", color_map_params.contrast_smoothness, .min = 1.0, .max = 32.0),
+ OPT_BOOL("force_tone_mapping_lut", "Force tone mapping LUT", color_map_params.force_tone_mapping_lut),
+ OPT_BOOL("visualize_lut", "Visualize tone mapping LUTs", color_map_params.visualize_lut),
+ OPT_FLOAT("visualize_lut_x0", "Visualization rect x0", color_map_params.visualize_rect.x0),
+ OPT_FLOAT("visualize_lut_y0", "Visualization rect y0", color_map_params.visualize_rect.y0),
+ OPT_FLOAT("visualize_lut_x1", "Visualization rect x0", color_map_params.visualize_rect.x1),
+ OPT_FLOAT("visualize_lut_y1", "Visualization rect y0", color_map_params.visualize_rect.y1),
+ OPT_FLOAT("visualize_hue", "Visualization hue slice", color_map_params.visualize_hue),
+ OPT_FLOAT("visualize_theta", "Visualization rotation", color_map_params.visualize_theta),
+ OPT_BOOL("show_clipping", "Highlight clipped pixels", color_map_params.show_clipping),
+ OPT_FLOAT("tone_mapping_param", "Tone mapping function parameter", color_map_params.tone_mapping_param, .deprecated = true),
+
+ // Dithering
+ OPT_ENABLE_PARAMS("dither", "Enable dithering", dither_params),
+ OPT_PRESET("dither_preset", "Dithering preset", dither_params, LIST(
+ {"default", &pl_dither_default_params})),
+ OPT_ENUM("dither_method", "Dither method", dither_params.method, LIST(
+ {"blue", PL_DITHER_BLUE_NOISE},
+ {"ordered_lut", PL_DITHER_ORDERED_LUT},
+ {"ordered", PL_DITHER_ORDERED_FIXED},
+ {"white", PL_DITHER_WHITE_NOISE})),
+ OPT_INT("dither_lut_size", "Dither LUT size", dither_params.lut_size, .min = 1, .max = 8),
+ OPT_BOOL("dither_temporal", "Temporal dithering", dither_params.temporal),
+
+ // ICC
+ OPT_ENABLE_PARAMS("icc", "Enable ICC settings", icc_params, .deprecated = true),
+ OPT_PRESET("icc_preset", "ICC preset", icc_params, LIST(
+ {"default", &pl_icc_default_params}), .deprecated = true),
+ OPT_ENUM("icc_intent", "ICC rendering intent", icc_params.intent, LIST(
+ {"auto", PL_INTENT_AUTO},
+ {"perceptual", PL_INTENT_PERCEPTUAL},
+ {"relative", PL_INTENT_RELATIVE_COLORIMETRIC},
+ {"saturation", PL_INTENT_SATURATION},
+ {"absolute", PL_INTENT_ABSOLUTE_COLORIMETRIC}), .deprecated = true),
+ OPT_INT("icc_size_r", "ICC 3DLUT size R", icc_params.size_r, .max = 256, .deprecated = true),
+ OPT_INT("icc_size_g", "ICC 3DLUT size G", icc_params.size_g, .max = 256, .deprecated = true),
+ OPT_INT("icc_size_b", "ICC 3DLUT size G", icc_params.size_b, .max = 256, .deprecated = true),
+ OPT_FLOAT("icc_max_luma", "ICC profile luma override", icc_params.max_luma, .max = 10000, .deprecated = true),
+ OPT_BOOL("icc_force_bpc", "Force ICC black point compensation", icc_params.force_bpc, .deprecated = true),
+
+ // Cone distortion
+ OPT_ENABLE_PARAMS("cone", "Enable cone distortion", cone_params),
+ OPT_PRESET("cone_preset", "Cone distortion preset", cone_params, LIST(
+ {"normal", &pl_vision_normal},
+ {"protanomaly", &pl_vision_protanomaly},
+ {"protanopia", &pl_vision_protanopia},
+ {"deuteranomaly", &pl_vision_deuteranomaly},
+ {"deuteranopia", &pl_vision_deuteranopia},
+ {"tritanomaly", &pl_vision_tritanomaly},
+ {"tritanopia", &pl_vision_tritanopia},
+ {"monochromacy", &pl_vision_monochromacy},
+ {"achromatopsia", &pl_vision_achromatopsia})),
+ OPT_ENUM("cones", "Cone selection", cone_params.cones, LIST(
+ {"none", PL_CONE_NONE},
+ {"l", PL_CONE_L},
+ {"m", PL_CONE_M},
+ {"s", PL_CONE_S},
+ {"lm", PL_CONE_LM},
+ {"ms", PL_CONE_MS},
+ {"ls", PL_CONE_LS},
+ {"lms", PL_CONE_LMS})),
+ OPT_FLOAT("cone_strength", "Cone distortion gain", cone_params.strength),
+
+ // Blending
+#define BLEND_VALUES LIST( \
+ {"zero", PL_BLEND_ZERO}, \
+ {"one", PL_BLEND_ONE}, \
+ {"alpha", PL_BLEND_SRC_ALPHA}, \
+ {"one_minus_alpha", PL_BLEND_ONE_MINUS_SRC_ALPHA})
+
+ OPT_ENABLE_PARAMS("blend", "Enable output blending", blend_params),
+ OPT_PRESET("blend_preset", "Output blending preset", blend_params, LIST(
+ {"alpha_overlay", &pl_alpha_overlay})),
+ OPT_ENUM("blend_src_rgb", "Source RGB blend mode", blend_params.src_rgb, BLEND_VALUES),
+ OPT_ENUM("blend_src_alpha", "Source alpha blend mode", blend_params.src_alpha, BLEND_VALUES),
+ OPT_ENUM("blend_dst_rgb", "Target RGB blend mode", blend_params.dst_rgb, BLEND_VALUES),
+ OPT_ENUM("blend_dst_alpha", "Target alpha blend mode", blend_params.dst_alpha, BLEND_VALUES),
+
+ // Deinterlacing
+ OPT_ENABLE_PARAMS("deinterlace", "Enable deinterlacing", deinterlace_params),
+ OPT_PRESET("deinterlace_preset", "Deinterlacing preset", deinterlace_params, LIST(
+ {"default", &pl_deinterlace_default_params})),
+ OPT_ENUM("deinterlace_algo", "Deinterlacing algorithm", deinterlace_params.algo, LIST(
+ {"weave", PL_DEINTERLACE_WEAVE},
+ {"bob", PL_DEINTERLACE_BOB},
+ {"yadif", PL_DEINTERLACE_YADIF})),
+ OPT_BOOL("deinterlace_skip_spatial", "Skip spatial interlacing check", deinterlace_params.skip_spatial_check),
+
+ // Distortion
+ OPT_ENABLE_PARAMS("distort", "Enable distortion", distort_params),
+ OPT_PRESET("distort_preset", "Distortion preset", distort_params, LIST(
+ {"default", &pl_distort_default_params})),
+ OPT_FLOAT("distort_scale_x", "Distortion X scale", distort_params.transform.mat.m[0][0]),
+ OPT_FLOAT("distort_scale_y", "Distortion Y scale", distort_params.transform.mat.m[1][1]),
+ OPT_FLOAT("distort_shear_x", "Distortion X shear", distort_params.transform.mat.m[0][1]),
+ OPT_FLOAT("distort_shear_y", "Distortion Y shear", distort_params.transform.mat.m[1][0]),
+ OPT_FLOAT("distort_offset_x", "Distortion X offset", distort_params.transform.c[0]),
+ OPT_FLOAT("distort_offset_y", "Distortion Y offset", distort_params.transform.c[1]),
+ OPT_BOOL("distort_unscaled", "Distortion unscaled", distort_params.unscaled),
+ OPT_BOOL("distort_constrain", "Constrain distortion", distort_params.constrain),
+ OPT_BOOL("distort_bicubic", "Distortion bicubic interpolation", distort_params.bicubic),
+ OPT_ENUM("distort_address_mode", "Distortion texture address mode", distort_params.address_mode, LIST(
+ {"clamp", PL_TEX_ADDRESS_CLAMP},
+ {"repeat", PL_TEX_ADDRESS_REPEAT},
+ {"mirror", PL_TEX_ADDRESS_MIRROR})),
+ OPT_ENUM("distort_alpha_mode", "Distortion alpha blending mode", distort_params.alpha_mode, LIST(
+ {"none", PL_ALPHA_UNKNOWN},
+ {"independent", PL_ALPHA_INDEPENDENT},
+ {"premultiplied", PL_ALPHA_PREMULTIPLIED})),
+
+ // Misc renderer settings
+ OPT_NAMED("error_diffusion", "Error diffusion kernel", params.error_diffusion,
+ pl_error_diffusion_kernels),
+ OPT_ENUM("lut_type", "Color mapping LUT type", params.lut_type, LIST(
+ {"unknown", PL_LUT_UNKNOWN},
+ {"native", PL_LUT_NATIVE},
+ {"normalized", PL_LUT_NORMALIZED},
+ {"conversion", PL_LUT_CONVERSION})),
+ OPT_FLOAT("background_r", "Background color R", params.background_color[0], .max = 1.0),
+ OPT_FLOAT("background_g", "Background color G", params.background_color[1], .max = 1.0),
+ OPT_FLOAT("background_b", "Background color B", params.background_color[2], .max = 1.0),
+ OPT_FLOAT("background_transparency", "Background color transparency", params.background_transparency, .max = 1),
+ OPT_BOOL("skip_target_clearing", "Skip target clearing", params.skip_target_clearing),
+ OPT_FLOAT("corner_rounding", "Corner rounding", params.corner_rounding, .max = 1.0),
+ OPT_BOOL("blend_against_tiles", "Blend against tiles", params.blend_against_tiles),
+ OPT_FLOAT("tile_color_hi_r", "Bright tile R", params.tile_colors[0][0], .max = 1.0),
+ OPT_FLOAT("tile_color_hi_g", "Bright tile G", params.tile_colors[0][1], .max = 1.0),
+ OPT_FLOAT("tile_color_hi_b", "Bright tile B", params.tile_colors[0][2], .max = 1.0),
+ OPT_FLOAT("tile_color_lo_r", "Dark tile R", params.tile_colors[1][0], .max = 1.0),
+ OPT_FLOAT("tile_color_lo_g", "Dark tile G", params.tile_colors[1][1], .max = 1.0),
+ OPT_FLOAT("tile_color_lo_b", "Dark tile B", params.tile_colors[1][2], .max = 1.0),
+ OPT_INT("tile_size", "Tile size", params.tile_size, .min = 2, .max = 256),
+
+ // Performance / quality trade-offs and debugging options
+ OPT_BOOL("skip_anti_aliasing", "Skip anti-aliasing", params.skip_anti_aliasing),
+ OPT_INT("lut_entries", "Scaler LUT entries", params.lut_entries, .max = 256, .deprecated = true),
+ OPT_FLOAT("polar_cutoff", "Polar LUT cutoff", params.polar_cutoff, .max = 1.0, .deprecated = true),
+ OPT_BOOL("preserve_mixing_cache", "Preserve mixing cache", params.preserve_mixing_cache),
+ OPT_BOOL("skip_caching_single_frame", "Skip caching single frame", params.skip_caching_single_frame),
+ OPT_BOOL("disable_linear_scaling", "Disable linear scaling", params.disable_linear_scaling),
+ OPT_BOOL("disable_builtin_scalers", "Disable built-in scalers", params.disable_builtin_scalers),
+ OPT_BOOL("correct_subpixel_offset", "Correct subpixel offsets", params.correct_subpixel_offsets),
+ OPT_BOOL("ignore_icc_profiles", "Ignore ICC profiles", params.ignore_icc_profiles, .deprecated = true),
+ OPT_BOOL("force_dither", "Force-enable dithering", params.force_dither),
+ OPT_BOOL("disable_dither_gamma_correction", "Disable gamma-correct dithering", params.disable_dither_gamma_correction),
+ OPT_BOOL("disable_fbos", "Disable FBOs", params.disable_fbos),
+ OPT_BOOL("force_low_bit_depth_fbos", "Force 8-bit FBOs", params.force_low_bit_depth_fbos),
+ OPT_BOOL("dynamic_constants", "Dynamic constants", params.dynamic_constants),
+ {0},
+};
+
+const int pl_option_count = PL_ARRAY_SIZE(pl_option_list) - 1;
+
+pl_opt pl_find_option(const char *key)
+{
+ for (int i = 0; i < pl_option_count; i++) {
+ if (!strcmp(key, pl_option_list[i].key))
+ return &pl_option_list[i];
+ }
+
+ return NULL;
+}
diff --git a/src/os.h b/src/os.h
new file mode 100644
index 0000000..386f0cb
--- /dev/null
+++ b/src/os.h
@@ -0,0 +1,30 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#ifdef __unix__
+#define PL_HAVE_UNIX
+#endif
+
+#ifdef _WIN32
+#define PL_HAVE_WIN32
+#endif
+
+#ifdef __APPLE__
+#define PL_HAVE_APPLE
+#endif
diff --git a/src/pl_alloc.c b/src/pl_alloc.c
new file mode 100644
index 0000000..64eeda7
--- /dev/null
+++ b/src/pl_alloc.c
@@ -0,0 +1,313 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+
+struct header {
+#ifndef NDEBUG
+#define MAGIC 0x20210119LU
+ uint32_t magic;
+#endif
+ size_t size;
+ struct header *parent;
+ struct ext *ext;
+
+ // Pointer to actual data, for alignment purposes
+ max_align_t data[];
+};
+
+// Lazily allocated, to save space for leaf allocations and allocations which
+// don't need fancy requirements
+struct ext {
+ size_t num_children;
+ size_t children_size; // total allocated size of `children`
+ struct header *children[];
+};
+
+#define PTR_OFFSET offsetof(struct header, data)
+#define MAX_ALLOC (SIZE_MAX - PTR_OFFSET)
+#define MINIMUM_CHILDREN 4
+
+static inline struct header *get_header(void *ptr)
+{
+ if (!ptr)
+ return NULL;
+
+ struct header *hdr = (struct header *) ((uintptr_t) ptr - PTR_OFFSET);
+#ifndef NDEBUG
+ assert(hdr->magic == MAGIC);
+#endif
+
+ return hdr;
+}
+
+static inline void *oom(void)
+{
+ fprintf(stderr, "out of memory\n");
+ abort();
+}
+
+static inline struct ext *alloc_ext(struct header *h)
+{
+ if (!h)
+ return NULL;
+
+ if (!h->ext) {
+ h->ext = malloc(sizeof(struct ext) + MINIMUM_CHILDREN * sizeof(void *));
+ if (!h->ext)
+ oom();
+ h->ext->num_children = 0;
+ h->ext->children_size = MINIMUM_CHILDREN;
+ }
+
+ return h->ext;
+}
+
+static inline void attach_child(struct header *parent, struct header *child)
+{
+ child->parent = parent;
+ if (!parent)
+ return;
+
+
+ struct ext *ext = alloc_ext(parent);
+ if (ext->num_children == ext->children_size) {
+ size_t new_size = ext->children_size * 2;
+ ext = realloc(ext, sizeof(struct ext) + new_size * sizeof(void *));
+ if (!ext)
+ oom();
+ ext->children_size = new_size;
+ parent->ext = ext;
+ }
+
+ ext->children[ext->num_children++] = child;
+}
+
+static inline void unlink_child(struct header *parent, struct header *child)
+{
+ child->parent = NULL;
+ if (!parent)
+ return;
+
+ struct ext *ext = parent->ext;
+ for (size_t i = 0; i < ext->num_children; i++) {
+ if (ext->children[i] == child) {
+ memmove(&ext->children[i], &ext->children[i + 1],
+ (--ext->num_children - i) * sizeof(ext->children[0]));
+ return;
+ }
+ }
+
+ assert(!"unlinking orphaned child?");
+}
+
+void *pl_alloc(void *parent, size_t size)
+{
+ if (size >= MAX_ALLOC)
+ return oom();
+
+ struct header *h = malloc(PTR_OFFSET + size);
+ if (!h)
+ return oom();
+
+#ifndef NDEBUG
+ h->magic = MAGIC;
+#endif
+ h->size = size;
+ h->ext = NULL;
+
+ attach_child(get_header(parent), h);
+ return h->data;
+}
+
+void *pl_zalloc(void *parent, size_t size)
+{
+ if (size >= MAX_ALLOC)
+ return oom();
+
+ struct header *h = calloc(1, PTR_OFFSET + size);
+ if (!h)
+ return oom();
+
+#ifndef NDEBUG
+ h->magic = MAGIC;
+#endif
+ h->size = size;
+
+ attach_child(get_header(parent), h);
+ return h->data;
+}
+
+void *pl_realloc(void *parent, void *ptr, size_t size)
+{
+ if (size >= MAX_ALLOC)
+ return oom();
+ if (!ptr)
+ return pl_alloc(parent, size);
+
+ struct header *h = get_header(ptr);
+ assert(get_header(parent) == h->parent);
+ if (h->size == size)
+ return ptr;
+
+ struct header *old_h = h;
+ h = realloc(h, PTR_OFFSET + size);
+ if (!h)
+ return oom();
+
+ h->size = size;
+
+ if (h != old_h) {
+ if (h->parent) {
+ struct ext *ext = h->parent->ext;
+ for (size_t i = 0; i < ext->num_children; i++) {
+ if (ext->children[i] == old_h) {
+ ext->children[i] = h;
+ goto done_reparenting;
+ }
+ }
+ assert(!"reallocating orphaned child?");
+ }
+done_reparenting:
+
+ if (h->ext) {
+ for (size_t i = 0; i < h->ext->num_children; i++)
+ h->ext->children[i]->parent = h;
+ }
+ }
+
+ return h->data;
+}
+
+void pl_free(void *ptr)
+{
+ struct header *h = get_header(ptr);
+ if (!h)
+ return;
+
+ pl_free_children(ptr);
+ unlink_child(h->parent, h);
+
+ free(h->ext);
+ free(h);
+}
+
+void pl_free_children(void *ptr)
+{
+ struct header *h = get_header(ptr);
+ if (!h || !h->ext)
+ return;
+
+#ifndef NDEBUG
+ // this detects recursive hierarchies
+ h->magic = 0;
+#endif
+
+ for (size_t i = 0; i < h->ext->num_children; i++) {
+ h->ext->children[i]->parent = NULL; // prevent recursive access
+ pl_free(h->ext->children[i]->data);
+ }
+ h->ext->num_children = 0;
+
+#ifndef NDEBUG
+ h->magic = MAGIC;
+#endif
+}
+
+size_t pl_get_size(const void *ptr)
+{
+ const struct header *h = get_header((void *) ptr);
+ return h ? h->size : 0;
+}
+
+void *pl_steal(void *parent, void *ptr)
+{
+ struct header *h = get_header(ptr);
+ if (!h)
+ return NULL;
+
+ struct header *new_par = get_header(parent);
+ if (new_par != h->parent) {
+ unlink_child(h->parent, h);
+ attach_child(new_par, h);
+ }
+
+ return h->data;
+}
+
+void *pl_memdup(void *parent, const void *ptr, size_t size)
+{
+ if (!size)
+ return NULL;
+
+ void *new = pl_alloc(parent, size);
+ if (!new)
+ return oom();
+
+ assert(ptr);
+ memcpy(new, ptr, size);
+ return new;
+}
+
+char *pl_str0dup0(void *parent, const char *str)
+{
+ if (!str)
+ return NULL;
+
+ return pl_memdup(parent, str, strlen(str) + 1);
+}
+
+char *pl_strndup0(void *parent, const char *str, size_t size)
+{
+ if (!str)
+ return NULL;
+
+ size_t str_size = strnlen(str, size);
+ char *new = pl_alloc(parent, str_size + 1);
+ if (!new)
+ return oom();
+ memcpy(new, str, str_size);
+ new[str_size] = '\0';
+ return new;
+}
+
+char *pl_asprintf(void *parent, const char *fmt, ...)
+{
+ char *str;
+ va_list ap;
+ va_start(ap, fmt);
+ str = pl_vasprintf(parent, fmt, ap);
+ va_end(ap);
+ return str;
+}
+
+char *pl_vasprintf(void *parent, const char *fmt, va_list ap)
+{
+ // First, we need to determine the size that will be required for
+ // printing the entire string. Do this by making a copy of the va_list
+ // and printing it to a null buffer.
+ va_list copy;
+ va_copy(copy, ap);
+ int size = vsnprintf(NULL, 0, fmt, copy);
+ va_end(copy);
+ if (size < 0)
+ return NULL;
+
+ char *str = pl_alloc(parent, size + 1);
+ vsnprintf(str, size + 1, fmt, ap);
+ return str;
+}
diff --git a/src/pl_alloc.h b/src/pl_alloc.h
new file mode 100644
index 0000000..78df08a
--- /dev/null
+++ b/src/pl_alloc.h
@@ -0,0 +1,191 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <stdalign.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+// Unlike standard malloc, `size` may be 0, in which case this returns an empty
+// allocation which can still be used as a parent for other allocations.
+void *pl_alloc(void *parent, size_t size);
+void *pl_zalloc(void *parent, size_t size);
+void *pl_realloc(void *parent, void *ptr, size_t size);
+
+static inline void *pl_calloc(void *parent, size_t count, size_t size)
+{
+ return pl_zalloc(parent, count * size);
+}
+
+#define pl_tmp(parent) pl_alloc(parent, 0)
+
+// Variants of the above which resolve to sizeof(*ptr)
+#define pl_alloc_ptr(parent, ptr) \
+ (__typeof__(ptr)) pl_alloc(parent, sizeof(*(ptr)))
+#define pl_zalloc_ptr(parent, ptr) \
+ (__typeof__(ptr)) pl_zalloc(parent, sizeof(*(ptr)))
+#define pl_calloc_ptr(parent, num, ptr) \
+ (__typeof__(ptr)) pl_calloc(parent, num, sizeof(*(ptr)))
+
+// Helper function to allocate a struct and immediately assign it
+#define pl_alloc_struct(parent, type, ...) \
+ (type *) pl_memdup(parent, &(type) __VA_ARGS__, sizeof(type))
+
+// Free an allocation and its children (recursively)
+void pl_free(void *ptr);
+void pl_free_children(void *ptr);
+
+#define pl_free_ptr(ptr) \
+ do { \
+ pl_free(*(ptr)); \
+ *(ptr) = NULL; \
+ } while (0)
+
+// Get the current size of an allocation.
+size_t pl_get_size(const void *ptr);
+
+#define pl_grow(parent, ptr, size) \
+ do { \
+ size_t _size = (size); \
+ if (_size > pl_get_size(*(ptr))) \
+ *(ptr) = pl_realloc(parent, *(ptr), _size); \
+ } while (0)
+
+// Reparent an allocation onto a new parent
+void *pl_steal(void *parent, void *ptr);
+
+// Wrapper functions around common string utilities
+void *pl_memdup(void *parent, const void *ptr, size_t size);
+char *pl_str0dup0(void *parent, const char *str);
+char *pl_strndup0(void *parent, const char *str, size_t size);
+
+#define pl_memdup_ptr(parent, ptr) \
+ (__typeof__(ptr)) pl_memdup(parent, ptr, sizeof(*(ptr)))
+
+// Helper functions for allocating public/private pairs, done by allocating
+// `priv` at the address of `pub` + sizeof(pub), rounded up to the maximum
+// alignment requirements.
+
+#define PL_ALIGN_MEM(size) PL_ALIGN2(size, alignof(max_align_t))
+
+#define PL_PRIV(pub) \
+ (void *) ((uintptr_t) (pub) + PL_ALIGN_MEM(sizeof(*(pub))))
+
+#define pl_alloc_obj(parent, ptr, priv) \
+ (__typeof__(ptr)) pl_alloc(parent, PL_ALIGN_MEM(sizeof(*(ptr))) + sizeof(priv))
+
+#define pl_zalloc_obj(parent, ptr, priv) \
+ (__typeof__(ptr)) pl_zalloc(parent, PL_ALIGN_MEM(sizeof(*(ptr))) + sizeof(priv))
+
+// Helper functions for dealing with arrays
+
+#define PL_ARRAY(type) struct { type *elem; int num; }
+
+#define PL_ARRAY_REALLOC(parent, arr, len) \
+ do { \
+ size_t _new_size = (len) * sizeof((arr).elem[0]); \
+ (arr).elem = pl_realloc((void *) parent, (arr).elem, _new_size); \
+ } while (0)
+
+#define PL_ARRAY_RESIZE(parent, arr, len) \
+ do { \
+ size_t _avail = pl_get_size((arr).elem) / sizeof((arr).elem[0]); \
+ size_t _min_len = (len); \
+ if (_avail < _min_len) \
+ PL_ARRAY_REALLOC(parent, arr, _min_len); \
+ } while (0)
+
+#define PL_ARRAY_MEMDUP(parent, arr, ptr, len) \
+ do { \
+ size_t _len = (len); \
+ PL_ARRAY_RESIZE(parent, arr, _len); \
+ memcpy((arr).elem, ptr, _len * sizeof((arr).elem[0])); \
+ (arr).num = _len; \
+ } while (0)
+
+#define PL_ARRAY_GROW(parent, arr) \
+ do { \
+ size_t _avail = pl_get_size((arr).elem) / sizeof((arr).elem[0]); \
+ if (_avail < 10) { \
+ PL_ARRAY_REALLOC(parent, arr, 10); \
+ } else if ((arr).num == _avail) { \
+ PL_ARRAY_REALLOC(parent, arr, (arr).num * 1.5); \
+ } else { \
+ assert((arr).elem); \
+ } \
+ } while (0)
+
+#define PL_ARRAY_APPEND(parent, arr, ...) \
+ do { \
+ PL_ARRAY_GROW(parent, arr); \
+ (arr).elem[(arr).num++] = __VA_ARGS__; \
+ } while (0)
+
+#define PL_ARRAY_CONCAT(parent, to, from) \
+ do { \
+ if ((from).num) { \
+ PL_ARRAY_RESIZE(parent, to, (to).num + (from).num); \
+ memmove(&(to).elem[(to).num], (from).elem, \
+ (from).num * sizeof((from).elem[0])); \
+ (to).num += (from).num; \
+ } \
+ } while (0)
+
+#define PL_ARRAY_REMOVE_RANGE(arr, idx, count) \
+ do { \
+ ptrdiff_t _idx = (idx); \
+ if (_idx < 0) \
+ _idx += (arr).num; \
+ size_t _count = (count); \
+ assert(_idx >= 0 && _idx + _count <= (arr).num); \
+ memmove(&(arr).elem[_idx], &(arr).elem[_idx + _count], \
+ ((arr).num - _idx - _count) * sizeof((arr).elem[0])); \
+ (arr).num -= _count; \
+ } while (0)
+
+#define PL_ARRAY_REMOVE_AT(arr, idx) PL_ARRAY_REMOVE_RANGE(arr, idx, 1)
+
+#define PL_ARRAY_INSERT_AT(parent, arr, idx, ...) \
+ do { \
+ ptrdiff_t _idx = (idx); \
+ if (_idx < 0) \
+ _idx += (arr).num + 1; \
+ assert(_idx >= 0 && _idx <= (arr).num); \
+ PL_ARRAY_GROW(parent, arr); \
+ memmove(&(arr).elem[_idx + 1], &(arr).elem[_idx], \
+ ((arr).num++ - _idx) * sizeof((arr).elem[0])); \
+ (arr).elem[_idx] = __VA_ARGS__; \
+ } while (0)
+
+// Returns whether or not there was any element to pop
+#define PL_ARRAY_POP(arr, out) \
+ ((arr).num > 0 \
+ ? (*(out) = (arr).elem[--(arr).num], true) \
+ : false \
+ )
+
+// Wrapper for dealing with non-PL_ARRAY arrays
+#define PL_ARRAY_APPEND_RAW(parent, arr, idxvar, ...) \
+ do { \
+ PL_ARRAY(__typeof__((arr)[0])) _arr = { (arr), (idxvar) }; \
+ PL_ARRAY_APPEND(parent, _arr, __VA_ARGS__); \
+ (arr) = _arr.elem; \
+ (idxvar) = _arr.num; \
+ } while (0)
diff --git a/src/pl_assert.h b/src/pl_assert.h
new file mode 100644
index 0000000..b4c6656
--- /dev/null
+++ b/src/pl_assert.h
@@ -0,0 +1,37 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <assert.h>
+
+#ifndef NDEBUG
+# define pl_assert assert
+#else
+# define pl_assert(expr) \
+ do { \
+ if (!(expr)) { \
+ fprintf(stderr, "Assertion failed: %s in %s:%d\n", \
+ #expr, __FILE__, __LINE__); \
+ abort(); \
+ } \
+ } while (0)
+#endif
+
+// In C11, static asserts must have a string message
+#define pl_static_assert(expr) static_assert(expr, #expr)
diff --git a/src/pl_clock.h b/src/pl_clock.h
new file mode 100644
index 0000000..541ef0b
--- /dev/null
+++ b/src/pl_clock.h
@@ -0,0 +1,98 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <time.h>
+#include <stdint.h>
+
+#include "os.h"
+
+#ifdef PL_HAVE_WIN32
+# include <windows.h>
+# define PL_CLOCK_QPC
+#elif defined(PL_HAVE_APPLE)
+# include <Availability.h>
+# if (defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && __MAC_OS_X_VERSION_MIN_REQUIRED < 101200) || \
+ (defined(__IPHONE_OS_VERSION_MIN_REQUIRED) && __IPHONE_OS_VERSION_MIN_REQUIRED < 100000) || \
+ (defined(__TV_OS_VERSION_MIN_REQUIRED) && __TV_OS_VERSION_MIN_REQUIRED < 100000) || \
+ (defined(__WATCH_OS_VERSION_MIN_REQUIRED) && __WATCH_OS_VERSION_MIN_REQUIRED < 30000) || \
+ !defined(CLOCK_MONOTONIC_RAW)
+# include <mach/mach_time.h>
+# define PL_CLOCK_MACH
+# else
+# define PL_CLOCK_MONOTONIC_RAW
+# endif
+#elif defined(CLOCK_MONOTONIC_RAW)
+# define PL_CLOCK_MONOTONIC_RAW
+#elif defined(TIME_UTC)
+# define PL_CLOCK_TIMESPEC_GET
+#else
+# warning "pl_clock not implemented for this platform!"
+#endif
+
+typedef uint64_t pl_clock_t;
+
+static inline pl_clock_t pl_clock_now(void)
+{
+#if defined(PL_CLOCK_QPC)
+
+ LARGE_INTEGER counter;
+ QueryPerformanceCounter(&counter);
+ return counter.QuadPart;
+
+#elif defined(PL_CLOCK_MACH)
+
+ return mach_absolute_time();
+
+#else
+
+ struct timespec tp = { .tv_sec = 0, .tv_nsec = 0 };
+#if defined(PL_CLOCK_MONOTONIC_RAW)
+ clock_gettime(CLOCK_MONOTONIC_RAW, &tp);
+#elif defined(PL_CLOCK_TIMESPEC_GET)
+ timespec_get(&tp, TIME_UTC);
+#endif
+ return tp.tv_sec * UINT64_C(1000000000) + tp.tv_nsec;
+
+#endif
+}
+
+static inline double pl_clock_diff(pl_clock_t a, pl_clock_t b)
+{
+ double frequency = 1e9;
+
+#if defined(PL_CLOCK_QPC)
+
+ LARGE_INTEGER freq;
+ QueryPerformanceFrequency(&freq);
+ frequency = freq.QuadPart;
+
+#elif defined(PL_CLOCK_MACH)
+
+ mach_timebase_info_data_t time_base;
+ if (mach_timebase_info(&time_base) != KERN_SUCCESS)
+ return 0;
+ frequency = (time_base.denom * 1e9) / time_base.numer;
+
+#endif
+
+ if (b > a)
+ return (b - a) / -frequency;
+ else
+ return (a - b) / frequency;
+}
diff --git a/src/pl_string.c b/src/pl_string.c
new file mode 100644
index 0000000..ba25971
--- /dev/null
+++ b/src/pl_string.c
@@ -0,0 +1,418 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+#include "hash.h"
+
+static void grow_str(void *alloc, pl_str *str, size_t len)
+{
+ // Like pl_grow, but with some extra headroom
+ if (len > pl_get_size(str->buf))
+ str->buf = pl_realloc(alloc, str->buf, len * 1.5);
+}
+
+void pl_str_append(void *alloc, pl_str *str, pl_str append)
+{
+ // Also append an extra \0 for convenience, since a lot of the time
+ // this function will be used to generate a string buffer
+ grow_str(alloc, str, str->len + append.len + 1);
+ if (append.len) {
+ memcpy(str->buf + str->len, append.buf, append.len);
+ str->len += append.len;
+ }
+ str->buf[str->len] = '\0';
+}
+
+void pl_str_append_raw(void *alloc, pl_str *str, const void *ptr, size_t size)
+{
+ if (!size)
+ return;
+ grow_str(alloc, str, str->len + size);
+ memcpy(str->buf + str->len, ptr, size);
+ str->len += size;
+}
+
+void pl_str_append_asprintf(void *alloc, pl_str *str, const char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap, fmt);
+ pl_str_append_vasprintf(alloc, str, fmt, ap);
+ va_end(ap);
+}
+
+void pl_str_append_vasprintf(void *alloc, pl_str *str, const char *fmt, va_list ap)
+{
+ // First, we need to determine the size that will be required for
+ // printing the entire string. Do this by making a copy of the va_list
+ // and printing it to a null buffer.
+ va_list copy;
+ va_copy(copy, ap);
+ int size = vsnprintf(NULL, 0, fmt, copy);
+ va_end(copy);
+ if (size < 0)
+ return;
+
+ // Make room in `str` and format to there directly
+ grow_str(alloc, str, str->len + size + 1);
+ str->len += vsnprintf((char *) (str->buf + str->len), size + 1, fmt, ap);
+}
+
+int pl_str_sscanf(pl_str str, const char *fmt, ...)
+{
+ char *tmp = pl_strdup0(NULL, str);
+ va_list va;
+ va_start(va, fmt);
+ int ret = vsscanf(tmp, fmt, va);
+ va_end(va);
+ pl_free(tmp);
+ return ret;
+}
+
+int pl_strchr(pl_str str, int c)
+{
+ if (!str.len)
+ return -1;
+
+ void *pos = memchr(str.buf, c, str.len);
+ if (pos)
+ return (intptr_t) pos - (intptr_t) str.buf;
+ return -1;
+}
+
+size_t pl_strspn(pl_str str, const char *accept)
+{
+ for (size_t i = 0; i < str.len; i++) {
+ if (!strchr(accept, str.buf[i]))
+ return i;
+ }
+
+ return str.len;
+}
+
+size_t pl_strcspn(pl_str str, const char *reject)
+{
+ for (size_t i = 0; i < str.len; i++) {
+ if (strchr(reject, str.buf[i]))
+ return i;
+ }
+
+ return str.len;
+}
+
+static inline bool pl_isspace(char c)
+{
+ switch (c) {
+ case ' ':
+ case '\n':
+ case '\r':
+ case '\t':
+ case '\v':
+ case '\f':
+ return true;
+ default:
+ return false;
+ }
+}
+
+pl_str pl_str_strip(pl_str str)
+{
+ while (str.len && pl_isspace(str.buf[0])) {
+ str.buf++;
+ str.len--;
+ }
+ while (str.len && pl_isspace(str.buf[str.len - 1]))
+ str.len--;
+ return str;
+}
+
+int pl_str_find(pl_str haystack, pl_str needle)
+{
+ if (!needle.len)
+ return 0;
+
+ for (size_t i = 0; i + needle.len <= haystack.len; i++) {
+ if (memcmp(&haystack.buf[i], needle.buf, needle.len) == 0)
+ return i;
+ }
+
+ return -1;
+}
+
+pl_str pl_str_split_char(pl_str str, char sep, pl_str *out_rest)
+{
+ int pos = pl_strchr(str, sep);
+ if (pos < 0) {
+ if (out_rest)
+ *out_rest = (pl_str) {0};
+ return str;
+ } else {
+ if (out_rest)
+ *out_rest = pl_str_drop(str, pos + 1);
+ return pl_str_take(str, pos);
+ }
+}
+
+pl_str pl_str_split_chars(pl_str str, const char *seps, pl_str *out_rest)
+{
+ int pos = pl_strcspn(str, seps);
+ if (pos < 0) {
+ if (out_rest)
+ *out_rest = (pl_str) {0};
+ return str;
+ } else {
+ if (out_rest)
+ *out_rest = pl_str_drop(str, pos + 1);
+ return pl_str_take(str, pos);
+ }
+}
+
+pl_str pl_str_split_str(pl_str str, pl_str sep, pl_str *out_rest)
+{
+ int pos = pl_str_find(str, sep);
+ if (pos < 0) {
+ if (out_rest)
+ *out_rest = (pl_str) {0};
+ return str;
+ } else {
+ if (out_rest)
+ *out_rest = pl_str_drop(str, pos + sep.len);
+ return pl_str_take(str, pos);
+ }
+}
+
+static bool get_hexdigit(pl_str *str, int *digit)
+{
+ while (str->len && pl_isspace(str->buf[0])) {
+ str->buf++;
+ str->len--;
+ }
+
+ if (!str->len) {
+ *digit = -1; // EOF
+ return true;
+ }
+
+ char c = str->buf[0];
+ str->buf++;
+ str->len--;
+
+ if (c >= '0' && c <= '9') {
+ *digit = c - '0';
+ } else if (c >= 'a' && c <= 'f') {
+ *digit = c - 'a' + 10;
+ } else if (c >= 'A' && c <= 'F') {
+ *digit = c - 'A' + 10;
+ } else {
+ return false; // invalid char
+ }
+
+ return true;
+}
+
+bool pl_str_decode_hex(void *alloc, pl_str hex, pl_str *out)
+{
+ if (!out)
+ return false;
+
+ uint8_t *buf = pl_alloc(alloc, hex.len / 2);
+ int len = 0;
+
+ while (hex.len) {
+ int a, b;
+ if (!get_hexdigit(&hex, &a) || !get_hexdigit(&hex, &b))
+ goto error; // invalid char
+ if (a < 0) // EOF
+ break;
+ if (b < 0) // only one digit
+ goto error;
+
+ buf[len++] = (a << 4) | b;
+ }
+
+ *out = (pl_str) { buf, len };
+ return true;
+
+error:
+ pl_free(buf);
+ return false;
+}
+
+struct pl_str_builder_t {
+ PL_ARRAY(pl_str_template) templates;
+ pl_str args;
+ pl_str output;
+};
+
+pl_str_builder pl_str_builder_alloc(void *alloc)
+{
+ pl_str_builder b = pl_zalloc_ptr(alloc, b);
+ return b;
+}
+
+void pl_str_builder_free(pl_str_builder *b)
+{
+ if (*b)
+ pl_free_ptr(b);
+}
+
+void pl_str_builder_reset(pl_str_builder b)
+{
+ *b = (struct pl_str_builder_t) {
+ .templates.elem = b->templates.elem,
+ .args.buf = b->args.buf,
+ .output.buf = b->output.buf,
+ };
+}
+
+uint64_t pl_str_builder_hash(const pl_str_builder b)
+{
+ size_t size = b->templates.num * sizeof(b->templates.elem[0]);
+ uint64_t hash = pl_mem_hash(b->templates.elem, size);
+ pl_hash_merge(&hash, pl_str_hash(b->args));
+ return hash;
+}
+
+pl_str pl_str_builder_exec(pl_str_builder b)
+{
+ pl_str args = b->args;
+
+ b->output.len = 0;
+ for (int i = 0; i < b->templates.num; i++) {
+ size_t consumed = b->templates.elem[i](b, &b->output, args.buf);
+ pl_assert(consumed <= args.len);
+ args = pl_str_drop(args, consumed);
+ }
+
+ // Terminate with an extra \0 byte for convenience
+ grow_str(b, &b->output, b->output.len + 1);
+ b->output.buf[b->output.len] = '\0';
+ return b->output;
+}
+
+void pl_str_builder_append(pl_str_builder b, pl_str_template tmpl,
+ const void *args, size_t size)
+{
+ PL_ARRAY_APPEND(b, b->templates, tmpl);
+ pl_str_append_raw(b, &b->args, args, size);
+}
+
+void pl_str_builder_concat(pl_str_builder b, const pl_str_builder append)
+{
+ PL_ARRAY_CONCAT(b, b->templates, append->templates);
+ pl_str_append_raw(b, &b->args, append->args.buf, append->args.len);
+}
+
+static size_t template_str_ptr(void *alloc, pl_str *buf, const uint8_t *args)
+{
+ const char *str;
+ memcpy(&str, args, sizeof(str));
+ pl_str_append_raw(alloc, buf, str, strlen(str));
+ return sizeof(str);
+}
+
+void pl_str_builder_const_str(pl_str_builder b, const char *str)
+{
+ pl_str_builder_append(b, template_str_ptr, &str, sizeof(str));
+}
+
+static size_t template_str(void *alloc, pl_str *buf, const uint8_t *args)
+{
+ pl_str str;
+ memcpy(&str.len, args, sizeof(str.len));
+ pl_str_append_raw(alloc, buf, args + sizeof(str.len), str.len);
+ return sizeof(str.len) + str.len;
+}
+
+void pl_str_builder_str(pl_str_builder b, const pl_str str)
+{
+ pl_str_builder_append(b, template_str, &str.len, sizeof(str.len));
+ pl_str_append_raw(b, &b->args, str.buf, str.len);
+}
+
+void pl_str_builder_printf_c(pl_str_builder b, const char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap, fmt);
+ pl_str_builder_vprintf_c(b, fmt, ap);
+ va_end(ap);
+}
+
+static size_t template_printf(void *alloc, pl_str *str, const uint8_t *args)
+{
+ const char *fmt;
+ memcpy(&fmt, args, sizeof(fmt));
+ args += sizeof(fmt);
+
+ return sizeof(fmt) + pl_str_append_memprintf_c(alloc, str, fmt, args);
+}
+
+void pl_str_builder_vprintf_c(pl_str_builder b, const char *fmt, va_list ap)
+{
+ pl_str_builder_append(b, template_printf, &fmt, sizeof(fmt));
+
+ // Push all of the variadic arguments directly onto `b->args`
+ for (const char *c; (c = strchr(fmt, '%')) != NULL; fmt = c + 1) {
+ c++;
+ switch (c[0]) {
+#define WRITE(T, x) pl_str_append_raw(b, &b->args, &(T) {x}, sizeof(T))
+ case '%': continue;
+ case 'c': WRITE(char, va_arg(ap, int)); break;
+ case 'd': WRITE(int, va_arg(ap, int)); break;
+ case 'u': WRITE(unsigned, va_arg(ap, unsigned)); break;
+ case 'f': WRITE(double, va_arg(ap, double)); break;
+ case 'h':
+ assert(c[1] == 'x');
+ WRITE(unsigned short, va_arg(ap, unsigned));
+ c++;
+ break;
+ case 'l':
+ assert(c[1] == 'l');
+ switch (c[2]) {
+ case 'u': WRITE(long long unsigned, va_arg(ap, long long unsigned)); break;
+ case 'd': WRITE(long long int, va_arg(ap, long long int)); break;
+ default: abort();
+ }
+ c += 2;
+ break;
+ case 'z':
+ assert(c[1] == 'u');
+ WRITE(size_t, va_arg(ap, size_t));
+ c++;
+ break;
+ case 's': {
+ pl_str str = pl_str0(va_arg(ap, const char *));
+ pl_str_append(b, &b->args, str);
+ b->args.len++; // expand to include \0 byte (from pl_str_append)
+ break;
+ }
+ case '.': {
+ assert(c[1] == '*');
+ assert(c[2] == 's');
+ int len = va_arg(ap, int);
+ const char *str = va_arg(ap, const char *);
+ WRITE(int, len);
+ pl_str_append_raw(b, &b->args, str, len);
+ c += 2;
+ break;
+ }
+ default:
+ fprintf(stderr, "Invalid conversion character: '%c'!\n", c[0]);
+ abort();
+ }
+#undef WRITE
+ }
+}
diff --git a/src/pl_string.h b/src/pl_string.h
new file mode 100644
index 0000000..7a0005c
--- /dev/null
+++ b/src/pl_string.h
@@ -0,0 +1,318 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+PL_API_BEGIN
+
+typedef struct pl_str {
+ uint8_t *buf;
+ size_t len;
+} pl_str;
+
+// For formatting with "%.*s"
+#define PL_STR_FMT(str) (int)((str).len), ((str).buf ? (char *)((str).buf) : "")
+
+static inline pl_str pl_str0(const char *str)
+{
+ return (pl_str) {
+ .buf = (uint8_t *) str,
+ .len = str ? strlen(str) : 0,
+ };
+}
+
+// Macro version of pl_str0, for constants
+#define PL_STR0(str) ((pl_str) { (uint8_t *) (str), (str) ? strlen(str) : 0 })
+
+static inline pl_str pl_strdup(void *alloc, pl_str str)
+{
+ return (pl_str) {
+ .buf = (uint8_t *) (str.len ? pl_memdup(alloc, str.buf, str.len) : NULL),
+ .len = str.len,
+ };
+}
+
+// Always returns a valid string
+static inline char *pl_strdup0(void *alloc, pl_str str)
+{
+ return pl_strndup0(alloc, str.len ? (char *) str.buf : "", str.len);
+}
+
+// Adds a trailing \0 for convenience, even if `append` is an empty string
+void pl_str_append(void *alloc, pl_str *str, pl_str append);
+
+// Like `pl_str_append` but for raw memory, omits trailing \0
+void pl_str_append_raw(void *alloc, pl_str *str, const void *ptr, size_t size);
+
+// Locale-sensitive string functions
+char *pl_asprintf(void *parent, const char *fmt, ...)
+ PL_PRINTF(2, 3);
+char *pl_vasprintf(void *parent, const char *fmt, va_list ap)
+ PL_PRINTF(2, 0);
+void pl_str_append_asprintf(void *alloc, pl_str *str, const char *fmt, ...)
+ PL_PRINTF(3, 4);
+void pl_str_append_vasprintf(void *alloc, pl_str *str, const char *fmt, va_list va)
+ PL_PRINTF(3, 0);
+int pl_str_sscanf(pl_str str, const char *fmt, ...);
+
+// Locale-invariant versions of append_(v)asprintf
+//
+// NOTE: These only support a small handful of modifiers. Check `format.c`
+// for a list. Calling them on an invalid string will abort!
+void pl_str_append_asprintf_c(void *alloc, pl_str *str, const char *fmt, ...)
+ PL_PRINTF(3, 4);
+void pl_str_append_vasprintf_c(void *alloc, pl_str *str, const char *fmt, va_list va)
+ PL_PRINTF(3, 0);
+
+// Variant of the above which takes arguments directly from a pointer in memory,
+// reading them incrementally (tightly packed). Returns the amount of bytes
+// read from `args`, as determined by the following table:
+//
+// %c: sizeof(char)
+// %d, %u: sizeof(int)
+// %f: sizeof(double)
+// %lld, %llu: sizeof(long long int)
+// %zu: sizeof(size_t)
+// %s: \0 terminated string
+// %.*s: sizeof(int) + that many bytes (no \0 terminator)
+size_t pl_str_append_memprintf_c(void *alloc, pl_str *str, const char *fmt,
+ const void *args)
+ PL_PRINTF(3, 0);
+
+// Locale-invariant number printing
+int pl_str_print_hex(char *buf, size_t len, unsigned short n);
+int pl_str_print_int(char *buf, size_t len, int n);
+int pl_str_print_uint(char *buf, size_t len, unsigned int n);
+int pl_str_print_int64(char *buf, size_t len, int64_t n);
+int pl_str_print_uint64(char *buf, size_t len, uint64_t n);
+int pl_str_print_float(char *buf, size_t len, float n);
+int pl_str_print_double(char *buf, size_t len, double n);
+
+// Locale-invariant number parsing
+bool pl_str_parse_hex(pl_str str, unsigned short *out);
+bool pl_str_parse_int(pl_str str, int *out);
+bool pl_str_parse_uint(pl_str str, unsigned int *out);
+bool pl_str_parse_int64(pl_str str, int64_t *out);
+bool pl_str_parse_uint64(pl_str str, uint64_t *out);
+bool pl_str_parse_float(pl_str str, float *out);
+bool pl_str_parse_double(pl_str str, double *out);
+
+// Variants of string.h functions
+int pl_strchr(pl_str str, int c);
+size_t pl_strspn(pl_str str, const char *accept);
+size_t pl_strcspn(pl_str str, const char *reject);
+
+// Strip leading/trailing whitespace
+pl_str pl_str_strip(pl_str str);
+
+// Generic functions for cutting up strings
+static inline pl_str pl_str_take(pl_str str, size_t len)
+{
+ if (len < str.len)
+ str.len = len;
+ return str;
+}
+
+static inline pl_str pl_str_drop(pl_str str, size_t len)
+{
+ if (len >= str.len)
+ return (pl_str) { .buf = NULL, .len = 0 };
+
+ str.buf += len;
+ str.len -= len;
+ return str;
+}
+
+// Find a substring in another string, and return its index (or -1)
+int pl_str_find(pl_str haystack, pl_str needle);
+
+// String splitting functions. These return the part of the string before
+// the separator, and optionally the rest (in `out_rest`).
+//
+// Note that the separator is not included as part of either string.
+pl_str pl_str_split_char(pl_str str, char sep, pl_str *out_rest);
+pl_str pl_str_split_str(pl_str str, pl_str sep, pl_str *out_rest);
+
+// Like `pl_str_split_char`, but splits on any char in `seps`
+pl_str pl_str_split_chars(pl_str str, const char *seps, pl_str *out_rest);
+
+static inline pl_str pl_str_getline(pl_str str, pl_str *out_rest)
+{
+ return pl_str_split_char(str, '\n', out_rest);
+}
+
+// Decode a string containing hexadecimal data. All whitespace will be silently
+// ignored. When successful, this allocates a new array to store the output.
+bool pl_str_decode_hex(void *alloc, pl_str hex, pl_str *out);
+
+static inline bool pl_str_equals(pl_str str1, pl_str str2)
+{
+ if (str1.len != str2.len)
+ return false;
+ if (str1.buf == str2.buf || !str1.len)
+ return true;
+ return memcmp(str1.buf, str2.buf, str1.len) == 0;
+}
+
+static inline bool pl_str_startswith(pl_str str, pl_str prefix)
+{
+ if (!prefix.len)
+ return true;
+ if (str.len < prefix.len)
+ return false;
+ return memcmp(str.buf, prefix.buf, prefix.len) == 0;
+}
+
+static inline bool pl_str_endswith(pl_str str, pl_str suffix)
+{
+ if (!suffix.len)
+ return true;
+ if (str.len < suffix.len)
+ return false;
+ return memcmp(str.buf + str.len - suffix.len, suffix.buf, suffix.len) == 0;
+}
+
+static inline bool pl_str_eatstart(pl_str *str, pl_str prefix)
+{
+ if (!pl_str_startswith(*str, prefix))
+ return false;
+
+ str->buf += prefix.len;
+ str->len -= prefix.len;
+ return true;
+}
+
+static inline bool pl_str_eatend(pl_str *str, pl_str suffix)
+{
+ if (!pl_str_endswith(*str, suffix))
+ return false;
+
+ str->len -= suffix.len;
+ return true;
+}
+
+// Convenience wrappers for the above which save the use of a pl_str0
+static inline pl_str pl_str_split_str0(pl_str str, const char *sep, pl_str *out_rest)
+{
+ return pl_str_split_str(str, pl_str0(sep), out_rest);
+}
+
+static inline bool pl_str_startswith0(pl_str str, const char *prefix)
+{
+ return pl_str_startswith(str, pl_str0(prefix));
+}
+
+static inline bool pl_str_endswith0(pl_str str, const char *suffix)
+{
+ return pl_str_endswith(str, pl_str0(suffix));
+}
+
+static inline bool pl_str_equals0(pl_str str1, const char *str2)
+{
+ return pl_str_equals(str1, pl_str0(str2));
+}
+
+static inline bool pl_str_eatstart0(pl_str *str, const char *prefix)
+{
+ return pl_str_eatstart(str, pl_str0(prefix));
+}
+
+static inline bool pl_str_eatend0(pl_str *str, const char *prefix)
+{
+ return pl_str_eatend(str, pl_str0(prefix));
+}
+
+// String building helpers, used to lazily construct a string by appending a
+// series of string templates which can be executed on-demand into a final
+// output buffer.
+typedef struct pl_str_builder_t *pl_str_builder;
+
+// Returns the number of bytes consumed from `args`. Be warned that the pointer
+// given will not necessarily be aligned to the type you need it as, so make
+// sure to use `memcpy` or some other method of safely loading arbitrary data
+// from memory.
+typedef size_t (*pl_str_template)(void *alloc, pl_str *buf, const uint8_t *args);
+
+pl_str_builder pl_str_builder_alloc(void *alloc);
+void pl_str_builder_free(pl_str_builder *builder);
+
+// Resets string builder without destroying buffer
+void pl_str_builder_reset(pl_str_builder builder);
+
+// Returns a representative hash of the string builder's output, without
+// actually executing it. Note that this is *not* the same as a pl_str_hash of
+// the string builder's output.
+//
+// Note also that the output of this may not survive a process restart because
+// of position-independent code and address randomization moving around the
+// locatons of template functions, so special care must be taken not to
+// compare such hashes across process invocations.
+uint64_t pl_str_builder_hash(const pl_str_builder builder);
+
+// Executes a string builder, dispatching all templates. The resulting string
+// is guaranteed to be \0-terminated, as a minor convenience.
+//
+// Calling any other `pl_str_builder_*` function on this builder causes the
+// contents of the returned string to become undefined.
+pl_str pl_str_builder_exec(pl_str_builder builder);
+
+// Append a template and its arguments to a string builder
+void pl_str_builder_append(pl_str_builder builder, pl_str_template tmpl,
+ const void *args, size_t args_size);
+
+// Append an entire other `pl_str_builder` onto `builder`
+void pl_str_builder_concat(pl_str_builder builder, const pl_str_builder append);
+
+// Append a constant string. This will only record &str into the buffer, which
+// may have a number of unwanted consequences if the memory pointed at by
+// `str` mutates at any point in time in the future, or if `str` is not
+// at a stable location in memory.
+//
+// This is intended for strings which are compile-time constants.
+void pl_str_builder_const_str(pl_str_builder builder, const char *str);
+
+// Append a string. This will make a full copy of `str`
+void pl_str_builder_str(pl_str_builder builder, const pl_str str);
+#define pl_str_builder_str0(b, str) pl_str_builder_str(b, pl_str0(str))
+
+// Append a string printf-style. This will preprocess `fmt` to determine the
+// number and type of arguments. Supports the same format conversion characters
+// as `pl_str_append_asprintf_c`.
+void pl_str_builder_printf_c(pl_str_builder builder, const char *fmt, ...)
+ PL_PRINTF(2, 3);
+
+void pl_str_builder_vprintf_c(pl_str_builder builder, const char *fmt, va_list ap)
+ PL_PRINTF(2, 0);
+
+// Helper macro to specialize `pl_str_builder_printf_c` to
+// `pl_str_builder_const_str` if it contains no format characters.
+#define pl_str_builder_addf(builder, ...) do \
+{ \
+ if (_contains_fmt_chars(__VA_ARGS__)) { \
+ pl_str_builder_printf_c(builder, __VA_ARGS__); \
+ } else { \
+ pl_str_builder_const_str(builder, _get_fmt(__VA_ARGS__)); \
+ } \
+} while (0)
+
+// Helper macros to deal with the non-portability of __VA_OPT__(,)
+#define _contains_fmt_chars(fmt, ...) (strchr(fmt, '%'))
+#define _get_fmt(fmt, ...) fmt
+
+PL_API_END
diff --git a/src/pl_thread.h b/src/pl_thread.h
new file mode 100644
index 0000000..7a5ae47
--- /dev/null
+++ b/src/pl_thread.h
@@ -0,0 +1,73 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "os.h"
+
+enum pl_mutex_type {
+ PL_MUTEX_NORMAL = 0,
+ PL_MUTEX_RECURSIVE,
+};
+
+#define pl_mutex_init(mutex) \
+ pl_mutex_init_type(mutex, PL_MUTEX_NORMAL)
+
+// Note: This is never compiled, and only documents the API. The actual
+// implementations of these prototypes may be macros.
+#ifdef PL_API_REFERENCE
+
+typedef void pl_mutex;
+void pl_mutex_init_type(pl_mutex *mutex, enum pl_mutex_type mtype);
+int pl_mutex_destroy(pl_mutex *mutex);
+int pl_mutex_lock(pl_mutex *mutex);
+int pl_mutex_unlock(pl_mutex *mutex);
+
+typedef void pl_cond;
+int pl_cond_init(pl_cond *cond);
+int pl_cond_destroy(pl_cond *cond);
+int pl_cond_broadcast(pl_cond *cond);
+int pl_cond_signal(pl_cond *cond);
+
+// `timeout` is in nanoseconds, or UINT64_MAX to block forever
+int pl_cond_timedwait(pl_cond *cond, pl_mutex *mutex, uint64_t timeout);
+int pl_cond_wait(pl_cond *cond, pl_mutex *mutex);
+
+typedef void pl_static_mutex;
+#define PL_STATIC_MUTEX_INITIALIZER
+int pl_static_mutex_lock(pl_static_mutex *mutex);
+int pl_static_mutex_unlock(pl_static_mutex *mutex);
+
+typedef void pl_thread;
+#define PL_THREAD_VOID void
+#define PL_THREAD_RETURN() return
+int pl_thread_create(pl_thread *thread, PL_THREAD_VOID (*fun)(void *), void *arg);
+int pl_thread_join(pl_thread thread);
+
+// Returns true if slept the full time, false otherwise
+bool pl_thread_sleep(double t);
+
+#endif
+
+// Actual platform-specific implementation
+#ifdef PL_HAVE_WIN32
+#include "pl_thread_win32.h"
+#elif defined(PL_HAVE_PTHREAD)
+#include "pl_thread_pthread.h"
+#else
+#error No threading implementation available!
+#endif
diff --git a/src/pl_thread_pthread.h b/src/pl_thread_pthread.h
new file mode 100644
index 0000000..5910650
--- /dev/null
+++ b/src/pl_thread_pthread.h
@@ -0,0 +1,137 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <errno.h>
+#include <pthread.h>
+#include <sys/time.h>
+#include <time.h>
+
+#include <pl_assert.h>
+
+typedef pthread_mutex_t pl_mutex;
+typedef pthread_cond_t pl_cond;
+typedef pthread_mutex_t pl_static_mutex;
+typedef pthread_t pl_thread;
+#define PL_STATIC_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
+
+static inline int pl_mutex_init_type_internal(pl_mutex *mutex, enum pl_mutex_type mtype)
+{
+ int mutex_type;
+ switch (mtype) {
+ case PL_MUTEX_RECURSIVE:
+ mutex_type = PTHREAD_MUTEX_RECURSIVE;
+ break;
+ case PL_MUTEX_NORMAL:
+ default:
+ #ifndef NDEBUG
+ mutex_type = PTHREAD_MUTEX_ERRORCHECK;
+ #else
+ mutex_type = PTHREAD_MUTEX_DEFAULT;
+ #endif
+ break;
+ }
+
+ int ret = 0;
+ pthread_mutexattr_t attr;
+ ret = pthread_mutexattr_init(&attr);
+ if (ret != 0)
+ return ret;
+
+ pthread_mutexattr_settype(&attr, mutex_type);
+ ret = pthread_mutex_init(mutex, &attr);
+ pthread_mutexattr_destroy(&attr);
+ return ret;
+}
+
+#define pl_mutex_init_type(mutex, mtype) \
+ pl_assert(!pl_mutex_init_type_internal(mutex, mtype))
+
+#define pl_mutex_destroy pthread_mutex_destroy
+#define pl_mutex_lock pthread_mutex_lock
+#define pl_mutex_unlock pthread_mutex_unlock
+
+static inline int pl_cond_init(pl_cond *cond)
+{
+ int ret = 0;
+ pthread_condattr_t attr;
+ ret = pthread_condattr_init(&attr);
+ if (ret != 0)
+ return ret;
+
+#ifdef PTHREAD_HAS_SETCLOCK
+ pthread_condattr_setclock(&attr, CLOCK_MONOTONIC);
+#endif
+ ret = pthread_cond_init(cond, &attr);
+ pthread_condattr_destroy(&attr);
+ return ret;
+}
+
+#define pl_cond_destroy pthread_cond_destroy
+#define pl_cond_broadcast pthread_cond_broadcast
+#define pl_cond_signal pthread_cond_signal
+#define pl_cond_wait pthread_cond_wait
+
+static inline int pl_cond_timedwait(pl_cond *cond, pl_mutex *mutex, uint64_t timeout)
+{
+ if (timeout == UINT64_MAX)
+ return pthread_cond_wait(cond, mutex);
+
+ struct timespec ts;
+#ifdef PTHREAD_HAS_SETCLOCK
+ if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0)
+ return errno;
+#else
+ struct timeval tv;
+ if (gettimeofday(&tv, NULL) < 0) // equivalent to CLOCK_REALTIME
+ return errno;
+ ts.tv_sec = tv.tv_sec;
+ ts.tv_nsec = tv.tv_usec * 1000;
+#endif
+
+ ts.tv_sec += timeout / 1000000000LLU;
+ ts.tv_nsec += timeout % 1000000000LLU;
+
+ if (ts.tv_nsec > 1000000000L) {
+ ts.tv_nsec -= 1000000000L;
+ ts.tv_sec++;
+ }
+
+ return pthread_cond_timedwait(cond, mutex, &ts);
+}
+
+#define pl_static_mutex_lock pthread_mutex_lock
+#define pl_static_mutex_unlock pthread_mutex_unlock
+
+#define PL_THREAD_VOID void *
+#define PL_THREAD_RETURN() return NULL
+
+#define pl_thread_create(t, f, a) pthread_create(t, NULL, f, a)
+#define pl_thread_join(t) pthread_join(t, NULL)
+
+static inline bool pl_thread_sleep(double t)
+{
+ if (t <= 0.0)
+ return true;
+
+ struct timespec ts;
+ ts.tv_sec = (time_t) t;
+ ts.tv_nsec = (t - ts.tv_sec) * 1e9;
+
+ return nanosleep(&ts, NULL) == 0;
+}
diff --git a/src/pl_thread_win32.h b/src/pl_thread_win32.h
new file mode 100644
index 0000000..ef68d50
--- /dev/null
+++ b/src/pl_thread_win32.h
@@ -0,0 +1,182 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <windows.h>
+#include <process.h>
+#include <stdint.h>
+#include <errno.h>
+
+#include <pl_assert.h>
+
+typedef CRITICAL_SECTION pl_mutex;
+typedef CONDITION_VARIABLE pl_cond;
+
+static inline int pl_mutex_init_type_internal(pl_mutex *mutex, enum pl_mutex_type mtype)
+{
+ (void) mtype;
+ return !InitializeCriticalSectionEx(mutex, 0, 0);
+}
+
+#define pl_mutex_init_type(mutex, mtype) \
+ pl_assert(!pl_mutex_init_type_internal(mutex, mtype))
+
+static inline int pl_mutex_destroy(pl_mutex *mutex)
+{
+ DeleteCriticalSection(mutex);
+ return 0;
+}
+
+static inline int pl_mutex_lock(pl_mutex *mutex)
+{
+ EnterCriticalSection(mutex);
+ return 0;
+}
+
+static inline int pl_mutex_unlock(pl_mutex *mutex)
+{
+ LeaveCriticalSection(mutex);
+ return 0;
+}
+
+static inline int pl_cond_init(pl_cond *cond)
+{
+ InitializeConditionVariable(cond);
+ return 0;
+}
+
+static inline int pl_cond_destroy(pl_cond *cond)
+{
+ // condition variables are not destroyed
+ (void) cond;
+ return 0;
+}
+
+static inline int pl_cond_broadcast(pl_cond *cond)
+{
+ WakeAllConditionVariable(cond);
+ return 0;
+}
+
+static inline int pl_cond_signal(pl_cond *cond)
+{
+ WakeConditionVariable(cond);
+ return 0;
+}
+
+static inline int pl_cond_wait(pl_cond *cond, pl_mutex *mutex)
+{
+ return !SleepConditionVariableCS(cond, mutex, INFINITE);
+}
+
+static inline int pl_cond_timedwait(pl_cond *cond, pl_mutex *mutex, uint64_t timeout)
+{
+ if (timeout == UINT64_MAX)
+ return pl_cond_wait(cond, mutex);
+
+ timeout /= UINT64_C(1000000);
+ if (timeout > INFINITE - 1)
+ timeout = INFINITE - 1;
+
+ BOOL bRet = SleepConditionVariableCS(cond, mutex, timeout);
+ if (bRet == FALSE)
+ {
+ if (GetLastError() == ERROR_TIMEOUT)
+ return ETIMEDOUT;
+ else
+ return EINVAL;
+ }
+ return 0;
+}
+
+typedef SRWLOCK pl_static_mutex;
+#define PL_STATIC_MUTEX_INITIALIZER SRWLOCK_INIT
+
+static inline int pl_static_mutex_lock(pl_static_mutex *mutex)
+{
+ AcquireSRWLockExclusive(mutex);
+ return 0;
+}
+
+static inline int pl_static_mutex_unlock(pl_static_mutex *mutex)
+{
+ ReleaseSRWLockExclusive(mutex);
+ return 0;
+}
+
+typedef HANDLE pl_thread;
+#define PL_THREAD_VOID unsigned __stdcall
+#define PL_THREAD_RETURN() return 0
+
+static inline int pl_thread_create(pl_thread *thread,
+ PL_THREAD_VOID (*fun)(void *),
+ void *__restrict arg)
+{
+ *thread = (HANDLE) _beginthreadex(NULL, 0, fun, arg, 0, NULL);
+ return *thread ? 0 : -1;
+}
+
+static inline int pl_thread_join(pl_thread thread)
+{
+ DWORD ret = WaitForSingleObject(thread, INFINITE);
+ if (ret != WAIT_OBJECT_0)
+ return ret == WAIT_ABANDONED ? EINVAL : EDEADLK;
+ CloseHandle(thread);
+ return 0;
+}
+
+static inline bool pl_thread_sleep(double t)
+{
+ // Time is expected in 100 nanosecond intervals.
+ // Negative values indicate relative time.
+ LARGE_INTEGER time = { .QuadPart = -(LONGLONG) (t * 1e7) };
+
+ if (time.QuadPart >= 0)
+ return true;
+
+ bool ret = false;
+
+#ifndef CREATE_WAITABLE_TIMER_HIGH_RESOLUTION
+# define CREATE_WAITABLE_TIMER_HIGH_RESOLUTION 0x2
+#endif
+
+ HANDLE timer = CreateWaitableTimerEx(NULL, NULL,
+ CREATE_WAITABLE_TIMER_HIGH_RESOLUTION,
+ TIMER_ALL_ACCESS);
+
+ // CREATE_WAITABLE_TIMER_HIGH_RESOLUTION is supported in Windows 10 1803+,
+ // retry without it.
+ if (!timer)
+ timer = CreateWaitableTimerEx(NULL, NULL, 0, TIMER_ALL_ACCESS);
+
+ if (!timer)
+ goto end;
+
+ if (!SetWaitableTimer(timer, &time, 0, NULL, NULL, 0))
+ goto end;
+
+ if (WaitForSingleObject(timer, INFINITE) != WAIT_OBJECT_0)
+ goto end;
+
+ ret = true;
+
+end:
+ if (timer)
+ CloseHandle(timer);
+ return ret;
+}
diff --git a/src/renderer.c b/src/renderer.c
new file mode 100644
index 0000000..cc56b6f
--- /dev/null
+++ b/src/renderer.c
@@ -0,0 +1,3815 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+
+#include "common.h"
+#include "filters.h"
+#include "hash.h"
+#include "shaders.h"
+#include "dispatch.h"
+
+#include <libplacebo/renderer.h>
+
+struct cached_frame {
+ uint64_t signature;
+ uint64_t params_hash; // for detecting `pl_render_params` changes
+ struct pl_color_space color;
+ struct pl_icc_profile profile;
+ pl_rect2df crop;
+ pl_tex tex;
+ int comps;
+ bool evict; // for garbage collection
+};
+
+struct sampler {
+ pl_shader_obj upscaler_state;
+ pl_shader_obj downscaler_state;
+};
+
+struct osd_vertex {
+ float pos[2];
+ float coord[2];
+ float color[4];
+};
+
+struct icc_state {
+ pl_icc_object icc;
+ uint64_t error; // set to profile signature on failure
+};
+
+struct pl_renderer_t {
+ pl_gpu gpu;
+ pl_dispatch dp;
+ pl_log log;
+
+ // Cached feature checks (inverted)
+ enum pl_render_error errors;
+
+ // List containing signatures of disabled hooks
+ PL_ARRAY(uint64_t) disabled_hooks;
+
+ // Shader resource objects and intermediate textures (FBOs)
+ pl_shader_obj tone_map_state;
+ pl_shader_obj dither_state;
+ pl_shader_obj grain_state[4];
+ pl_shader_obj lut_state[3];
+ pl_shader_obj icc_state[2];
+ PL_ARRAY(pl_tex) fbos;
+ struct sampler sampler_main;
+ struct sampler sampler_contrast;
+ struct sampler samplers_src[4];
+ struct sampler samplers_dst[4];
+
+ // Temporary storage for vertex/index data
+ PL_ARRAY(struct osd_vertex) osd_vertices;
+ PL_ARRAY(uint16_t) osd_indices;
+ struct pl_vertex_attrib osd_attribs[3];
+
+ // Frame cache (for frame mixing / interpolation)
+ PL_ARRAY(struct cached_frame) frames;
+ PL_ARRAY(pl_tex) frame_fbos;
+
+ // For debugging / logging purposes
+ int prev_dither;
+
+ // For backwards compatibility
+ struct icc_state icc_fallback[2];
+};
+
+enum {
+ // Index into `lut_state`
+ LUT_IMAGE,
+ LUT_TARGET,
+ LUT_PARAMS,
+};
+
+enum {
+ // Index into `icc_state`
+ ICC_IMAGE,
+ ICC_TARGET
+};
+
+pl_renderer pl_renderer_create(pl_log log, pl_gpu gpu)
+{
+ pl_renderer rr = pl_alloc_ptr(NULL, rr);
+ *rr = (struct pl_renderer_t) {
+ .gpu = gpu,
+ .log = log,
+ .dp = pl_dispatch_create(log, gpu),
+ .osd_attribs = {
+ {
+ .name = "pos",
+ .offset = offsetof(struct osd_vertex, pos),
+ .fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2),
+ }, {
+ .name = "coord",
+ .offset = offsetof(struct osd_vertex, coord),
+ .fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2),
+ }, {
+ .name = "osd_color",
+ .offset = offsetof(struct osd_vertex, color),
+ .fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 4),
+ }
+ },
+ };
+
+ assert(rr->dp);
+ return rr;
+}
+
+static void sampler_destroy(pl_renderer rr, struct sampler *sampler)
+{
+ pl_shader_obj_destroy(&sampler->upscaler_state);
+ pl_shader_obj_destroy(&sampler->downscaler_state);
+}
+
+void pl_renderer_destroy(pl_renderer *p_rr)
+{
+ pl_renderer rr = *p_rr;
+ if (!rr)
+ return;
+
+ // Free all intermediate FBOs
+ for (int i = 0; i < rr->fbos.num; i++)
+ pl_tex_destroy(rr->gpu, &rr->fbos.elem[i]);
+ for (int i = 0; i < rr->frames.num; i++)
+ pl_tex_destroy(rr->gpu, &rr->frames.elem[i].tex);
+ for (int i = 0; i < rr->frame_fbos.num; i++)
+ pl_tex_destroy(rr->gpu, &rr->frame_fbos.elem[i]);
+
+ // Free all shader resource objects
+ pl_shader_obj_destroy(&rr->tone_map_state);
+ pl_shader_obj_destroy(&rr->dither_state);
+ for (int i = 0; i < PL_ARRAY_SIZE(rr->lut_state); i++)
+ pl_shader_obj_destroy(&rr->lut_state[i]);
+ for (int i = 0; i < PL_ARRAY_SIZE(rr->grain_state); i++)
+ pl_shader_obj_destroy(&rr->grain_state[i]);
+ for (int i = 0; i < PL_ARRAY_SIZE(rr->icc_state); i++)
+ pl_shader_obj_destroy(&rr->icc_state[i]);
+
+ // Free all samplers
+ sampler_destroy(rr, &rr->sampler_main);
+ sampler_destroy(rr, &rr->sampler_contrast);
+ for (int i = 0; i < PL_ARRAY_SIZE(rr->samplers_src); i++)
+ sampler_destroy(rr, &rr->samplers_src[i]);
+ for (int i = 0; i < PL_ARRAY_SIZE(rr->samplers_dst); i++)
+ sampler_destroy(rr, &rr->samplers_dst[i]);
+
+ // Free fallback ICC profiles
+ for (int i = 0; i < PL_ARRAY_SIZE(rr->icc_fallback); i++)
+ pl_icc_close(&rr->icc_fallback[i].icc);
+
+ pl_dispatch_destroy(&rr->dp);
+ pl_free_ptr(p_rr);
+}
+
+size_t pl_renderer_save(pl_renderer rr, uint8_t *out)
+{
+ return pl_cache_save(pl_gpu_cache(rr->gpu), out, out ? SIZE_MAX : 0);
+}
+
+void pl_renderer_load(pl_renderer rr, const uint8_t *cache)
+{
+ pl_cache_load(pl_gpu_cache(rr->gpu), cache, SIZE_MAX);
+}
+
+void pl_renderer_flush_cache(pl_renderer rr)
+{
+ for (int i = 0; i < rr->frames.num; i++)
+ pl_tex_destroy(rr->gpu, &rr->frames.elem[i].tex);
+ rr->frames.num = 0;
+
+ pl_reset_detected_peak(rr->tone_map_state);
+}
+
+const struct pl_render_params pl_render_fast_params = { PL_RENDER_DEFAULTS };
+const struct pl_render_params pl_render_default_params = {
+ PL_RENDER_DEFAULTS
+ .upscaler = &pl_filter_lanczos,
+ .downscaler = &pl_filter_hermite,
+ .frame_mixer = &pl_filter_oversample,
+ .sigmoid_params = &pl_sigmoid_default_params,
+ .dither_params = &pl_dither_default_params,
+ .peak_detect_params = &pl_peak_detect_default_params,
+};
+
+const struct pl_render_params pl_render_high_quality_params = {
+ PL_RENDER_DEFAULTS
+ .upscaler = &pl_filter_ewa_lanczossharp,
+ .downscaler = &pl_filter_hermite,
+ .frame_mixer = &pl_filter_oversample,
+ .sigmoid_params = &pl_sigmoid_default_params,
+ .peak_detect_params = &pl_peak_detect_high_quality_params,
+ .color_map_params = &pl_color_map_high_quality_params,
+ .dither_params = &pl_dither_default_params,
+ .deband_params = &pl_deband_default_params,
+};
+
+const struct pl_filter_preset pl_frame_mixers[] = {
+ { "none", NULL, "No frame mixing" },
+ { "linear", &pl_filter_bilinear, "Linear frame mixing" },
+ { "oversample", &pl_filter_oversample, "Oversample (AKA SmoothMotion)" },
+ { "mitchell_clamp", &pl_filter_mitchell_clamp, "Clamped Mitchell spline" },
+ { "hermite", &pl_filter_hermite, "Cubic spline (Hermite)" },
+ {0}
+};
+
+const int pl_num_frame_mixers = PL_ARRAY_SIZE(pl_frame_mixers) - 1;
+
+const struct pl_filter_preset pl_scale_filters[] = {
+ {"none", NULL, "Built-in sampling"},
+ {"oversample", &pl_filter_oversample, "Oversample (Aspect-preserving NN)"},
+ COMMON_FILTER_PRESETS,
+ {0}
+};
+
+const int pl_num_scale_filters = PL_ARRAY_SIZE(pl_scale_filters) - 1;
+
+// Represents a "in-flight" image, which is either a shader that's in the
+// process of producing some sort of image, or a texture that needs to be
+// sampled from
+struct img {
+ // Effective texture size, always set
+ int w, h;
+
+ // Recommended format (falls back to fbofmt otherwise), only for shaders
+ pl_fmt fmt;
+
+ // Exactly *one* of these two is set:
+ pl_shader sh;
+ pl_tex tex;
+
+ // If true, created shaders will be set to unique
+ bool unique;
+
+ // Information about what to log/disable/fallback to if the shader fails
+ const char *err_msg;
+ enum pl_render_error err_enum;
+ pl_tex err_tex;
+
+ // Current effective source area, will be sampled by the main scaler
+ pl_rect2df rect;
+
+ // The current effective colorspace
+ struct pl_color_repr repr;
+ struct pl_color_space color;
+ int comps;
+};
+
+// Plane 'type', ordered by incrementing priority
+enum plane_type {
+ PLANE_INVALID = 0,
+ PLANE_ALPHA,
+ PLANE_CHROMA,
+ PLANE_LUMA,
+ PLANE_RGB,
+ PLANE_XYZ,
+};
+
+static inline enum plane_type detect_plane_type(const struct pl_plane *plane,
+ const struct pl_color_repr *repr)
+{
+ if (pl_color_system_is_ycbcr_like(repr->sys)) {
+ int t = PLANE_INVALID;
+ for (int c = 0; c < plane->components; c++) {
+ switch (plane->component_mapping[c]) {
+ case PL_CHANNEL_Y: t = PL_MAX(t, PLANE_LUMA); continue;
+ case PL_CHANNEL_A: t = PL_MAX(t, PLANE_ALPHA); continue;
+
+ case PL_CHANNEL_CB:
+ case PL_CHANNEL_CR:
+ t = PL_MAX(t, PLANE_CHROMA);
+ continue;
+
+ default: continue;
+ }
+ }
+
+ pl_assert(t);
+ return t;
+ }
+
+ // Extra test for exclusive / separated alpha plane
+ if (plane->components == 1 && plane->component_mapping[0] == PL_CHANNEL_A)
+ return PLANE_ALPHA;
+
+ switch (repr->sys) {
+ case PL_COLOR_SYSTEM_UNKNOWN: // fall through to RGB
+ case PL_COLOR_SYSTEM_RGB: return PLANE_RGB;
+ case PL_COLOR_SYSTEM_XYZ: return PLANE_XYZ;
+
+ // For the switch completeness check
+ case PL_COLOR_SYSTEM_BT_601:
+ case PL_COLOR_SYSTEM_BT_709:
+ case PL_COLOR_SYSTEM_SMPTE_240M:
+ case PL_COLOR_SYSTEM_BT_2020_NC:
+ case PL_COLOR_SYSTEM_BT_2020_C:
+ case PL_COLOR_SYSTEM_BT_2100_PQ:
+ case PL_COLOR_SYSTEM_BT_2100_HLG:
+ case PL_COLOR_SYSTEM_DOLBYVISION:
+ case PL_COLOR_SYSTEM_YCGCO:
+ case PL_COLOR_SYSTEM_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+}
+
+struct pass_state {
+ void *tmp;
+ pl_renderer rr;
+ const struct pl_render_params *params;
+ struct pl_render_info info; // for info callback
+
+ // Represents the "current" image which we're in the process of rendering.
+ // This is initially set by pass_read_image, and all of the subsequent
+ // rendering steps will mutate this in-place.
+ struct img img;
+
+ // Represents the "reference rect". Canonically, this is functionally
+ // equivalent to `image.crop`, but also updates as the refplane evolves
+ // (e.g. due to user hook prescalers)
+ pl_rect2df ref_rect;
+
+ // Integer version of `target.crop`. Semantically identical.
+ pl_rect2d dst_rect;
+
+ // Logical end-to-end rotation
+ pl_rotation rotation;
+
+ // Cached copies of the `image` / `target` for this rendering pass,
+ // corrected to make sure all rects etc. are properly defaulted/inferred.
+ struct pl_frame image;
+ struct pl_frame target;
+
+ // Cached copies of the `prev` / `next` frames, for deinterlacing.
+ struct pl_frame prev, next;
+
+ // Some extra plane metadata, inferred from `planes`
+ enum plane_type src_type[4];
+ int src_ref, dst_ref; // index into `planes`
+
+ // Metadata for `rr->fbos`
+ pl_fmt fbofmt[5];
+ bool *fbos_used;
+ bool need_peak_fbo; // need indirection for peak detection
+
+ // Map of acquired frames
+ struct {
+ bool target, image, prev, next;
+ } acquired;
+};
+
+static void find_fbo_format(struct pass_state *pass)
+{
+ const struct pl_render_params *params = pass->params;
+ pl_renderer rr = pass->rr;
+ if (params->disable_fbos || (rr->errors & PL_RENDER_ERR_FBO) || pass->fbofmt[4])
+ return;
+
+ struct {
+ enum pl_fmt_type type;
+ int depth;
+ enum pl_fmt_caps caps;
+ } configs[] = {
+ // Prefer floating point formats first
+ {PL_FMT_FLOAT, 16, PL_FMT_CAP_LINEAR},
+ {PL_FMT_FLOAT, 16, PL_FMT_CAP_SAMPLEABLE},
+
+ // Otherwise, fall back to unorm/snorm, preferring linearly sampleable
+ {PL_FMT_UNORM, 16, PL_FMT_CAP_LINEAR},
+ {PL_FMT_SNORM, 16, PL_FMT_CAP_LINEAR},
+ {PL_FMT_UNORM, 16, PL_FMT_CAP_SAMPLEABLE},
+ {PL_FMT_SNORM, 16, PL_FMT_CAP_SAMPLEABLE},
+
+ // As a final fallback, allow 8-bit FBO formats (for UNORM only)
+ {PL_FMT_UNORM, 8, PL_FMT_CAP_LINEAR},
+ {PL_FMT_UNORM, 8, PL_FMT_CAP_SAMPLEABLE},
+ };
+
+ pl_fmt fmt = NULL;
+ for (int i = 0; i < PL_ARRAY_SIZE(configs); i++) {
+ if (params->force_low_bit_depth_fbos && configs[i].depth > 8)
+ continue;
+
+ fmt = pl_find_fmt(rr->gpu, configs[i].type, 4, configs[i].depth, 0,
+ PL_FMT_CAP_RENDERABLE | configs[i].caps);
+ if (!fmt)
+ continue;
+
+ pass->fbofmt[4] = fmt;
+
+ // Probe the right variant for each number of channels, falling
+ // back to the next biggest format
+ for (int c = 1; c < 4; c++) {
+ pass->fbofmt[c] = pl_find_fmt(rr->gpu, configs[i].type, c,
+ configs[i].depth, 0, fmt->caps);
+ pass->fbofmt[c] = PL_DEF(pass->fbofmt[c], pass->fbofmt[c+1]);
+ }
+ return;
+ }
+
+ PL_WARN(rr, "Found no renderable FBO format! Most features disabled");
+ rr->errors |= PL_RENDER_ERR_FBO;
+}
+
+static void info_callback(void *priv, const struct pl_dispatch_info *dinfo)
+{
+ struct pass_state *pass = priv;
+ const struct pl_render_params *params = pass->params;
+ if (!params->info_callback)
+ return;
+
+ pass->info.pass = dinfo;
+ params->info_callback(params->info_priv, &pass->info);
+ pass->info.index++;
+}
+
+static pl_tex get_fbo(struct pass_state *pass, int w, int h, pl_fmt fmt,
+ int comps, pl_debug_tag debug_tag)
+{
+ pl_renderer rr = pass->rr;
+ comps = PL_DEF(comps, 4);
+ fmt = PL_DEF(fmt, pass->fbofmt[comps]);
+ if (!fmt)
+ return NULL;
+
+ struct pl_tex_params params = {
+ .w = w,
+ .h = h,
+ .format = fmt,
+ .sampleable = true,
+ .renderable = true,
+ .blit_src = fmt->caps & PL_FMT_CAP_BLITTABLE,
+ .storable = fmt->caps & PL_FMT_CAP_STORABLE,
+ .debug_tag = debug_tag,
+ };
+
+ int best_idx = -1;
+ int best_diff = 0;
+
+ // Find the best-fitting texture out of rr->fbos
+ for (int i = 0; i < rr->fbos.num; i++) {
+ if (pass->fbos_used[i])
+ continue;
+
+ // Orthogonal distance, with penalty for format mismatches
+ int diff = abs(rr->fbos.elem[i]->params.w - w) +
+ abs(rr->fbos.elem[i]->params.h - h) +
+ ((rr->fbos.elem[i]->params.format != fmt) ? 1000 : 0);
+
+ if (best_idx < 0 || diff < best_diff) {
+ best_idx = i;
+ best_diff = diff;
+ }
+ }
+
+ // No texture found at all, add a new one
+ if (best_idx < 0) {
+ best_idx = rr->fbos.num;
+ PL_ARRAY_APPEND(rr, rr->fbos, NULL);
+ pl_grow(pass->tmp, &pass->fbos_used, rr->fbos.num * sizeof(bool));
+ pass->fbos_used[best_idx] = false;
+ }
+
+ if (!pl_tex_recreate(rr->gpu, &rr->fbos.elem[best_idx], &params))
+ return NULL;
+
+ pass->fbos_used[best_idx] = true;
+ return rr->fbos.elem[best_idx];
+}
+
+// Forcibly convert an img to `tex`, dispatching where necessary
+static pl_tex _img_tex(struct pass_state *pass, struct img *img, pl_debug_tag tag)
+{
+ if (img->tex) {
+ pl_assert(!img->sh);
+ return img->tex;
+ }
+
+ pl_renderer rr = pass->rr;
+ pl_tex tex = get_fbo(pass, img->w, img->h, img->fmt, img->comps, tag);
+ img->fmt = NULL;
+
+ if (!tex) {
+ PL_ERR(rr, "Failed creating FBO texture! Disabling advanced rendering..");
+ memset(pass->fbofmt, 0, sizeof(pass->fbofmt));
+ pl_dispatch_abort(rr->dp, &img->sh);
+ rr->errors |= PL_RENDER_ERR_FBO;
+ return img->err_tex;
+ }
+
+ pl_assert(img->sh);
+ bool ok = pl_dispatch_finish(rr->dp, pl_dispatch_params(
+ .shader = &img->sh,
+ .target = tex,
+ ));
+
+ const char *err_msg = img->err_msg;
+ enum pl_render_error err_enum = img->err_enum;
+ pl_tex err_tex = img->err_tex;
+ img->err_msg = NULL;
+ img->err_enum = PL_RENDER_ERR_NONE;
+ img->err_tex = NULL;
+
+ if (!ok) {
+ PL_ERR(rr, "%s", PL_DEF(err_msg, "Failed dispatching intermediate pass!"));
+ rr->errors |= err_enum;
+ img->sh = pl_dispatch_begin(rr->dp);
+ img->tex = err_tex;
+ return img->tex;
+ }
+
+ img->tex = tex;
+ return img->tex;
+}
+
+#define img_tex(pass, img) _img_tex(pass, img, PL_DEBUG_TAG)
+
+// Forcibly convert an img to `sh`, sampling where necessary
+static pl_shader img_sh(struct pass_state *pass, struct img *img)
+{
+ if (img->sh) {
+ pl_assert(!img->tex);
+ return img->sh;
+ }
+
+ pl_assert(img->tex);
+ img->sh = pl_dispatch_begin_ex(pass->rr->dp, img->unique);
+ pl_shader_sample_direct(img->sh, pl_sample_src( .tex = img->tex ));
+
+ img->tex = NULL;
+ return img->sh;
+}
+
+enum sampler_type {
+ SAMPLER_DIRECT, // pick based on texture caps
+ SAMPLER_NEAREST, // direct sampling, force nearest
+ SAMPLER_BICUBIC, // fast bicubic scaling
+ SAMPLER_HERMITE, // fast hermite scaling
+ SAMPLER_GAUSSIAN, // fast gaussian scaling
+ SAMPLER_COMPLEX, // complex custom filters
+ SAMPLER_OVERSAMPLE,
+};
+
+enum sampler_dir {
+ SAMPLER_NOOP, // 1:1 scaling
+ SAMPLER_UP, // upscaling
+ SAMPLER_DOWN, // downscaling
+};
+
+enum sampler_usage {
+ SAMPLER_MAIN,
+ SAMPLER_PLANE,
+ SAMPLER_CONTRAST,
+};
+
+struct sampler_info {
+ const struct pl_filter_config *config; // if applicable
+ enum sampler_usage usage;
+ enum sampler_type type;
+ enum sampler_dir dir;
+ enum sampler_dir dir_sep[2];
+};
+
+static struct sampler_info sample_src_info(struct pass_state *pass,
+ const struct pl_sample_src *src,
+ enum sampler_usage usage)
+{
+ const struct pl_render_params *params = pass->params;
+ struct sampler_info info = { .usage = usage };
+ pl_renderer rr = pass->rr;
+
+ float rx = src->new_w / fabsf(pl_rect_w(src->rect));
+ if (rx < 1.0 - 1e-6) {
+ info.dir_sep[0] = SAMPLER_DOWN;
+ } else if (rx > 1.0 + 1e-6) {
+ info.dir_sep[0] = SAMPLER_UP;
+ }
+
+ float ry = src->new_h / fabsf(pl_rect_h(src->rect));
+ if (ry < 1.0 - 1e-6) {
+ info.dir_sep[1] = SAMPLER_DOWN;
+ } else if (ry > 1.0 + 1e-6) {
+ info.dir_sep[1] = SAMPLER_UP;
+ }
+
+ if (params->correct_subpixel_offsets) {
+ if (!info.dir_sep[0] && fabsf(src->rect.x0) > 1e-6f)
+ info.dir_sep[0] = SAMPLER_UP;
+ if (!info.dir_sep[1] && fabsf(src->rect.y0) > 1e-6f)
+ info.dir_sep[1] = SAMPLER_UP;
+ }
+
+ // We use PL_MAX so downscaling overrides upscaling when choosing scalers
+ info.dir = PL_MAX(info.dir_sep[0], info.dir_sep[1]);
+ switch (info.dir) {
+ case SAMPLER_DOWN:
+ if (usage == SAMPLER_CONTRAST) {
+ info.config = &pl_filter_bicubic;
+ } else if (usage == SAMPLER_PLANE && params->plane_downscaler) {
+ info.config = params->plane_downscaler;
+ } else {
+ info.config = params->downscaler;
+ }
+ break;
+ case SAMPLER_UP:
+ if (usage == SAMPLER_PLANE && params->plane_upscaler) {
+ info.config = params->plane_upscaler;
+ } else {
+ pl_assert(usage != SAMPLER_CONTRAST);
+ info.config = params->upscaler;
+ }
+ break;
+ case SAMPLER_NOOP:
+ info.type = SAMPLER_NEAREST;
+ return info;
+ }
+
+ if ((rr->errors & PL_RENDER_ERR_SAMPLING) || !info.config) {
+ info.type = SAMPLER_DIRECT;
+ } else if (info.config->kernel == &pl_filter_function_oversample) {
+ info.type = SAMPLER_OVERSAMPLE;
+ } else {
+ info.type = SAMPLER_COMPLEX;
+
+ // Try using faster replacements for GPU built-in scalers
+ pl_fmt texfmt = src->tex ? src->tex->params.format : pass->fbofmt[4];
+ bool can_linear = texfmt->caps & PL_FMT_CAP_LINEAR;
+ bool can_fast = info.dir == SAMPLER_UP || params->skip_anti_aliasing;
+
+ if (can_fast && !params->disable_builtin_scalers) {
+ if (can_linear && info.config == &pl_filter_bicubic)
+ info.type = SAMPLER_BICUBIC;
+ if (can_linear && info.config == &pl_filter_hermite)
+ info.type = SAMPLER_HERMITE;
+ if (can_linear && info.config == &pl_filter_gaussian)
+ info.type = SAMPLER_GAUSSIAN;
+ if (can_linear && info.config == &pl_filter_bilinear)
+ info.type = SAMPLER_DIRECT;
+ if (info.config == &pl_filter_nearest)
+ info.type = can_linear ? SAMPLER_NEAREST : SAMPLER_DIRECT;
+ }
+ }
+
+ // Disable advanced scaling without FBOs
+ if (!pass->fbofmt[4] && info.type == SAMPLER_COMPLEX)
+ info.type = SAMPLER_DIRECT;
+
+ return info;
+}
+
+static void dispatch_sampler(struct pass_state *pass, pl_shader sh,
+ struct sampler *sampler, enum sampler_usage usage,
+ pl_tex target_tex, const struct pl_sample_src *src)
+{
+ const struct pl_render_params *params = pass->params;
+ if (!sampler)
+ goto fallback;
+
+ pl_renderer rr = pass->rr;
+ struct sampler_info info = sample_src_info(pass, src, usage);
+ pl_shader_obj *lut = NULL;
+ switch (info.dir) {
+ case SAMPLER_NOOP:
+ goto fallback;
+ case SAMPLER_DOWN:
+ lut = &sampler->downscaler_state;
+ break;
+ case SAMPLER_UP:
+ lut = &sampler->upscaler_state;
+ break;
+ }
+
+ switch (info.type) {
+ case SAMPLER_DIRECT:
+ goto fallback;
+ case SAMPLER_NEAREST:
+ pl_shader_sample_nearest(sh, src);
+ return;
+ case SAMPLER_OVERSAMPLE:
+ pl_shader_sample_oversample(sh, src, info.config->kernel->params[0]);
+ return;
+ case SAMPLER_BICUBIC:
+ pl_shader_sample_bicubic(sh, src);
+ return;
+ case SAMPLER_HERMITE:
+ pl_shader_sample_hermite(sh, src);
+ return;
+ case SAMPLER_GAUSSIAN:
+ pl_shader_sample_gaussian(sh, src);
+ return;
+ case SAMPLER_COMPLEX:
+ break; // continue below
+ }
+
+ pl_assert(lut);
+ struct pl_sample_filter_params fparams = {
+ .filter = *info.config,
+ .antiring = params->antiringing_strength,
+ .no_widening = params->skip_anti_aliasing && usage != SAMPLER_CONTRAST,
+ .lut = lut,
+ };
+
+ if (target_tex) {
+ fparams.no_compute = !target_tex->params.storable;
+ } else {
+ fparams.no_compute = !(pass->fbofmt[4]->caps & PL_FMT_CAP_STORABLE);
+ }
+
+ bool ok;
+ if (info.config->polar) {
+ // Polar samplers are always a single function call
+ ok = pl_shader_sample_polar(sh, src, &fparams);
+ } else if (info.dir_sep[0] && info.dir_sep[1]) {
+ // Scaling is needed in both directions
+ struct pl_sample_src src1 = *src, src2 = *src;
+ src1.new_w = src->tex->params.w;
+ src1.rect.x0 = 0;
+ src1.rect.x1 = src1.new_w;;
+ src2.rect.y0 = 0;
+ src2.rect.y1 = src1.new_h;
+
+ pl_shader tsh = pl_dispatch_begin(rr->dp);
+ ok = pl_shader_sample_ortho2(tsh, &src1, &fparams);
+ if (!ok) {
+ pl_dispatch_abort(rr->dp, &tsh);
+ goto done;
+ }
+
+ struct img img = {
+ .sh = tsh,
+ .w = src1.new_w,
+ .h = src1.new_h,
+ .comps = src->components,
+ };
+
+ src2.tex = img_tex(pass, &img);
+ src2.scale = 1.0;
+ ok = src2.tex && pl_shader_sample_ortho2(sh, &src2, &fparams);
+ } else {
+ // Scaling is needed only in one direction
+ ok = pl_shader_sample_ortho2(sh, src, &fparams);
+ }
+
+done:
+ if (!ok) {
+ PL_ERR(rr, "Failed dispatching scaler.. disabling");
+ rr->errors |= PL_RENDER_ERR_SAMPLING;
+ goto fallback;
+ }
+
+ return;
+
+fallback:
+ // If all else fails, fall back to auto sampling
+ pl_shader_sample_direct(sh, src);
+}
+
+static void swizzle_color(pl_shader sh, int comps, const int comp_map[4],
+ bool force_alpha)
+{
+ ident_t orig = sh_fresh(sh, "orig_color");
+ GLSL("vec4 "$" = color; \n"
+ "color = vec4(0.0, 0.0, 0.0, 1.0); \n", orig);
+
+ static const int def_map[4] = {0, 1, 2, 3};
+ comp_map = PL_DEF(comp_map, def_map);
+
+ for (int c = 0; c < comps; c++) {
+ if (comp_map[c] >= 0)
+ GLSL("color[%d] = "$"[%d]; \n", c, orig, comp_map[c]);
+ }
+
+ if (force_alpha)
+ GLSL("color.a = "$".a; \n", orig);
+}
+
+// `scale` adapts from `pass->dst_rect` to the plane being rendered to
+static void draw_overlays(struct pass_state *pass, pl_tex fbo,
+ int comps, const int comp_map[4],
+ const struct pl_overlay *overlays, int num,
+ struct pl_color_space color, struct pl_color_repr repr,
+ const pl_transform2x2 *output_shift)
+{
+ pl_renderer rr = pass->rr;
+ if (num <= 0 || (rr->errors & PL_RENDER_ERR_OVERLAY))
+ return;
+
+ enum pl_fmt_caps caps = fbo->params.format->caps;
+ if (!(rr->errors & PL_RENDER_ERR_BLENDING) &&
+ !(caps & PL_FMT_CAP_BLENDABLE))
+ {
+ PL_WARN(rr, "Trying to draw an overlay to a non-blendable target. "
+ "Alpha blending is disabled, results may be incorrect!");
+ rr->errors |= PL_RENDER_ERR_BLENDING;
+ }
+
+ const struct pl_frame *image = pass->src_ref >= 0 ? &pass->image : NULL;
+ pl_transform2x2 src_to_dst;
+ if (image) {
+ float rx = pl_rect_w(pass->dst_rect) / pl_rect_w(image->crop);
+ float ry = pl_rect_h(pass->dst_rect) / pl_rect_h(image->crop);
+ src_to_dst = (pl_transform2x2) {
+ .mat.m = {{ rx, 0 }, { 0, ry }},
+ .c = {
+ pass->dst_rect.x0 - rx * image->crop.x0,
+ pass->dst_rect.y0 - ry * image->crop.y0,
+ },
+ };
+
+ if (pass->rotation % PL_ROTATION_180 == PL_ROTATION_90) {
+ PL_SWAP(src_to_dst.c[0], src_to_dst.c[1]);
+ src_to_dst.mat = (pl_matrix2x2) {{{ 0, ry }, { rx, 0 }}};
+ }
+ }
+
+ const struct pl_frame *target = &pass->target;
+ pl_rect2df dst_crop = target->crop;
+ pl_rect2df_rotate(&dst_crop, -pass->rotation);
+ pl_rect2df_normalize(&dst_crop);
+
+ for (int n = 0; n < num; n++) {
+ struct pl_overlay ol = overlays[n];
+ if (!ol.num_parts)
+ continue;
+
+ if (!ol.coords) {
+ ol.coords = overlays == target->overlays
+ ? PL_OVERLAY_COORDS_DST_FRAME
+ : PL_OVERLAY_COORDS_SRC_FRAME;
+ }
+
+ pl_transform2x2 tf = pl_transform2x2_identity;
+ switch (ol.coords) {
+ case PL_OVERLAY_COORDS_SRC_CROP:
+ if (!image)
+ continue;
+ tf.c[0] = image->crop.x0;
+ tf.c[1] = image->crop.y0;
+ // fall through
+ case PL_OVERLAY_COORDS_SRC_FRAME:
+ if (!image)
+ continue;
+ pl_transform2x2_rmul(&src_to_dst, &tf);
+ break;
+ case PL_OVERLAY_COORDS_DST_CROP:
+ tf.c[0] = dst_crop.x0;
+ tf.c[1] = dst_crop.y0;
+ break;
+ case PL_OVERLAY_COORDS_DST_FRAME:
+ break;
+ case PL_OVERLAY_COORDS_AUTO:
+ case PL_OVERLAY_COORDS_COUNT:
+ pl_unreachable();
+ }
+
+ if (output_shift)
+ pl_transform2x2_rmul(output_shift, &tf);
+
+ // Construct vertex/index buffers
+ rr->osd_vertices.num = 0;
+ rr->osd_indices.num = 0;
+ for (int i = 0; i < ol.num_parts; i++) {
+ const struct pl_overlay_part *part = &ol.parts[i];
+
+#define EMIT_VERT(x, y) \
+ do { \
+ float pos[2] = { part->dst.x, part->dst.y }; \
+ pl_transform2x2_apply(&tf, pos); \
+ PL_ARRAY_APPEND(rr, rr->osd_vertices, (struct osd_vertex) { \
+ .pos = { \
+ 2.0 * (pos[0] / fbo->params.w) - 1.0, \
+ 2.0 * (pos[1] / fbo->params.h) - 1.0, \
+ }, \
+ .coord = { \
+ part->src.x / ol.tex->params.w, \
+ part->src.y / ol.tex->params.h, \
+ }, \
+ .color = { \
+ part->color[0], part->color[1], \
+ part->color[2], part->color[3], \
+ }, \
+ }); \
+ } while (0)
+
+ int idx_base = rr->osd_vertices.num;
+ EMIT_VERT(x0, y0); // idx 0: top left
+ EMIT_VERT(x1, y0); // idx 1: top right
+ EMIT_VERT(x0, y1); // idx 2: bottom left
+ EMIT_VERT(x1, y1); // idx 3: bottom right
+ PL_ARRAY_APPEND(rr, rr->osd_indices, idx_base + 0);
+ PL_ARRAY_APPEND(rr, rr->osd_indices, idx_base + 1);
+ PL_ARRAY_APPEND(rr, rr->osd_indices, idx_base + 2);
+ PL_ARRAY_APPEND(rr, rr->osd_indices, idx_base + 2);
+ PL_ARRAY_APPEND(rr, rr->osd_indices, idx_base + 1);
+ PL_ARRAY_APPEND(rr, rr->osd_indices, idx_base + 3);
+ }
+
+ // Draw parts
+ pl_shader sh = pl_dispatch_begin(rr->dp);
+ ident_t tex = sh_desc(sh, (struct pl_shader_desc) {
+ .desc = {
+ .name = "osd_tex",
+ .type = PL_DESC_SAMPLED_TEX,
+ },
+ .binding = {
+ .object = ol.tex,
+ .sample_mode = (ol.tex->params.format->caps & PL_FMT_CAP_LINEAR)
+ ? PL_TEX_SAMPLE_LINEAR
+ : PL_TEX_SAMPLE_NEAREST,
+ },
+ });
+
+ sh_describe(sh, "overlay");
+ GLSL("// overlay \n");
+
+ switch (ol.mode) {
+ case PL_OVERLAY_NORMAL:
+ GLSL("vec4 color = textureLod("$", coord, 0.0); \n", tex);
+ break;
+ case PL_OVERLAY_MONOCHROME:
+ GLSL("vec4 color = osd_color; \n");
+ break;
+ case PL_OVERLAY_MODE_COUNT:
+ pl_unreachable();
+ };
+
+ static const struct pl_color_map_params osd_params = {
+ PL_COLOR_MAP_DEFAULTS
+ .tone_mapping_function = &pl_tone_map_linear,
+ .gamut_mapping = &pl_gamut_map_saturation,
+ };
+
+ sh->output = PL_SHADER_SIG_COLOR;
+ pl_shader_decode_color(sh, &ol.repr, NULL);
+ if (target->icc)
+ color.transfer = PL_COLOR_TRC_LINEAR;
+ pl_shader_color_map_ex(sh, &osd_params, pl_color_map_args(ol.color, color));
+ if (target->icc)
+ pl_icc_encode(sh, target->icc, &rr->icc_state[ICC_TARGET]);
+
+ bool premul = repr.alpha == PL_ALPHA_PREMULTIPLIED;
+ pl_shader_encode_color(sh, &repr);
+ if (ol.mode == PL_OVERLAY_MONOCHROME) {
+ GLSL("color.%s *= textureLod("$", coord, 0.0).r; \n",
+ premul ? "rgba" : "a", tex);
+ }
+
+ swizzle_color(sh, comps, comp_map, true);
+
+ struct pl_blend_params blend_params = {
+ .src_rgb = premul ? PL_BLEND_ONE : PL_BLEND_SRC_ALPHA,
+ .src_alpha = PL_BLEND_ONE,
+ .dst_rgb = PL_BLEND_ONE_MINUS_SRC_ALPHA,
+ .dst_alpha = PL_BLEND_ONE_MINUS_SRC_ALPHA,
+ };
+
+ bool ok = pl_dispatch_vertex(rr->dp, pl_dispatch_vertex_params(
+ .shader = &sh,
+ .target = fbo,
+ .blend_params = (rr->errors & PL_RENDER_ERR_BLENDING)
+ ? NULL : &blend_params,
+ .vertex_stride = sizeof(struct osd_vertex),
+ .num_vertex_attribs = ol.mode == PL_OVERLAY_NORMAL ? 2 : 3,
+ .vertex_attribs = rr->osd_attribs,
+ .vertex_position_idx = 0,
+ .vertex_coords = PL_COORDS_NORMALIZED,
+ .vertex_type = PL_PRIM_TRIANGLE_LIST,
+ .vertex_count = rr->osd_indices.num,
+ .vertex_data = rr->osd_vertices.elem,
+ .index_data = rr->osd_indices.elem,
+ ));
+
+ if (!ok) {
+ PL_ERR(rr, "Failed rendering overlays!");
+ rr->errors |= PL_RENDER_ERR_OVERLAY;
+ return;
+ }
+ }
+}
+
+static pl_tex get_hook_tex(void *priv, int width, int height)
+{
+ struct pass_state *pass = priv;
+
+ return get_fbo(pass, width, height, NULL, 4, PL_DEBUG_TAG);
+}
+
+// Returns if any hook was applied (even if there were errors)
+static bool pass_hook(struct pass_state *pass, struct img *img,
+ enum pl_hook_stage stage)
+{
+ const struct pl_render_params *params = pass->params;
+ pl_renderer rr = pass->rr;
+ if (!pass->fbofmt[4] || !stage)
+ return false;
+
+ bool ret = false;
+
+ for (int n = 0; n < params->num_hooks; n++) {
+ const struct pl_hook *hook = params->hooks[n];
+ if (!(hook->stages & stage))
+ continue;
+
+ // Hopefully the list of disabled hooks is small, search linearly.
+ for (int i = 0; i < rr->disabled_hooks.num; i++) {
+ if (rr->disabled_hooks.elem[i] != hook->signature)
+ continue;
+ PL_TRACE(rr, "Skipping hook %d (0x%"PRIx64") stage 0x%x",
+ n, hook->signature, stage);
+ goto hook_skip;
+ }
+
+ PL_TRACE(rr, "Dispatching hook %d (0x%"PRIx64") stage 0x%x",
+ n, hook->signature, stage);
+ struct pl_hook_params hparams = {
+ .gpu = rr->gpu,
+ .dispatch = rr->dp,
+ .get_tex = get_hook_tex,
+ .priv = pass,
+ .stage = stage,
+ .rect = img->rect,
+ .repr = img->repr,
+ .color = img->color,
+ .orig_repr = &pass->image.repr,
+ .orig_color = &pass->image.color,
+ .components = img->comps,
+ .src_rect = pass->ref_rect,
+ .dst_rect = pass->dst_rect,
+ };
+
+ // TODO: Add some sort of `test` API function to the hooks that allows
+ // us to skip having to touch the `img` state at all for no-ops
+
+ switch (hook->input) {
+ case PL_HOOK_SIG_NONE:
+ break;
+
+ case PL_HOOK_SIG_TEX: {
+ hparams.tex = img_tex(pass, img);
+ if (!hparams.tex) {
+ PL_ERR(rr, "Failed dispatching shader prior to hook!");
+ goto hook_error;
+ }
+ break;
+ }
+
+ case PL_HOOK_SIG_COLOR:
+ hparams.sh = img_sh(pass, img);
+ break;
+
+ case PL_HOOK_SIG_COUNT:
+ pl_unreachable();
+ }
+
+ struct pl_hook_res res = hook->hook(hook->priv, &hparams);
+ if (res.failed) {
+ PL_ERR(rr, "Failed executing hook, disabling");
+ goto hook_error;
+ }
+
+ bool resizable = pl_hook_stage_resizable(stage);
+ switch (res.output) {
+ case PL_HOOK_SIG_NONE:
+ break;
+
+ case PL_HOOK_SIG_TEX:
+ if (!resizable) {
+ if (res.tex->params.w != img->w ||
+ res.tex->params.h != img->h ||
+ !pl_rect2d_eq(res.rect, img->rect))
+ {
+ PL_ERR(rr, "User hook tried resizing non-resizable stage!");
+ goto hook_error;
+ }
+ }
+
+ *img = (struct img) {
+ .tex = res.tex,
+ .repr = res.repr,
+ .color = res.color,
+ .comps = res.components,
+ .rect = res.rect,
+ .w = res.tex->params.w,
+ .h = res.tex->params.h,
+ .unique = img->unique,
+ };
+ break;
+
+ case PL_HOOK_SIG_COLOR:
+ if (!resizable) {
+ if (res.sh->output_w != img->w ||
+ res.sh->output_h != img->h ||
+ !pl_rect2d_eq(res.rect, img->rect))
+ {
+ PL_ERR(rr, "User hook tried resizing non-resizable stage!");
+ goto hook_error;
+ }
+ }
+
+ *img = (struct img) {
+ .sh = res.sh,
+ .repr = res.repr,
+ .color = res.color,
+ .comps = res.components,
+ .rect = res.rect,
+ .w = res.sh->output_w,
+ .h = res.sh->output_h,
+ .unique = img->unique,
+ .err_enum = PL_RENDER_ERR_HOOKS,
+ .err_msg = "Failed applying user hook",
+ .err_tex = hparams.tex, // if any
+ };
+ break;
+
+ case PL_HOOK_SIG_COUNT:
+ pl_unreachable();
+ }
+
+ // a hook was performed successfully
+ ret = true;
+
+hook_skip:
+ continue;
+hook_error:
+ PL_ARRAY_APPEND(rr, rr->disabled_hooks, hook->signature);
+ rr->errors |= PL_RENDER_ERR_HOOKS;
+ }
+
+ // Make sure the state remains as valid as possible, even if the resulting
+ // shaders might end up nonsensical, to prevent segfaults
+ if (!img->tex && !img->sh)
+ img->sh = pl_dispatch_begin(rr->dp);
+ return ret;
+}
+
+static void hdr_update_peak(struct pass_state *pass)
+{
+ const struct pl_render_params *params = pass->params;
+ pl_renderer rr = pass->rr;
+ if (!params->peak_detect_params || !pl_color_space_is_hdr(&pass->img.color))
+ goto cleanup;
+
+ if (rr->errors & PL_RENDER_ERR_PEAK_DETECT)
+ goto cleanup;
+
+ if (pass->fbofmt[4] && !(pass->fbofmt[4]->caps & PL_FMT_CAP_STORABLE))
+ goto cleanup;
+
+ if (!rr->gpu->limits.max_ssbo_size)
+ goto cleanup;
+
+ float max_peak = pl_color_transfer_nominal_peak(pass->img.color.transfer) *
+ PL_COLOR_SDR_WHITE;
+ if (pass->img.color.transfer == PL_COLOR_TRC_HLG)
+ max_peak = pass->img.color.hdr.max_luma;
+ if (max_peak <= pass->target.color.hdr.max_luma + 1e-6)
+ goto cleanup; // no adaptation needed
+
+ if (pass->img.color.hdr.avg_pq_y)
+ goto cleanup; // DV metadata already present
+
+ enum pl_hdr_metadata_type metadata = PL_HDR_METADATA_ANY;
+ if (params->color_map_params)
+ metadata = params->color_map_params->metadata;
+
+ if (metadata && metadata != PL_HDR_METADATA_CIE_Y)
+ goto cleanup; // metadata will be unused
+
+ const struct pl_color_map_params *cpars = params->color_map_params;
+ bool uses_ootf = cpars && cpars->tone_mapping_function == &pl_tone_map_st2094_40;
+ if (uses_ootf && pass->img.color.hdr.ootf.num_anchors)
+ goto cleanup; // HDR10+ OOTF is being used
+
+ if (params->lut && params->lut_type == PL_LUT_CONVERSION)
+ goto cleanup; // LUT handles tone mapping
+
+ if (!pass->fbofmt[4] && !params->peak_detect_params->allow_delayed) {
+ PL_WARN(rr, "Disabling peak detection because "
+ "`pl_peak_detect_params.allow_delayed` is false, but lack of "
+ "FBOs forces the result to be delayed.");
+ rr->errors |= PL_RENDER_ERR_PEAK_DETECT;
+ goto cleanup;
+ }
+
+ bool ok = pl_shader_detect_peak(img_sh(pass, &pass->img), pass->img.color,
+ &rr->tone_map_state, params->peak_detect_params);
+ if (!ok) {
+ PL_WARN(rr, "Failed creating HDR peak detection shader.. disabling");
+ rr->errors |= PL_RENDER_ERR_PEAK_DETECT;
+ goto cleanup;
+ }
+
+ pass->need_peak_fbo = !params->peak_detect_params->allow_delayed;
+ return;
+
+cleanup:
+ // No peak detection required or supported, so clean up the state to avoid
+ // confusing it with later frames where peak detection is enabled again
+ pl_reset_detected_peak(rr->tone_map_state);
+}
+
+bool pl_renderer_get_hdr_metadata(pl_renderer rr,
+ struct pl_hdr_metadata *metadata)
+{
+ return pl_get_detected_hdr_metadata(rr->tone_map_state, metadata);
+}
+
+struct plane_state {
+ enum plane_type type;
+ struct pl_plane plane;
+ struct img img; // for per-plane shaders
+ float plane_w, plane_h; // logical plane dimensions
+};
+
+static const char *plane_type_names[] = {
+ [PLANE_INVALID] = "invalid",
+ [PLANE_ALPHA] = "alpha",
+ [PLANE_CHROMA] = "chroma",
+ [PLANE_LUMA] = "luma",
+ [PLANE_RGB] = "rgb",
+ [PLANE_XYZ] = "xyz",
+};
+
+static void log_plane_info(pl_renderer rr, const struct plane_state *st)
+{
+ const struct pl_plane *plane = &st->plane;
+ PL_TRACE(rr, " Type: %s", plane_type_names[st->type]);
+
+ switch (plane->components) {
+ case 0:
+ PL_TRACE(rr, " Components: (none)");
+ break;
+ case 1:
+ PL_TRACE(rr, " Components: {%d}",
+ plane->component_mapping[0]);
+ break;
+ case 2:
+ PL_TRACE(rr, " Components: {%d %d}",
+ plane->component_mapping[0],
+ plane->component_mapping[1]);
+ break;
+ case 3:
+ PL_TRACE(rr, " Components: {%d %d %d}",
+ plane->component_mapping[0],
+ plane->component_mapping[1],
+ plane->component_mapping[2]);
+ break;
+ case 4:
+ PL_TRACE(rr, " Components: {%d %d %d %d}",
+ plane->component_mapping[0],
+ plane->component_mapping[1],
+ plane->component_mapping[2],
+ plane->component_mapping[3]);
+ break;
+ }
+
+ PL_TRACE(rr, " Rect: {%f %f} -> {%f %f}",
+ st->img.rect.x0, st->img.rect.y0, st->img.rect.x1, st->img.rect.y1);
+
+ PL_TRACE(rr, " Bits: %d (used) / %d (sampled), shift %d",
+ st->img.repr.bits.color_depth,
+ st->img.repr.bits.sample_depth,
+ st->img.repr.bits.bit_shift);
+}
+
+// Returns true if debanding was applied
+static bool plane_deband(struct pass_state *pass, struct img *img, float neutral[3])
+{
+ const struct pl_render_params *params = pass->params;
+ const struct pl_frame *image = &pass->image;
+ pl_renderer rr = pass->rr;
+ if ((rr->errors & PL_RENDER_ERR_DEBANDING) ||
+ !params->deband_params || !pass->fbofmt[4])
+ {
+ return false;
+ }
+
+ struct pl_color_repr repr = img->repr;
+ struct pl_sample_src src = {
+ .tex = img_tex(pass, img),
+ .components = img->comps,
+ .scale = pl_color_repr_normalize(&repr),
+ };
+
+ if (!(src.tex->params.format->caps & PL_FMT_CAP_LINEAR)) {
+ PL_WARN(rr, "Debanding requires uploaded textures to be linearly "
+ "sampleable (params.sample_mode = PL_TEX_SAMPLE_LINEAR)! "
+ "Disabling debanding..");
+ rr->errors |= PL_RENDER_ERR_DEBANDING;
+ return false;
+ }
+
+ // Divide the deband grain scale by the effective current colorspace nominal
+ // peak, to make sure the output intensity of the grain is as independent
+ // of the source as possible, even though it happens this early in the
+ // process (well before any linearization / output adaptation)
+ struct pl_deband_params dparams = *params->deband_params;
+ dparams.grain /= image->color.hdr.max_luma / PL_COLOR_SDR_WHITE;
+ memcpy(dparams.grain_neutral, neutral, sizeof(dparams.grain_neutral));
+
+ img->tex = NULL;
+ img->sh = pl_dispatch_begin_ex(rr->dp, true);
+ pl_shader_deband(img->sh, &src, &dparams);
+ img->err_msg = "Failed applying debanding... disabling!";
+ img->err_enum = PL_RENDER_ERR_DEBANDING;
+ img->err_tex = src.tex;
+ img->repr = repr;
+ return true;
+}
+
+// Returns true if grain was applied
+static bool plane_film_grain(struct pass_state *pass, int plane_idx,
+ struct plane_state *st,
+ const struct plane_state *ref)
+{
+ const struct pl_frame *image = &pass->image;
+ pl_renderer rr = pass->rr;
+ if (rr->errors & PL_RENDER_ERR_FILM_GRAIN)
+ return false;
+
+ struct img *img = &st->img;
+ struct pl_plane *plane = &st->plane;
+ struct pl_color_repr repr = image->repr;
+ bool is_orig_repr = pl_color_repr_equal(&st->img.repr, &image->repr);
+ if (!is_orig_repr) {
+ // Propagate the original color depth to the film grain algorithm, but
+ // update the sample depth and effective bit shift based on the state
+ // of the current texture, which is guaranteed to already be
+ // normalized.
+ pl_assert(st->img.repr.bits.bit_shift == 0);
+ repr.bits.sample_depth = st->img.repr.bits.sample_depth;
+ repr.bits.bit_shift = repr.bits.sample_depth - repr.bits.color_depth;
+ }
+
+ struct pl_film_grain_params grain_params = {
+ .data = image->film_grain,
+ .luma_tex = ref->plane.texture,
+ .repr = &repr,
+ .components = plane->components,
+ };
+
+ switch (image->film_grain.type) {
+ case PL_FILM_GRAIN_NONE: return false;
+ case PL_FILM_GRAIN_H274: break;
+ case PL_FILM_GRAIN_AV1:
+ grain_params.luma_tex = ref->plane.texture;
+ for (int c = 0; c < ref->plane.components; c++) {
+ if (ref->plane.component_mapping[c] == PL_CHANNEL_Y)
+ grain_params.luma_comp = c;
+ }
+ break;
+ default: pl_unreachable();
+ }
+
+ for (int c = 0; c < plane->components; c++)
+ grain_params.component_mapping[c] = plane->component_mapping[c];
+
+ if (!pl_needs_film_grain(&grain_params))
+ return false;
+
+ if (!pass->fbofmt[plane->components]) {
+ PL_ERR(rr, "Film grain required but no renderable format available.. "
+ "disabling!");
+ rr->errors |= PL_RENDER_ERR_FILM_GRAIN;
+ return false;
+ }
+
+ grain_params.tex = img_tex(pass, img);
+ if (!grain_params.tex)
+ return false;
+
+ img->sh = pl_dispatch_begin_ex(rr->dp, true);
+ if (!pl_shader_film_grain(img->sh, &rr->grain_state[plane_idx], &grain_params)) {
+ pl_dispatch_abort(rr->dp, &img->sh);
+ rr->errors |= PL_RENDER_ERR_FILM_GRAIN;
+ return false;
+ }
+
+ img->tex = NULL;
+ img->err_msg = "Failed applying film grain.. disabling!";
+ img->err_enum = PL_RENDER_ERR_FILM_GRAIN;
+ img->err_tex = grain_params.tex;
+ if (is_orig_repr)
+ img->repr = repr;
+ return true;
+}
+
+static const enum pl_hook_stage plane_hook_stages[] = {
+ [PLANE_ALPHA] = PL_HOOK_ALPHA_INPUT,
+ [PLANE_CHROMA] = PL_HOOK_CHROMA_INPUT,
+ [PLANE_LUMA] = PL_HOOK_LUMA_INPUT,
+ [PLANE_RGB] = PL_HOOK_RGB_INPUT,
+ [PLANE_XYZ] = PL_HOOK_XYZ_INPUT,
+};
+
+static const enum pl_hook_stage plane_scaled_hook_stages[] = {
+ [PLANE_ALPHA] = PL_HOOK_ALPHA_SCALED,
+ [PLANE_CHROMA] = PL_HOOK_CHROMA_SCALED,
+ [PLANE_LUMA] = 0, // never hooked
+ [PLANE_RGB] = 0,
+ [PLANE_XYZ] = 0,
+};
+
+static enum pl_lut_type guess_frame_lut_type(const struct pl_frame *frame,
+ bool reversed)
+{
+ if (!frame->lut)
+ return PL_LUT_UNKNOWN;
+ if (frame->lut_type)
+ return frame->lut_type;
+
+ enum pl_color_system sys_in = frame->lut->repr_in.sys;
+ enum pl_color_system sys_out = frame->lut->repr_out.sys;
+ if (reversed)
+ PL_SWAP(sys_in, sys_out);
+
+ if (sys_in == PL_COLOR_SYSTEM_RGB && sys_out == sys_in)
+ return PL_LUT_NORMALIZED;
+
+ if (sys_in == frame->repr.sys && sys_out == PL_COLOR_SYSTEM_RGB)
+ return PL_LUT_CONVERSION;
+
+ // Unknown, just fall back to the default
+ return PL_LUT_NATIVE;
+}
+
+static pl_fmt merge_fmt(struct pass_state *pass, const struct img *a,
+ const struct img *b)
+{
+ pl_renderer rr = pass->rr;
+ pl_fmt fmta = a->tex ? a->tex->params.format : PL_DEF(a->fmt, pass->fbofmt[a->comps]);
+ pl_fmt fmtb = b->tex ? b->tex->params.format : PL_DEF(b->fmt, pass->fbofmt[b->comps]);
+ pl_assert(fmta && fmtb);
+ if (fmta->type != fmtb->type)
+ return NULL;
+
+ int num_comps = PL_MIN(4, a->comps + b->comps);
+ int min_depth = PL_MAX(a->repr.bits.sample_depth, b->repr.bits.sample_depth);
+
+ // Only return formats that support all relevant caps of both formats
+ const enum pl_fmt_caps mask = PL_FMT_CAP_SAMPLEABLE | PL_FMT_CAP_LINEAR;
+ enum pl_fmt_caps req_caps = (fmta->caps & mask) | (fmtb->caps & mask);
+
+ return pl_find_fmt(rr->gpu, fmta->type, num_comps, min_depth, 0, req_caps);
+}
+
+// Applies a series of rough heuristics to figure out whether we expect any
+// performance gains from plane merging. This is basically a series of checks
+// for operations that we *know* benefit from merged planes
+static bool want_merge(struct pass_state *pass,
+ const struct plane_state *st,
+ const struct plane_state *ref)
+{
+ const struct pl_render_params *params = pass->params;
+ const pl_renderer rr = pass->rr;
+ if (!pass->fbofmt[4])
+ return false;
+
+ // Debanding
+ if (!(rr->errors & PL_RENDER_ERR_DEBANDING) && params->deband_params)
+ return true;
+
+ // Other plane hooks, which are generally nontrivial
+ enum pl_hook_stage stage = plane_hook_stages[st->type];
+ for (int i = 0; i < params->num_hooks; i++) {
+ if (params->hooks[i]->stages & stage)
+ return true;
+ }
+
+ // Non-trivial scaling
+ struct pl_sample_src src = {
+ .new_w = ref->img.w,
+ .new_h = ref->img.h,
+ .rect = {
+ .x1 = st->img.w,
+ .y1 = st->img.h,
+ },
+ };
+
+ struct sampler_info info = sample_src_info(pass, &src, SAMPLER_PLANE);
+ if (info.type == SAMPLER_COMPLEX)
+ return true;
+
+ // Film grain synthesis, can be merged for compatible channels, saving on
+ // redundant sampling of the grain/offset textures
+ struct pl_film_grain_params grain_params = {
+ .data = pass->image.film_grain,
+ .repr = (struct pl_color_repr *) &st->img.repr,
+ .components = st->plane.components,
+ };
+
+ for (int c = 0; c < st->plane.components; c++)
+ grain_params.component_mapping[c] = st->plane.component_mapping[c];
+
+ if (!(rr->errors & PL_RENDER_ERR_FILM_GRAIN) &&
+ pl_needs_film_grain(&grain_params))
+ {
+ return true;
+ }
+
+ return false;
+}
+
+// This scales and merges all of the source images, and initializes pass->img.
+static bool pass_read_image(struct pass_state *pass)
+{
+ const struct pl_render_params *params = pass->params;
+ struct pl_frame *image = &pass->image;
+ pl_renderer rr = pass->rr;
+
+ struct plane_state planes[4];
+ struct plane_state *ref = &planes[pass->src_ref];
+ pl_assert(pass->src_ref >= 0 && pass->src_ref < image->num_planes);
+
+ for (int i = 0; i < image->num_planes; i++) {
+ planes[i] = (struct plane_state) {
+ .type = detect_plane_type(&image->planes[i], &image->repr),
+ .plane = image->planes[i],
+ .img = {
+ .w = image->planes[i].texture->params.w,
+ .h = image->planes[i].texture->params.h,
+ .tex = image->planes[i].texture,
+ .repr = image->repr,
+ .color = image->color,
+ .comps = image->planes[i].components,
+ },
+ };
+
+ // Deinterlace plane if needed
+ if (image->field != PL_FIELD_NONE && params->deinterlace_params &&
+ pass->fbofmt[4] && !(rr->errors & PL_RENDER_ERR_DEINTERLACING))
+ {
+ struct img *img = &planes[i].img;
+ struct pl_deinterlace_source src = {
+ .cur.top = img->tex,
+ .prev.top = image->prev ? image->prev->planes[i].texture : NULL,
+ .next.top = image->next ? image->next->planes[i].texture : NULL,
+ .field = image->field,
+ .first_field = image->first_field,
+ .component_mask = (1 << img->comps) - 1,
+ };
+
+ img->tex = NULL;
+ img->sh = pl_dispatch_begin_ex(pass->rr->dp, true);
+ pl_shader_deinterlace(img->sh, &src, params->deinterlace_params);
+ img->err_msg = "Failed deinterlacing plane.. disabling!";
+ img->err_enum = PL_RENDER_ERR_DEINTERLACING;
+ img->err_tex = planes[i].plane.texture;
+ }
+ }
+
+ // Original ref texture, even after preprocessing
+ pl_tex ref_tex = ref->plane.texture;
+
+ // Merge all compatible planes into 'combined' shaders
+ for (int i = 0; i < image->num_planes; i++) {
+ struct plane_state *sti = &planes[i];
+ if (!sti->type)
+ continue;
+ if (!want_merge(pass, sti, ref))
+ continue;
+
+ bool did_merge = false;
+ for (int j = i+1; j < image->num_planes; j++) {
+ struct plane_state *stj = &planes[j];
+ bool merge = sti->type == stj->type &&
+ sti->img.w == stj->img.w &&
+ sti->img.h == stj->img.h &&
+ sti->plane.shift_x == stj->plane.shift_x &&
+ sti->plane.shift_y == stj->plane.shift_y;
+ if (!merge)
+ continue;
+
+ pl_fmt fmt = merge_fmt(pass, &sti->img, &stj->img);
+ if (!fmt)
+ continue;
+
+ PL_TRACE(rr, "Merging plane %d into plane %d", j, i);
+ pl_shader sh = sti->img.sh;
+ if (!sh) {
+ sh = sti->img.sh = pl_dispatch_begin_ex(pass->rr->dp, true);
+ pl_shader_sample_direct(sh, pl_sample_src( .tex = sti->img.tex ));
+ sti->img.tex = NULL;
+ }
+
+ pl_shader psh = NULL;
+ if (!stj->img.sh) {
+ psh = pl_dispatch_begin_ex(pass->rr->dp, true);
+ pl_shader_sample_direct(psh, pl_sample_src( .tex = stj->img.tex ));
+ }
+
+ ident_t sub = sh_subpass(sh, psh ? psh : stj->img.sh);
+ pl_dispatch_abort(rr->dp, &psh);
+ if (!sub)
+ break; // skip merging
+
+ sh_describe(sh, "merging planes");
+ GLSL("{ \n"
+ "vec4 tmp = "$"(); \n", sub);
+ for (int jc = 0; jc < stj->img.comps; jc++) {
+ int map = stj->plane.component_mapping[jc];
+ if (map == PL_CHANNEL_NONE)
+ continue;
+ int ic = sti->img.comps++;
+ pl_assert(ic < 4);
+ GLSL("color[%d] = tmp[%d]; \n", ic, jc);
+ sti->plane.components = sti->img.comps;
+ sti->plane.component_mapping[ic] = map;
+ }
+ GLSL("} \n");
+
+ sti->img.fmt = fmt;
+ pl_dispatch_abort(rr->dp, &stj->img.sh);
+ *stj = (struct plane_state) {0};
+ did_merge = true;
+ }
+
+ if (!did_merge)
+ continue;
+
+ if (!img_tex(pass, &sti->img)) {
+ PL_ERR(rr, "Failed dispatching plane merging shader, disabling FBOs!");
+ memset(pass->fbofmt, 0, sizeof(pass->fbofmt));
+ rr->errors |= PL_RENDER_ERR_FBO;
+ return false;
+ }
+ }
+
+ int bits = image->repr.bits.sample_depth;
+ float out_scale = bits ? (1llu << bits) / ((1llu << bits) - 1.0f) : 1.0f;
+ float neutral_luma = 0.0, neutral_chroma = 0.5f * out_scale;
+ if (pl_color_levels_guess(&image->repr) == PL_COLOR_LEVELS_LIMITED)
+ neutral_luma = 16 / 256.0f * out_scale;
+ if (!pl_color_system_is_ycbcr_like(image->repr.sys))
+ neutral_chroma = neutral_luma;
+
+ // Compute the sampling rc of each plane
+ for (int i = 0; i < image->num_planes; i++) {
+ struct plane_state *st = &planes[i];
+ if (!st->type)
+ continue;
+
+ float rx = (float) st->plane.texture->params.w / ref_tex->params.w,
+ ry = (float) st->plane.texture->params.h / ref_tex->params.h;
+
+ // Only accept integer scaling ratios. This accounts for the fact that
+ // fractionally subsampled planes get rounded up to the nearest integer
+ // size, which we want to discard.
+ float rrx = rx >= 1 ? roundf(rx) : 1.0 / roundf(1.0 / rx),
+ rry = ry >= 1 ? roundf(ry) : 1.0 / roundf(1.0 / ry);
+
+ float sx = st->plane.shift_x,
+ sy = st->plane.shift_y;
+
+ st->img.rect = (pl_rect2df) {
+ .x0 = (image->crop.x0 - sx) * rrx,
+ .y0 = (image->crop.y0 - sy) * rry,
+ .x1 = (image->crop.x1 - sx) * rrx,
+ .y1 = (image->crop.y1 - sy) * rry,
+ };
+
+ st->plane_w = ref_tex->params.w * rrx;
+ st->plane_h = ref_tex->params.h * rry;
+
+ PL_TRACE(rr, "Plane %d:", i);
+ log_plane_info(rr, st);
+
+ float neutral[3] = {0.0};
+ for (int c = 0, idx = 0; c < st->plane.components; c++) {
+ switch (st->plane.component_mapping[c]) {
+ case PL_CHANNEL_Y: neutral[idx++] = neutral_luma; break;
+ case PL_CHANNEL_U: // fall through
+ case PL_CHANNEL_V: neutral[idx++] = neutral_chroma; break;
+ }
+ }
+
+ // The order of operations (deband -> film grain -> user hooks) is
+ // chosen to maximize quality. Note that film grain requires unmodified
+ // plane sizes, so it has to be before user hooks. As for debanding,
+ // it's reduced in quality after e.g. plane scalers as well. It's also
+ // made less effective by performing film grain synthesis first.
+
+ if (plane_deband(pass, &st->img, neutral)) {
+ PL_TRACE(rr, "After debanding:");
+ log_plane_info(rr, st);
+ }
+
+ if (plane_film_grain(pass, i, st, ref)) {
+ PL_TRACE(rr, "After film grain:");
+ log_plane_info(rr, st);
+ }
+
+ if (pass_hook(pass, &st->img, plane_hook_stages[st->type])) {
+ PL_TRACE(rr, "After user hooks:");
+ log_plane_info(rr, st);
+ }
+ }
+
+ pl_shader sh = pl_dispatch_begin_ex(rr->dp, true);
+ sh_require(sh, PL_SHADER_SIG_NONE, 0, 0);
+
+ // Initialize the color to black
+ GLSL("vec4 color = vec4("$", vec2("$"), 1.0); \n"
+ "// pass_read_image \n"
+ "{ \n"
+ "vec4 tmp; \n",
+ SH_FLOAT(neutral_luma), SH_FLOAT(neutral_chroma));
+
+ // For quality reasons, explicitly drop subpixel offsets from the ref rect
+ // and re-add them as part of `pass->img.rect`, always rounding towards 0.
+ // Additionally, drop anamorphic subpixel mismatches.
+ pl_rect2d ref_rounded;
+ ref_rounded.x0 = truncf(ref->img.rect.x0);
+ ref_rounded.y0 = truncf(ref->img.rect.y0);
+ ref_rounded.x1 = ref_rounded.x0 + roundf(pl_rect_w(ref->img.rect));
+ ref_rounded.y1 = ref_rounded.y0 + roundf(pl_rect_h(ref->img.rect));
+
+ PL_TRACE(rr, "Rounded reference rect: {%d %d %d %d}",
+ ref_rounded.x0, ref_rounded.y0,
+ ref_rounded.x1, ref_rounded.y1);
+
+ float off_x = ref->img.rect.x0 - ref_rounded.x0,
+ off_y = ref->img.rect.y0 - ref_rounded.y0,
+ stretch_x = pl_rect_w(ref_rounded) / pl_rect_w(ref->img.rect),
+ stretch_y = pl_rect_h(ref_rounded) / pl_rect_h(ref->img.rect);
+
+ for (int i = 0; i < image->num_planes; i++) {
+ struct plane_state *st = &planes[i];
+ const struct pl_plane *plane = &st->plane;
+ if (!st->type)
+ continue;
+
+ float scale_x = pl_rect_w(st->img.rect) / pl_rect_w(ref->img.rect),
+ scale_y = pl_rect_h(st->img.rect) / pl_rect_h(ref->img.rect),
+ base_x = st->img.rect.x0 - scale_x * off_x,
+ base_y = st->img.rect.y0 - scale_y * off_y;
+
+ struct pl_sample_src src = {
+ .components = plane->components,
+ .address_mode = plane->address_mode,
+ .scale = pl_color_repr_normalize(&st->img.repr),
+ .new_w = pl_rect_w(ref_rounded),
+ .new_h = pl_rect_h(ref_rounded),
+ .rect = {
+ base_x,
+ base_y,
+ base_x + stretch_x * pl_rect_w(st->img.rect),
+ base_y + stretch_y * pl_rect_h(st->img.rect),
+ },
+ };
+
+ if (plane->flipped) {
+ src.rect.y0 = st->plane_h - src.rect.y0;
+ src.rect.y1 = st->plane_h - src.rect.y1;
+ }
+
+ PL_TRACE(rr, "Aligning plane %d: {%f %f %f %f} -> {%f %f %f %f}%s",
+ i, st->img.rect.x0, st->img.rect.y0,
+ st->img.rect.x1, st->img.rect.y1,
+ src.rect.x0, src.rect.y0,
+ src.rect.x1, src.rect.y1,
+ plane->flipped ? " (flipped) " : "");
+
+ st->img.unique = true;
+ pl_rect2d unscaled = { .x1 = src.new_w, .y1 = src.new_h };
+ if (st->img.sh && st->img.w == src.new_w && st->img.h == src.new_h &&
+ pl_rect2d_eq(src.rect, unscaled))
+ {
+ // Image rects are already equal, no indirect scaling needed
+ } else {
+ src.tex = img_tex(pass, &st->img);
+ st->img.tex = NULL;
+ st->img.sh = pl_dispatch_begin_ex(rr->dp, true);
+ dispatch_sampler(pass, st->img.sh, &rr->samplers_src[i],
+ SAMPLER_PLANE, NULL, &src);
+ st->img.err_enum |= PL_RENDER_ERR_SAMPLING;
+ st->img.rect.x0 = st->img.rect.y0 = 0.0f;
+ st->img.w = st->img.rect.x1 = src.new_w;
+ st->img.h = st->img.rect.y1 = src.new_h;
+ }
+
+ pass_hook(pass, &st->img, plane_scaled_hook_stages[st->type]);
+ ident_t sub = sh_subpass(sh, img_sh(pass, &st->img));
+ if (!sub) {
+ if (!img_tex(pass, &st->img)) {
+ pl_dispatch_abort(rr->dp, &sh);
+ return false;
+ }
+
+ sub = sh_subpass(sh, img_sh(pass, &st->img));
+ pl_assert(sub);
+ }
+
+ GLSL("tmp = "$"(); \n", sub);
+ for (int c = 0; c < src.components; c++) {
+ if (plane->component_mapping[c] < 0)
+ continue;
+ GLSL("color[%d] = tmp[%d];\n", plane->component_mapping[c], c);
+ }
+
+ // we don't need it anymore
+ pl_dispatch_abort(rr->dp, &st->img.sh);
+ }
+
+ GLSL("}\n");
+
+ pass->img = (struct img) {
+ .sh = sh,
+ .w = pl_rect_w(ref_rounded),
+ .h = pl_rect_h(ref_rounded),
+ .repr = ref->img.repr,
+ .color = image->color,
+ .comps = ref->img.repr.alpha ? 4 : 3,
+ .rect = {
+ off_x,
+ off_y,
+ off_x + pl_rect_w(ref->img.rect),
+ off_y + pl_rect_h(ref->img.rect),
+ },
+ };
+
+ // Update the reference rect to our adjusted image coordinates
+ pass->ref_rect = pass->img.rect;
+
+ pass_hook(pass, &pass->img, PL_HOOK_NATIVE);
+
+ // Apply LUT logic and colorspace conversion
+ enum pl_lut_type lut_type = guess_frame_lut_type(image, false);
+ sh = img_sh(pass, &pass->img);
+ bool needs_conversion = true;
+
+ if (lut_type == PL_LUT_NATIVE || lut_type == PL_LUT_CONVERSION) {
+ // Fix bit depth normalization before applying LUT
+ float scale = pl_color_repr_normalize(&pass->img.repr);
+ GLSL("color *= vec4("$"); \n", SH_FLOAT(scale));
+ pl_shader_set_alpha(sh, &pass->img.repr, PL_ALPHA_INDEPENDENT);
+ pl_shader_custom_lut(sh, image->lut, &rr->lut_state[LUT_IMAGE]);
+
+ if (lut_type == PL_LUT_CONVERSION) {
+ pass->img.repr.sys = PL_COLOR_SYSTEM_RGB;
+ pass->img.repr.levels = PL_COLOR_LEVELS_FULL;
+ needs_conversion = false;
+ }
+ }
+
+ if (needs_conversion) {
+ if (pass->img.repr.sys == PL_COLOR_SYSTEM_XYZ)
+ pass->img.color.transfer = PL_COLOR_TRC_LINEAR;
+ pl_shader_decode_color(sh, &pass->img.repr, params->color_adjustment);
+ }
+
+ if (lut_type == PL_LUT_NORMALIZED)
+ pl_shader_custom_lut(sh, image->lut, &rr->lut_state[LUT_IMAGE]);
+
+ // A main PL_LUT_CONVERSION LUT overrides ICC profiles
+ bool main_lut_override = params->lut && params->lut_type == PL_LUT_CONVERSION;
+ if (image->icc && !main_lut_override) {
+ pl_shader_set_alpha(sh, &pass->img.repr, PL_ALPHA_INDEPENDENT);
+ pl_icc_decode(sh, image->icc, &rr->icc_state[ICC_IMAGE], &pass->img.color);
+ }
+
+ // Pre-multiply alpha channel before the rest of the pipeline, to avoid
+ // bleeding colors from transparent regions into non-transparent regions
+ pl_shader_set_alpha(sh, &pass->img.repr, PL_ALPHA_PREMULTIPLIED);
+
+ pass_hook(pass, &pass->img, PL_HOOK_RGB);
+ sh = NULL;
+ return true;
+}
+
+static bool pass_scale_main(struct pass_state *pass)
+{
+ const struct pl_render_params *params = pass->params;
+ pl_renderer rr = pass->rr;
+
+ pl_fmt fbofmt = pass->fbofmt[pass->img.comps];
+ if (!fbofmt) {
+ PL_TRACE(rr, "Skipping main scaler (no FBOs)");
+ return true;
+ }
+
+ const pl_rect2df new_rect = {
+ .x1 = abs(pl_rect_w(pass->dst_rect)),
+ .y1 = abs(pl_rect_h(pass->dst_rect)),
+ };
+
+ struct img *img = &pass->img;
+ struct pl_sample_src src = {
+ .components = img->comps,
+ .new_w = pl_rect_w(new_rect),
+ .new_h = pl_rect_h(new_rect),
+ .rect = img->rect,
+ };
+
+ const struct pl_frame *image = &pass->image;
+ bool need_fbo = false;
+
+ // Force FBO indirection if this shader is non-resizable
+ int out_w, out_h;
+ if (img->sh && pl_shader_output_size(img->sh, &out_w, &out_h))
+ need_fbo |= out_w != src.new_w || out_h != src.new_h;
+
+ struct sampler_info info = sample_src_info(pass, &src, SAMPLER_MAIN);
+ bool use_sigmoid = info.dir == SAMPLER_UP && params->sigmoid_params;
+ bool use_linear = info.dir == SAMPLER_DOWN;
+
+ // Opportunistically update peak here if it would save performance
+ if (info.dir == SAMPLER_UP)
+ hdr_update_peak(pass);
+
+ // We need to enable the full rendering pipeline if there are any user
+ // shaders / hooks that might depend on it.
+ uint64_t scaling_hooks = PL_HOOK_PRE_KERNEL | PL_HOOK_POST_KERNEL;
+ uint64_t linear_hooks = PL_HOOK_LINEAR | PL_HOOK_SIGMOID;
+
+ for (int i = 0; i < params->num_hooks; i++) {
+ if (params->hooks[i]->stages & (scaling_hooks | linear_hooks)) {
+ need_fbo = true;
+ if (params->hooks[i]->stages & linear_hooks)
+ use_linear = true;
+ if (params->hooks[i]->stages & PL_HOOK_SIGMOID)
+ use_sigmoid = true;
+ }
+ }
+
+ if (info.dir == SAMPLER_NOOP && !need_fbo) {
+ pl_assert(src.new_w == img->w && src.new_h == img->h);
+ PL_TRACE(rr, "Skipping main scaler (would be no-op)");
+ goto done;
+ }
+
+ if (info.type == SAMPLER_DIRECT && !need_fbo) {
+ img->w = src.new_w;
+ img->h = src.new_h;
+ img->rect = new_rect;
+ PL_TRACE(rr, "Skipping main scaler (free sampling)");
+ goto done;
+ }
+
+ // Hard-disable both sigmoidization and linearization when required
+ if (params->disable_linear_scaling || fbofmt->component_depth[0] < 16)
+ use_sigmoid = use_linear = false;
+
+ // Avoid sigmoidization for HDR content because it clips to [0,1], and
+ // linearization because it causes very nasty ringing artefacts.
+ if (pl_color_space_is_hdr(&img->color))
+ use_sigmoid = use_linear = false;
+
+ if (!(use_linear || use_sigmoid) && img->color.transfer == PL_COLOR_TRC_LINEAR) {
+ img->color.transfer = image->color.transfer;
+ if (image->color.transfer == PL_COLOR_TRC_LINEAR)
+ img->color.transfer = PL_COLOR_TRC_GAMMA22; // arbitrary fallback
+ pl_shader_delinearize(img_sh(pass, img), &img->color);
+ }
+
+ if (use_linear || use_sigmoid) {
+ pl_shader_linearize(img_sh(pass, img), &img->color);
+ img->color.transfer = PL_COLOR_TRC_LINEAR;
+ pass_hook(pass, img, PL_HOOK_LINEAR);
+ }
+
+ if (use_sigmoid) {
+ pl_shader_sigmoidize(img_sh(pass, img), params->sigmoid_params);
+ pass_hook(pass, img, PL_HOOK_SIGMOID);
+ }
+
+ pass_hook(pass, img, PL_HOOK_PRE_KERNEL);
+
+ src.tex = img_tex(pass, img);
+ if (!src.tex)
+ return false;
+ pass->need_peak_fbo = false;
+
+ pl_shader sh = pl_dispatch_begin_ex(rr->dp, true);
+ dispatch_sampler(pass, sh, &rr->sampler_main, SAMPLER_MAIN, NULL, &src);
+ img->tex = NULL;
+ img->sh = sh;
+ img->w = src.new_w;
+ img->h = src.new_h;
+ img->rect = new_rect;
+
+ pass_hook(pass, img, PL_HOOK_POST_KERNEL);
+
+ if (use_sigmoid)
+ pl_shader_unsigmoidize(img_sh(pass, img), params->sigmoid_params);
+
+done:
+ if (info.dir != SAMPLER_UP)
+ hdr_update_peak(pass);
+ pass_hook(pass, img, PL_HOOK_SCALED);
+ return true;
+}
+
+static pl_tex get_feature_map(struct pass_state *pass)
+{
+ const struct pl_render_params *params = pass->params;
+ pl_renderer rr = pass->rr;
+ const struct pl_color_map_params *cparams = params->color_map_params;
+ cparams = PL_DEF(cparams, &pl_color_map_default_params);
+ if (!cparams->contrast_recovery || cparams->contrast_smoothness <= 1)
+ return NULL;
+ if (!pass->fbofmt[4])
+ return NULL;
+ if (!pl_color_space_is_hdr(&pass->img.color))
+ return NULL;
+ if (rr->errors & (PL_RENDER_ERR_SAMPLING | PL_RENDER_ERR_CONTRAST_RECOVERY))
+ return NULL;
+ if (pass->img.color.hdr.max_luma <= pass->target.color.hdr.max_luma + 1e-6)
+ return NULL; // no adaptation needed
+ if (params->lut && params->lut_type == PL_LUT_CONVERSION)
+ return NULL; // LUT handles tone mapping
+
+ struct img *img = &pass->img;
+ if (!img_tex(pass, img))
+ return NULL;
+
+ const float ratio = cparams->contrast_smoothness;
+ const int cr_w = ceilf(abs(pl_rect_w(pass->dst_rect)) / ratio);
+ const int cr_h = ceilf(abs(pl_rect_h(pass->dst_rect)) / ratio);
+ pl_tex inter_tex = get_fbo(pass, img->w, img->h, NULL, 1, PL_DEBUG_TAG);
+ pl_tex out_tex = get_fbo(pass, cr_w, cr_h, NULL, 1, PL_DEBUG_TAG);
+ if (!inter_tex || !out_tex)
+ goto error;
+
+ pl_shader sh = pl_dispatch_begin(rr->dp);
+ pl_shader_sample_direct(sh, pl_sample_src( .tex = img->tex ));
+ pl_shader_extract_features(sh, img->color);
+ bool ok = pl_dispatch_finish(rr->dp, pl_dispatch_params(
+ .shader = &sh,
+ .target = inter_tex,
+ ));
+ if (!ok)
+ goto error;
+
+ const struct pl_sample_src src = {
+ .tex = inter_tex,
+ .rect = img->rect,
+ .address_mode = PL_TEX_ADDRESS_MIRROR,
+ .components = 1,
+ .new_w = cr_w,
+ .new_h = cr_h,
+ };
+
+ sh = pl_dispatch_begin(rr->dp);
+ dispatch_sampler(pass, sh, &rr->sampler_contrast, SAMPLER_CONTRAST, out_tex, &src);
+ ok = pl_dispatch_finish(rr->dp, pl_dispatch_params(
+ .shader = &sh,
+ .target = out_tex,
+ ));
+ if (!ok)
+ goto error;
+
+ return out_tex;
+
+error:
+ PL_ERR(rr, "Failed extracting luma for contrast recovery, disabling");
+ rr->errors |= PL_RENDER_ERR_CONTRAST_RECOVERY;
+ return NULL;
+}
+
+// Transforms image into the output color space (tone-mapping, ICC 3DLUT, etc)
+static void pass_convert_colors(struct pass_state *pass)
+{
+ const struct pl_render_params *params = pass->params;
+ const struct pl_frame *image = &pass->image;
+ const struct pl_frame *target = &pass->target;
+ pl_renderer rr = pass->rr;
+
+ struct img *img = &pass->img;
+ pl_shader sh = img_sh(pass, img);
+
+ bool prelinearized = false;
+ bool need_conversion = true;
+ assert(image->color.primaries == img->color.primaries);
+ if (img->color.transfer == PL_COLOR_TRC_LINEAR) {
+ if (img->repr.alpha == PL_ALPHA_PREMULTIPLIED) {
+ // Very annoying edge case: since prelinerization happens with
+ // premultiplied alpha, but color mapping happens with independent
+ // alpha, we need to go back to non-linear representation *before*
+ // alpha mode conversion, to avoid distortion
+ img->color.transfer = image->color.transfer;
+ pl_shader_delinearize(sh, &img->color);
+ } else {
+ prelinearized = true;
+ }
+ } else if (img->color.transfer != image->color.transfer) {
+ if (image->color.transfer == PL_COLOR_TRC_LINEAR) {
+ // Another annoying edge case: if the input is linear light, but we
+ // decide to un-linearize it for scaling purposes, we need to
+ // re-linearize before passing it into `pl_shader_color_map`
+ pl_shader_linearize(sh, &img->color);
+ img->color.transfer = PL_COLOR_TRC_LINEAR;
+ }
+ }
+
+ // Do all processing in independent alpha, to avoid nonlinear distortions
+ pl_shader_set_alpha(sh, &img->repr, PL_ALPHA_INDEPENDENT);
+
+ // Apply color blindness simulation if requested
+ if (params->cone_params)
+ pl_shader_cone_distort(sh, img->color, params->cone_params);
+
+ if (params->lut) {
+ struct pl_color_space lut_in = params->lut->color_in;
+ struct pl_color_space lut_out = params->lut->color_out;
+ switch (params->lut_type) {
+ case PL_LUT_UNKNOWN:
+ case PL_LUT_NATIVE:
+ pl_color_space_merge(&lut_in, &image->color);
+ pl_color_space_merge(&lut_out, &image->color);
+ break;
+ case PL_LUT_CONVERSION:
+ pl_color_space_merge(&lut_in, &image->color);
+ need_conversion = false; // conversion LUT the highest priority
+ break;
+ case PL_LUT_NORMALIZED:
+ if (!prelinearized) {
+ // PL_LUT_NORMALIZED wants linear input data
+ pl_shader_linearize(sh, &img->color);
+ img->color.transfer = PL_COLOR_TRC_LINEAR;
+ prelinearized = true;
+ }
+ pl_color_space_merge(&lut_in, &img->color);
+ pl_color_space_merge(&lut_out, &img->color);
+ break;
+ }
+
+ pl_shader_color_map_ex(sh, params->color_map_params, pl_color_map_args(
+ .src = image->color,
+ .dst = lut_in,
+ .prelinearized = prelinearized,
+ ));
+
+ if (params->lut_type == PL_LUT_NORMALIZED) {
+ GLSLF("color.rgb *= vec3(1.0/"$"); \n",
+ SH_FLOAT(pl_color_transfer_nominal_peak(lut_in.transfer)));
+ }
+
+ pl_shader_custom_lut(sh, params->lut, &rr->lut_state[LUT_PARAMS]);
+
+ if (params->lut_type == PL_LUT_NORMALIZED) {
+ GLSLF("color.rgb *= vec3("$"); \n",
+ SH_FLOAT(pl_color_transfer_nominal_peak(lut_out.transfer)));
+ }
+
+ if (params->lut_type != PL_LUT_CONVERSION) {
+ pl_shader_color_map_ex(sh, params->color_map_params, pl_color_map_args(
+ .src = lut_out,
+ .dst = img->color,
+ ));
+ }
+ }
+
+ if (need_conversion) {
+ struct pl_color_space target_csp = target->color;
+ if (target->icc)
+ target_csp.transfer = PL_COLOR_TRC_LINEAR;
+
+ if (pass->need_peak_fbo && !img_tex(pass, img))
+ return;
+
+ // generate HDR feature map if required
+ pl_tex feature_map = get_feature_map(pass);
+ sh = img_sh(pass, img); // `get_feature_map` dispatches previous shader
+
+ // current -> target
+ pl_shader_color_map_ex(sh, params->color_map_params, pl_color_map_args(
+ .src = image->color,
+ .dst = target_csp,
+ .prelinearized = prelinearized,
+ .state = &rr->tone_map_state,
+ .feature_map = feature_map,
+ ));
+
+ if (target->icc)
+ pl_icc_encode(sh, target->icc, &rr->icc_state[ICC_TARGET]);
+ }
+
+ enum pl_lut_type lut_type = guess_frame_lut_type(target, true);
+ if (lut_type == PL_LUT_NORMALIZED || lut_type == PL_LUT_CONVERSION)
+ pl_shader_custom_lut(sh, target->lut, &rr->lut_state[LUT_TARGET]);
+
+ img->color = target->color;
+}
+
+// Returns true if error diffusion was successfully performed
+static bool pass_error_diffusion(struct pass_state *pass, pl_shader *sh,
+ int new_depth, int comps, int out_w, int out_h)
+{
+ const struct pl_render_params *params = pass->params;
+ pl_renderer rr = pass->rr;
+ if (!params->error_diffusion || (rr->errors & PL_RENDER_ERR_ERROR_DIFFUSION))
+ return false;
+
+ size_t shmem_req = pl_error_diffusion_shmem_req(params->error_diffusion, out_h);
+ if (shmem_req > rr->gpu->glsl.max_shmem_size) {
+ PL_TRACE(rr, "Disabling error diffusion due to shmem requirements (%zu) "
+ "exceeding capabilities (%zu)", shmem_req, rr->gpu->glsl.max_shmem_size);
+ return false;
+ }
+
+ pl_fmt fmt = pass->fbofmt[comps];
+ if (!fmt || !(fmt->caps & PL_FMT_CAP_STORABLE)) {
+ PL_ERR(rr, "Error diffusion requires storable FBOs but GPU does not "
+ "provide them... disabling!");
+ goto error;
+ }
+
+ struct pl_error_diffusion_params edpars = {
+ .new_depth = new_depth,
+ .kernel = params->error_diffusion,
+ };
+
+ // Create temporary framebuffers
+ edpars.input_tex = get_fbo(pass, out_w, out_h, fmt, comps, PL_DEBUG_TAG);
+ edpars.output_tex = get_fbo(pass, out_w, out_h, fmt, comps, PL_DEBUG_TAG);
+ if (!edpars.input_tex || !edpars.output_tex)
+ goto error;
+
+ pl_shader dsh = pl_dispatch_begin(rr->dp);
+ if (!pl_shader_error_diffusion(dsh, &edpars)) {
+ pl_dispatch_abort(rr->dp, &dsh);
+ goto error;
+ }
+
+ // Everything was okay, run the shaders
+ bool ok = pl_dispatch_finish(rr->dp, pl_dispatch_params(
+ .shader = sh,
+ .target = edpars.input_tex,
+ ));
+
+ if (ok) {
+ ok = pl_dispatch_compute(rr->dp, pl_dispatch_compute_params(
+ .shader = &dsh,
+ .dispatch_size = {1, 1, 1},
+ ));
+ }
+
+ *sh = pl_dispatch_begin(rr->dp);
+ pl_shader_sample_direct(*sh, pl_sample_src(
+ .tex = ok ? edpars.output_tex : edpars.input_tex,
+ ));
+ return ok;
+
+error:
+ rr->errors |= PL_RENDER_ERR_ERROR_DIFFUSION;
+ return false;
+}
+
+#define CLEAR_COL(params) \
+ (float[4]) { \
+ (params)->background_color[0], \
+ (params)->background_color[1], \
+ (params)->background_color[2], \
+ 1.0 - (params)->background_transparency, \
+ }
+
+static bool pass_output_target(struct pass_state *pass)
+{
+ const struct pl_render_params *params = pass->params;
+ const struct pl_frame *image = &pass->image;
+ const struct pl_frame *target = &pass->target;
+ pl_renderer rr = pass->rr;
+
+ struct img *img = &pass->img;
+ pl_shader sh = img_sh(pass, img);
+
+ if (params->corner_rounding > 0.0f) {
+ const float out_w2 = fabsf(pl_rect_w(target->crop)) / 2.0f;
+ const float out_h2 = fabsf(pl_rect_h(target->crop)) / 2.0f;
+ const float radius = fminf(params->corner_rounding, 1.0f) *
+ fminf(out_w2, out_h2);
+ const struct pl_rect2df relpos = {
+ .x0 = -out_w2, .y0 = -out_h2,
+ .x1 = out_w2, .y1 = out_h2,
+ };
+ GLSL("float radius = "$"; \n"
+ "vec2 size2 = vec2("$", "$"); \n"
+ "vec2 relpos = "$"; \n"
+ "vec2 rd = abs(relpos) - size2 + vec2(radius); \n"
+ "float rdist = length(max(rd, 0.0)) - radius; \n"
+ "float border = smoothstep(2.0f, 0.0f, rdist); \n",
+ SH_FLOAT_DYN(radius),
+ SH_FLOAT_DYN(out_w2), SH_FLOAT_DYN(out_h2),
+ sh_attr_vec2(sh, "relpos", &relpos));
+
+ switch (img->repr.alpha) {
+ case PL_ALPHA_UNKNOWN:
+ GLSL("color.a = border; \n");
+ img->repr.alpha = PL_ALPHA_INDEPENDENT;
+ img->comps = 4;
+ break;
+ case PL_ALPHA_INDEPENDENT:
+ GLSL("color.a *= border; \n");
+ break;
+ case PL_ALPHA_PREMULTIPLIED:
+ GLSL("color *= border; \n");
+ break;
+ case PL_ALPHA_MODE_COUNT:
+ pl_unreachable();
+ }
+ }
+
+ const struct pl_plane *ref = &target->planes[pass->dst_ref];
+ pl_rect2d dst_rect = pass->dst_rect;
+ if (params->distort_params) {
+ struct pl_distort_params dpars = *params->distort_params;
+ if (dpars.alpha_mode) {
+ pl_shader_set_alpha(sh, &img->repr, dpars.alpha_mode);
+ img->repr.alpha = dpars.alpha_mode;
+ img->comps = 4;
+ }
+ pl_tex tex = img_tex(pass, img);
+ if (!tex)
+ return false;
+ // Expand canvas to fit result of distortion
+ const float ar = pl_rect2df_aspect(&target->crop);
+ const float sx = fminf(ar, 1.0f);
+ const float sy = fminf(1.0f / ar, 1.0f);
+ pl_rect2df bb = pl_transform2x2_bounds(&dpars.transform, &(pl_rect2df) {
+ .x0 = -sx, .x1 = sx,
+ .y0 = -sy, .y1 = sy,
+ });
+
+ // Clamp to output size and adjust as needed when constraining output
+ pl_rect2df tmp = target->crop;
+ pl_rect2df_stretch(&tmp, pl_rect_w(bb) / (2*sx), pl_rect_h(bb) / (2*sy));
+ const float tmp_w = pl_rect_w(tmp), tmp_h = pl_rect_h(tmp);
+ int canvas_w = ref->texture->params.w,
+ canvas_h = ref->texture->params.h;
+ if (pass->rotation % PL_ROTATION_180 == PL_ROTATION_90)
+ PL_SWAP(canvas_w, canvas_h);
+ tmp.x0 = PL_CLAMP(tmp.x0, 0.0f, canvas_w);
+ tmp.x1 = PL_CLAMP(tmp.x1, 0.0f, canvas_w);
+ tmp.y0 = PL_CLAMP(tmp.y0, 0.0f, canvas_h);
+ tmp.y1 = PL_CLAMP(tmp.y1, 0.0f, canvas_h);
+ if (dpars.constrain) {
+ const float rx = pl_rect_w(tmp) / tmp_w;
+ const float ry = pl_rect_h(tmp) / tmp_h;
+ pl_rect2df_stretch(&tmp, fminf(ry / rx, 1.0f), fminf(rx / ry, 1.0f));
+ }
+ dst_rect.x0 = roundf(tmp.x0);
+ dst_rect.x1 = roundf(tmp.x1);
+ dst_rect.y0 = roundf(tmp.y0);
+ dst_rect.y1 = roundf(tmp.y1);
+ dpars.unscaled = true;
+ img->w = abs(pl_rect_w(dst_rect));
+ img->h = abs(pl_rect_h(dst_rect));
+ img->tex = NULL;
+ img->sh = sh = pl_dispatch_begin(rr->dp);
+ pl_shader_distort(sh, tex, img->w, img->h, &dpars);
+ }
+
+ pass_hook(pass, img, PL_HOOK_PRE_OUTPUT);
+
+ bool need_blend = params->blend_against_tiles ||
+ (!target->repr.alpha && !params->blend_params);
+ if (img->comps == 4 && need_blend) {
+ if (params->blend_against_tiles) {
+ static const float zero[2][3] = {0};
+ const float (*color)[3] = params->tile_colors;
+ if (memcmp(color, zero, sizeof(zero)) == 0)
+ color = pl_render_default_params.tile_colors;
+ int size = PL_DEF(params->tile_size, pl_render_default_params.tile_size);
+ GLSLH("#define bg_tile_a vec3("$", "$", "$") \n",
+ SH_FLOAT(color[0][0]), SH_FLOAT(color[0][1]), SH_FLOAT(color[0][2]));
+ GLSLH("#define bg_tile_b vec3("$", "$", "$") \n",
+ SH_FLOAT(color[1][0]), SH_FLOAT(color[1][1]), SH_FLOAT(color[1][2]));
+ GLSL("vec2 outcoord = gl_FragCoord.xy * "$"; \n"
+ "bvec2 tile = lessThan(fract(outcoord), vec2(0.5)); \n"
+ "vec3 bg_color = tile.x == tile.y ? bg_tile_a : bg_tile_b; \n",
+ SH_FLOAT(1.0 / size));
+ } else {
+ GLSLH("#define bg_color vec3("$", "$", "$") \n",
+ SH_FLOAT(params->background_color[0]),
+ SH_FLOAT(params->background_color[1]),
+ SH_FLOAT(params->background_color[2]));
+ }
+
+ pl_shader_set_alpha(sh, &img->repr, PL_ALPHA_PREMULTIPLIED);
+ GLSL("color = vec4(color.rgb + bg_color * (1.0 - color.a), 1.0); \n");
+ img->repr.alpha = PL_ALPHA_UNKNOWN;
+ img->comps = 3;
+ }
+
+ // Apply the color scale separately, after encoding is done, to make sure
+ // that the intermediate FBO (if any) has the correct precision.
+ struct pl_color_repr repr = target->repr;
+ float scale = pl_color_repr_normalize(&repr);
+ enum pl_lut_type lut_type = guess_frame_lut_type(target, true);
+ if (lut_type != PL_LUT_CONVERSION)
+ pl_shader_encode_color(sh, &repr);
+ if (lut_type == PL_LUT_NATIVE) {
+ pl_shader_set_alpha(sh, &img->repr, PL_ALPHA_INDEPENDENT);
+ pl_shader_custom_lut(sh, target->lut, &rr->lut_state[LUT_TARGET]);
+ pl_shader_set_alpha(sh, &img->repr, PL_ALPHA_PREMULTIPLIED);
+ }
+
+ // Rotation handling
+ if (pass->rotation % PL_ROTATION_180 == PL_ROTATION_90) {
+ PL_SWAP(dst_rect.x0, dst_rect.y0);
+ PL_SWAP(dst_rect.x1, dst_rect.y1);
+ PL_SWAP(img->w, img->h);
+ sh->transpose = true;
+ }
+
+ pass_hook(pass, img, PL_HOOK_OUTPUT);
+ sh = NULL;
+
+ bool flipped_x = dst_rect.x1 < dst_rect.x0,
+ flipped_y = dst_rect.y1 < dst_rect.y0;
+
+ if (!params->skip_target_clearing && pl_frame_is_cropped(target))
+ pl_frame_clear_rgba(rr->gpu, target, CLEAR_COL(params));
+
+ for (int p = 0; p < target->num_planes; p++) {
+ const struct pl_plane *plane = &target->planes[p];
+ float rx = (float) plane->texture->params.w / ref->texture->params.w,
+ ry = (float) plane->texture->params.h / ref->texture->params.h;
+
+ // Only accept integer scaling ratios. This accounts for the fact
+ // that fractionally subsampled planes get rounded up to the
+ // nearest integer size, which we want to over-render.
+ float rrx = rx >= 1 ? roundf(rx) : 1.0 / roundf(1.0 / rx),
+ rry = ry >= 1 ? roundf(ry) : 1.0 / roundf(1.0 / ry);
+ float sx = plane->shift_x, sy = plane->shift_y;
+
+ pl_rect2df plane_rectf = {
+ .x0 = (dst_rect.x0 - sx) * rrx,
+ .y0 = (dst_rect.y0 - sy) * rry,
+ .x1 = (dst_rect.x1 - sx) * rrx,
+ .y1 = (dst_rect.y1 - sy) * rry,
+ };
+
+ // Normalize to make the math easier
+ pl_rect2df_normalize(&plane_rectf);
+
+ // Round the output rect
+ int rx0 = floorf(plane_rectf.x0), ry0 = floorf(plane_rectf.y0),
+ rx1 = ceilf(plane_rectf.x1), ry1 = ceilf(plane_rectf.y1);
+
+ PL_TRACE(rr, "Subsampled target %d: {%f %f %f %f} -> {%d %d %d %d}",
+ p, plane_rectf.x0, plane_rectf.y0,
+ plane_rectf.x1, plane_rectf.y1,
+ rx0, ry0, rx1, ry1);
+
+ if (target->num_planes > 1) {
+
+ // Planar output, so we need to sample from an intermediate FBO
+ struct pl_sample_src src = {
+ .tex = img_tex(pass, img),
+ .new_w = rx1 - rx0,
+ .new_h = ry1 - ry0,
+ .rect = {
+ .x0 = (rx0 - plane_rectf.x0) / rrx,
+ .x1 = (rx1 - plane_rectf.x0) / rrx,
+ .y0 = (ry0 - plane_rectf.y0) / rry,
+ .y1 = (ry1 - plane_rectf.y0) / rry,
+ },
+ };
+
+ if (!src.tex) {
+ PL_ERR(rr, "Output requires multiple planes, but FBOs are "
+ "unavailable. This combination is unsupported.");
+ return false;
+ }
+
+ PL_TRACE(rr, "Sampling %dx%d img aligned from {%f %f %f %f}",
+ pass->img.w, pass->img.h,
+ src.rect.x0, src.rect.y0,
+ src.rect.x1, src.rect.y1);
+
+ for (int c = 0; c < plane->components; c++) {
+ if (plane->component_mapping[c] < 0)
+ continue;
+ src.component_mask |= 1 << plane->component_mapping[c];
+ }
+
+ sh = pl_dispatch_begin(rr->dp);
+ dispatch_sampler(pass, sh, &rr->samplers_dst[p], SAMPLER_PLANE,
+ plane->texture, &src);
+
+ } else {
+
+ // Single plane, so we can directly re-use the img shader unless
+ // it's incompatible with the FBO capabilities
+ bool is_comp = pl_shader_is_compute(img_sh(pass, img));
+ if (is_comp && !plane->texture->params.storable) {
+ if (!img_tex(pass, img)) {
+ PL_ERR(rr, "Rendering requires compute shaders, but output "
+ "is not storable, and FBOs are unavailable. This "
+ "combination is unsupported.");
+ return false;
+ }
+ }
+
+ sh = img_sh(pass, img);
+ img->sh = NULL;
+
+ }
+
+ // Ignore dithering for > 16-bit outputs by default, since it makes
+ // little sense to do so (and probably just adds errors)
+ int depth = target->repr.bits.color_depth, applied_dither = 0;
+ if (depth && (depth < 16 || params->force_dither)) {
+ if (pass_error_diffusion(pass, &sh, depth, plane->components,
+ rx1 - rx0, ry1 - ry0))
+ {
+ applied_dither = depth;
+ } else if (params->dither_params) {
+ struct pl_dither_params dparams = *params->dither_params;
+ if (!params->disable_dither_gamma_correction)
+ dparams.transfer = target->color.transfer;
+ pl_shader_dither(sh, depth, &rr->dither_state, &dparams);
+ applied_dither = depth;
+ }
+ }
+
+ if (applied_dither != rr->prev_dither) {
+ if (applied_dither) {
+ PL_INFO(rr, "Dithering to %d bit depth", applied_dither);
+ } else {
+ PL_INFO(rr, "Dithering disabled");
+ }
+ rr->prev_dither = applied_dither;
+ }
+
+ GLSL("color *= vec4(1.0 / "$"); \n", SH_FLOAT(scale));
+ swizzle_color(sh, plane->components, plane->component_mapping,
+ params->blend_params);
+
+ pl_rect2d plane_rect = {
+ .x0 = flipped_x ? rx1 : rx0,
+ .x1 = flipped_x ? rx0 : rx1,
+ .y0 = flipped_y ? ry1 : ry0,
+ .y1 = flipped_y ? ry0 : ry1,
+ };
+
+ pl_transform2x2 tscale = {
+ .mat = {{{ rrx, 0.0 }, { 0.0, rry }}},
+ .c = { -sx, -sy },
+ };
+
+ if (plane->flipped) {
+ int plane_h = rry * ref->texture->params.h;
+ plane_rect.y0 = plane_h - plane_rect.y0;
+ plane_rect.y1 = plane_h - plane_rect.y1;
+ tscale.mat.m[1][1] = -tscale.mat.m[1][1];
+ tscale.c[1] += plane->texture->params.h;
+ }
+
+ bool ok = pl_dispatch_finish(rr->dp, pl_dispatch_params(
+ .shader = &sh,
+ .target = plane->texture,
+ .blend_params = params->blend_params,
+ .rect = plane_rect,
+ ));
+
+ if (!ok)
+ return false;
+
+ if (pass->info.stage != PL_RENDER_STAGE_BLEND) {
+ draw_overlays(pass, plane->texture, plane->components,
+ plane->component_mapping, image->overlays,
+ image->num_overlays, target->color, target->repr,
+ &tscale);
+ }
+
+ draw_overlays(pass, plane->texture, plane->components,
+ plane->component_mapping, target->overlays,
+ target->num_overlays, target->color, target->repr,
+ &tscale);
+ }
+
+ *img = (struct img) {0};
+ return true;
+}
+
+#define require(expr) pl_require(rr, expr)
+#define validate_plane(plane, param) \
+ do { \
+ require((plane).texture); \
+ require((plane).texture->params.param); \
+ require((plane).components > 0 && (plane).components <= 4); \
+ for (int c = 0; c < (plane).components; c++) { \
+ require((plane).component_mapping[c] >= PL_CHANNEL_NONE && \
+ (plane).component_mapping[c] <= PL_CHANNEL_A); \
+ } \
+ } while (0)
+
+#define validate_overlay(overlay) \
+ do { \
+ require((overlay).tex); \
+ require((overlay).tex->params.sampleable); \
+ require((overlay).num_parts >= 0); \
+ for (int n = 0; n < (overlay).num_parts; n++) { \
+ const struct pl_overlay_part *p = &(overlay).parts[n]; \
+ require(pl_rect_w(p->dst) && pl_rect_h(p->dst)); \
+ } \
+ } while (0)
+
+#define validate_deinterlace_ref(image, ref) \
+ do { \
+ require((image)->num_planes == (ref)->num_planes); \
+ const struct pl_tex_params *imgp, *refp; \
+ for (int p = 0; p < (image)->num_planes; p++) { \
+ validate_plane((ref)->planes[p], sampleable); \
+ imgp = &(image)->planes[p].texture->params; \
+ refp = &(ref)->planes[p].texture->params; \
+ require(imgp->w == refp->w); \
+ require(imgp->h == refp->h); \
+ require(imgp->format->num_components == refp->format->num_components);\
+ } \
+ } while (0)
+
+// Perform some basic validity checks on incoming structs to help catch invalid
+// API usage. This is not an exhaustive check. In particular, enums are not
+// bounds checked. This is because most functions accepting enums already
+// abort() in the default case, and because it's not the intent of this check
+// to catch all instances of memory corruption - just common logic bugs.
+static bool validate_structs(pl_renderer rr,
+ const struct pl_frame *image,
+ const struct pl_frame *target)
+{
+ // Rendering to/from a frame with no planes is technically allowed, but so
+ // pointless that it's more likely to be a user error worth catching.
+ require(target->num_planes > 0 && target->num_planes <= PL_MAX_PLANES);
+ for (int i = 0; i < target->num_planes; i++)
+ validate_plane(target->planes[i], renderable);
+ require(!pl_rect_w(target->crop) == !pl_rect_h(target->crop));
+ require(target->num_overlays >= 0);
+ for (int i = 0; i < target->num_overlays; i++)
+ validate_overlay(target->overlays[i]);
+
+ if (!image)
+ return true;
+
+ require(image->num_planes > 0 && image->num_planes <= PL_MAX_PLANES);
+ for (int i = 0; i < image->num_planes; i++)
+ validate_plane(image->planes[i], sampleable);
+ require(!pl_rect_w(image->crop) == !pl_rect_h(image->crop));
+ require(image->num_overlays >= 0);
+ for (int i = 0; i < image->num_overlays; i++)
+ validate_overlay(image->overlays[i]);
+
+ if (image->field != PL_FIELD_NONE) {
+ require(image->first_field != PL_FIELD_NONE);
+ if (image->prev)
+ validate_deinterlace_ref(image, image->prev);
+ if (image->next)
+ validate_deinterlace_ref(image, image->next);
+ }
+
+ return true;
+
+error:
+ return false;
+}
+
+// returns index
+static int frame_ref(const struct pl_frame *frame)
+{
+ pl_assert(frame->num_planes);
+ for (int i = 0; i < frame->num_planes; i++) {
+ switch (detect_plane_type(&frame->planes[i], &frame->repr)) {
+ case PLANE_RGB:
+ case PLANE_LUMA:
+ case PLANE_XYZ:
+ return i;
+ case PLANE_CHROMA:
+ case PLANE_ALPHA:
+ continue;
+ case PLANE_INVALID:
+ pl_unreachable();
+ }
+ }
+
+ return 0;
+}
+
+static void fix_refs_and_rects(struct pass_state *pass)
+{
+ struct pl_frame *target = &pass->target;
+ pl_rect2df *dst = &target->crop;
+ pass->dst_ref = frame_ref(target);
+ pl_tex dst_ref = target->planes[pass->dst_ref].texture;
+ int dst_w = dst_ref->params.w, dst_h = dst_ref->params.h;
+
+ if ((!dst->x0 && !dst->x1) || (!dst->y0 && !dst->y1)) {
+ dst->x1 = dst_w;
+ dst->y1 = dst_h;
+ }
+
+ if (pass->src_ref < 0) {
+ // Simplified version of the below code which only rounds the target
+ // rect but doesn't retroactively apply the crop to the image
+ pass->rotation = pl_rotation_normalize(-target->rotation);
+ pl_rect2df_rotate(dst, -pass->rotation);
+ if (pass->rotation % PL_ROTATION_180 == PL_ROTATION_90)
+ PL_SWAP(dst_w, dst_h);
+
+ *dst = (pl_rect2df) {
+ .x0 = roundf(PL_CLAMP(dst->x0, 0.0, dst_w)),
+ .y0 = roundf(PL_CLAMP(dst->y0, 0.0, dst_h)),
+ .x1 = roundf(PL_CLAMP(dst->x1, 0.0, dst_w)),
+ .y1 = roundf(PL_CLAMP(dst->y1, 0.0, dst_h)),
+ };
+
+ pass->dst_rect = (pl_rect2d) {
+ dst->x0, dst->y0, dst->x1, dst->y1,
+ };
+
+ return;
+ }
+
+ struct pl_frame *image = &pass->image;
+ pl_rect2df *src = &image->crop;
+ pass->src_ref = frame_ref(image);
+ pl_tex src_ref = image->planes[pass->src_ref].texture;
+
+ if ((!src->x0 && !src->x1) || (!src->y0 && !src->y1)) {
+ src->x1 = src_ref->params.w;
+ src->y1 = src_ref->params.h;
+ };
+
+ // Compute end-to-end rotation
+ pass->rotation = pl_rotation_normalize(image->rotation - target->rotation);
+ pl_rect2df_rotate(dst, -pass->rotation); // normalize by counter-rotating
+ if (pass->rotation % PL_ROTATION_180 == PL_ROTATION_90)
+ PL_SWAP(dst_w, dst_h);
+
+ // Keep track of whether the end-to-end rendering is flipped
+ bool flipped_x = (src->x0 > src->x1) != (dst->x0 > dst->x1),
+ flipped_y = (src->y0 > src->y1) != (dst->y0 > dst->y1);
+
+ // Normalize both rects to make the math easier
+ pl_rect2df_normalize(src);
+ pl_rect2df_normalize(dst);
+
+ // Round the output rect and clip it to the framebuffer dimensions
+ float rx0 = roundf(PL_CLAMP(dst->x0, 0.0, dst_w)),
+ ry0 = roundf(PL_CLAMP(dst->y0, 0.0, dst_h)),
+ rx1 = roundf(PL_CLAMP(dst->x1, 0.0, dst_w)),
+ ry1 = roundf(PL_CLAMP(dst->y1, 0.0, dst_h));
+
+ // Adjust the src rect corresponding to the rounded crop
+ float scale_x = pl_rect_w(*src) / pl_rect_w(*dst),
+ scale_y = pl_rect_h(*src) / pl_rect_h(*dst),
+ base_x = src->x0,
+ base_y = src->y0;
+
+ src->x0 = base_x + (rx0 - dst->x0) * scale_x;
+ src->x1 = base_x + (rx1 - dst->x0) * scale_x;
+ src->y0 = base_y + (ry0 - dst->y0) * scale_y;
+ src->y1 = base_y + (ry1 - dst->y0) * scale_y;
+
+ // Update dst_rect to the rounded values and re-apply flip if needed. We
+ // always do this in the `dst` rather than the `src`` because this allows
+ // e.g. polar sampling compute shaders to work.
+ *dst = (pl_rect2df) {
+ .x0 = flipped_x ? rx1 : rx0,
+ .y0 = flipped_y ? ry1 : ry0,
+ .x1 = flipped_x ? rx0 : rx1,
+ .y1 = flipped_y ? ry0 : ry1,
+ };
+
+ // Copies of the above, for convenience
+ pass->ref_rect = *src;
+ pass->dst_rect = (pl_rect2d) {
+ dst->x0, dst->y0, dst->x1, dst->y1,
+ };
+}
+
+static void fix_frame(struct pl_frame *frame)
+{
+ pl_tex tex = frame->planes[frame_ref(frame)].texture;
+
+ if (frame->repr.sys == PL_COLOR_SYSTEM_XYZ) {
+ // XYZ is implicity converted to linear DCI-P3 in pl_color_repr_decode
+ frame->color.primaries = PL_COLOR_PRIM_DCI_P3;
+ frame->color.transfer = PL_COLOR_TRC_ST428;
+ }
+
+ // If the primaries are not known, guess them based on the resolution
+ if (tex && !frame->color.primaries)
+ frame->color.primaries = pl_color_primaries_guess(tex->params.w, tex->params.h);
+
+ // For UNORM formats, we can infer the sampled bit depth from the texture
+ // itself. This is ignored for other format types, because the logic
+ // doesn't really work out for them anyways, and it's best not to do
+ // anything too crazy unless the user provides explicit details.
+ struct pl_bit_encoding *bits = &frame->repr.bits;
+ if (!bits->sample_depth && tex && tex->params.format->type == PL_FMT_UNORM) {
+ // Just assume the first component's depth is canonical. This works in
+ // practice, since for cases like rgb565 we want to use the lower depth
+ // anyway. Plus, every format has at least one component.
+ bits->sample_depth = tex->params.format->component_depth[0];
+
+ // If we don't know the color depth, assume it spans the full range of
+ // the texture. Otherwise, clamp it to the texture depth.
+ bits->color_depth = PL_DEF(bits->color_depth, bits->sample_depth);
+ bits->color_depth = PL_MIN(bits->color_depth, bits->sample_depth);
+
+ // If the texture depth is higher than the known color depth, assume
+ // the colors were left-shifted.
+ bits->bit_shift += bits->sample_depth - bits->color_depth;
+ }
+}
+
+static bool acquire_frame(struct pass_state *pass, struct pl_frame *frame,
+ bool *acquired)
+{
+ if (!frame || !frame->acquire || *acquired)
+ return true;
+
+ *acquired = true;
+ return frame->acquire(pass->rr->gpu, frame);
+}
+
+static void release_frame(struct pass_state *pass, struct pl_frame *frame,
+ bool *acquired)
+{
+ if (frame && frame->release && *acquired)
+ frame->release(pass->rr->gpu, frame);
+ *acquired = false;
+}
+
+static void pass_uninit(struct pass_state *pass)
+{
+ pl_renderer rr = pass->rr;
+ pl_dispatch_abort(rr->dp, &pass->img.sh);
+ release_frame(pass, &pass->next, &pass->acquired.next);
+ release_frame(pass, &pass->prev, &pass->acquired.prev);
+ release_frame(pass, &pass->image, &pass->acquired.image);
+ release_frame(pass, &pass->target, &pass->acquired.target);
+ pl_free_ptr(&pass->tmp);
+}
+
+static void icc_fallback(struct pass_state *pass, struct pl_frame *frame,
+ struct icc_state *fallback)
+{
+ if (!frame || frame->icc || !frame->profile.data)
+ return;
+
+ // Don't re-attempt opening already failed profiles
+ if (fallback->error && fallback->error == frame->profile.signature)
+ return;
+
+#ifdef PL_HAVE_LCMS
+ pl_renderer rr = pass->rr;
+ if (pl_icc_update(rr->log, &fallback->icc, &frame->profile, NULL)) {
+ frame->icc = fallback->icc;
+ } else {
+ PL_WARN(rr, "Failed opening ICC profile... ignoring");
+ fallback->error = frame->profile.signature;
+ }
+#endif
+}
+
+static void pass_fix_frames(struct pass_state *pass)
+{
+ pl_renderer rr = pass->rr;
+ struct pl_frame *image = pass->src_ref < 0 ? NULL : &pass->image;
+ struct pl_frame *target = &pass->target;
+
+ fix_refs_and_rects(pass);
+
+ // Fallback for older ICC profile API
+ icc_fallback(pass, image, &rr->icc_fallback[ICC_IMAGE]);
+ icc_fallback(pass, target, &rr->icc_fallback[ICC_TARGET]);
+
+ // Force colorspace metadata to ICC profile values, if present
+ if (image && image->icc) {
+ image->color.primaries = image->icc->containing_primaries;
+ image->color.hdr = image->icc->csp.hdr;
+ }
+
+ if (target->icc) {
+ target->color.primaries = target->icc->containing_primaries;
+ target->color.hdr = target->icc->csp.hdr;
+ }
+
+ // Infer the target color space info based on the image's
+ if (image) {
+ fix_frame(image);
+ pl_color_space_infer_map(&image->color, &target->color);
+ fix_frame(target); // do this only after infer_map
+ } else {
+ fix_frame(target);
+ pl_color_space_infer(&target->color);
+ }
+
+ // Detect the presence of an alpha channel in the frames and explicitly
+ // default the alpha mode in this case, so we can use it to detect whether
+ // or not to strip the alpha channel during rendering.
+ //
+ // Note the different defaults for the image and target, because files
+ // are usually independent but windowing systems usually expect
+ // premultiplied. (We also premultiply for internal rendering, so this
+ // way of doing it avoids a possible division-by-zero path!)
+ if (image && !image->repr.alpha) {
+ for (int i = 0; i < image->num_planes; i++) {
+ const struct pl_plane *plane = &image->planes[i];
+ for (int c = 0; c < plane->components; c++) {
+ if (plane->component_mapping[c] == PL_CHANNEL_A)
+ image->repr.alpha = PL_ALPHA_INDEPENDENT;
+ }
+ }
+ }
+
+ if (!target->repr.alpha) {
+ for (int i = 0; i < target->num_planes; i++) {
+ const struct pl_plane *plane = &target->planes[i];
+ for (int c = 0; c < plane->components; c++) {
+ if (plane->component_mapping[c] == PL_CHANNEL_A)
+ target->repr.alpha = PL_ALPHA_PREMULTIPLIED;
+ }
+ }
+ }
+}
+
+void pl_frames_infer(pl_renderer rr, struct pl_frame *image,
+ struct pl_frame *target)
+{
+ struct pass_state pass = {
+ .rr = rr,
+ .image = *image,
+ .target = *target,
+ };
+
+ pass_fix_frames(&pass);
+ *image = pass.image;
+ *target = pass.target;
+}
+
+static bool pass_init(struct pass_state *pass, bool acquire_image)
+{
+ struct pl_frame *image = pass->src_ref < 0 ? NULL : &pass->image;
+ struct pl_frame *target = &pass->target;
+
+ if (!acquire_frame(pass, target, &pass->acquired.target))
+ goto error;
+ if (acquire_image && image) {
+ if (!acquire_frame(pass, image, &pass->acquired.image))
+ goto error;
+
+ const struct pl_render_params *params = pass->params;
+ const struct pl_deinterlace_params *deint = params->deinterlace_params;
+ bool needs_refs = image->field != PL_FIELD_NONE && deint &&
+ pl_deinterlace_needs_refs(deint->algo);
+
+ if (image->prev && needs_refs) {
+ // Move into local copy so we can acquire/release it
+ pass->prev = *image->prev;
+ image->prev = &pass->prev;
+ if (!acquire_frame(pass, &pass->prev, &pass->acquired.prev))
+ goto error;
+ }
+ if (image->next && needs_refs) {
+ pass->next = *image->next;
+ image->next = &pass->next;
+ if (!acquire_frame(pass, &pass->next, &pass->acquired.next))
+ goto error;
+ }
+ }
+
+ if (!validate_structs(pass->rr, acquire_image ? image : NULL, target))
+ goto error;
+
+ find_fbo_format(pass);
+ pass_fix_frames(pass);
+
+ pass->tmp = pl_tmp(NULL);
+ return true;
+
+error:
+ pass_uninit(pass);
+ return false;
+}
+
+static void pass_begin_frame(struct pass_state *pass)
+{
+ pl_renderer rr = pass->rr;
+ const struct pl_render_params *params = pass->params;
+
+ pl_dispatch_callback(rr->dp, pass, info_callback);
+ pl_dispatch_reset_frame(rr->dp);
+
+ for (int i = 0; i < params->num_hooks; i++) {
+ if (params->hooks[i]->reset)
+ params->hooks[i]->reset(params->hooks[i]->priv);
+ }
+
+ size_t size = rr->fbos.num * sizeof(bool);
+ pass->fbos_used = pl_realloc(pass->tmp, pass->fbos_used, size);
+ memset(pass->fbos_used, 0, size);
+}
+
+static bool draw_empty_overlays(pl_renderer rr,
+ const struct pl_frame *ptarget,
+ const struct pl_render_params *params)
+{
+ if (!params->skip_target_clearing)
+ pl_frame_clear_rgba(rr->gpu, ptarget, CLEAR_COL(params));
+
+ if (!ptarget->num_overlays)
+ return true;
+
+ struct pass_state pass = {
+ .rr = rr,
+ .params = params,
+ .src_ref = -1,
+ .target = *ptarget,
+ .info.stage = PL_RENDER_STAGE_BLEND,
+ .info.count = 0,
+ };
+
+ if (!pass_init(&pass, false))
+ return false;
+
+ pass_begin_frame(&pass);
+ struct pl_frame *target = &pass.target;
+ pl_tex ref = target->planes[pass.dst_ref].texture;
+ for (int p = 0; p < target->num_planes; p++) {
+ const struct pl_plane *plane = &target->planes[p];
+ // Math replicated from `pass_output_target`
+ float rx = (float) plane->texture->params.w / ref->params.w,
+ ry = (float) plane->texture->params.h / ref->params.h;
+ float rrx = rx >= 1 ? roundf(rx) : 1.0 / roundf(1.0 / rx),
+ rry = ry >= 1 ? roundf(ry) : 1.0 / roundf(1.0 / ry);
+ float sx = plane->shift_x, sy = plane->shift_y;
+
+ pl_transform2x2 tscale = {
+ .mat = {{{ rrx, 0.0 }, { 0.0, rry }}},
+ .c = { -sx, -sy },
+ };
+
+ if (plane->flipped) {
+ tscale.mat.m[1][1] = -tscale.mat.m[1][1];
+ tscale.c[1] += plane->texture->params.h;
+ }
+
+ draw_overlays(&pass, plane->texture, plane->components,
+ plane->component_mapping, target->overlays,
+ target->num_overlays, target->color, target->repr,
+ &tscale);
+ }
+
+ pass_uninit(&pass);
+ return true;
+}
+
+bool pl_render_image(pl_renderer rr, const struct pl_frame *pimage,
+ const struct pl_frame *ptarget,
+ const struct pl_render_params *params)
+{
+ params = PL_DEF(params, &pl_render_default_params);
+ pl_dispatch_mark_dynamic(rr->dp, params->dynamic_constants);
+ if (!pimage)
+ return draw_empty_overlays(rr, ptarget, params);
+
+ struct pass_state pass = {
+ .rr = rr,
+ .params = params,
+ .image = *pimage,
+ .target = *ptarget,
+ .info.stage = PL_RENDER_STAGE_FRAME,
+ };
+
+ if (!pass_init(&pass, true))
+ return false;
+
+ // No-op (empty crop)
+ if (!pl_rect_w(pass.dst_rect) || !pl_rect_h(pass.dst_rect)) {
+ pass_uninit(&pass);
+ return draw_empty_overlays(rr, ptarget, params);
+ }
+
+ pass_begin_frame(&pass);
+ if (!pass_read_image(&pass))
+ goto error;
+ if (!pass_scale_main(&pass))
+ goto error;
+ pass_convert_colors(&pass);
+ if (!pass_output_target(&pass))
+ goto error;
+
+ pass_uninit(&pass);
+ return true;
+
+error:
+ PL_ERR(rr, "Failed rendering image!");
+ pass_uninit(&pass);
+ return false;
+}
+
+const struct pl_frame *pl_frame_mix_current(const struct pl_frame_mix *mix)
+{
+ const struct pl_frame *cur = NULL;
+ for (int i = 0; i < mix->num_frames; i++) {
+ if (mix->timestamps[i] > 0.0f)
+ break;
+ cur = mix->frames[i];
+ }
+
+ return cur;
+}
+
+const struct pl_frame *pl_frame_mix_nearest(const struct pl_frame_mix *mix)
+{
+ if (!mix->num_frames)
+ return NULL;
+
+ const struct pl_frame *best = mix->frames[0];
+ float best_dist = fabsf(mix->timestamps[0]);
+ for (int i = 1; i < mix->num_frames; i++) {
+ float dist = fabsf(mix->timestamps[i]);
+ if (dist < best_dist) {
+ best = mix->frames[i];
+ best_dist = dist;
+ continue;
+ } else {
+ break;
+ }
+ }
+
+ return best;
+}
+
+struct params_info {
+ uint64_t hash;
+ bool trivial;
+};
+
+static struct params_info render_params_info(const struct pl_render_params *params_orig)
+{
+ struct pl_render_params params = *params_orig;
+ struct params_info info = {
+ .trivial = true,
+ .hash = 0,
+ };
+
+#define HASH_PTR(ptr, def, ptr_trivial) \
+ do { \
+ if (ptr) { \
+ pl_hash_merge(&info.hash, pl_mem_hash(ptr, sizeof(*ptr))); \
+ info.trivial &= (ptr_trivial); \
+ ptr = NULL; \
+ } else if ((def) != NULL) { \
+ pl_hash_merge(&info.hash, pl_mem_hash(def, sizeof(*ptr))); \
+ } \
+ } while (0)
+
+#define HASH_FILTER(scaler) \
+ do { \
+ if ((scaler == &pl_filter_bilinear || scaler == &pl_filter_nearest) && \
+ params.skip_anti_aliasing) \
+ { \
+ /* treat as NULL */ \
+ } else if (scaler) { \
+ struct pl_filter_config filter = *scaler; \
+ HASH_PTR(filter.kernel, NULL, false); \
+ HASH_PTR(filter.window, NULL, false); \
+ pl_hash_merge(&info.hash, pl_var_hash(filter)); \
+ scaler = NULL; \
+ } \
+ } while (0)
+
+ HASH_FILTER(params.upscaler);
+ HASH_FILTER(params.downscaler);
+
+ HASH_PTR(params.deband_params, NULL, false);
+ HASH_PTR(params.sigmoid_params, NULL, false);
+ HASH_PTR(params.deinterlace_params, NULL, false);
+ HASH_PTR(params.cone_params, NULL, true);
+ HASH_PTR(params.icc_params, &pl_icc_default_params, true);
+ HASH_PTR(params.color_adjustment, &pl_color_adjustment_neutral, true);
+ HASH_PTR(params.color_map_params, &pl_color_map_default_params, true);
+ HASH_PTR(params.peak_detect_params, NULL, false);
+
+ // Hash all hooks
+ for (int i = 0; i < params.num_hooks; i++) {
+ const struct pl_hook *hook = params.hooks[i];
+ if (hook->stages == PL_HOOK_OUTPUT)
+ continue; // ignore hooks only relevant to pass_output_target
+ pl_hash_merge(&info.hash, pl_var_hash(*hook));
+ info.trivial = false;
+ }
+ params.hooks = NULL;
+
+ // Hash the LUT by only looking at the signature
+ if (params.lut) {
+ pl_hash_merge(&info.hash, params.lut->signature);
+ info.trivial = false;
+ params.lut = NULL;
+ }
+
+#define CLEAR(field) field = (__typeof__(field)) {0}
+
+ // Clear out fields only relevant to pl_render_image_mix
+ CLEAR(params.frame_mixer);
+ CLEAR(params.preserve_mixing_cache);
+ CLEAR(params.skip_caching_single_frame);
+ memset(params.background_color, 0, sizeof(params.background_color));
+ CLEAR(params.background_transparency);
+ CLEAR(params.skip_target_clearing);
+ CLEAR(params.blend_against_tiles);
+ memset(params.tile_colors, 0, sizeof(params.tile_colors));
+ CLEAR(params.tile_size);
+
+ // Clear out fields only relevant to pass_output_target
+ CLEAR(params.blend_params);
+ CLEAR(params.distort_params);
+ CLEAR(params.dither_params);
+ CLEAR(params.error_diffusion);
+ CLEAR(params.force_dither);
+ CLEAR(params.corner_rounding);
+
+ // Clear out other irrelevant fields
+ CLEAR(params.dynamic_constants);
+ CLEAR(params.info_callback);
+ CLEAR(params.info_priv);
+
+ pl_hash_merge(&info.hash, pl_var_hash(params));
+ return info;
+}
+
+#define MAX_MIX_FRAMES 16
+
+bool pl_render_image_mix(pl_renderer rr, const struct pl_frame_mix *images,
+ const struct pl_frame *ptarget,
+ const struct pl_render_params *params)
+{
+ if (!images->num_frames)
+ return pl_render_image(rr, NULL, ptarget, params);
+
+ params = PL_DEF(params, &pl_render_default_params);
+ struct params_info par_info = render_params_info(params);
+ pl_dispatch_mark_dynamic(rr->dp, params->dynamic_constants);
+
+ require(images->num_frames >= 1);
+ require(images->vsync_duration > 0.0);
+ for (int i = 0; i < images->num_frames - 1; i++)
+ require(images->timestamps[i] <= images->timestamps[i+1]);
+
+ const struct pl_frame *refimg = pl_frame_mix_nearest(images);
+ struct pass_state pass = {
+ .rr = rr,
+ .params = params,
+ .image = *refimg,
+ .target = *ptarget,
+ .info.stage = PL_RENDER_STAGE_BLEND,
+ };
+
+ if (rr->errors & PL_RENDER_ERR_FRAME_MIXING)
+ goto fallback;
+ if (!pass_init(&pass, false))
+ return false;
+ if (!pass.fbofmt[4])
+ goto fallback;
+
+ const struct pl_frame *target = &pass.target;
+ int out_w = abs(pl_rect_w(pass.dst_rect)),
+ out_h = abs(pl_rect_h(pass.dst_rect));
+ if (!out_w || !out_h)
+ goto fallback;
+
+ int fidx = 0;
+ struct cached_frame frames[MAX_MIX_FRAMES];
+ float weights[MAX_MIX_FRAMES];
+ float wsum = 0.0;
+
+ // Garbage collect the cache by evicting all frames from the cache that are
+ // not determined to still be required
+ for (int i = 0; i < rr->frames.num; i++)
+ rr->frames.elem[i].evict = true;
+
+ // Blur frame mixer according to vsync ratio (source / display)
+ struct pl_filter_config mixer;
+ if (params->frame_mixer) {
+ mixer = *params->frame_mixer;
+ mixer.blur = PL_DEF(mixer.blur, 1.0);
+ for (int i = 1; i < images->num_frames; i++) {
+ if (images->timestamps[i] >= 0.0 && images->timestamps[i - 1] < 0) {
+ float frame_dur = images->timestamps[i] - images->timestamps[i - 1];
+ if (images->vsync_duration > frame_dur && !params->skip_anti_aliasing)
+ mixer.blur *= images->vsync_duration / frame_dur;
+ break;
+ }
+ }
+ }
+
+ // Traverse the input frames and determine/prepare the ones we need
+ bool single_frame = !params->frame_mixer || images->num_frames == 1;
+retry:
+ for (int i = 0; i < images->num_frames; i++) {
+ uint64_t sig = images->signatures[i];
+ float rts = images->timestamps[i];
+ const struct pl_frame *img = images->frames[i];
+ PL_TRACE(rr, "Considering image with signature 0x%llx, rts %f",
+ (unsigned long long) sig, rts);
+
+ // Combining images with different rotations is basically unfeasible
+ if (pl_rotation_normalize(img->rotation - refimg->rotation)) {
+ PL_TRACE(rr, " -> Skipping: incompatible rotation");
+ continue;
+ }
+
+ float weight;
+ if (single_frame) {
+
+ // Only render the refimg, ignore others
+ if (img == refimg) {
+ weight = 1.0;
+ } else {
+ PL_TRACE(rr, " -> Skipping: no frame mixer");
+ continue;
+ }
+
+ // For backwards compatibility, treat !kernel as oversample
+ } else if (!mixer.kernel || mixer.kernel == &pl_filter_function_oversample) {
+
+ // Compute the visible interval [rts, end] of this frame
+ float end = i+1 < images->num_frames ? images->timestamps[i+1] : INFINITY;
+ if (rts > images->vsync_duration || end < 0.0) {
+ PL_TRACE(rr, " -> Skipping: no intersection with vsync");
+ continue;
+ } else {
+ rts = PL_MAX(rts, 0.0);
+ end = PL_MIN(end, images->vsync_duration);
+ pl_assert(end >= rts);
+ }
+
+ // Weight is the fraction of vsync interval that frame is visible
+ weight = (end - rts) / images->vsync_duration;
+ PL_TRACE(rr, " -> Frame [%f, %f] intersects [%f, %f] = weight %f",
+ rts, end, 0.0, images->vsync_duration, weight);
+
+ if (weight < mixer.kernel->params[0]) {
+ PL_TRACE(rr, " (culling due to threshold)");
+ weight = 0.0;
+ }
+
+ } else {
+
+ const float radius = pl_filter_radius_bound(&mixer);
+ if (fabsf(rts) >= radius) {
+ PL_TRACE(rr, " -> Skipping: outside filter radius (%f)", radius);
+ continue;
+ }
+
+ // Weight is directly sampled from the filter
+ weight = pl_filter_sample(&mixer, rts);
+ PL_TRACE(rr, " -> Filter offset %f = weight %f", rts, weight);
+
+ }
+
+ struct cached_frame *f = NULL;
+ for (int j = 0; j < rr->frames.num; j++) {
+ if (rr->frames.elem[j].signature == sig) {
+ f = &rr->frames.elem[j];
+ f->evict = false;
+ break;
+ }
+ }
+
+ // Skip frames with negligible contributions. Do this after the loop
+ // above to make sure these frames don't get evicted just yet, and
+ // also exclude the reference image from this optimization to ensure
+ // that we always have at least one frame.
+ const float cutoff = 1e-3;
+ if (fabsf(weight) <= cutoff && img != refimg) {
+ PL_TRACE(rr, " -> Skipping: weight (%f) below threshold (%f)",
+ weight, cutoff);
+ continue;
+ }
+
+ bool skip_cache = single_frame && (params->skip_caching_single_frame || par_info.trivial);
+ if (!f && skip_cache) {
+ PL_TRACE(rr, "Single frame not found in cache, bypassing");
+ goto fallback;
+ }
+
+ if (!f) {
+ // Signature does not exist in the cache at all yet,
+ // so grow the cache by this entry.
+ PL_ARRAY_GROW(rr, rr->frames);
+ f = &rr->frames.elem[rr->frames.num++];
+ *f = (struct cached_frame) {
+ .signature = sig,
+ };
+ }
+
+ // Check to see if we can blindly reuse this cache entry. This is the
+ // case if either the params are compatible, or the user doesn't care
+ bool can_reuse = f->tex;
+ bool strict_reuse = skip_cache || single_frame ||
+ !params->preserve_mixing_cache;
+ if (can_reuse && strict_reuse) {
+ can_reuse = f->tex->params.w == out_w &&
+ f->tex->params.h == out_h &&
+ pl_rect2d_eq(f->crop, img->crop) &&
+ f->params_hash == par_info.hash &&
+ pl_color_space_equal(&f->color, &target->color) &&
+ pl_icc_profile_equal(&f->profile, &target->profile);
+ }
+
+ if (!can_reuse && skip_cache) {
+ PL_TRACE(rr, "Single frame cache entry invalid, bypassing");
+ goto fallback;
+ }
+
+ if (!can_reuse) {
+ // If we can't reuse the entry, we need to re-render this frame
+ PL_TRACE(rr, " -> Cached texture missing or invalid.. (re)creating");
+ if (!f->tex) {
+ if (PL_ARRAY_POP(rr->frame_fbos, &f->tex))
+ pl_tex_invalidate(rr->gpu, f->tex);
+ }
+
+ bool ok = pl_tex_recreate(rr->gpu, &f->tex, pl_tex_params(
+ .w = out_w,
+ .h = out_h,
+ .format = pass.fbofmt[4],
+ .sampleable = true,
+ .renderable = true,
+ .blit_dst = pass.fbofmt[4]->caps & PL_FMT_CAP_BLITTABLE,
+ .storable = pass.fbofmt[4]->caps & PL_FMT_CAP_STORABLE,
+ ));
+
+ if (!ok) {
+ PL_ERR(rr, "Could not create intermediate texture for "
+ "frame mixing.. disabling!");
+ rr->errors |= PL_RENDER_ERR_FRAME_MIXING;
+ goto fallback;
+ }
+
+ struct pass_state inter_pass = {
+ .rr = rr,
+ .params = pass.params,
+ .image = *img,
+ .target = *ptarget,
+ .info.stage = PL_RENDER_STAGE_FRAME,
+ .acquired = pass.acquired,
+ };
+
+ // Render a single frame up to `pass_output_target`
+ memcpy(inter_pass.fbofmt, pass.fbofmt, sizeof(pass.fbofmt));
+ if (!pass_init(&inter_pass, true))
+ goto fail;
+
+ pass_begin_frame(&inter_pass);
+ if (!(ok = pass_read_image(&inter_pass)))
+ goto inter_pass_error;
+ if (!(ok = pass_scale_main(&inter_pass)))
+ goto inter_pass_error;
+ pass_convert_colors(&inter_pass);
+
+ pl_assert(inter_pass.img.sh); // guaranteed by `pass_convert_colors`
+ pl_shader_set_alpha(inter_pass.img.sh, &inter_pass.img.repr,
+ PL_ALPHA_PREMULTIPLIED); // for frame mixing
+
+ pl_assert(inter_pass.img.w == out_w &&
+ inter_pass.img.h == out_h);
+
+ ok = pl_dispatch_finish(rr->dp, pl_dispatch_params(
+ .shader = &inter_pass.img.sh,
+ .target = f->tex,
+ ));
+ if (!ok)
+ goto inter_pass_error;
+
+ float sx = out_w / pl_rect_w(inter_pass.dst_rect),
+ sy = out_h / pl_rect_h(inter_pass.dst_rect);
+
+ pl_transform2x2 shift = {
+ .mat.m = {{ sx, 0, }, { 0, sy, }},
+ .c = {
+ -sx * inter_pass.dst_rect.x0,
+ -sy * inter_pass.dst_rect.y0
+ },
+ };
+
+ if (inter_pass.rotation % PL_ROTATION_180 == PL_ROTATION_90) {
+ PL_SWAP(shift.mat.m[0][0], shift.mat.m[0][1]);
+ PL_SWAP(shift.mat.m[1][0], shift.mat.m[1][1]);
+ }
+
+ draw_overlays(&inter_pass, f->tex, inter_pass.img.comps, NULL,
+ inter_pass.image.overlays,
+ inter_pass.image.num_overlays,
+ inter_pass.img.color,
+ inter_pass.img.repr,
+ &shift);
+
+ f->params_hash = par_info.hash;
+ f->crop = img->crop;
+ f->color = inter_pass.img.color;
+ f->comps = inter_pass.img.comps;
+ f->profile = target->profile;
+ // fall through
+
+inter_pass_error:
+ inter_pass.acquired.target = false; // don't release target
+ pass_uninit(&inter_pass);
+ if (!ok)
+ goto fail;
+ }
+
+ pl_assert(fidx < MAX_MIX_FRAMES);
+ frames[fidx] = *f;
+ weights[fidx] = weight;
+ wsum += weight;
+ fidx++;
+ }
+
+ // Evict the frames we *don't* need
+ for (int i = 0; i < rr->frames.num; ) {
+ if (rr->frames.elem[i].evict) {
+ PL_TRACE(rr, "Evicting frame with signature %llx from cache",
+ (unsigned long long) rr->frames.elem[i].signature);
+ PL_ARRAY_APPEND(rr, rr->frame_fbos, rr->frames.elem[i].tex);
+ PL_ARRAY_REMOVE_AT(rr->frames, i);
+ continue;
+ } else {
+ i++;
+ }
+ }
+
+ // If we got back no frames, retry with ZOH semantics
+ if (!fidx) {
+ pl_assert(!single_frame);
+ single_frame = true;
+ goto retry;
+ }
+
+ // Sample and mix the output color
+ pass_begin_frame(&pass);
+ pass.info.count = fidx;
+ pl_assert(fidx > 0);
+
+ pl_shader sh = pl_dispatch_begin(rr->dp);
+ sh_describef(sh, "frame mixing (%d frame%s)", fidx, fidx > 1 ? "s" : "");
+ sh->output = PL_SHADER_SIG_COLOR;
+ sh->output_w = out_w;
+ sh->output_h = out_h;
+
+ GLSL("vec4 color; \n"
+ "// pl_render_image_mix \n"
+ "{ \n"
+ "vec4 mix_color = vec4(0.0); \n");
+
+ int comps = 0;
+ for (int i = 0; i < fidx; i++) {
+ const struct pl_tex_params *tpars = &frames[i].tex->params;
+
+ // Use linear sampling if desired and possible
+ enum pl_tex_sample_mode sample_mode = PL_TEX_SAMPLE_NEAREST;
+ if ((tpars->w != out_w || tpars->h != out_h) &&
+ (tpars->format->caps & PL_FMT_CAP_LINEAR))
+ {
+ sample_mode = PL_TEX_SAMPLE_LINEAR;
+ }
+
+ ident_t pos, tex = sh_bind(sh, frames[i].tex, PL_TEX_ADDRESS_CLAMP,
+ sample_mode, "frame", NULL, &pos, NULL);
+
+ GLSL("color = textureLod("$", "$", 0.0); \n", tex, pos);
+
+ // Note: This ignores differences in ICC profile, which we decide to
+ // just simply not care about. Doing that properly would require
+ // converting between different image profiles, and the headache of
+ // finagling that state is just not worth it because this is an
+ // exceptionally unlikely hypothetical.
+ //
+ // This also ignores differences in HDR metadata, which we deliberately
+ // ignore because it causes aggressive shader recompilation.
+ struct pl_color_space frame_csp = frames[i].color;
+ struct pl_color_space mix_csp = target->color;
+ frame_csp.hdr = mix_csp.hdr = (struct pl_hdr_metadata) {0};
+ pl_shader_color_map_ex(sh, NULL, pl_color_map_args(frame_csp, mix_csp));
+
+ float weight = weights[i] / wsum;
+ GLSL("mix_color += vec4("$") * color; \n", SH_FLOAT_DYN(weight));
+ comps = PL_MAX(comps, frames[i].comps);
+ }
+
+ GLSL("color = mix_color; \n"
+ "} \n");
+
+ // Dispatch this to the destination
+ pass.img = (struct img) {
+ .sh = sh,
+ .w = out_w,
+ .h = out_h,
+ .comps = comps,
+ .color = target->color,
+ .repr = {
+ .sys = PL_COLOR_SYSTEM_RGB,
+ .levels = PL_COLOR_LEVELS_PC,
+ .alpha = comps >= 4 ? PL_ALPHA_PREMULTIPLIED : PL_ALPHA_UNKNOWN,
+ },
+ };
+
+ if (!pass_output_target(&pass))
+ goto fallback;
+
+ pass_uninit(&pass);
+ return true;
+
+fail:
+ PL_ERR(rr, "Could not render image for frame mixing.. disabling!");
+ rr->errors |= PL_RENDER_ERR_FRAME_MIXING;
+ // fall through
+
+fallback:
+ pass_uninit(&pass);
+ return pl_render_image(rr, refimg, ptarget, params);
+
+error: // for parameter validation failures
+ return false;
+}
+
+void pl_frames_infer_mix(pl_renderer rr, const struct pl_frame_mix *mix,
+ struct pl_frame *target, struct pl_frame *out_ref)
+{
+ struct pass_state pass = {
+ .rr = rr,
+ .target = *target,
+ };
+
+ const struct pl_frame *refimg = pl_frame_mix_nearest(mix);
+ if (refimg) {
+ pass.image = *refimg;
+ } else {
+ pass.src_ref = -1;
+ }
+
+ pass_fix_frames(&pass);
+ *target = pass.target;
+ if (out_ref)
+ *out_ref = pass.image;
+}
+
+void pl_frame_set_chroma_location(struct pl_frame *frame,
+ enum pl_chroma_location chroma_loc)
+{
+ pl_tex ref = frame->planes[frame_ref(frame)].texture;
+
+ if (ref) {
+ // Texture dimensions are already known, so apply the chroma location
+ // only to subsampled planes
+ int ref_w = ref->params.w, ref_h = ref->params.h;
+
+ for (int i = 0; i < frame->num_planes; i++) {
+ struct pl_plane *plane = &frame->planes[i];
+ pl_tex tex = plane->texture;
+ bool subsampled = tex->params.w < ref_w || tex->params.h < ref_h;
+ if (subsampled)
+ pl_chroma_location_offset(chroma_loc, &plane->shift_x, &plane->shift_y);
+ }
+ } else {
+ // Texture dimensions are not yet known, so apply the chroma location
+ // to all chroma planes, regardless of subsampling
+ for (int i = 0; i < frame->num_planes; i++) {
+ struct pl_plane *plane = &frame->planes[i];
+ if (detect_plane_type(plane, &frame->repr) == PLANE_CHROMA)
+ pl_chroma_location_offset(chroma_loc, &plane->shift_x, &plane->shift_y);
+ }
+ }
+}
+
+void pl_frame_from_swapchain(struct pl_frame *out_frame,
+ const struct pl_swapchain_frame *frame)
+{
+ pl_tex fbo = frame->fbo;
+ int num_comps = fbo->params.format->num_components;
+ if (!frame->color_repr.alpha)
+ num_comps = PL_MIN(num_comps, 3);
+
+ *out_frame = (struct pl_frame) {
+ .num_planes = 1,
+ .planes = {{
+ .texture = fbo,
+ .flipped = frame->flipped,
+ .components = num_comps,
+ .component_mapping = {0, 1, 2, 3},
+ }},
+ .crop = { 0, 0, fbo->params.w, fbo->params.h },
+ .repr = frame->color_repr,
+ .color = frame->color_space,
+ };
+}
+
+bool pl_frame_is_cropped(const struct pl_frame *frame)
+{
+ int x0 = roundf(PL_MIN(frame->crop.x0, frame->crop.x1)),
+ y0 = roundf(PL_MIN(frame->crop.y0, frame->crop.y1)),
+ x1 = roundf(PL_MAX(frame->crop.x0, frame->crop.x1)),
+ y1 = roundf(PL_MAX(frame->crop.y0, frame->crop.y1));
+
+ pl_tex ref = frame->planes[frame_ref(frame)].texture;
+ pl_assert(ref);
+
+ if (!x0 && !x1)
+ x1 = ref->params.w;
+ if (!y0 && !y1)
+ y1 = ref->params.h;
+
+ return x0 > 0 || y0 > 0 || x1 < ref->params.w || y1 < ref->params.h;
+}
+
+void pl_frame_clear_rgba(pl_gpu gpu, const struct pl_frame *frame,
+ const float rgba[4])
+{
+ struct pl_color_repr repr = frame->repr;
+ pl_transform3x3 tr = pl_color_repr_decode(&repr, NULL);
+ pl_transform3x3_invert(&tr);
+
+ float encoded[3] = { rgba[0], rgba[1], rgba[2] };
+ pl_transform3x3_apply(&tr, encoded);
+
+ float mult = frame->repr.alpha == PL_ALPHA_PREMULTIPLIED ? rgba[3] : 1.0;
+ for (int p = 0; p < frame->num_planes; p++) {
+ const struct pl_plane *plane = &frame->planes[p];
+ float clear[4] = { 0.0, 0.0, 0.0, rgba[3] };
+ for (int c = 0; c < plane->components; c++) {
+ int ch = plane->component_mapping[c];
+ if (ch >= 0 && ch < 3)
+ clear[c] = mult * encoded[plane->component_mapping[c]];
+ }
+
+ pl_tex_clear(gpu, plane->texture, clear);
+ }
+}
+
+struct pl_render_errors pl_renderer_get_errors(pl_renderer rr)
+{
+ return (struct pl_render_errors) {
+ .errors = rr->errors,
+ .disabled_hooks = rr->disabled_hooks.elem,
+ .num_disabled_hooks = rr->disabled_hooks.num,
+ };
+}
+
+void pl_renderer_reset_errors(pl_renderer rr,
+ const struct pl_render_errors *errors)
+{
+ if (!errors) {
+ // Reset everything
+ rr->errors = PL_RENDER_ERR_NONE;
+ rr->disabled_hooks.num = 0;
+ return;
+ }
+
+ // Reset only requested errors
+ rr->errors &= ~errors->errors;
+
+ // Not clearing hooks
+ if (!(errors->errors & PL_RENDER_ERR_HOOKS))
+ goto done;
+
+ // Remove all hook signatures
+ if (!errors->num_disabled_hooks) {
+ rr->disabled_hooks.num = 0;
+ goto done;
+ }
+
+ // At this point we require valid array of hooks
+ if (!errors->disabled_hooks) {
+ assert(errors->disabled_hooks);
+ goto done;
+ }
+
+ for (int i = 0; i < errors->num_disabled_hooks; i++) {
+ for (int j = 0; j < rr->disabled_hooks.num; j++) {
+ // Remove only requested hook signatures
+ if (rr->disabled_hooks.elem[j] == errors->disabled_hooks[i]) {
+ PL_ARRAY_REMOVE_AT(rr->disabled_hooks, j);
+ break;
+ }
+ }
+ }
+
+ done:
+ if (rr->disabled_hooks.num)
+ rr->errors |= PL_RENDER_ERR_HOOKS;
+ return;
+}
diff --git a/src/shaders.c b/src/shaders.c
new file mode 100644
index 0000000..503ea78
--- /dev/null
+++ b/src/shaders.c
@@ -0,0 +1,992 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <math.h>
+
+#include "common.h"
+#include "log.h"
+#include "shaders.h"
+
+pl_shader_info pl_shader_info_ref(pl_shader_info pinfo)
+{
+ struct sh_info *info = (struct sh_info *) pinfo;
+ if (!info)
+ return NULL;
+
+ pl_rc_ref(&info->rc);
+ return &info->info;
+}
+
+void pl_shader_info_deref(pl_shader_info *pinfo)
+{
+ struct sh_info *info = (struct sh_info *) *pinfo;
+ if (!info)
+ return;
+
+ if (pl_rc_deref(&info->rc))
+ pl_free(info);
+ *pinfo = NULL;
+}
+
+static struct sh_info *sh_info_alloc(void *alloc)
+{
+ struct sh_info *info = pl_zalloc_ptr(alloc, info);
+ info->tmp = pl_tmp(info);
+ pl_rc_init(&info->rc);
+ return info;
+}
+
+// Re-use `sh_info` allocation if possible, allocate new otherwise
+static struct sh_info *sh_info_recycle(struct sh_info *info)
+{
+ if (!pl_rc_deref(&info->rc))
+ return sh_info_alloc(NULL);
+
+ memset(&info->info, 0, sizeof(info->info)); // reset public fields
+ pl_free_children(info->tmp);
+ pl_rc_ref(&info->rc);
+ info->desc.len = 0;
+ info->steps.num = 0;
+ return info;
+}
+
+static uint8_t reverse_bits(uint8_t x)
+{
+ static const uint8_t reverse_nibble[16] = {
+ 0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe,
+ 0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf,
+ };
+
+ return reverse_nibble[x & 0xF] << 4 | reverse_nibble[x >> 4];
+}
+
+static void init_shader(pl_shader sh, const struct pl_shader_params *params)
+{
+ if (params) {
+ sh->info->info.params = *params;
+
+ // To avoid collisions for shaders with very high number of
+ // identifiers, pack the shader ID into the highest bits (MSB -> LSB)
+ pl_static_assert(sizeof(sh->prefix) > sizeof(params->id));
+ const int shift = 8 * (sizeof(sh->prefix) - sizeof(params->id));
+ sh->prefix = reverse_bits(params->id) << shift;
+ }
+
+ sh->name = sh_fresh(sh, "main");
+}
+
+pl_shader pl_shader_alloc(pl_log log, const struct pl_shader_params *params)
+{
+ static const int glsl_ver_req = 130;
+ if (params && params->glsl.version && params->glsl.version < 130) {
+ pl_err(log, "Requested GLSL version %d too low (required: %d)",
+ params->glsl.version, glsl_ver_req);
+ return NULL;
+ }
+
+ pl_shader sh = pl_alloc_ptr(NULL, sh);
+ *sh = (struct pl_shader_t) {
+ .log = log,
+ .tmp = pl_tmp(sh),
+ .info = sh_info_alloc(NULL),
+ .mutable = true,
+ };
+
+ for (int i = 0; i < PL_ARRAY_SIZE(sh->buffers); i++)
+ sh->buffers[i] = pl_str_builder_alloc(sh);
+
+ init_shader(sh, params);
+ return sh;
+}
+
+static void sh_obj_deref(pl_shader_obj obj);
+
+void sh_deref(pl_shader sh)
+{
+ pl_free_children(sh->tmp);
+
+ for (int i = 0; i < sh->obj.num; i++)
+ sh_obj_deref(sh->obj.elem[i]);
+ sh->obj.num = 0;
+}
+
+void pl_shader_free(pl_shader *psh)
+{
+ pl_shader sh = *psh;
+ if (!sh)
+ return;
+
+ sh_deref(sh);
+ pl_shader_info_deref((pl_shader_info *) &sh->info);
+ pl_free_ptr(psh);
+}
+
+void pl_shader_reset(pl_shader sh, const struct pl_shader_params *params)
+{
+ sh_deref(sh);
+
+ struct pl_shader_t new = {
+ .log = sh->log,
+ .tmp = sh->tmp,
+ .info = sh_info_recycle(sh->info),
+ .data.buf = sh->data.buf,
+ .mutable = true,
+
+ // Preserve array allocations
+ .obj.elem = sh->obj.elem,
+ .vas.elem = sh->vas.elem,
+ .vars.elem = sh->vars.elem,
+ .descs.elem = sh->descs.elem,
+ .consts.elem = sh->consts.elem,
+ };
+
+ // Preserve buffer allocations
+ memcpy(new.buffers, sh->buffers, sizeof(new.buffers));
+ for (int i = 0; i < PL_ARRAY_SIZE(new.buffers); i++)
+ pl_str_builder_reset(new.buffers[i]);
+
+ *sh = new;
+ init_shader(sh, params);
+}
+
+static void *sh_alloc(pl_shader sh, size_t size, size_t align)
+{
+ const size_t offset = PL_ALIGN2(sh->data.len, align);
+ const size_t req_size = offset + size;
+ if (req_size <= pl_get_size(sh->data.buf)) {
+ sh->data.len = offset + size;
+ return sh->data.buf + offset;
+ }
+
+ // We can't realloc this buffer because various pointers will be left
+ // dangling, so just reparent it onto `sh->tmp` (so it will be cleaned
+ // up when the shader is next reset) and allocate a new, larger buffer
+ // in its place
+ const size_t new_size = PL_MAX(req_size << 1, 256);
+ pl_steal(sh->tmp, sh->data.buf);
+ sh->data.buf = pl_alloc(sh, new_size);
+ sh->data.len = size;
+ return sh->data.buf;
+}
+
+static void *sh_memdup(pl_shader sh, const void *data, size_t size, size_t align)
+{
+ if (!size)
+ return NULL;
+
+ void *dst = sh_alloc(sh, size, align);
+ assert(data);
+ memcpy(dst, data, size);
+ return dst;
+}
+
+bool pl_shader_is_failed(const pl_shader sh)
+{
+ return sh->failed;
+}
+
+struct pl_glsl_version sh_glsl(const pl_shader sh)
+{
+ if (SH_PARAMS(sh).glsl.version)
+ return SH_PARAMS(sh).glsl;
+
+ if (SH_GPU(sh))
+ return SH_GPU(sh)->glsl;
+
+ return (struct pl_glsl_version) { .version = 130 };
+}
+
+bool sh_try_compute(pl_shader sh, int bw, int bh, bool flex, size_t mem)
+{
+ pl_assert(bw && bh);
+ int *sh_bw = &sh->group_size[0];
+ int *sh_bh = &sh->group_size[1];
+
+ struct pl_glsl_version glsl = sh_glsl(sh);
+ if (!glsl.compute) {
+ PL_TRACE(sh, "Disabling compute shader due to missing `compute` support");
+ return false;
+ }
+
+ if (sh->shmem + mem > glsl.max_shmem_size) {
+ PL_TRACE(sh, "Disabling compute shader due to insufficient shmem");
+ return false;
+ }
+
+ if (sh->type == SH_FRAGMENT) {
+ PL_TRACE(sh, "Disabling compute shader because shader is already marked "
+ "as fragment shader");
+ return false;
+ }
+
+ if (bw > glsl.max_group_size[0] ||
+ bh > glsl.max_group_size[1] ||
+ (bw * bh) > glsl.max_group_threads)
+ {
+ if (!flex) {
+ PL_TRACE(sh, "Disabling compute shader due to exceeded group "
+ "thread count.");
+ return false;
+ } else {
+ // Pick better group sizes
+ bw = PL_MIN(bw, glsl.max_group_size[0]);
+ bh = glsl.max_group_threads / bw;
+ }
+ }
+
+ sh->shmem += mem;
+
+ // If the current shader is either not a compute shader, or we have no
+ // choice but to override the metadata, always do so
+ if (sh->type != SH_COMPUTE || (sh->flexible_work_groups && !flex)) {
+ *sh_bw = bw;
+ *sh_bh = bh;
+ sh->type = SH_COMPUTE;
+ sh->flexible_work_groups = flex;
+ return true;
+ }
+
+ // If both shaders are flexible, pick the larger of the two
+ if (sh->flexible_work_groups && flex) {
+ *sh_bw = PL_MAX(*sh_bw, bw);
+ *sh_bh = PL_MAX(*sh_bh, bh);
+ pl_assert(*sh_bw * *sh_bh <= glsl.max_group_threads);
+ return true;
+ }
+
+ // At this point we're looking only at a non-flexible compute shader
+ pl_assert(sh->type == SH_COMPUTE && !sh->flexible_work_groups);
+ if (!flex) {
+ // Ensure parameters match
+ if (bw != *sh_bw || bh != *sh_bh) {
+ PL_TRACE(sh, "Disabling compute shader due to incompatible group "
+ "sizes %dx%d and %dx%d", *sh_bw, *sh_bh, bw, bh);
+ sh->shmem -= mem;
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool pl_shader_is_compute(const pl_shader sh)
+{
+ return sh->type == SH_COMPUTE;
+}
+
+bool pl_shader_output_size(const pl_shader sh, int *w, int *h)
+{
+ if (!sh->output_w || !sh->output_h)
+ return false;
+
+ *w = sh->transpose ? sh->output_h : sh->output_w;
+ *h = sh->transpose ? sh->output_w : sh->output_h;
+ return true;
+}
+
+ident_t sh_fresh(pl_shader sh, const char *name)
+{
+ unsigned short id = ++sh->fresh;
+ assert(!(sh->prefix & id));
+ id |= sh->prefix;
+
+ assert(name);
+ return sh_mkident(id, name);
+}
+
+static inline ident_t sh_fresh_name(pl_shader sh, const char **pname)
+{
+ ident_t id = sh_fresh(sh, *pname);
+ *pname = sh_ident_pack(id);
+ return id;
+}
+
+ident_t sh_var(pl_shader sh, struct pl_shader_var sv)
+{
+ ident_t id = sh_fresh_name(sh, &sv.var.name);
+ struct pl_var_layout layout = pl_var_host_layout(0, &sv.var);
+ sv.data = sh_memdup(sh, sv.data, layout.size, layout.stride);
+ PL_ARRAY_APPEND(sh, sh->vars, sv);
+ return id;
+}
+
+ident_t sh_var_int(pl_shader sh, const char *name, int val, bool dynamic)
+{
+ return sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_int(name),
+ .data = &val,
+ .dynamic = dynamic,
+ });
+}
+
+ident_t sh_var_uint(pl_shader sh, const char *name, unsigned int val, bool dynamic)
+{
+ return sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_uint(name),
+ .data = &val,
+ .dynamic = dynamic,
+ });
+}
+
+ident_t sh_var_float(pl_shader sh, const char *name, float val, bool dynamic)
+{
+ return sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_float(name),
+ .data = &val,
+ .dynamic = dynamic,
+ });
+}
+
+ident_t sh_var_mat3(pl_shader sh, const char *name, pl_matrix3x3 val)
+{
+ return sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_mat3(name),
+ .data = PL_TRANSPOSE_3X3(val.m),
+ });
+}
+
+ident_t sh_desc(pl_shader sh, struct pl_shader_desc sd)
+{
+ switch (sd.desc.type) {
+ case PL_DESC_BUF_UNIFORM:
+ case PL_DESC_BUF_STORAGE:
+ for (int i = 0; i < sh->descs.num; i++) // ensure uniqueness
+ pl_assert(sh->descs.elem[i].binding.object != sd.binding.object);
+ size_t bsize = sizeof(sd.buffer_vars[0]) * sd.num_buffer_vars;
+ sd.buffer_vars = sh_memdup(sh, sd.buffer_vars, bsize,
+ alignof(struct pl_buffer_var));
+ for (int i = 0; i < sd.num_buffer_vars; i++) {
+ struct pl_var *bv = &sd.buffer_vars[i].var;
+ const char *name = bv->name;
+ GLSLP("#define %s "$"\n", name, sh_fresh_name(sh, &bv->name));
+ }
+ break;
+
+ case PL_DESC_BUF_TEXEL_UNIFORM:
+ case PL_DESC_BUF_TEXEL_STORAGE:
+ case PL_DESC_SAMPLED_TEX:
+ case PL_DESC_STORAGE_IMG:
+ pl_assert(!sd.num_buffer_vars);
+ break;
+
+ case PL_DESC_INVALID:
+ case PL_DESC_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ ident_t id = sh_fresh_name(sh, &sd.desc.name);
+ PL_ARRAY_APPEND(sh, sh->descs, sd);
+ return id;
+}
+
+ident_t sh_const(pl_shader sh, struct pl_shader_const sc)
+{
+ if (SH_PARAMS(sh).dynamic_constants && !sc.compile_time) {
+ return sh_var(sh, (struct pl_shader_var) {
+ .var = {
+ .name = sc.name,
+ .type = sc.type,
+ .dim_v = 1,
+ .dim_m = 1,
+ .dim_a = 1,
+ },
+ .data = sc.data,
+ });
+ }
+
+ ident_t id = sh_fresh_name(sh, &sc.name);
+
+ pl_gpu gpu = SH_GPU(sh);
+ if (gpu && gpu->limits.max_constants) {
+ if (!sc.compile_time || gpu->limits.array_size_constants) {
+ size_t size = pl_var_type_size(sc.type);
+ sc.data = sh_memdup(sh, sc.data, size, size);
+ PL_ARRAY_APPEND(sh, sh->consts, sc);
+ return id;
+ }
+ }
+
+ // Fallback for GPUs without specialization constants
+ switch (sc.type) {
+ case PL_VAR_SINT:
+ GLSLH("const int "$" = %d; \n", id, *(int *) sc.data);
+ return id;
+ case PL_VAR_UINT:
+ GLSLH("const uint "$" = uint(%u); \n", id, *(unsigned int *) sc.data);
+ return id;
+ case PL_VAR_FLOAT:
+ GLSLH("const float "$" = float(%f); \n", id, *(float *) sc.data);
+ return id;
+ case PL_VAR_INVALID:
+ case PL_VAR_TYPE_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+}
+
+ident_t sh_const_int(pl_shader sh, const char *name, int val)
+{
+ return sh_const(sh, (struct pl_shader_const) {
+ .type = PL_VAR_SINT,
+ .name = name,
+ .data = &val,
+ });
+}
+
+ident_t sh_const_uint(pl_shader sh, const char *name, unsigned int val)
+{
+ return sh_const(sh, (struct pl_shader_const) {
+ .type = PL_VAR_UINT,
+ .name = name,
+ .data = &val,
+ });
+}
+
+ident_t sh_const_float(pl_shader sh, const char *name, float val)
+{
+ return sh_const(sh, (struct pl_shader_const) {
+ .type = PL_VAR_FLOAT,
+ .name = name,
+ .data = &val,
+ });
+}
+
+ident_t sh_attr(pl_shader sh, struct pl_shader_va sva)
+{
+ const size_t vsize = sva.attr.fmt->texel_size;
+ uint8_t *data = sh_alloc(sh, vsize * 4, vsize);
+ for (int i = 0; i < 4; i++) {
+ memcpy(data, sva.data[i], vsize);
+ sva.data[i] = data;
+ data += vsize;
+ }
+
+ ident_t id = sh_fresh_name(sh, &sva.attr.name);
+ PL_ARRAY_APPEND(sh, sh->vas, sva);
+ return id;
+}
+
+ident_t sh_attr_vec2(pl_shader sh, const char *name, const pl_rect2df *rc)
+{
+ pl_gpu gpu = SH_GPU(sh);
+ if (!gpu) {
+ SH_FAIL(sh, "Failed adding vertex attr '%s': No GPU available!", name);
+ return NULL_IDENT;
+ }
+
+ pl_fmt fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2);
+ if (!fmt) {
+ SH_FAIL(sh, "Failed adding vertex attr '%s': no vertex fmt!", name);
+ return NULL_IDENT;
+ }
+
+ float verts[4][2] = {
+ { rc->x0, rc->y0 },
+ { rc->x1, rc->y0 },
+ { rc->x0, rc->y1 },
+ { rc->x1, rc->y1 },
+ };
+
+ return sh_attr(sh, (struct pl_shader_va) {
+ .attr = {
+ .name = name,
+ .fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2),
+ },
+ .data = { verts[0], verts[1], verts[2], verts[3] },
+ });
+}
+
+ident_t sh_bind(pl_shader sh, pl_tex tex,
+ enum pl_tex_address_mode address_mode,
+ enum pl_tex_sample_mode sample_mode,
+ const char *name, const pl_rect2df *rect,
+ ident_t *out_pos, ident_t *out_pt)
+{
+ if (pl_tex_params_dimension(tex->params) != 2) {
+ SH_FAIL(sh, "Failed binding texture '%s': not a 2D texture!", name);
+ return NULL_IDENT;
+ }
+
+ if (!tex->params.sampleable) {
+ SH_FAIL(sh, "Failed binding texture '%s': texture not sampleable!", name);
+ return NULL_IDENT;
+ }
+
+ ident_t itex = sh_desc(sh, (struct pl_shader_desc) {
+ .desc = {
+ .name = name,
+ .type = PL_DESC_SAMPLED_TEX,
+ },
+ .binding = {
+ .object = tex,
+ .address_mode = address_mode,
+ .sample_mode = sample_mode,
+ },
+ });
+
+ float sx, sy;
+ if (tex->sampler_type == PL_SAMPLER_RECT) {
+ sx = 1.0;
+ sy = 1.0;
+ } else {
+ sx = 1.0 / tex->params.w;
+ sy = 1.0 / tex->params.h;
+ }
+
+ if (out_pos) {
+ pl_rect2df full = {
+ .x1 = tex->params.w,
+ .y1 = tex->params.h,
+ };
+
+ rect = PL_DEF(rect, &full);
+ *out_pos = sh_attr_vec2(sh, "tex_coord", &(pl_rect2df) {
+ .x0 = sx * rect->x0, .y0 = sy * rect->y0,
+ .x1 = sx * rect->x1, .y1 = sy * rect->y1,
+ });
+ }
+
+ if (out_pt) {
+ *out_pt = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec2("tex_pt"),
+ .data = &(float[2]) {sx, sy},
+ });
+ }
+
+ return itex;
+}
+
+bool sh_buf_desc_append(void *alloc, pl_gpu gpu,
+ struct pl_shader_desc *buf_desc,
+ struct pl_var_layout *out_layout,
+ const struct pl_var new_var)
+{
+ struct pl_buffer_var bv = { .var = new_var };
+ size_t cur_size = sh_buf_desc_size(buf_desc);
+
+ switch (buf_desc->desc.type) {
+ case PL_DESC_BUF_UNIFORM:
+ bv.layout = pl_std140_layout(cur_size, &new_var);
+ if (bv.layout.offset + bv.layout.size > gpu->limits.max_ubo_size)
+ return false;
+ break;
+ case PL_DESC_BUF_STORAGE:
+ bv.layout = pl_std430_layout(cur_size, &new_var);
+ if (bv.layout.offset + bv.layout.size > gpu->limits.max_ssbo_size)
+ return false;
+ break;
+ case PL_DESC_INVALID:
+ case PL_DESC_SAMPLED_TEX:
+ case PL_DESC_STORAGE_IMG:
+ case PL_DESC_BUF_TEXEL_UNIFORM:
+ case PL_DESC_BUF_TEXEL_STORAGE:
+ case PL_DESC_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ if (out_layout)
+ *out_layout = bv.layout;
+ PL_ARRAY_APPEND_RAW(alloc, buf_desc->buffer_vars, buf_desc->num_buffer_vars, bv);
+ return true;
+}
+
+size_t sh_buf_desc_size(const struct pl_shader_desc *buf_desc)
+{
+ if (!buf_desc->num_buffer_vars)
+ return 0;
+
+ const struct pl_buffer_var *last;
+ last = &buf_desc->buffer_vars[buf_desc->num_buffer_vars - 1];
+ return last->layout.offset + last->layout.size;
+}
+
+void sh_describef(pl_shader sh, const char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap, fmt);
+ sh_describe(sh, pl_vasprintf(sh->info->tmp, fmt, ap));
+ va_end(ap);
+}
+
+static const char *insigs[] = {
+ [PL_SHADER_SIG_NONE] = "",
+ [PL_SHADER_SIG_COLOR] = "vec4 color",
+};
+
+static const char *outsigs[] = {
+ [PL_SHADER_SIG_NONE] = "void",
+ [PL_SHADER_SIG_COLOR] = "vec4",
+};
+
+static const char *retvals[] = {
+ [PL_SHADER_SIG_NONE] = "",
+ [PL_SHADER_SIG_COLOR] = "return color;",
+};
+
+// libplacebo currently only allows 2D samplers for shader signatures
+static const char *samplers2D[] = {
+ [PL_SAMPLER_NORMAL] = "sampler2D",
+ [PL_SAMPLER_RECT] = "sampler2DRect",
+ [PL_SAMPLER_EXTERNAL] = "samplerExternalOES",
+};
+
+ident_t sh_subpass(pl_shader sh, pl_shader sub)
+{
+ pl_assert(sh->mutable);
+
+ if (sh->prefix == sub->prefix) {
+ PL_TRACE(sh, "Can't merge shaders: conflicting identifiers!");
+ return NULL_IDENT;
+ }
+
+ // Check for shader compatibility
+ int res_w = PL_DEF(sh->output_w, sub->output_w),
+ res_h = PL_DEF(sh->output_h, sub->output_h);
+
+ if ((sub->output_w && res_w != sub->output_w) ||
+ (sub->output_h && res_h != sub->output_h))
+ {
+ PL_TRACE(sh, "Can't merge shaders: incompatible sizes: %dx%d and %dx%d",
+ sh->output_w, sh->output_h, sub->output_w, sub->output_h);
+ return NULL_IDENT;
+ }
+
+ if (sub->type == SH_COMPUTE) {
+ int subw = sub->group_size[0],
+ subh = sub->group_size[1];
+ bool flex = sub->flexible_work_groups;
+
+ if (!sh_try_compute(sh, subw, subh, flex, sub->shmem)) {
+ PL_TRACE(sh, "Can't merge shaders: incompatible block sizes or "
+ "exceeded shared memory resource capabilities");
+ return NULL_IDENT;
+ }
+ }
+
+ sh->output_w = res_w;
+ sh->output_h = res_h;
+
+ // Append the prelude and header
+ pl_str_builder_concat(sh->buffers[SH_BUF_PRELUDE], sub->buffers[SH_BUF_PRELUDE]);
+ pl_str_builder_concat(sh->buffers[SH_BUF_HEADER], sub->buffers[SH_BUF_HEADER]);
+
+ // Append the body as a new header function
+ if (sub->input == PL_SHADER_SIG_SAMPLER) {
+ pl_assert(sub->sampler_prefix);
+ GLSLH("%s "$"(%c%s src_tex, vec2 tex_coord) {\n",
+ outsigs[sub->output], sub->name,
+ sub->sampler_prefix, samplers2D[sub->sampler_type]);
+ } else {
+ GLSLH("%s "$"(%s) {\n",
+ outsigs[sub->output], sub->name, insigs[sub->input]);
+ }
+ pl_str_builder_concat(sh->buffers[SH_BUF_HEADER], sub->buffers[SH_BUF_BODY]);
+ GLSLH("%s\n}\n\n", retvals[sub->output]);
+
+ // Steal all inputs and objects from the subpass
+#define ARRAY_STEAL(arr) do \
+{ \
+ PL_ARRAY_CONCAT(sh, sh->arr, sub->arr); \
+ sub->arr.num = 0; \
+} while (0)
+
+ ARRAY_STEAL(obj);
+ ARRAY_STEAL(vas);
+ ARRAY_STEAL(vars);
+ ARRAY_STEAL(descs);
+ ARRAY_STEAL(consts);
+#undef ARRAY_STEAL
+
+ // Steal the scratch buffer (if it holds data)
+ if (sub->data.len) {
+ pl_steal(sh->tmp, sub->data.buf);
+ sub->data = (pl_str) {0};
+ }
+
+ // Steal all temporary allocations and mark the child as unusable
+ pl_steal(sh->tmp, sub->tmp);
+ sub->tmp = pl_tmp(sub);
+ sub->failed = true;
+
+ // Steal the shader steps array (and allocations)
+ pl_assert(pl_rc_count(&sub->info->rc) == 1);
+ PL_ARRAY_CONCAT(sh->info, sh->info->steps, sub->info->steps);
+ pl_steal(sh->info->tmp, sub->info->tmp);
+ sub->info->tmp = pl_tmp(sub->info);
+ sub->info->steps.num = 0; // sanity
+
+ return sub->name;
+}
+
+pl_str_builder sh_finalize_internal(pl_shader sh)
+{
+ pl_assert(sh->mutable); // this function should only ever be called once
+ if (sh->failed)
+ return NULL;
+
+ // Padding for readability
+ GLSLP("\n");
+
+ // Concatenate everything onto the prelude to form the final output
+ pl_str_builder_concat(sh->buffers[SH_BUF_PRELUDE], sh->buffers[SH_BUF_HEADER]);
+
+ if (sh->input == PL_SHADER_SIG_SAMPLER) {
+ pl_assert(sh->sampler_prefix);
+ GLSLP("%s "$"(%c%s src_tex, vec2 tex_coord) {\n",
+ outsigs[sh->output], sh->name,
+ sh->sampler_prefix,
+ samplers2D[sh->sampler_type]);
+ } else {
+ GLSLP("%s "$"(%s) {\n", outsigs[sh->output], sh->name, insigs[sh->input]);
+ }
+
+ pl_str_builder_concat(sh->buffers[SH_BUF_PRELUDE], sh->buffers[SH_BUF_BODY]);
+ pl_str_builder_concat(sh->buffers[SH_BUF_PRELUDE], sh->buffers[SH_BUF_FOOTER]);
+ GLSLP("%s\n}\n\n", retvals[sh->output]);
+
+ // Generate the shader info
+ struct sh_info *info = sh->info;
+ info->info.steps = info->steps.elem;
+ info->info.num_steps = info->steps.num;
+ info->info.description = "(unknown shader)";
+
+ // Generate pretty description
+ for (int i = 0; i < info->steps.num; i++) {
+ const char *step = info->steps.elem[i];
+
+ // Prevent duplicates. We're okay using a weak equality check here
+ // because most pass descriptions are static strings.
+ for (int j = 0; j < i; j++) {
+ if (info->steps.elem[j] == step)
+ goto next_step;
+ }
+
+ int count = 1;
+ for (int j = i+1; j < info->steps.num; j++) {
+ if (info->steps.elem[j] == step)
+ count++;
+ }
+
+ const char *prefix = i > 0 ? ", " : "";
+ if (count > 1) {
+ pl_str_append_asprintf(info, &info->desc, "%s%s x%d",
+ prefix, step, count);
+ } else {
+ pl_str_append_asprintf(info, &info->desc, "%s%s", prefix, step);
+ }
+
+next_step: ;
+ }
+
+ if (info->desc.len)
+ info->info.description = (char *) info->desc.buf;
+
+ sh->mutable = false;
+ return sh->buffers[SH_BUF_PRELUDE];
+}
+
+const struct pl_shader_res *pl_shader_finalize(pl_shader sh)
+{
+ if (sh->failed) {
+ return NULL;
+ } else if (!sh->mutable) {
+ return &sh->result;
+ }
+
+ pl_shader_info info = &sh->info->info;
+ pl_str_builder glsl = sh_finalize_internal(sh);
+
+ // Turn ident_t into friendly strings before passing it to users
+#define FIX_IDENT(name) \
+ name = sh_ident_tostr(sh_ident_unpack(name))
+ for (int i = 0; i < sh->vas.num; i++)
+ FIX_IDENT(sh->vas.elem[i].attr.name);
+ for (int i = 0; i < sh->vars.num; i++)
+ FIX_IDENT(sh->vars.elem[i].var.name);
+ for (int i = 0; i < sh->consts.num; i++)
+ FIX_IDENT(sh->consts.elem[i].name);
+ for (int i = 0; i < sh->descs.num; i++) {
+ struct pl_shader_desc *sd = &sh->descs.elem[i];
+ FIX_IDENT(sd->desc.name);
+ for (int j = 0; j < sd->num_buffer_vars; sd++)
+ FIX_IDENT(sd->buffer_vars[j].var.name);
+ }
+#undef FIX_IDENT
+
+ sh->result = (struct pl_shader_res) {
+ .info = info,
+ .glsl = (char *) pl_str_builder_exec(glsl).buf,
+ .name = sh_ident_tostr(sh->name),
+ .input = sh->input,
+ .output = sh->output,
+ .compute_group_size = { sh->group_size[0], sh->group_size[1] },
+ .compute_shmem = sh->shmem,
+ .vertex_attribs = sh->vas.elem,
+ .num_vertex_attribs = sh->vas.num,
+ .variables = sh->vars.elem,
+ .num_variables = sh->vars.num,
+ .descriptors = sh->descs.elem,
+ .num_descriptors = sh->descs.num,
+ .constants = sh->consts.elem,
+ .num_constants = sh->consts.num,
+ // deprecated fields
+ .params = info->params,
+ .steps = info->steps,
+ .num_steps = info->num_steps,
+ .description = info->description,
+ };
+
+ return &sh->result;
+}
+
+bool sh_require(pl_shader sh, enum pl_shader_sig insig, int w, int h)
+{
+ if (sh->failed) {
+ SH_FAIL(sh, "Attempting to modify a failed shader!");
+ return false;
+ }
+
+ if (!sh->mutable) {
+ SH_FAIL(sh, "Attempted to modify an immutable shader!");
+ return false;
+ }
+
+ if ((w && sh->output_w && sh->output_w != w) ||
+ (h && sh->output_h && sh->output_h != h))
+ {
+ SH_FAIL(sh, "Illegal sequence of shader operations: Incompatible "
+ "output size requirements %dx%d and %dx%d",
+ sh->output_w, sh->output_h, w, h);
+ return false;
+ }
+
+ static const char *names[] = {
+ [PL_SHADER_SIG_NONE] = "PL_SHADER_SIG_NONE",
+ [PL_SHADER_SIG_COLOR] = "PL_SHADER_SIG_COLOR",
+ };
+
+ // If we require an input, but there is none available - just get it from
+ // the user by turning it into an explicit input signature.
+ if (!sh->output && insig) {
+ pl_assert(!sh->input);
+ sh->input = insig;
+ } else if (sh->output != insig) {
+ SH_FAIL(sh, "Illegal sequence of shader operations! Current output "
+ "signature is '%s', but called operation expects '%s'!",
+ names[sh->output], names[insig]);
+ return false;
+ }
+
+ // All of our shaders end up returning a vec4 color
+ sh->output = PL_SHADER_SIG_COLOR;
+ sh->output_w = PL_DEF(sh->output_w, w);
+ sh->output_h = PL_DEF(sh->output_h, h);
+ return true;
+}
+
+static void sh_obj_deref(pl_shader_obj obj)
+{
+ if (!pl_rc_deref(&obj->rc))
+ return;
+
+ if (obj->uninit)
+ obj->uninit(obj->gpu, obj->priv);
+
+ pl_free(obj);
+}
+
+void pl_shader_obj_destroy(pl_shader_obj *ptr)
+{
+ pl_shader_obj obj = *ptr;
+ if (!obj)
+ return;
+
+ sh_obj_deref(obj);
+ *ptr = NULL;
+}
+
+void *sh_require_obj(pl_shader sh, pl_shader_obj *ptr,
+ enum pl_shader_obj_type type, size_t priv_size,
+ void (*uninit)(pl_gpu gpu, void *priv))
+{
+ if (!ptr)
+ return NULL;
+
+ pl_shader_obj obj = *ptr;
+ if (obj && obj->gpu != SH_GPU(sh)) {
+ SH_FAIL(sh, "Passed pl_shader_obj belongs to different GPU!");
+ return NULL;
+ }
+
+ if (obj && obj->type != type) {
+ SH_FAIL(sh, "Passed pl_shader_obj of wrong type! Shader objects must "
+ "always be used with the same type of shader.");
+ return NULL;
+ }
+
+ if (!obj) {
+ obj = pl_zalloc_ptr(NULL, obj);
+ pl_rc_init(&obj->rc);
+ obj->gpu = SH_GPU(sh);
+ obj->type = type;
+ obj->priv = pl_zalloc(obj, priv_size);
+ obj->uninit = uninit;
+ }
+
+ PL_ARRAY_APPEND(sh, sh->obj, obj);
+ pl_rc_ref(&obj->rc);
+
+ *ptr = obj;
+ return obj->priv;
+}
+
+ident_t sh_prng(pl_shader sh, bool temporal, ident_t *p_state)
+{
+ ident_t randfun = sh_fresh(sh, "rand"),
+ state = sh_fresh(sh, "state");
+
+ // Based on pcg3d (http://jcgt.org/published/0009/03/02/)
+ GLSLP("#define prng_t uvec3\n");
+ GLSLH("vec3 "$"(inout uvec3 s) { \n"
+ " s = 1664525u * s + uvec3(1013904223u); \n"
+ " s.x += s.y * s.z; \n"
+ " s.y += s.z * s.x; \n"
+ " s.z += s.x * s.y; \n"
+ " s ^= s >> 16u; \n"
+ " s.x += s.y * s.z; \n"
+ " s.y += s.z * s.x; \n"
+ " s.z += s.x * s.y; \n"
+ " return vec3(s) * 1.0/float(0xFFFFFFFFu); \n"
+ "} \n",
+ randfun);
+
+ if (temporal) {
+ GLSL("uvec3 "$" = uvec3(gl_FragCoord.xy, "$"); \n",
+ state, SH_UINT_DYN(SH_PARAMS(sh).index));
+ } else {
+ GLSL("uvec3 "$" = uvec3(gl_FragCoord.xy, 0.0); \n", state);
+ }
+
+ if (p_state)
+ *p_state = state;
+
+ ident_t res = sh_fresh(sh, "RAND");
+ GLSLH("#define "$" ("$"("$"))\n", res, randfun, state);
+ return res;
+}
diff --git a/src/shaders.h b/src/shaders.h
new file mode 100644
index 0000000..7656a35
--- /dev/null
+++ b/src/shaders.h
@@ -0,0 +1,387 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <limits.h>
+
+#include "common.h"
+#include "cache.h"
+#include "log.h"
+#include "gpu.h"
+
+#include <libplacebo/shaders.h>
+
+// This represents an identifier (e.g. name of function, uniform etc.) for
+// a shader resource. Not human-readable.
+
+typedef unsigned short ident_t;
+#define $ "_%hx"
+#define NULL_IDENT 0u
+
+#define sh_mkident(id, name) ((ident_t) id)
+#define sh_ident_tostr(id) pl_asprintf(sh->tmp, $, id)
+
+enum {
+ IDENT_BITS = 8 * sizeof(ident_t),
+ IDENT_MASK = (uintptr_t) USHRT_MAX,
+ IDENT_SENTINEL = (uintptr_t) 0x20230319 << IDENT_BITS,
+};
+
+// Functions to pack/unpack an identifier into a `const char *` name field.
+// Used to defer string templating of friendly names until actually necessary
+static inline const char *sh_ident_pack(ident_t id)
+{
+ return (const char *)(uintptr_t) (IDENT_SENTINEL | id);
+}
+
+static inline ident_t sh_ident_unpack(const char *name)
+{
+ uintptr_t uname = (uintptr_t) name;
+ assert((uname & ~IDENT_MASK) == IDENT_SENTINEL);
+ return uname & IDENT_MASK;
+}
+
+enum pl_shader_buf {
+ SH_BUF_PRELUDE, // extra #defines etc.
+ SH_BUF_HEADER, // previous passes, helper function definitions, etc.
+ SH_BUF_BODY, // partial contents of the "current" function
+ SH_BUF_FOOTER, // will be appended to the end of the current function
+ SH_BUF_COUNT,
+};
+
+enum pl_shader_type {
+ SH_AUTO,
+ SH_COMPUTE,
+ SH_FRAGMENT
+};
+
+struct sh_info {
+ // public-facing struct
+ struct pl_shader_info_t info;
+
+ // internal fields
+ void *tmp;
+ pl_rc_t rc;
+ pl_str desc;
+ PL_ARRAY(const char *) steps;
+};
+
+struct pl_shader_t {
+ pl_log log;
+ void *tmp; // temporary allocations (freed on pl_shader_reset)
+ struct sh_info *info;
+ pl_str data; // pooled/recycled scratch buffer for small allocations
+ PL_ARRAY(pl_shader_obj) obj;
+ bool failed;
+ bool mutable;
+ ident_t name;
+ enum pl_shader_sig input, output;
+ int output_w;
+ int output_h;
+ bool transpose;
+ pl_str_builder buffers[SH_BUF_COUNT];
+ enum pl_shader_type type;
+ bool flexible_work_groups;
+ int group_size[2];
+ size_t shmem;
+ enum pl_sampler_type sampler_type;
+ char sampler_prefix;
+ unsigned short prefix; // pre-processed version of res.params.id
+ unsigned short fresh;
+
+ // Note: internally, these `pl_shader_va` etc. use raw ident_t fields
+ // instead of `const char *` wherever a name is required! These are
+ // translated to legal strings either in `pl_shader_finalize`, or inside
+ // the `pl_dispatch` shader compilation step.
+ PL_ARRAY(struct pl_shader_va) vas;
+ PL_ARRAY(struct pl_shader_var) vars;
+ PL_ARRAY(struct pl_shader_desc) descs;
+ PL_ARRAY(struct pl_shader_const) consts;
+
+ // cached result of `pl_shader_finalize`
+ struct pl_shader_res result;
+};
+
+// Free temporary resources associated with a shader. Normally called by
+// pl_shader_reset(), but used internally to reduce memory waste.
+void sh_deref(pl_shader sh);
+
+// Same as `pl_shader_finalize` but doesn't generate `sh->res`, instead returns
+// the string builder to be used to finalize the shader. Assumes the caller
+// will access the shader's internal fields directly.
+pl_str_builder sh_finalize_internal(pl_shader sh);
+
+// Helper functions for convenience
+#define SH_PARAMS(sh) ((sh)->info->info.params)
+#define SH_GPU(sh) (SH_PARAMS(sh).gpu)
+#define SH_CACHE(sh) pl_gpu_cache(SH_GPU(sh))
+
+// Returns the GLSL version, defaulting to desktop 130.
+struct pl_glsl_version sh_glsl(const pl_shader sh);
+
+#define SH_FAIL(sh, ...) do { \
+ sh->failed = true; \
+ PL_ERR(sh, __VA_ARGS__); \
+ } while (0)
+
+// Attempt enabling compute shaders for this pass, if possible
+bool sh_try_compute(pl_shader sh, int bw, int bh, bool flex, size_t mem);
+
+// Attempt merging a secondary shader into the current shader. Returns NULL if
+// merging fails (e.g. incompatible signatures); otherwise returns an identifier
+// corresponding to the generated subpass function.
+//
+// If successful, the subpass shader is set to an undefined failure state and
+// must be explicitly reset/aborted before being re-used.
+ident_t sh_subpass(pl_shader sh, pl_shader sub);
+
+// Helpers for adding new variables/descriptors/etc. with fresh, unique
+// identifier names. These will never conflict with other identifiers, even
+// if the shaders are merged together.
+ident_t sh_fresh(pl_shader sh, const char *name);
+
+// Add a new shader var and return its identifier
+ident_t sh_var(pl_shader sh, struct pl_shader_var sv);
+
+// Helper functions for `sh_var`
+ident_t sh_var_int(pl_shader sh, const char *name, int val, bool dynamic);
+ident_t sh_var_uint(pl_shader sh, const char *name, unsigned int val, bool dynamic);
+ident_t sh_var_float(pl_shader sh, const char *name, float val, bool dynamic);
+ident_t sh_var_mat3(pl_shader sh, const char *name, pl_matrix3x3 val);
+#define SH_INT_DYN(val) sh_var_int(sh, "const", val, true)
+#define SH_UINT_DYN(val) sh_var_uint(sh, "const", val, true)
+#define SH_FLOAT_DYN(val) sh_var_float(sh, "const", val, true)
+#define SH_MAT3(val) sh_var_mat3(sh, "mat", val)
+
+// Add a new shader desc and return its identifier.
+ident_t sh_desc(pl_shader sh, struct pl_shader_desc sd);
+
+// Add a new shader constant and return its identifier.
+ident_t sh_const(pl_shader sh, struct pl_shader_const sc);
+
+// Helper functions for `sh_const`
+ident_t sh_const_int(pl_shader sh, const char *name, int val);
+ident_t sh_const_uint(pl_shader sh, const char *name, unsigned int val);
+ident_t sh_const_float(pl_shader sh, const char *name, float val);
+#define SH_INT(val) sh_const_int(sh, "const", val)
+#define SH_UINT(val) sh_const_uint(sh, "const", val)
+#define SH_FLOAT(val) sh_const_float(sh, "const", val)
+
+// Add a new shader va and return its identifier
+ident_t sh_attr(pl_shader sh, struct pl_shader_va sva);
+
+// Helper to add a a vec2 VA from a pl_rect2df. Returns NULL_IDENT on failure.
+ident_t sh_attr_vec2(pl_shader sh, const char *name, const pl_rect2df *rc);
+
+// Bind a texture under a given transformation and make its attributes
+// available as well. If an output pointer for one of the attributes is left
+// as NULL, that attribute will not be added. Returns NULL on failure. `rect`
+// is optional, and defaults to the full texture if left as NULL.
+//
+// Note that for e.g. compute shaders, the vec2 out_pos might be a macro that
+// expands to an expensive computation, and should be cached by the user.
+ident_t sh_bind(pl_shader sh, pl_tex tex,
+ enum pl_tex_address_mode address_mode,
+ enum pl_tex_sample_mode sample_mode,
+ const char *name, const pl_rect2df *rect,
+ ident_t *out_pos, ident_t *out_pt);
+
+// Incrementally build up a buffer by adding new variable elements to the
+// buffer, resizing buf.buffer_vars if necessary. Returns whether or not the
+// variable could be successfully added (which may fail if you try exceeding
+// the size limits of the buffer type). If successful, the layout is stored
+// in *out_layout (may be NULL).
+bool sh_buf_desc_append(void *alloc, pl_gpu gpu,
+ struct pl_shader_desc *buf_desc,
+ struct pl_var_layout *out_layout,
+ const struct pl_var new_var);
+
+size_t sh_buf_desc_size(const struct pl_shader_desc *buf_desc);
+
+
+// Underlying function for appending text to a shader
+#define sh_append(sh, buf, ...) \
+ pl_str_builder_addf((sh)->buffers[buf], __VA_ARGS__)
+
+#define sh_append_str(sh, buf, str) \
+ pl_str_builder_str((sh)->buffers[buf], str)
+
+#define GLSLP(...) sh_append(sh, SH_BUF_PRELUDE, __VA_ARGS__)
+#define GLSLH(...) sh_append(sh, SH_BUF_HEADER, __VA_ARGS__)
+#define GLSL(...) sh_append(sh, SH_BUF_BODY, __VA_ARGS__)
+#define GLSLF(...) sh_append(sh, SH_BUF_FOOTER, __VA_ARGS__)
+
+// Attach a description to a shader
+void sh_describef(pl_shader sh, const char *fmt, ...)
+ PL_PRINTF(2, 3);
+
+static inline void sh_describe(pl_shader sh, const char *desc)
+{
+ PL_ARRAY_APPEND(sh->info, sh->info->steps, desc);
+};
+
+// Requires that the share is mutable, has an output signature compatible
+// with the given input signature, as well as an output size compatible with
+// the given size requirements. Errors and returns false otherwise.
+bool sh_require(pl_shader sh, enum pl_shader_sig insig, int w, int h);
+
+// Shader resources
+
+enum pl_shader_obj_type {
+ PL_SHADER_OBJ_INVALID = 0,
+ PL_SHADER_OBJ_COLOR_MAP,
+ PL_SHADER_OBJ_SAMPLER,
+ PL_SHADER_OBJ_DITHER,
+ PL_SHADER_OBJ_LUT,
+ PL_SHADER_OBJ_AV1_GRAIN,
+ PL_SHADER_OBJ_FILM_GRAIN,
+ PL_SHADER_OBJ_RESHAPE,
+};
+
+struct pl_shader_obj_t {
+ enum pl_shader_obj_type type;
+ pl_rc_t rc;
+ pl_gpu gpu;
+ void (*uninit)(pl_gpu gpu, void *priv);
+ void *priv;
+};
+
+// Returns (*ptr)->priv, or NULL on failure
+void *sh_require_obj(pl_shader sh, pl_shader_obj *ptr,
+ enum pl_shader_obj_type type, size_t priv_size,
+ void (*uninit)(pl_gpu gpu, void *priv));
+
+#define SH_OBJ(sh, ptr, type, t, uninit) \
+ ((t*) sh_require_obj(sh, ptr, type, sizeof(t), uninit))
+
+// Initializes a PRNG. The resulting string will directly evaluate to a
+// pseudorandom, uniformly distributed vec3 from [0.0,1.0]. Since this
+// algorithm works by mutating a state variable, if the user wants to use the
+// resulting PRNG inside a subfunction, they must add an extra `inout prng_t %s`
+// with the contents of `state` to the signature. (Optional)
+//
+// If `temporal` is set, the PRNG will vary across frames.
+ident_t sh_prng(pl_shader sh, bool temporal, ident_t *state);
+
+// Backing memory type
+enum sh_lut_type {
+ SH_LUT_AUTO = 0, // pick whatever makes the most sense
+ SH_LUT_TEXTURE, // upload as texture
+ SH_LUT_UNIFORM, // uniform array
+ SH_LUT_LITERAL, // constant / literal array in shader source (fallback)
+};
+
+// Interpolation method
+enum sh_lut_method {
+ SH_LUT_NONE = 0, // no interpolation, integer indices
+ SH_LUT_LINEAR, // linear interpolation, vecN indices in range [0,1]
+ SH_LUT_CUBIC, // (bi/tri)cubic interpolation
+ SH_LUT_TETRAHEDRAL, // tetrahedral interpolation for vec3, equivalent to
+ // SH_LUT_LINEAR for lower dimensions
+};
+
+struct sh_lut_params {
+ pl_shader_obj *object;
+
+ // Type of the LUT we intend to generate.
+ //
+ // Note: If `var_type` is PL_VAR_*INT, `method` must be SH_LUT_NONE.
+ enum pl_var_type var_type;
+ enum sh_lut_type lut_type;
+ enum sh_lut_method method;
+
+ // For SH_LUT_TEXTURE, this can be used to override the texture's internal
+ // format, in which case it takes precedence over the default for `type`.
+ pl_fmt fmt;
+
+ // LUT dimensions. Unused dimensions may be left as 0.
+ int width;
+ int height;
+ int depth;
+ int comps;
+
+ // If true, the LUT will always be regenerated, even if the dimensions have
+ // not changed.
+ bool update;
+
+ // Alternate way of triggering shader invalidations. If the signature
+ // does not match the LUT's signature, it will be regenerated.
+ uint64_t signature;
+
+ // If set to true, shader objects will be preserved and updated in-place
+ // rather than being treated as read-only.
+ bool dynamic;
+
+ // If set , generated shader objects are automatically cached in this
+ // cache. Requires `signature` to be set (and uniquely identify the LUT).
+ pl_cache cache;
+
+ // Will be called with a zero-initialized buffer whenever the data needs to
+ // be computed, which happens whenever the size is changed, the shader
+ // object is invalidated, or `update` is set to true.
+ //
+ // Note: Interpretation of `data` is according to `type` and `fmt`.
+ void (*fill)(void *data, const struct sh_lut_params *params);
+ void *priv;
+
+ // Debug tag to track LUT source
+ pl_debug_tag debug_tag;
+};
+
+#define sh_lut_params(...) (&(struct sh_lut_params) { \
+ .debug_tag = PL_DEBUG_TAG, \
+ __VA_ARGS__ \
+ })
+
+// Makes a table of values available as a shader variable, using an a given
+// method (falling back if needed). The resulting identifier can be sampled
+// directly as %s(pos), where pos is a vector with the right number of
+// dimensions. `pos` must be an integer vector within the bounds of the array,
+// unless the method is `SH_LUT_LINEAR`, in which case it's a float vector that
+// gets interpolated and clamped as needed. Returns NULL on error.
+ident_t sh_lut(pl_shader sh, const struct sh_lut_params *params);
+
+static inline uint8_t sh_num_comps(uint8_t mask)
+{
+ pl_assert((mask & 0xF) == mask);
+ return __builtin_popcount(mask);
+}
+
+static inline const char *sh_float_type(uint8_t mask)
+{
+ switch (sh_num_comps(mask)) {
+ case 1: return "float";
+ case 2: return "vec2";
+ case 3: return "vec3";
+ case 4: return "vec4";
+ }
+
+ pl_unreachable();
+}
+
+static inline const char *sh_swizzle(uint8_t mask)
+{
+ static const char * const swizzles[0x10] = {
+ NULL, "r", "g", "rg", "b", "rb", "gb", "rgb",
+ "a", "ra", "ga", "rga", "ba", "rba", "gba", "rgba",
+ };
+
+ pl_assert(mask <= PL_ARRAY_SIZE(swizzles));
+ return swizzles[mask];
+}
diff --git a/src/shaders/colorspace.c b/src/shaders/colorspace.c
new file mode 100644
index 0000000..c7b3b5a
--- /dev/null
+++ b/src/shaders/colorspace.c
@@ -0,0 +1,2120 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+
+#include "cache.h"
+#include "shaders.h"
+
+#include <libplacebo/shaders/colorspace.h>
+
+// Common constants for SMPTE ST.2084 (PQ)
+static const float PQ_M1 = 2610./4096 * 1./4,
+ PQ_M2 = 2523./4096 * 128,
+ PQ_C1 = 3424./4096,
+ PQ_C2 = 2413./4096 * 32,
+ PQ_C3 = 2392./4096 * 32;
+
+// Common constants for ARIB STD-B67 (HLG)
+static const float HLG_A = 0.17883277,
+ HLG_B = 0.28466892,
+ HLG_C = 0.55991073,
+ HLG_REF = 1000.0 / PL_COLOR_SDR_WHITE;
+
+// Common constants for Panasonic V-Log
+static const float VLOG_B = 0.00873,
+ VLOG_C = 0.241514,
+ VLOG_D = 0.598206;
+
+// Common constants for Sony S-Log
+static const float SLOG_A = 0.432699,
+ SLOG_B = 0.037584,
+ SLOG_C = 0.616596 + 0.03,
+ SLOG_P = 3.538813,
+ SLOG_Q = 0.030001,
+ SLOG_K2 = 155.0 / 219.0;
+
+void pl_shader_set_alpha(pl_shader sh, struct pl_color_repr *repr,
+ enum pl_alpha_mode mode)
+{
+ if (repr->alpha == PL_ALPHA_PREMULTIPLIED && mode == PL_ALPHA_INDEPENDENT) {
+ GLSL("if (color.a > 1e-6) \n"
+ " color.rgb /= vec3(color.a); \n");
+ repr->alpha = PL_ALPHA_INDEPENDENT;
+ }
+
+ if (repr->alpha == PL_ALPHA_INDEPENDENT && mode == PL_ALPHA_PREMULTIPLIED) {
+ GLSL("color.rgb *= vec3(color.a); \n");
+ repr->alpha = PL_ALPHA_PREMULTIPLIED;
+ }
+}
+
+#ifdef PL_HAVE_DOVI
+static inline void reshape_mmr(pl_shader sh, ident_t mmr, bool single,
+ int min_order, int max_order)
+{
+ if (single) {
+ GLSL("const uint mmr_idx = 0u; \n");
+ } else {
+ GLSL("uint mmr_idx = uint(coeffs.y); \n");
+ }
+
+ assert(min_order <= max_order);
+ if (min_order < max_order)
+ GLSL("uint order = uint(coeffs.w); \n");
+
+ GLSL("vec4 sigX; \n"
+ "s = coeffs.x; \n"
+ "sigX.xyz = sig.xxy * sig.yzz; \n"
+ "sigX.w = sigX.x * sig.z; \n"
+ "s += dot("$"[mmr_idx + 0].xyz, sig); \n"
+ "s += dot("$"[mmr_idx + 1], sigX); \n",
+ mmr, mmr);
+
+ if (max_order >= 2) {
+ if (min_order < 2)
+ GLSL("if (order >= 2) { \n");
+
+ GLSL("vec3 sig2 = sig * sig; \n"
+ "vec4 sigX2 = sigX * sigX; \n"
+ "s += dot("$"[mmr_idx + 2].xyz, sig2); \n"
+ "s += dot("$"[mmr_idx + 3], sigX2); \n",
+ mmr, mmr);
+
+ if (max_order == 3) {
+ if (min_order < 3)
+ GLSL("if (order >= 3 { \n");
+
+ GLSL("s += dot("$"[mmr_idx + 4].xyz, sig2 * sig); \n"
+ "s += dot("$"[mmr_idx + 5], sigX2 * sigX); \n",
+ mmr, mmr);
+
+ if (min_order < 3)
+ GLSL("} \n");
+ }
+
+ if (min_order < 2)
+ GLSL("} \n");
+ }
+}
+
+static inline void reshape_poly(pl_shader sh)
+{
+ GLSL("s = (coeffs.z * s + coeffs.y) * s + coeffs.x; \n");
+}
+#endif
+
+void pl_shader_dovi_reshape(pl_shader sh, const struct pl_dovi_metadata *data)
+{
+#ifdef PL_HAVE_DOVI
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0) || !data)
+ return;
+
+ sh_describe(sh, "reshaping");
+ GLSL("// pl_shader_reshape \n"
+ "{ \n"
+ "vec3 sig; \n"
+ "vec4 coeffs; \n"
+ "float s; \n"
+ "sig = clamp(color.rgb, 0.0, 1.0); \n");
+
+ float coeffs_data[8][4];
+ float mmr_packed_data[8*6][4];
+
+ for (int c = 0; c < 3; c++) {
+ const struct pl_reshape_data *comp = &data->comp[c];
+ if (!comp->num_pivots)
+ continue;
+
+ pl_assert(comp->num_pivots >= 2 && comp->num_pivots <= 9);
+ GLSL("s = sig[%d]; \n", c);
+
+ // Prepare coefficients for GPU
+ bool has_poly = false, has_mmr = false, mmr_single = true;
+ int mmr_idx = 0, min_order = 3, max_order = 1;
+ memset(coeffs_data, 0, sizeof(coeffs_data));
+ for (int i = 0; i < comp->num_pivots - 1; i++) {
+ switch (comp->method[i]) {
+ case 0: // polynomial
+ has_poly = true;
+ coeffs_data[i][3] = 0.0; // order=0 signals polynomial
+ for (int k = 0; k < 3; k++)
+ coeffs_data[i][k] = comp->poly_coeffs[i][k];
+ break;
+
+ case 1:
+ min_order = PL_MIN(min_order, comp->mmr_order[i]);
+ max_order = PL_MAX(max_order, comp->mmr_order[i]);
+ mmr_single = !has_mmr;
+ has_mmr = true;
+ coeffs_data[i][3] = (float) comp->mmr_order[i];
+ coeffs_data[i][0] = comp->mmr_constant[i];
+ coeffs_data[i][1] = (float) mmr_idx;
+ for (int j = 0; j < comp->mmr_order[i]; j++) {
+ // store weights per order as two packed vec4s
+ float *mmr = &mmr_packed_data[mmr_idx][0];
+ mmr[0] = comp->mmr_coeffs[i][j][0];
+ mmr[1] = comp->mmr_coeffs[i][j][1];
+ mmr[2] = comp->mmr_coeffs[i][j][2];
+ mmr[3] = 0.0; // unused
+ mmr[4] = comp->mmr_coeffs[i][j][3];
+ mmr[5] = comp->mmr_coeffs[i][j][4];
+ mmr[6] = comp->mmr_coeffs[i][j][5];
+ mmr[7] = comp->mmr_coeffs[i][j][6];
+ mmr_idx += 2;
+ }
+ break;
+
+ default:
+ pl_unreachable();
+ }
+ }
+
+ if (comp->num_pivots > 2) {
+
+ // Skip the (irrelevant) lower and upper bounds
+ float pivots_data[7];
+ memcpy(pivots_data, comp->pivots + 1,
+ (comp->num_pivots - 2) * sizeof(pivots_data[0]));
+
+ // Fill the remainder with a quasi-infinite sentinel pivot
+ for (int i = comp->num_pivots - 2; i < PL_ARRAY_SIZE(pivots_data); i++)
+ pivots_data[i] = 1e9f;
+
+ ident_t pivots = sh_var(sh, (struct pl_shader_var) {
+ .data = pivots_data,
+ .var = {
+ .name = "pivots",
+ .type = PL_VAR_FLOAT,
+ .dim_v = 1,
+ .dim_m = 1,
+ .dim_a = PL_ARRAY_SIZE(pivots_data),
+ },
+ });
+
+ ident_t coeffs = sh_var(sh, (struct pl_shader_var) {
+ .data = coeffs_data,
+ .var = {
+ .name = "coeffs",
+ .type = PL_VAR_FLOAT,
+ .dim_v = 4,
+ .dim_m = 1,
+ .dim_a = PL_ARRAY_SIZE(coeffs_data),
+ },
+ });
+
+ // Efficiently branch into the correct set of coefficients
+ GLSL("#define test(i) bvec4(s >= "$"[i]) \n"
+ "#define coef(i) "$"[i] \n"
+ "coeffs = mix(mix(mix(coef(0), coef(1), test(0)), \n"
+ " mix(coef(2), coef(3), test(2)), \n"
+ " test(1)), \n"
+ " mix(mix(coef(4), coef(5), test(4)), \n"
+ " mix(coef(6), coef(7), test(6)), \n"
+ " test(5)), \n"
+ " test(3)); \n"
+ "#undef test \n"
+ "#undef coef \n",
+ pivots, coeffs);
+
+ } else {
+
+ // No need for a single pivot, just set the coeffs directly
+ GLSL("coeffs = "$"; \n", sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec4("coeffs"),
+ .data = coeffs_data,
+ }));
+
+ }
+
+ ident_t mmr = NULL_IDENT;
+ if (has_mmr) {
+ mmr = sh_var(sh, (struct pl_shader_var) {
+ .data = mmr_packed_data,
+ .var = {
+ .name = "mmr",
+ .type = PL_VAR_FLOAT,
+ .dim_v = 4,
+ .dim_m = 1,
+ .dim_a = mmr_idx,
+ },
+ });
+ }
+
+ if (has_mmr && has_poly) {
+ GLSL("if (coeffs.w == 0.0) { \n");
+ reshape_poly(sh);
+ GLSL("} else { \n");
+ reshape_mmr(sh, mmr, mmr_single, min_order, max_order);
+ GLSL("} \n");
+ } else if (has_poly) {
+ reshape_poly(sh);
+ } else {
+ assert(has_mmr);
+ GLSL("{ \n");
+ reshape_mmr(sh, mmr, mmr_single, min_order, max_order);
+ GLSL("} \n");
+ }
+
+ ident_t lo = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_float("lo"),
+ .data = &comp->pivots[0],
+ });
+ ident_t hi = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_float("hi"),
+ .data = &comp->pivots[comp->num_pivots - 1],
+ });
+ GLSL("color[%d] = clamp(s, "$", "$"); \n", c, lo, hi);
+ }
+
+ GLSL("} \n");
+#else
+ SH_FAIL(sh, "libplacebo was compiled without support for dolbyvision reshaping");
+#endif
+}
+
+void pl_shader_decode_color(pl_shader sh, struct pl_color_repr *repr,
+ const struct pl_color_adjustment *params)
+{
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+
+ sh_describe(sh, "color decoding");
+ GLSL("// pl_shader_decode_color \n"
+ "{ \n");
+
+ // Do this first because the following operations are potentially nonlinear
+ pl_shader_set_alpha(sh, repr, PL_ALPHA_INDEPENDENT);
+
+ if (repr->sys == PL_COLOR_SYSTEM_XYZ ||
+ repr->sys == PL_COLOR_SYSTEM_DOLBYVISION)
+ {
+ ident_t scale = SH_FLOAT(pl_color_repr_normalize(repr));
+ GLSL("color.rgb *= vec3("$"); \n", scale);
+ }
+
+ if (repr->sys == PL_COLOR_SYSTEM_XYZ) {
+ pl_shader_linearize(sh, &(struct pl_color_space) {
+ .transfer = PL_COLOR_TRC_ST428,
+ });
+ }
+
+ if (repr->sys == PL_COLOR_SYSTEM_DOLBYVISION)
+ pl_shader_dovi_reshape(sh, repr->dovi);
+
+ enum pl_color_system orig_sys = repr->sys;
+ pl_transform3x3 tr = pl_color_repr_decode(repr, params);
+
+ if (memcmp(&tr, &pl_transform3x3_identity, sizeof(tr))) {
+ ident_t cmat = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_mat3("cmat"),
+ .data = PL_TRANSPOSE_3X3(tr.mat.m),
+ });
+
+ ident_t cmat_c = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec3("cmat_c"),
+ .data = tr.c,
+ });
+
+ GLSL("color.rgb = "$" * color.rgb + "$"; \n", cmat, cmat_c);
+ }
+
+ switch (orig_sys) {
+ case PL_COLOR_SYSTEM_BT_2020_C:
+ // Conversion for C'rcY'cC'bc via the BT.2020 CL system:
+ // C'bc = (B'-Y'c) / 1.9404 | C'bc <= 0
+ // = (B'-Y'c) / 1.5816 | C'bc > 0
+ //
+ // C'rc = (R'-Y'c) / 1.7184 | C'rc <= 0
+ // = (R'-Y'c) / 0.9936 | C'rc > 0
+ //
+ // as per the BT.2020 specification, table 4. This is a non-linear
+ // transformation because (constant) luminance receives non-equal
+ // contributions from the three different channels.
+ GLSL("// constant luminance conversion \n"
+ "color.br = color.br * mix(vec2(1.5816, 0.9936), \n"
+ " vec2(1.9404, 1.7184), \n"
+ " lessThanEqual(color.br, vec2(0.0))) \n"
+ " + color.gg; \n");
+ // Expand channels to camera-linear light. This shader currently just
+ // assumes everything uses the BT.2020 12-bit gamma function, since the
+ // difference between 10 and 12-bit is negligible for anything other
+ // than 12-bit content.
+ GLSL("vec3 lin = mix(color.rgb * vec3(1.0/4.5), \n"
+ " pow((color.rgb + vec3(0.0993))*vec3(1.0/1.0993), \n"
+ " vec3(1.0/0.45)), \n"
+ " lessThanEqual(vec3(0.08145), color.rgb)); \n");
+ // Calculate the green channel from the expanded RYcB, and recompress to G'
+ // The BT.2020 specification says Yc = 0.2627*R + 0.6780*G + 0.0593*B
+ GLSL("color.g = (lin.g - 0.2627*lin.r - 0.0593*lin.b)*1.0/0.6780; \n"
+ "color.g = mix(color.g * 4.5, \n"
+ " 1.0993 * pow(color.g, 0.45) - 0.0993, \n"
+ " 0.0181 <= color.g); \n");
+ break;
+
+ case PL_COLOR_SYSTEM_BT_2100_PQ:;
+ // Conversion process from the spec:
+ //
+ // 1. L'M'S' = cmat * ICtCp
+ // 2. LMS = linearize(L'M'S') (EOTF for PQ, inverse OETF for HLG)
+ // 3. RGB = lms2rgb * LMS
+ //
+ // After this we need to invert step 2 to arrive at non-linear RGB.
+ // (It's important we keep the transfer function conversion separate
+ // from the color system decoding, so we have to partially undo our
+ // work here even though we will end up linearizing later on anyway)
+
+ GLSL(// PQ EOTF
+ "color.rgb = pow(max(color.rgb, 0.0), vec3(1.0/%f)); \n"
+ "color.rgb = max(color.rgb - vec3(%f), 0.0) \n"
+ " / (vec3(%f) - vec3(%f) * color.rgb); \n"
+ "color.rgb = pow(color.rgb, vec3(1.0/%f)); \n"
+ // LMS matrix
+ "color.rgb = mat3( 3.43661, -0.79133, -0.0259499, \n"
+ " -2.50645, 1.98360, -0.0989137, \n"
+ " 0.06984, -0.192271, 1.12486) * color.rgb; \n"
+ // PQ OETF
+ "color.rgb = pow(max(color.rgb, 0.0), vec3(%f)); \n"
+ "color.rgb = (vec3(%f) + vec3(%f) * color.rgb) \n"
+ " / (vec3(1.0) + vec3(%f) * color.rgb); \n"
+ "color.rgb = pow(color.rgb, vec3(%f)); \n",
+ PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1,
+ PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2);
+ break;
+
+ case PL_COLOR_SYSTEM_BT_2100_HLG:
+ GLSL(// HLG OETF^-1
+ "color.rgb = mix(vec3(4.0) * color.rgb * color.rgb, \n"
+ " exp((color.rgb - vec3(%f)) * vec3(1.0/%f)) \n"
+ " + vec3(%f), \n"
+ " lessThan(vec3(0.5), color.rgb)); \n"
+ // LMS matrix
+ "color.rgb = mat3( 3.43661, -0.79133, -0.0259499, \n"
+ " -2.50645, 1.98360, -0.0989137, \n"
+ " 0.06984, -0.192271, 1.12486) * color.rgb; \n"
+ // HLG OETF
+ "color.rgb = mix(vec3(0.5) * sqrt(color.rgb), \n"
+ " vec3(%f) * log(color.rgb - vec3(%f)) + vec3(%f), \n"
+ " lessThan(vec3(1.0), color.rgb)); \n",
+ HLG_C, HLG_A, HLG_B,
+ HLG_A, HLG_B, HLG_C);
+ break;
+
+ case PL_COLOR_SYSTEM_DOLBYVISION:;
+#ifdef PL_HAVE_DOVI
+ // Dolby Vision always outputs BT.2020-referred HPE LMS, so hard-code
+ // the inverse LMS->RGB matrix corresponding to this color space.
+ pl_matrix3x3 dovi_lms2rgb = {{
+ { 3.06441879, -2.16597676, 0.10155818},
+ {-0.65612108, 1.78554118, -0.12943749},
+ { 0.01736321, -0.04725154, 1.03004253},
+ }};
+
+ pl_matrix3x3_mul(&dovi_lms2rgb, &repr->dovi->linear);
+ ident_t mat = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_mat3("lms2rgb"),
+ .data = PL_TRANSPOSE_3X3(dovi_lms2rgb.m),
+ });
+
+ // PQ EOTF
+ GLSL("color.rgb = pow(max(color.rgb, 0.0), vec3(1.0/%f)); \n"
+ "color.rgb = max(color.rgb - vec3(%f), 0.0) \n"
+ " / (vec3(%f) - vec3(%f) * color.rgb); \n"
+ "color.rgb = pow(color.rgb, vec3(1.0/%f)); \n",
+ PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1);
+ // LMS matrix
+ GLSL("color.rgb = "$" * color.rgb; \n", mat);
+ // PQ OETF
+ GLSL("color.rgb = pow(max(color.rgb, 0.0), vec3(%f)); \n"
+ "color.rgb = (vec3(%f) + vec3(%f) * color.rgb) \n"
+ " / (vec3(1.0) + vec3(%f) * color.rgb); \n"
+ "color.rgb = pow(color.rgb, vec3(%f)); \n",
+ PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2);
+ break;
+#else
+ SH_FAIL(sh, "libplacebo was compiled without support for dolbyvision reshaping");
+ return;
+#endif
+
+ case PL_COLOR_SYSTEM_UNKNOWN:
+ case PL_COLOR_SYSTEM_RGB:
+ case PL_COLOR_SYSTEM_XYZ:
+ case PL_COLOR_SYSTEM_BT_601:
+ case PL_COLOR_SYSTEM_BT_709:
+ case PL_COLOR_SYSTEM_SMPTE_240M:
+ case PL_COLOR_SYSTEM_BT_2020_NC:
+ case PL_COLOR_SYSTEM_YCGCO:
+ break; // no special post-processing needed
+
+ case PL_COLOR_SYSTEM_COUNT:
+ pl_unreachable();
+ }
+
+ // Gamma adjustment. Doing this here (in non-linear light) is technically
+ // somewhat wrong, but this is just an aesthetic parameter and not really
+ // meant for colorimetric precision, so we don't care too much.
+ if (params && params->gamma == 0) {
+ // Avoid division by zero
+ GLSL("color.rgb = vec3(0.0); \n");
+ } else if (params && params->gamma != 1) {
+ ident_t gamma = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_float("gamma"),
+ .data = &(float){ 1 / params->gamma },
+ });
+ GLSL("color.rgb = pow(max(color.rgb, vec3(0.0)), vec3("$")); \n", gamma);
+ }
+
+ GLSL("}\n");
+}
+
+void pl_shader_encode_color(pl_shader sh, const struct pl_color_repr *repr)
+{
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+
+ sh_describe(sh, "color encoding");
+ GLSL("// pl_shader_encode_color \n"
+ "{ \n");
+
+ switch (repr->sys) {
+ case PL_COLOR_SYSTEM_BT_2020_C:
+ // Expand R'G'B' to RGB
+ GLSL("vec3 lin = mix(color.rgb * vec3(1.0/4.5), \n"
+ " pow((color.rgb + vec3(0.0993))*vec3(1.0/1.0993), \n"
+ " vec3(1.0/0.45)), \n"
+ " lessThanEqual(vec3(0.08145), color.rgb)); \n");
+
+ // Compute Yc from RGB and compress to R'Y'cB'
+ GLSL("color.g = dot(vec3(0.2627, 0.6780, 0.0593), lin); \n"
+ "color.g = mix(color.g * 4.5, \n"
+ " 1.0993 * pow(color.g, 0.45) - 0.0993, \n"
+ " 0.0181 <= color.g); \n");
+
+ // Compute C'bc and C'rc into color.br
+ GLSL("color.br = color.br - color.gg; \n"
+ "color.br *= mix(vec2(1.0/1.5816, 1.0/0.9936), \n"
+ " vec2(1.0/1.9404, 1.0/1.7184), \n"
+ " lessThanEqual(color.br, vec2(0.0))); \n");
+ break;
+
+ case PL_COLOR_SYSTEM_BT_2100_PQ:;
+ GLSL("color.rgb = pow(max(color.rgb, 0.0), vec3(1.0/%f)); \n"
+ "color.rgb = max(color.rgb - vec3(%f), 0.0) \n"
+ " / (vec3(%f) - vec3(%f) * color.rgb); \n"
+ "color.rgb = pow(color.rgb, vec3(1.0/%f)); \n"
+ "color.rgb = mat3(0.412109, 0.166748, 0.024170, \n"
+ " 0.523925, 0.720459, 0.075440, \n"
+ " 0.063965, 0.112793, 0.900394) * color.rgb; \n"
+ "color.rgb = pow(color.rgb, vec3(%f)); \n"
+ "color.rgb = (vec3(%f) + vec3(%f) * color.rgb) \n"
+ " / (vec3(1.0) + vec3(%f) * color.rgb); \n"
+ "color.rgb = pow(color.rgb, vec3(%f)); \n",
+ PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1,
+ PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2);
+ break;
+
+ case PL_COLOR_SYSTEM_BT_2100_HLG:
+ GLSL("color.rgb = mix(vec3(4.0) * color.rgb * color.rgb, \n"
+ " exp((color.rgb - vec3(%f)) * vec3(1.0/%f)) \n"
+ " + vec3(%f), \n"
+ " lessThan(vec3(0.5), color.rgb)); \n"
+ "color.rgb = mat3(0.412109, 0.166748, 0.024170, \n"
+ " 0.523925, 0.720459, 0.075440, \n"
+ " 0.063965, 0.112793, 0.900394) * color.rgb; \n"
+ "color.rgb = mix(vec3(0.5) * sqrt(color.rgb), \n"
+ " vec3(%f) * log(color.rgb - vec3(%f)) + vec3(%f), \n"
+ " lessThan(vec3(1.0), color.rgb)); \n",
+ HLG_C, HLG_A, HLG_B,
+ HLG_A, HLG_B, HLG_C);
+ break;
+
+ case PL_COLOR_SYSTEM_DOLBYVISION:
+ SH_FAIL(sh, "Cannot un-apply dolbyvision yet (no inverse reshaping)!");
+ return;
+
+ case PL_COLOR_SYSTEM_UNKNOWN:
+ case PL_COLOR_SYSTEM_RGB:
+ case PL_COLOR_SYSTEM_XYZ:
+ case PL_COLOR_SYSTEM_BT_601:
+ case PL_COLOR_SYSTEM_BT_709:
+ case PL_COLOR_SYSTEM_SMPTE_240M:
+ case PL_COLOR_SYSTEM_BT_2020_NC:
+ case PL_COLOR_SYSTEM_YCGCO:
+ break; // no special pre-processing needed
+
+ case PL_COLOR_SYSTEM_COUNT:
+ pl_unreachable();
+ }
+
+ // Since this is a relatively rare operation, bypass it as much as possible
+ bool skip = true;
+ skip &= PL_DEF(repr->sys, PL_COLOR_SYSTEM_RGB) == PL_COLOR_SYSTEM_RGB;
+ skip &= PL_DEF(repr->levels, PL_COLOR_LEVELS_FULL) == PL_COLOR_LEVELS_FULL;
+ skip &= !repr->bits.sample_depth || !repr->bits.color_depth ||
+ repr->bits.sample_depth == repr->bits.color_depth;
+ skip &= !repr->bits.bit_shift;
+
+ if (!skip) {
+ struct pl_color_repr copy = *repr;
+ ident_t xyzscale = NULL_IDENT;
+ if (repr->sys == PL_COLOR_SYSTEM_XYZ)
+ xyzscale = SH_FLOAT(1.0 / pl_color_repr_normalize(&copy));
+
+ pl_transform3x3 tr = pl_color_repr_decode(&copy, NULL);
+ pl_transform3x3_invert(&tr);
+
+ ident_t cmat = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_mat3("cmat"),
+ .data = PL_TRANSPOSE_3X3(tr.mat.m),
+ });
+
+ ident_t cmat_c = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec3("cmat_c"),
+ .data = tr.c,
+ });
+
+ GLSL("color.rgb = "$" * color.rgb + "$"; \n", cmat, cmat_c);
+
+ if (repr->sys == PL_COLOR_SYSTEM_XYZ) {
+ pl_shader_delinearize(sh, &(struct pl_color_space) {
+ .transfer = PL_COLOR_TRC_ST428,
+ });
+ GLSL("color.rgb *= vec3("$"); \n", xyzscale);
+ }
+ }
+
+ if (repr->alpha == PL_ALPHA_PREMULTIPLIED)
+ GLSL("color.rgb *= vec3(color.a); \n");
+
+ GLSL("}\n");
+}
+
+static ident_t sh_luma_coeffs(pl_shader sh, const struct pl_color_space *csp)
+{
+ pl_matrix3x3 rgb2xyz;
+ rgb2xyz = pl_get_rgb2xyz_matrix(pl_raw_primaries_get(csp->primaries));
+
+ // FIXME: Cannot use `const vec3` due to glslang bug #2025
+ ident_t coeffs = sh_fresh(sh, "luma_coeffs");
+ GLSLH("#define "$" vec3("$", "$", "$") \n", coeffs,
+ SH_FLOAT(rgb2xyz.m[1][0]), // RGB->Y vector
+ SH_FLOAT(rgb2xyz.m[1][1]),
+ SH_FLOAT(rgb2xyz.m[1][2]));
+ return coeffs;
+}
+
+void pl_shader_linearize(pl_shader sh, const struct pl_color_space *csp)
+{
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+
+ if (csp->transfer == PL_COLOR_TRC_LINEAR)
+ return;
+
+ float csp_min, csp_max;
+ pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+ .color = csp,
+ .metadata = PL_HDR_METADATA_HDR10,
+ .scaling = PL_HDR_NORM,
+ .out_min = &csp_min,
+ .out_max = &csp_max,
+ ));
+
+ // Note that this clamp may technically violate the definition of
+ // ITU-R BT.2100, which allows for sub-blacks and super-whites to be
+ // displayed on the display where such would be possible. That said, the
+ // problem is that not all gamma curves are well-defined on the values
+ // outside this range, so we ignore it and just clamp anyway for sanity.
+ GLSL("// pl_shader_linearize \n"
+ "color.rgb = max(color.rgb, 0.0); \n");
+
+ switch (csp->transfer) {
+ case PL_COLOR_TRC_SRGB:
+ GLSL("color.rgb = mix(color.rgb * vec3(1.0/12.92), \n"
+ " pow((color.rgb + vec3(0.055))/vec3(1.055), \n"
+ " vec3(2.4)), \n"
+ " lessThan(vec3(0.04045), color.rgb)); \n");
+ goto scale_out;
+ case PL_COLOR_TRC_BT_1886: {
+ const float lb = powf(csp_min, 1/2.4f);
+ const float lw = powf(csp_max, 1/2.4f);
+ const float a = powf(lw - lb, 2.4f);
+ const float b = lb / (lw - lb);
+ GLSL("color.rgb = "$" * pow(color.rgb + vec3("$"), vec3(2.4)); \n",
+ SH_FLOAT(a), SH_FLOAT(b));
+ return;
+ }
+ case PL_COLOR_TRC_GAMMA18:
+ GLSL("color.rgb = pow(color.rgb, vec3(1.8));\n");
+ goto scale_out;
+ case PL_COLOR_TRC_GAMMA20:
+ GLSL("color.rgb = pow(color.rgb, vec3(2.0));\n");
+ goto scale_out;
+ case PL_COLOR_TRC_UNKNOWN:
+ case PL_COLOR_TRC_GAMMA22:
+ GLSL("color.rgb = pow(color.rgb, vec3(2.2));\n");
+ goto scale_out;
+ case PL_COLOR_TRC_GAMMA24:
+ GLSL("color.rgb = pow(color.rgb, vec3(2.4));\n");
+ goto scale_out;
+ case PL_COLOR_TRC_GAMMA26:
+ GLSL("color.rgb = pow(color.rgb, vec3(2.6));\n");
+ goto scale_out;
+ case PL_COLOR_TRC_GAMMA28:
+ GLSL("color.rgb = pow(color.rgb, vec3(2.8));\n");
+ goto scale_out;
+ case PL_COLOR_TRC_PRO_PHOTO:
+ GLSL("color.rgb = mix(color.rgb * vec3(1.0/16.0), \n"
+ " pow(color.rgb, vec3(1.8)), \n"
+ " lessThan(vec3(0.03125), color.rgb)); \n");
+ goto scale_out;
+ case PL_COLOR_TRC_ST428:
+ GLSL("color.rgb = vec3(52.37/48.0) * pow(color.rgb, vec3(2.6));\n");
+ goto scale_out;
+ case PL_COLOR_TRC_PQ:
+ GLSL("color.rgb = pow(color.rgb, vec3(1.0/%f)); \n"
+ "color.rgb = max(color.rgb - vec3(%f), 0.0) \n"
+ " / (vec3(%f) - vec3(%f) * color.rgb); \n"
+ "color.rgb = pow(color.rgb, vec3(1.0/%f)); \n"
+ // PQ's output range is 0-10000, but we need it to be relative to
+ // to PL_COLOR_SDR_WHITE instead, so rescale
+ "color.rgb *= vec3(%f); \n",
+ PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1, 10000.0 / PL_COLOR_SDR_WHITE);
+ return;
+ case PL_COLOR_TRC_HLG: {
+ const float y = fmaxf(1.2f + 0.42f * log10f(csp_max / HLG_REF), 1);
+ const float b = sqrtf(3 * powf(csp_min / csp_max, 1 / y));
+ // OETF^-1
+ GLSL("color.rgb = "$" * color.rgb + vec3("$"); \n"
+ "color.rgb = mix(vec3(4.0) * color.rgb * color.rgb, \n"
+ " exp((color.rgb - vec3(%f)) * vec3(1.0/%f))\n"
+ " + vec3(%f), \n"
+ " lessThan(vec3(0.5), color.rgb)); \n",
+ SH_FLOAT(1 - b), SH_FLOAT(b),
+ HLG_C, HLG_A, HLG_B);
+ // OOTF
+ GLSL("color.rgb *= 1.0 / 12.0; \n"
+ "color.rgb *= "$" * pow(max(dot("$", color.rgb), 0.0), "$"); \n",
+ SH_FLOAT(csp_max), sh_luma_coeffs(sh, csp), SH_FLOAT(y - 1));
+ return;
+ }
+ case PL_COLOR_TRC_V_LOG:
+ GLSL("color.rgb = mix((color.rgb - vec3(0.125)) * vec3(1.0/5.6), \n"
+ " pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n"
+ " - vec3(%f), \n"
+ " lessThanEqual(vec3(0.181), color.rgb)); \n",
+ VLOG_D, VLOG_C, VLOG_B);
+ return;
+ case PL_COLOR_TRC_S_LOG1:
+ GLSL("color.rgb = pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n"
+ " - vec3(%f); \n",
+ SLOG_C, SLOG_A, SLOG_B);
+ return;
+ case PL_COLOR_TRC_S_LOG2:
+ GLSL("color.rgb = mix((color.rgb - vec3(%f)) * vec3(1.0/%f), \n"
+ " (pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n"
+ " - vec3(%f)) * vec3(1.0/%f), \n"
+ " lessThanEqual(vec3(%f), color.rgb)); \n",
+ SLOG_Q, SLOG_P, SLOG_C, SLOG_A, SLOG_B, SLOG_K2, SLOG_Q);
+ return;
+ case PL_COLOR_TRC_LINEAR:
+ case PL_COLOR_TRC_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+
+scale_out:
+ if (csp_max != 1 || csp_min != 0) {
+ GLSL("color.rgb = "$" * color.rgb + vec3("$"); \n",
+ SH_FLOAT(csp_max - csp_min), SH_FLOAT(csp_min));
+ }
+}
+
+void pl_shader_delinearize(pl_shader sh, const struct pl_color_space *csp)
+{
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+
+ if (csp->transfer == PL_COLOR_TRC_LINEAR)
+ return;
+
+ float csp_min, csp_max;
+ pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+ .color = csp,
+ .metadata = PL_HDR_METADATA_HDR10,
+ .scaling = PL_HDR_NORM,
+ .out_min = &csp_min,
+ .out_max = &csp_max,
+ ));
+
+ GLSL("// pl_shader_delinearize \n");
+ switch (csp->transfer) {
+ case PL_COLOR_TRC_UNKNOWN:
+ case PL_COLOR_TRC_SRGB:
+ case PL_COLOR_TRC_LINEAR:
+ case PL_COLOR_TRC_GAMMA18:
+ case PL_COLOR_TRC_GAMMA20:
+ case PL_COLOR_TRC_GAMMA22:
+ case PL_COLOR_TRC_GAMMA24:
+ case PL_COLOR_TRC_GAMMA26:
+ case PL_COLOR_TRC_GAMMA28:
+ case PL_COLOR_TRC_PRO_PHOTO:
+ case PL_COLOR_TRC_ST428: ;
+ if (csp_max != 1 || csp_min != 0) {
+ GLSL("color.rgb = "$" * color.rgb + vec3("$"); \n",
+ SH_FLOAT(1 / (csp_max - csp_min)),
+ SH_FLOAT(-csp_min / (csp_max - csp_min)));
+ }
+ break;
+ case PL_COLOR_TRC_BT_1886:
+ case PL_COLOR_TRC_PQ:
+ case PL_COLOR_TRC_HLG:
+ case PL_COLOR_TRC_V_LOG:
+ case PL_COLOR_TRC_S_LOG1:
+ case PL_COLOR_TRC_S_LOG2:
+ break; // scene-referred or absolute scale
+ case PL_COLOR_TRC_COUNT:
+ pl_unreachable();
+ }
+
+ GLSL("color.rgb = max(color.rgb, 0.0); \n");
+
+ switch (csp->transfer) {
+ case PL_COLOR_TRC_SRGB:
+ GLSL("color.rgb = mix(color.rgb * vec3(12.92), \n"
+ " vec3(1.055) * pow(color.rgb, vec3(1.0/2.4)) \n"
+ " - vec3(0.055), \n"
+ " lessThanEqual(vec3(0.0031308), color.rgb)); \n");
+ return;
+ case PL_COLOR_TRC_BT_1886: {
+ const float lb = powf(csp_min, 1/2.4f);
+ const float lw = powf(csp_max, 1/2.4f);
+ const float a = powf(lw - lb, 2.4f);
+ const float b = lb / (lw - lb);
+ GLSL("color.rgb = pow("$" * color.rgb, vec3(1.0/2.4)) - vec3("$"); \n",
+ SH_FLOAT(1.0 / a), SH_FLOAT(b));
+ return;
+ }
+ case PL_COLOR_TRC_GAMMA18:
+ GLSL("color.rgb = pow(color.rgb, vec3(1.0/1.8));\n");
+ return;
+ case PL_COLOR_TRC_GAMMA20:
+ GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.0));\n");
+ return;
+ case PL_COLOR_TRC_UNKNOWN:
+ case PL_COLOR_TRC_GAMMA22:
+ GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.2));\n");
+ return;
+ case PL_COLOR_TRC_GAMMA24:
+ GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.4));\n");
+ return;
+ case PL_COLOR_TRC_GAMMA26:
+ GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.6));\n");
+ return;
+ case PL_COLOR_TRC_GAMMA28:
+ GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.8));\n");
+ return;
+ case PL_COLOR_TRC_ST428:
+ GLSL("color.rgb = pow(color.rgb * vec3(48.0/52.37), vec3(1.0/2.6));\n");
+ return;
+ case PL_COLOR_TRC_PRO_PHOTO:
+ GLSL("color.rgb = mix(color.rgb * vec3(16.0), \n"
+ " pow(color.rgb, vec3(1.0/1.8)), \n"
+ " lessThanEqual(vec3(0.001953), color.rgb)); \n");
+ return;
+ case PL_COLOR_TRC_PQ:
+ GLSL("color.rgb *= vec3(1.0/%f); \n"
+ "color.rgb = pow(color.rgb, vec3(%f)); \n"
+ "color.rgb = (vec3(%f) + vec3(%f) * color.rgb) \n"
+ " / (vec3(1.0) + vec3(%f) * color.rgb); \n"
+ "color.rgb = pow(color.rgb, vec3(%f)); \n",
+ 10000 / PL_COLOR_SDR_WHITE, PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2);
+ return;
+ case PL_COLOR_TRC_HLG: {
+ const float y = fmaxf(1.2f + 0.42f * log10f(csp_max / HLG_REF), 1);
+ const float b = sqrtf(3 * powf(csp_min / csp_max, 1 / y));
+ // OOTF^-1
+ GLSL("color.rgb *= 1.0 / "$"; \n"
+ "color.rgb *= 12.0 * max(1e-6, pow(dot("$", color.rgb), "$")); \n",
+ SH_FLOAT(csp_max), sh_luma_coeffs(sh, csp), SH_FLOAT((1 - y) / y));
+ // OETF
+ GLSL("color.rgb = mix(vec3(0.5) * sqrt(color.rgb), \n"
+ " vec3(%f) * log(color.rgb - vec3(%f)) + vec3(%f), \n"
+ " lessThan(vec3(1.0), color.rgb)); \n"
+ "color.rgb = "$" * color.rgb + vec3("$"); \n",
+ HLG_A, HLG_B, HLG_C,
+ SH_FLOAT(1 / (1 - b)), SH_FLOAT(-b / (1 - b)));
+ return;
+ }
+ case PL_COLOR_TRC_V_LOG:
+ GLSL("color.rgb = mix(vec3(5.6) * color.rgb + vec3(0.125), \n"
+ " vec3(%f) * log(color.rgb + vec3(%f)) \n"
+ " + vec3(%f), \n"
+ " lessThanEqual(vec3(0.01), color.rgb)); \n",
+ VLOG_C / M_LN10, VLOG_B, VLOG_D);
+ return;
+ case PL_COLOR_TRC_S_LOG1:
+ GLSL("color.rgb = vec3(%f) * log(color.rgb + vec3(%f)) + vec3(%f);\n",
+ SLOG_A / M_LN10, SLOG_B, SLOG_C);
+ return;
+ case PL_COLOR_TRC_S_LOG2:
+ GLSL("color.rgb = mix(vec3(%f) * color.rgb + vec3(%f), \n"
+ " vec3(%f) * log(vec3(%f) * color.rgb + vec3(%f)) \n"
+ " + vec3(%f), \n"
+ " lessThanEqual(vec3(0.0), color.rgb)); \n",
+ SLOG_P, SLOG_Q, SLOG_A / M_LN10, SLOG_K2, SLOG_B, SLOG_C);
+ return;
+ case PL_COLOR_TRC_LINEAR:
+ case PL_COLOR_TRC_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+}
+
+const struct pl_sigmoid_params pl_sigmoid_default_params = { PL_SIGMOID_DEFAULTS };
+
+void pl_shader_sigmoidize(pl_shader sh, const struct pl_sigmoid_params *params)
+{
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+
+ params = PL_DEF(params, &pl_sigmoid_default_params);
+ float center = PL_DEF(params->center, pl_sigmoid_default_params.center);
+ float slope = PL_DEF(params->slope, pl_sigmoid_default_params.slope);
+
+ // This function needs to go through (0,0) and (1,1), so we compute the
+ // values at 1 and 0, and then scale/shift them, respectively.
+ float offset = 1.0 / (1 + expf(slope * center));
+ float scale = 1.0 / (1 + expf(slope * (center - 1))) - offset;
+
+ GLSL("// pl_shader_sigmoidize \n"
+ "color = clamp(color, 0.0, 1.0); \n"
+ "color = vec4("$") - vec4("$") * \n"
+ " log(vec4(1.0) / (color * vec4("$") + vec4("$")) \n"
+ " - vec4(1.0)); \n",
+ SH_FLOAT(center), SH_FLOAT(1.0 / slope),
+ SH_FLOAT(scale), SH_FLOAT(offset));
+}
+
+void pl_shader_unsigmoidize(pl_shader sh, const struct pl_sigmoid_params *params)
+{
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+
+ // See: pl_shader_sigmoidize
+ params = PL_DEF(params, &pl_sigmoid_default_params);
+ float center = PL_DEF(params->center, pl_sigmoid_default_params.center);
+ float slope = PL_DEF(params->slope, pl_sigmoid_default_params.slope);
+ float offset = 1.0 / (1 + expf(slope * center));
+ float scale = 1.0 / (1 + expf(slope * (center - 1))) - offset;
+
+ GLSL("// pl_shader_unsigmoidize \n"
+ "color = clamp(color, 0.0, 1.0); \n"
+ "color = vec4("$") / \n"
+ " (vec4(1.0) + exp(vec4("$") * (vec4("$") - color))) \n"
+ " - vec4("$"); \n",
+ SH_FLOAT(1.0 / scale),
+ SH_FLOAT(slope), SH_FLOAT(center),
+ SH_FLOAT(offset / scale));
+}
+
+const struct pl_peak_detect_params pl_peak_detect_default_params = { PL_PEAK_DETECT_DEFAULTS };
+const struct pl_peak_detect_params pl_peak_detect_high_quality_params = { PL_PEAK_DETECT_HQ_DEFAULTS };
+
+static bool peak_detect_params_eq(const struct pl_peak_detect_params *a,
+ const struct pl_peak_detect_params *b)
+{
+ return a->smoothing_period == b->smoothing_period &&
+ a->scene_threshold_low == b->scene_threshold_low &&
+ a->scene_threshold_high == b->scene_threshold_high &&
+ a->percentile == b->percentile;
+ // don't compare `allow_delayed` because it doesn't change measurement
+}
+
+enum {
+ // Split the peak buffer into several independent slices to reduce pressure
+ // on global atomics
+ SLICES = 12,
+
+ // How many bits to use for storing PQ data. Be careful when setting this
+ // too high, as it may overflow `unsigned int` on large video sources.
+ //
+ // The value chosen is enough to guarantee no overflow for an 8K x 4K frame
+ // consisting entirely of 100% 10k nits PQ values, with 16x16 workgroups.
+ PQ_BITS = 14,
+ PQ_MAX = (1 << PQ_BITS) - 1,
+
+ // How many bits to use for the histogram. We bias the histogram down
+ // by half the PQ range (~90 nits), effectively clumping the SDR part
+ // of the image into a single histogram bin.
+ HIST_BITS = 7,
+ HIST_BIAS = 1 << (HIST_BITS - 1),
+ HIST_BINS = (1 << HIST_BITS) - HIST_BIAS,
+
+ // Convert from histogram bin to (starting) PQ value
+#define HIST_PQ(bin) (((bin) + HIST_BIAS) << (PQ_BITS - HIST_BITS))
+};
+
+
+pl_static_assert(PQ_BITS >= HIST_BITS);
+
+struct peak_buf_data {
+ unsigned frame_wg_count[SLICES]; // number of work groups processed
+ unsigned frame_wg_active[SLICES];// number of active (nonzero) work groups
+ unsigned frame_sum_pq[SLICES]; // sum of PQ Y values over all WGs (PQ_BITS)
+ unsigned frame_max_pq[SLICES]; // maximum PQ Y value among these WGs (PQ_BITS)
+ unsigned frame_hist[SLICES][HIST_BINS]; // always allocated, conditionally used
+};
+
+static const struct pl_buffer_var peak_buf_vars[] = {
+#define VAR(field) { \
+ .var = { \
+ .name = #field, \
+ .type = PL_VAR_UINT, \
+ .dim_v = 1, \
+ .dim_m = 1, \
+ .dim_a = sizeof(((struct peak_buf_data *) NULL)->field) / \
+ sizeof(unsigned), \
+ }, \
+ .layout = { \
+ .offset = offsetof(struct peak_buf_data, field), \
+ .size = sizeof(((struct peak_buf_data *) NULL)->field), \
+ .stride = sizeof(unsigned), \
+ }, \
+}
+ VAR(frame_wg_count),
+ VAR(frame_wg_active),
+ VAR(frame_sum_pq),
+ VAR(frame_max_pq),
+ VAR(frame_hist),
+#undef VAR
+};
+
+struct sh_color_map_obj {
+ // Tone map state
+ struct {
+ struct pl_tone_map_params params;
+ pl_shader_obj lut;
+ } tone;
+
+ // Gamut map state
+ struct {
+ pl_shader_obj lut;
+ } gamut;
+
+ // Peak detection state
+ struct {
+ struct pl_peak_detect_params params; // currently active parameters
+ pl_buf buf; // pending peak detection buffer
+ pl_buf readback; // readback buffer (fallback)
+ float avg_pq; // current (smoothed) values
+ float max_pq;
+ } peak;
+};
+
+// Excluding size, since this is checked by sh_lut
+static uint64_t gamut_map_signature(const struct pl_gamut_map_params *par)
+{
+ uint64_t sig = CACHE_KEY_GAMUT_LUT;
+ pl_hash_merge(&sig, pl_str0_hash(par->function->name));
+ pl_hash_merge(&sig, pl_var_hash(par->input_gamut));
+ pl_hash_merge(&sig, pl_var_hash(par->output_gamut));
+ pl_hash_merge(&sig, pl_var_hash(par->min_luma));
+ pl_hash_merge(&sig, pl_var_hash(par->max_luma));
+ pl_hash_merge(&sig, pl_var_hash(par->constants));
+ return sig;
+}
+
+static void sh_color_map_uninit(pl_gpu gpu, void *ptr)
+{
+ struct sh_color_map_obj *obj = ptr;
+ pl_shader_obj_destroy(&obj->tone.lut);
+ pl_shader_obj_destroy(&obj->gamut.lut);
+ pl_buf_destroy(gpu, &obj->peak.buf);
+ pl_buf_destroy(gpu, &obj->peak.readback);
+ memset(obj, 0, sizeof(*obj));
+}
+
+static inline float iir_coeff(float rate)
+{
+ if (!rate)
+ return 1.0f;
+ return 1.0f - expf(-1.0f / rate);
+}
+
+static float measure_peak(const struct peak_buf_data *data, float percentile)
+{
+ unsigned frame_max_pq = data->frame_max_pq[0];
+ for (int k = 1; k < SLICES; k++)
+ frame_max_pq = PL_MAX(frame_max_pq, data->frame_max_pq[k]);
+ const float frame_max = (float) frame_max_pq / PQ_MAX;
+ if (percentile <= 0 || percentile >= 100)
+ return frame_max;
+ unsigned total_pixels = 0;
+ for (int k = 0; k < SLICES; k++) {
+ for (int i = 0; i < HIST_BINS; i++)
+ total_pixels += data->frame_hist[k][i];
+ }
+ if (!total_pixels) // no histogram data available?
+ return frame_max;
+
+ const unsigned target_pixel = ceilf(percentile / 100.0f * total_pixels);
+ if (target_pixel >= total_pixels)
+ return frame_max;
+
+ unsigned sum = 0;
+ for (int i = 0; i < HIST_BINS; i++) {
+ unsigned next = sum;
+ for (int k = 0; k < SLICES; k++)
+ next += data->frame_hist[k][i];
+ if (next < target_pixel) {
+ sum = next;
+ continue;
+ }
+
+ // Upper and lower frequency boundaries of the matching histogram bin
+ const unsigned count_low = sum; // last pixel of previous bin
+ const unsigned count_high = next + 1; // first pixel of next bin
+ pl_assert(count_low < target_pixel && target_pixel < count_high);
+
+ // PQ luminance associated with count_low/high respectively
+ const float pq_low = (float) HIST_PQ(i) / PQ_MAX;
+ float pq_high = (float) HIST_PQ(i + 1) / PQ_MAX;
+ if (count_high > total_pixels) // special case for last histogram bin
+ pq_high = frame_max;
+
+ // Position of `target_pixel` inside this bin, assumes pixels are
+ // equidistributed inside a histogram bin
+ const float ratio = (float) (target_pixel - count_low) /
+ (count_high - count_low);
+ return PL_MIX(pq_low, pq_high, ratio);
+ }
+
+ pl_unreachable();
+}
+
+// if `force` is true, ensures the buffer is read, even if `allow_delayed`
+static void update_peak_buf(pl_gpu gpu, struct sh_color_map_obj *obj, bool force)
+{
+ const struct pl_peak_detect_params *params = &obj->peak.params;
+ if (!obj->peak.buf)
+ return;
+
+ if (!force && params->allow_delayed && pl_buf_poll(gpu, obj->peak.buf, 0))
+ return; // buffer not ready yet
+
+ bool ok;
+ struct peak_buf_data data = {0};
+ if (obj->peak.readback) {
+ pl_buf_copy(gpu, obj->peak.readback, 0, obj->peak.buf, 0, sizeof(data));
+ ok = pl_buf_read(gpu, obj->peak.readback, 0, &data, sizeof(data));
+ } else {
+ ok = pl_buf_read(gpu, obj->peak.buf, 0, &data, sizeof(data));
+ }
+ if (ok && data.frame_wg_count[0] > 0) {
+ // Peak detection completed successfully
+ pl_buf_destroy(gpu, &obj->peak.buf);
+ } else {
+ // No data read? Possibly this peak obj has not been executed yet
+ if (!ok) {
+ PL_ERR(gpu, "Failed reading peak detection buffer!");
+ } else if (params->allow_delayed) {
+ PL_TRACE(gpu, "Peak detection buffer not yet ready, ignoring..");
+ } else {
+ PL_WARN(gpu, "Peak detection usage error: attempted detecting peak "
+ "and using detected peak in the same shader program, "
+ "but `params->allow_delayed` is false! Ignoring, but "
+ "expect incorrect output.");
+ }
+ if (force || !ok)
+ pl_buf_destroy(gpu, &obj->peak.buf);
+ return;
+ }
+
+ uint64_t frame_sum_pq = 0u, frame_wg_count = 0u, frame_wg_active = 0u;
+ for (int k = 0; k < SLICES; k++) {
+ frame_sum_pq += data.frame_sum_pq[k];
+ frame_wg_count += data.frame_wg_count[k];
+ frame_wg_active += data.frame_wg_active[k];
+ }
+ float avg_pq, max_pq;
+ if (frame_wg_active) {
+ avg_pq = (float) frame_sum_pq / (frame_wg_active * PQ_MAX);
+ max_pq = measure_peak(&data, params->percentile);
+ } else {
+ // Solid black frame
+ avg_pq = max_pq = PL_COLOR_HDR_BLACK;
+ }
+
+ if (!obj->peak.avg_pq) {
+ // Set the initial value accordingly if it contains no data
+ obj->peak.avg_pq = avg_pq;
+ obj->peak.max_pq = max_pq;
+ } else {
+ // Ignore small deviations from existing peak (rounding error)
+ static const float epsilon = 1.0f / PQ_MAX;
+ if (fabsf(avg_pq - obj->peak.avg_pq) < epsilon)
+ avg_pq = obj->peak.avg_pq;
+ if (fabsf(max_pq - obj->peak.max_pq) < epsilon)
+ max_pq = obj->peak.max_pq;
+ }
+
+ // Use an IIR low-pass filter to smooth out the detected values
+ const float coeff = iir_coeff(params->smoothing_period);
+ obj->peak.avg_pq += coeff * (avg_pq - obj->peak.avg_pq);
+ obj->peak.max_pq += coeff * (max_pq - obj->peak.max_pq);
+
+ // Scene change hysteresis
+ if (params->scene_threshold_low > 0 && params->scene_threshold_high > 0) {
+ const float log10_pq = 1e-2f; // experimentally determined approximate
+ const float thresh_low = params->scene_threshold_low * log10_pq;
+ const float thresh_high = params->scene_threshold_high * log10_pq;
+ const float bias = (float) frame_wg_active / frame_wg_count;
+ const float delta = bias * fabsf(avg_pq - obj->peak.avg_pq);
+ const float mix_coeff = pl_smoothstep(thresh_low, thresh_high, delta);
+ obj->peak.avg_pq = PL_MIX(obj->peak.avg_pq, avg_pq, mix_coeff);
+ obj->peak.max_pq = PL_MIX(obj->peak.max_pq, max_pq, mix_coeff);
+ }
+}
+
+bool pl_shader_detect_peak(pl_shader sh, struct pl_color_space csp,
+ pl_shader_obj *state,
+ const struct pl_peak_detect_params *params)
+{
+ params = PL_DEF(params, &pl_peak_detect_default_params);
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return false;
+
+ pl_gpu gpu = SH_GPU(sh);
+ if (!gpu || gpu->limits.max_ssbo_size < sizeof(struct peak_buf_data)) {
+ PL_ERR(sh, "HDR peak detection requires a GPU with support for at "
+ "least %zu bytes of SSBO data (supported: %zu)",
+ sizeof(struct peak_buf_data), gpu ? gpu->limits.max_ssbo_size : 0);
+ return false;
+ }
+
+ const bool use_histogram = params->percentile > 0 && params->percentile < 100;
+ size_t shmem_req = 3 * sizeof(uint32_t);
+ if (use_histogram)
+ shmem_req += sizeof(uint32_t[HIST_BINS]);
+
+ if (!sh_try_compute(sh, 16, 16, true, shmem_req)) {
+ PL_ERR(sh, "HDR peak detection requires compute shaders with support "
+ "for at least %zu bytes of shared memory! (avail: %zu)",
+ shmem_req, sh_glsl(sh).max_shmem_size);
+ return false;
+ }
+
+ struct sh_color_map_obj *obj;
+ obj = SH_OBJ(sh, state, PL_SHADER_OBJ_COLOR_MAP, struct sh_color_map_obj,
+ sh_color_map_uninit);
+ if (!obj)
+ return false;
+
+ if (peak_detect_params_eq(&obj->peak.params, params)) {
+ update_peak_buf(gpu, obj, true); // prevent over-writing previous frame
+ } else {
+ pl_reset_detected_peak(*state);
+ }
+
+ pl_assert(!obj->peak.buf);
+ static const struct peak_buf_data zero = {0};
+
+retry_ssbo:
+ if (obj->peak.readback) {
+ obj->peak.buf = pl_buf_create(gpu, pl_buf_params(
+ .size = sizeof(struct peak_buf_data),
+ .storable = true,
+ .initial_data = &zero,
+ ));
+ } else {
+ obj->peak.buf = pl_buf_create(gpu, pl_buf_params(
+ .size = sizeof(struct peak_buf_data),
+ .memory_type = PL_BUF_MEM_DEVICE,
+ .host_readable = true,
+ .storable = true,
+ .initial_data = &zero,
+ ));
+ }
+
+ if (!obj->peak.buf && !obj->peak.readback) {
+ PL_WARN(sh, "Failed creating host-readable peak detection SSBO, "
+ "retrying with fallback buffer");
+ obj->peak.readback = pl_buf_create(gpu, pl_buf_params(
+ .size = sizeof(struct peak_buf_data),
+ .host_readable = true,
+ ));
+ if (obj->peak.readback)
+ goto retry_ssbo;
+ }
+
+ if (!obj->peak.buf) {
+ SH_FAIL(sh, "Failed creating peak detection SSBO!");
+ return false;
+ }
+
+ obj->peak.params = *params;
+
+ sh_desc(sh, (struct pl_shader_desc) {
+ .desc = {
+ .name = "PeakBuf",
+ .type = PL_DESC_BUF_STORAGE,
+ .access = PL_DESC_ACCESS_READWRITE,
+ },
+ .binding.object = obj->peak.buf,
+ .buffer_vars = (struct pl_buffer_var *) peak_buf_vars,
+ .num_buffer_vars = PL_ARRAY_SIZE(peak_buf_vars),
+ });
+
+ sh_describe(sh, "peak detection");
+ GLSL("// pl_shader_detect_peak \n"
+ "{ \n"
+ "const uint wg_size = gl_WorkGroupSize.x * gl_WorkGroupSize.y; \n"
+ "uint wg_idx = gl_WorkGroupID.y * gl_NumWorkGroups.x + \n"
+ " gl_WorkGroupID.x; \n"
+ "uint slice = wg_idx %% %du; \n"
+ "vec4 color_orig = color; \n",
+ SLICES);
+
+ // For performance, we want to do as few atomic operations on global
+ // memory as possible, so use an atomic in shmem for the work group.
+ ident_t wg_sum = sh_fresh(sh, "wg_sum"),
+ wg_max = sh_fresh(sh, "wg_max"),
+ wg_black = sh_fresh(sh, "wg_black"),
+ wg_hist = NULL_IDENT;
+ GLSLH("shared uint "$", "$", "$"; \n", wg_sum, wg_max, wg_black);
+ if (use_histogram) {
+ wg_hist = sh_fresh(sh, "wg_hist");
+ GLSLH("shared uint "$"[%u]; \n", wg_hist, HIST_BINS);
+ GLSL("for (uint i = gl_LocalInvocationIndex; i < %du; i += wg_size) \n"
+ " "$"[i] = 0u; \n",
+ HIST_BINS, wg_hist);
+ }
+ GLSL($" = 0u; "$" = 0u; "$" = 0u; \n"
+ "barrier(); \n",
+ wg_sum, wg_max, wg_black);
+
+ // Decode color into linear light representation
+ pl_color_space_infer(&csp);
+ pl_shader_linearize(sh, &csp);
+
+ // Measure luminance as N-bit PQ
+ GLSL("float luma = dot("$", color.rgb); \n"
+ "luma *= %f; \n"
+ "luma = pow(clamp(luma, 0.0, 1.0), %f); \n"
+ "luma = (%f + %f * luma) / (1.0 + %f * luma); \n"
+ "luma = pow(luma, %f); \n"
+ "luma *= smoothstep(0.0, 1e-2, luma); \n"
+ "uint y_pq = uint(%d.0 * luma); \n",
+ sh_luma_coeffs(sh, &csp),
+ PL_COLOR_SDR_WHITE / 10000.0,
+ PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2,
+ PQ_MAX);
+
+ // Update the work group's shared atomics
+ bool has_subgroups = sh_glsl(sh).subgroup_size > 0;
+ if (use_histogram) {
+ GLSL("int bin = (int(y_pq) >> %d) - %d; \n"
+ "bin = clamp(bin, 0, %d); \n",
+ PQ_BITS - HIST_BITS, HIST_BIAS,
+ HIST_BINS - 1);
+ if (has_subgroups) {
+ // Optimize for the very common case of identical histogram bins
+ GLSL("if (subgroupAllEqual(bin)) { \n"
+ " if (subgroupElect()) \n"
+ " atomicAdd("$"[bin], gl_SubgroupSize); \n"
+ "} else { \n"
+ " atomicAdd("$"[bin], 1u); \n"
+ "} \n",
+ wg_hist, wg_hist);
+ } else {
+ GLSL("atomicAdd("$"[bin], 1u); \n", wg_hist);
+ }
+ }
+
+ if (has_subgroups) {
+ GLSL("uint group_sum = subgroupAdd(y_pq); \n"
+ "uint group_max = subgroupMax(y_pq); \n"
+ "uvec4 b = subgroupBallot(y_pq == 0u); \n"
+ "if (subgroupElect()) { \n"
+ " atomicAdd("$", group_sum); \n"
+ " atomicMax("$", group_max); \n"
+ " atomicAdd("$", subgroupBallotBitCount(b));\n"
+ "} \n"
+ "barrier(); \n",
+ wg_sum, wg_max, wg_black);
+ } else {
+ GLSL("atomicAdd("$", y_pq); \n"
+ "atomicMax("$", y_pq); \n"
+ "if (y_pq == 0u) \n"
+ " atomicAdd("$", 1u); \n"
+ "barrier(); \n",
+ wg_sum, wg_max, wg_black);
+ }
+
+ if (use_histogram) {
+ GLSL("if (gl_LocalInvocationIndex == 0u) \n"
+ " "$"[0] -= "$"; \n"
+ "for (uint i = gl_LocalInvocationIndex; i < %du; i += wg_size) \n"
+ " atomicAdd(frame_hist[slice * %du + i], "$"[i]); \n",
+ wg_hist, wg_black,
+ HIST_BINS,
+ HIST_BINS, wg_hist);
+ }
+
+ // Have one thread per work group update the global atomics
+ GLSL("if (gl_LocalInvocationIndex == 0u) { \n"
+ " uint num = wg_size - "$"; \n"
+ " atomicAdd(frame_wg_count[slice], 1u); \n"
+ " atomicAdd(frame_wg_active[slice], min(num, 1u)); \n"
+ " if (num > 0u) { \n"
+ " atomicAdd(frame_sum_pq[slice], "$" / num); \n"
+ " atomicMax(frame_max_pq[slice], "$"); \n"
+ " } \n"
+ "} \n"
+ "color = color_orig; \n"
+ "} \n",
+ wg_black, wg_sum, wg_max);
+
+ return true;
+}
+
+bool pl_get_detected_hdr_metadata(const pl_shader_obj state,
+ struct pl_hdr_metadata *out)
+{
+ if (!state || state->type != PL_SHADER_OBJ_COLOR_MAP)
+ return false;
+
+ struct sh_color_map_obj *obj = state->priv;
+ update_peak_buf(state->gpu, obj, false);
+ if (!obj->peak.avg_pq)
+ return false;
+
+ out->max_pq_y = obj->peak.max_pq;
+ out->avg_pq_y = obj->peak.avg_pq;
+ return true;
+}
+
+bool pl_get_detected_peak(const pl_shader_obj state,
+ float *out_peak, float *out_avg)
+{
+ struct pl_hdr_metadata data;
+ if (!pl_get_detected_hdr_metadata(state, &data))
+ return false;
+
+ // Preserves old behavior
+ *out_peak = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, data.max_pq_y);
+ *out_avg = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, data.avg_pq_y);
+ return true;
+}
+
+void pl_reset_detected_peak(pl_shader_obj state)
+{
+ if (!state || state->type != PL_SHADER_OBJ_COLOR_MAP)
+ return;
+
+ struct sh_color_map_obj *obj = state->priv;
+ pl_buf readback = obj->peak.readback;
+ pl_buf_destroy(state->gpu, &obj->peak.buf);
+ memset(&obj->peak, 0, sizeof(obj->peak));
+ obj->peak.readback = readback;
+}
+
+void pl_shader_extract_features(pl_shader sh, struct pl_color_space csp)
+{
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+
+ sh_describe(sh, "feature extraction");
+ pl_shader_linearize(sh, &csp);
+ GLSL("// pl_shader_extract_features \n"
+ "{ \n"
+ "vec3 lms = %f * "$" * color.rgb; \n"
+ "lms = pow(max(lms, 0.0), vec3(%f)); \n"
+ "lms = (vec3(%f) + %f * lms) \n"
+ " / (vec3(1.0) + %f * lms); \n"
+ "lms = pow(lms, vec3(%f)); \n"
+ "float I = dot(vec3(%f, %f, %f), lms); \n"
+ "color = vec4(I, 0.0, 0.0, 1.0); \n"
+ "} \n",
+ PL_COLOR_SDR_WHITE / 10000,
+ SH_MAT3(pl_ipt_rgb2lms(pl_raw_primaries_get(csp.primaries))),
+ PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2,
+ pl_ipt_lms2ipt.m[0][0], pl_ipt_lms2ipt.m[0][1], pl_ipt_lms2ipt.m[0][2]);
+}
+
+const struct pl_color_map_params pl_color_map_default_params = { PL_COLOR_MAP_DEFAULTS };
+const struct pl_color_map_params pl_color_map_high_quality_params = { PL_COLOR_MAP_HQ_DEFAULTS };
+
+static ident_t rect_pos(pl_shader sh, pl_rect2df rc)
+{
+ if (!rc.x0 && !rc.x1)
+ rc.x1 = 1.0f;
+ if (!rc.y0 && !rc.y1)
+ rc.y1 = 1.0f;
+
+ return sh_attr_vec2(sh, "tone_map_coords", &(pl_rect2df) {
+ .x0 = -rc.x0 / (rc.x1 - rc.x0),
+ .x1 = (1.0f - rc.x0) / (rc.x1 - rc.x0),
+ .y0 = -rc.y1 / (rc.y0 - rc.y1),
+ .y1 = (1.0f - rc.y1) / (rc.y0 - rc.y1),
+ });
+}
+
+static void visualize_tone_map(pl_shader sh, pl_rect2df rc, float alpha,
+ const struct pl_tone_map_params *params)
+{
+ pl_assert(params->input_scaling == PL_HDR_PQ);
+ pl_assert(params->output_scaling == PL_HDR_PQ);
+
+ GLSL("// Visualize tone mapping \n"
+ "{ \n"
+ "vec2 pos = "$"; \n"
+ "if (min(pos.x, pos.y) >= 0.0 && \n" // visualizer rect
+ " max(pos.x, pos.y) <= 1.0) \n"
+ "{ \n"
+ "float xmin = "$"; \n"
+ "float xmax = "$"; \n"
+ "float xavg = "$"; \n"
+ "float ymin = "$"; \n"
+ "float ymax = "$"; \n"
+ "float alpha = 0.8 * "$"; \n"
+ "vec3 viz = color.rgb; \n"
+ "float vv = tone_map(pos.x); \n"
+ // Color based on region
+ "if (pos.x < xmin || pos.x > xmax) { \n" // outside source
+ "} else if (pos.y < ymin || pos.y > ymax) {\n" // outside target
+ " if (pos.y < xmin || pos.y > xmax) { \n" // and also source
+ " viz = vec3(0.1, 0.1, 0.5); \n"
+ " } else { \n"
+ " viz = vec3(0.2, 0.05, 0.05); \n" // but inside source
+ " } \n"
+ "} else { \n" // inside domain
+ " if (abs(pos.x - pos.y) < 1e-3) { \n" // main diagonal
+ " viz = vec3(0.2); \n"
+ " } else if (pos.y < vv) { \n" // inside function
+ " alpha *= 0.6; \n"
+ " viz = vec3(0.05); \n"
+ " if (vv > pos.x && pos.y > pos.x) \n" // output brighter than input
+ " viz.rg = vec2(0.5, 0.7); \n"
+ " } else { \n" // outside function
+ " if (vv < pos.x && pos.y < pos.x) \n" // output darker than input
+ " viz = vec3(0.0, 0.1, 0.2); \n"
+ " } \n"
+ " if (pos.y > xmax) { \n" // inverse tone-mapping region
+ " vec3 hi = vec3(0.2, 0.5, 0.8); \n"
+ " viz = mix(viz, hi, 0.5); \n"
+ " } else if (pos.y < xmin) { \n" // black point region
+ " viz = mix(viz, vec3(0.0), 0.3); \n"
+ " } \n"
+ " if (xavg > 0.0 && abs(pos.x - xavg) < 1e-3)\n" // source avg brightness
+ " viz = vec3(0.5); \n"
+ "} \n"
+ "color.rgb = mix(color.rgb, viz, alpha); \n"
+ "} \n"
+ "} \n",
+ rect_pos(sh, rc),
+ SH_FLOAT_DYN(params->input_min),
+ SH_FLOAT_DYN(params->input_max),
+ SH_FLOAT_DYN(params->input_avg),
+ SH_FLOAT(params->output_min),
+ SH_FLOAT_DYN(params->output_max),
+ SH_FLOAT_DYN(alpha));
+}
+
+static void visualize_gamut_map(pl_shader sh, pl_rect2df rc,
+ ident_t lut, float hue, float theta,
+ const struct pl_gamut_map_params *params)
+{
+ ident_t ipt2lms = SH_MAT3(pl_ipt_ipt2lms);
+ ident_t lms2rgb_src = SH_MAT3(pl_ipt_lms2rgb(&params->input_gamut));
+ ident_t lms2rgb_dst = SH_MAT3(pl_ipt_lms2rgb(&params->output_gamut));
+
+ GLSL("// Visualize gamut mapping \n"
+ "vec2 pos = "$"; \n"
+ "float pqmin = "$"; \n"
+ "float pqmax = "$"; \n"
+ "float rgbmin = "$"; \n"
+ "float rgbmax = "$"; \n"
+ "vec3 orig = ipt; \n"
+ "if (min(pos.x, pos.y) >= 0.0 && \n"
+ " max(pos.x, pos.y) <= 1.0) \n"
+ "{ \n"
+ // Source color to visualize
+ "float mid = mix(pqmin, pqmax, 0.6); \n"
+ "vec3 base = vec3(0.5, 0.0, 0.0); \n"
+ "float hue = "$", theta = "$"; \n"
+ "base.x = mix(base.x, mid, sin(theta)); \n"
+ "mat3 rot1 = mat3(1.0, 0.0, 0.0, \n"
+ " 0.0, cos(hue), sin(hue), \n"
+ " 0.0, -sin(hue), cos(hue)); \n"
+ "mat3 rot2 = mat3( cos(theta), 0.0, sin(theta), \n"
+ " 0.0, 1.0, 0.0, \n"
+ " -sin(theta), 0.0, cos(theta)); \n"
+ "vec3 dir = vec3(pos.yx - vec2(0.5), 0.0); \n"
+ "ipt = base + rot1 * rot2 * dir; \n"
+ // Convert back to RGB (for gamut boundary testing)
+ "lmspq = "$" * ipt; \n"
+ "lms = pow(max(lmspq, 0.0), vec3(1.0/%f)); \n"
+ "lms = max(lms - vec3(%f), 0.0) \n"
+ " / (vec3(%f) - %f * lms); \n"
+ "lms = pow(lms, vec3(1.0/%f)); \n"
+ "lms *= %f; \n"
+ // Check against src/dst gamut boundaries
+ "vec3 rgbsrc = "$" * lms; \n"
+ "vec3 rgbdst = "$" * lms; \n"
+ "bool insrc, indst; \n"
+ "insrc = all(lessThan(rgbsrc, vec3(rgbmax))) && \n"
+ " all(greaterThan(rgbsrc, vec3(rgbmin))); \n"
+ "indst = all(lessThan(rgbdst, vec3(rgbmax))) && \n"
+ " all(greaterThan(rgbdst, vec3(rgbmin))); \n"
+ // Sample from gamut mapping 3DLUT
+ "idx.x = (ipt.x - pqmin) / (pqmax - pqmin); \n"
+ "idx.y = 2.0 * length(ipt.yz); \n"
+ "idx.z = %f * atan(ipt.z, ipt.y) + 0.5; \n"
+ "vec3 mapped = "$"(idx).xyz; \n"
+ "mapped.yz -= vec2(32768.0/65535.0); \n"
+ "float mappedhue = atan(mapped.z, mapped.y); \n"
+ "float mappedchroma = length(mapped.yz); \n"
+ "ipt = mapped; \n"
+ // Visualize gamuts
+ "if (!insrc && !indst) { \n"
+ " ipt = orig; \n"
+ "} else if (insrc && !indst) { \n"
+ " ipt.x -= 0.1; \n"
+ "} else if (indst && !insrc) { \n"
+ " ipt.x += 0.1; \n"
+ "} \n"
+ // Visualize iso-luminance and iso-hue lines
+ "vec3 line; \n"
+ "if (insrc && fract(50.0 * mapped.x) < 1e-1) { \n"
+ " float k = smoothstep(0.1, 0.0, abs(sin(theta))); \n"
+ " line.x = mix(mapped.x, 0.3, 0.5); \n"
+ " line.yz = sqrt(length(mapped.yz)) * \n"
+ " normalize(mapped.yz); \n"
+ " ipt = mix(ipt, line, k); \n"
+ "} \n"
+ "if (insrc && fract(10.0 * (mappedhue - hue)) < 1e-1) {\n"
+ " float k = smoothstep(0.3, 0.0, abs(cos(theta))); \n"
+ " line.x = mapped.x - 0.05; \n"
+ " line.yz = 1.2 * mapped.yz; \n"
+ " ipt = mix(ipt, line, k); \n"
+ "} \n"
+ "if (insrc && fract(100.0 * mappedchroma) < 1e-1) { \n"
+ " line.x = mapped.x + 0.1; \n"
+ " line.yz = 0.4 * mapped.yz; \n"
+ " ipt = mix(ipt, line, 0.5); \n"
+ "} \n"
+ "} \n",
+ rect_pos(sh, rc),
+ SH_FLOAT(params->min_luma), SH_FLOAT(params->max_luma),
+ SH_FLOAT(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, params->min_luma)),
+ SH_FLOAT(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, params->max_luma)),
+ SH_FLOAT_DYN(hue), SH_FLOAT_DYN(theta),
+ ipt2lms,
+ PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1,
+ 10000 / PL_COLOR_SDR_WHITE,
+ lms2rgb_src,
+ lms2rgb_dst,
+ 0.5f / M_PI,
+ lut);
+}
+
+static void fill_tone_lut(void *data, const struct sh_lut_params *params)
+{
+ const struct pl_tone_map_params *lut_params = params->priv;
+ pl_tone_map_generate(data, lut_params);
+}
+
+static void fill_gamut_lut(void *data, const struct sh_lut_params *params)
+{
+ const struct pl_gamut_map_params *lut_params = params->priv;
+ const int lut_size = params->width * params->height * params->depth;
+ void *tmp = pl_alloc(NULL, lut_size * sizeof(float) * lut_params->lut_stride);
+ pl_gamut_map_generate(tmp, lut_params);
+
+ // Convert to 16-bit unsigned integer for GPU texture
+ const float *in = tmp;
+ uint16_t *out = data;
+ pl_assert(lut_params->lut_stride == 3);
+ pl_assert(params->comps == 4);
+ for (int i = 0; i < lut_size; i++) {
+ out[0] = roundf(in[0] * UINT16_MAX);
+ out[1] = roundf(in[1] * UINT16_MAX + (UINT16_MAX >> 1));
+ out[2] = roundf(in[2] * UINT16_MAX + (UINT16_MAX >> 1));
+ in += 3;
+ out += 4;
+ }
+
+ pl_free(tmp);
+}
+
+void pl_shader_color_map_ex(pl_shader sh, const struct pl_color_map_params *params,
+ const struct pl_color_map_args *args)
+{
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+
+ struct pl_color_space src = args->src, dst = args->dst;
+ pl_color_space_infer_map(&src, &dst);
+ if (pl_color_space_equal(&src, &dst)) {
+ if (args->prelinearized)
+ pl_shader_delinearize(sh, &dst);
+ return;
+ }
+
+ struct sh_color_map_obj *obj = NULL;
+ if (args->state) {
+ pl_get_detected_hdr_metadata(*args->state, &src.hdr);
+ obj = SH_OBJ(sh, args->state, PL_SHADER_OBJ_COLOR_MAP, struct sh_color_map_obj,
+ sh_color_map_uninit);
+ if (!obj)
+ return;
+ }
+
+ params = PL_DEF(params, &pl_color_map_default_params);
+ GLSL("// pl_shader_color_map \n"
+ "{ \n");
+
+ struct pl_tone_map_params tone = {
+ .function = PL_DEF(params->tone_mapping_function, &pl_tone_map_clip),
+ .constants = params->tone_constants,
+ .param = params->tone_mapping_param,
+ .input_scaling = PL_HDR_PQ,
+ .output_scaling = PL_HDR_PQ,
+ .lut_size = PL_DEF(params->lut_size, pl_color_map_default_params.lut_size),
+ .hdr = src.hdr,
+ };
+
+ pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+ .color = &src,
+ .metadata = params->metadata,
+ .scaling = tone.input_scaling,
+ .out_min = &tone.input_min,
+ .out_max = &tone.input_max,
+ .out_avg = &tone.input_avg,
+ ));
+
+ pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+ .color = &dst,
+ .metadata = PL_HDR_METADATA_HDR10,
+ .scaling = tone.output_scaling,
+ .out_min = &tone.output_min,
+ .out_max = &tone.output_max,
+ ));
+
+ pl_tone_map_params_infer(&tone);
+
+ // Round sufficiently similar values
+ if (fabs(tone.input_max - tone.output_max) < 1e-6)
+ tone.output_max = tone.input_max;
+ if (fabs(tone.input_min - tone.output_min) < 1e-6)
+ tone.output_min = tone.input_min;
+
+ if (!params->inverse_tone_mapping) {
+ // Never exceed the source unless requested, but still allow
+ // black point adaptation
+ tone.output_max = PL_MIN(tone.output_max, tone.input_max);
+ }
+
+ const int *lut3d_size_def = pl_color_map_default_params.lut3d_size;
+ struct pl_gamut_map_params gamut = {
+ .function = PL_DEF(params->gamut_mapping, &pl_gamut_map_clip),
+ .constants = params->gamut_constants,
+ .input_gamut = src.hdr.prim,
+ .output_gamut = dst.hdr.prim,
+ .lut_size_I = PL_DEF(params->lut3d_size[0], lut3d_size_def[0]),
+ .lut_size_C = PL_DEF(params->lut3d_size[1], lut3d_size_def[1]),
+ .lut_size_h = PL_DEF(params->lut3d_size[2], lut3d_size_def[2]),
+ .lut_stride = 3,
+ };
+
+ float src_peak_static;
+ pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+ .color = &src,
+ .metadata = PL_HDR_METADATA_HDR10,
+ .scaling = PL_HDR_PQ,
+ .out_max = &src_peak_static,
+ ));
+
+ pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+ .color = &dst,
+ .metadata = PL_HDR_METADATA_HDR10,
+ .scaling = PL_HDR_PQ,
+ .out_min = &gamut.min_luma,
+ .out_max = &gamut.max_luma,
+ ));
+
+ // Clip the gamut mapping output to the input gamut if disabled
+ if (!params->gamut_expansion && gamut.function->bidirectional) {
+ if (pl_primaries_compatible(&gamut.input_gamut, &gamut.output_gamut)) {
+ gamut.output_gamut = pl_primaries_clip(&gamut.output_gamut,
+ &gamut.input_gamut);
+ }
+ }
+
+ // Backwards compatibility with older API
+ switch (params->gamut_mode) {
+ case PL_GAMUT_CLIP:
+ switch (params->intent) {
+ case PL_INTENT_AUTO:
+ case PL_INTENT_PERCEPTUAL:
+ case PL_INTENT_RELATIVE_COLORIMETRIC:
+ break; // leave default
+ case PL_INTENT_SATURATION:
+ gamut.function = &pl_gamut_map_saturation;
+ break;
+ case PL_INTENT_ABSOLUTE_COLORIMETRIC:
+ gamut.function = &pl_gamut_map_absolute;
+ break;
+ }
+ break;
+ case PL_GAMUT_DARKEN:
+ gamut.function = &pl_gamut_map_darken;
+ break;
+ case PL_GAMUT_WARN:
+ gamut.function = &pl_gamut_map_highlight;
+ break;
+ case PL_GAMUT_DESATURATE:
+ gamut.function = &pl_gamut_map_desaturate;
+ break;
+ case PL_GAMUT_MODE_COUNT:
+ pl_unreachable();
+ }
+
+ bool can_fast = !params->force_tone_mapping_lut;
+ if (!args->state) {
+ // No state object provided, forcibly disable advanced methods
+ can_fast = true;
+ if (tone.function != &pl_tone_map_clip)
+ tone.function = &pl_tone_map_linear;
+ if (gamut.function != &pl_gamut_map_clip)
+ gamut.function = &pl_gamut_map_saturation;
+ }
+
+ pl_fmt gamut_fmt = pl_find_fmt(SH_GPU(sh), PL_FMT_UNORM, 4, 16, 16, PL_FMT_CAP_LINEAR);
+ if (!gamut_fmt) {
+ gamut.function = &pl_gamut_map_saturation;
+ can_fast = true;
+ }
+
+ bool need_tone_map = !pl_tone_map_params_noop(&tone);
+ bool need_gamut_map = !pl_gamut_map_params_noop(&gamut);
+
+ if (!args->prelinearized)
+ pl_shader_linearize(sh, &src);
+
+ pl_matrix3x3 rgb2lms = pl_ipt_rgb2lms(pl_raw_primaries_get(src.primaries));
+ pl_matrix3x3 lms2rgb = pl_ipt_lms2rgb(pl_raw_primaries_get(dst.primaries));
+ ident_t lms2ipt = SH_MAT3(pl_ipt_lms2ipt);
+ ident_t ipt2lms = SH_MAT3(pl_ipt_ipt2lms);
+
+ if (need_gamut_map && gamut.function == &pl_gamut_map_saturation && can_fast) {
+ const pl_matrix3x3 lms2src = pl_ipt_lms2rgb(&gamut.input_gamut);
+ const pl_matrix3x3 dst2lms = pl_ipt_rgb2lms(&gamut.output_gamut);
+ sh_describe(sh, "gamut map (saturation)");
+ pl_matrix3x3_mul(&lms2rgb, &dst2lms);
+ pl_matrix3x3_mul(&lms2rgb, &lms2src);
+ need_gamut_map = false;
+ }
+
+ // Fast path: simply convert between primaries (if needed)
+ if (!need_tone_map && !need_gamut_map) {
+ if (src.primaries != dst.primaries) {
+ sh_describe(sh, "colorspace conversion");
+ pl_matrix3x3_mul(&lms2rgb, &rgb2lms);
+ GLSL("color.rgb = "$" * color.rgb; \n", SH_MAT3(lms2rgb));
+ }
+ goto done;
+ }
+
+ // Full path: convert input from normalized RGB to IPT
+ GLSL("vec3 lms = "$" * color.rgb; \n"
+ "vec3 lmspq = %f * lms; \n"
+ "lmspq = pow(max(lmspq, 0.0), vec3(%f)); \n"
+ "lmspq = (vec3(%f) + %f * lmspq) \n"
+ " / (vec3(1.0) + %f * lmspq); \n"
+ "lmspq = pow(lmspq, vec3(%f)); \n"
+ "vec3 ipt = "$" * lmspq; \n"
+ "float i_orig = ipt.x; \n",
+ SH_MAT3(rgb2lms),
+ PL_COLOR_SDR_WHITE / 10000,
+ PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2,
+ lms2ipt);
+
+ if (params->show_clipping) {
+ const float eps = 1e-6f;
+ GLSL("bool clip_hi, clip_lo; \n"
+ "clip_hi = any(greaterThan(color.rgb, vec3("$"))); \n"
+ "clip_lo = any(lessThan(color.rgb, vec3("$"))); \n"
+ "clip_hi = clip_hi || ipt.x > "$"; \n"
+ "clip_lo = clip_lo || ipt.x < "$"; \n",
+ SH_FLOAT_DYN(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, tone.input_max) + eps),
+ SH_FLOAT(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, tone.input_min) - eps),
+ SH_FLOAT_DYN(tone.input_max + eps),
+ SH_FLOAT(tone.input_min - eps));
+ }
+
+ if (need_tone_map) {
+ const struct pl_tone_map_function *fun = tone.function;
+ sh_describef(sh, "%s tone map (%.0f -> %.0f)", fun->name,
+ pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, tone.input_max),
+ pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, tone.output_max));
+
+ if (fun == &pl_tone_map_clip && can_fast) {
+
+ GLSL("#define tone_map(x) clamp((x), "$", "$") \n",
+ SH_FLOAT(tone.input_min),
+ SH_FLOAT_DYN(tone.input_max));
+
+ } else if (fun == &pl_tone_map_linear && can_fast) {
+
+ const float gain = tone.constants.exposure;
+ const float scale = tone.input_max - tone.input_min;
+
+ ident_t linfun = sh_fresh(sh, "linear_pq");
+ GLSLH("float "$"(float x) { \n"
+ // Stretch the input range (while clipping)
+ " x = "$" * x + "$"; \n"
+ " x = clamp(x, 0.0, 1.0); \n"
+ " x = "$" * x + "$"; \n"
+ " return x; \n"
+ "} \n",
+ linfun,
+ SH_FLOAT_DYN(gain / scale),
+ SH_FLOAT_DYN(-gain / scale * tone.input_min),
+ SH_FLOAT_DYN(tone.output_max - tone.output_min),
+ SH_FLOAT(tone.output_min));
+
+ GLSL("#define tone_map(x) ("$"(x)) \n", linfun);
+
+ } else {
+
+ pl_assert(obj);
+ ident_t lut = sh_lut(sh, sh_lut_params(
+ .object = &obj->tone.lut,
+ .var_type = PL_VAR_FLOAT,
+ .lut_type = SH_LUT_AUTO,
+ .method = SH_LUT_LINEAR,
+ .width = tone.lut_size,
+ .comps = 1,
+ .update = !pl_tone_map_params_equal(&tone, &obj->tone.params),
+ .dynamic = tone.input_avg > 0, // dynamic metadata
+ .fill = fill_tone_lut,
+ .priv = &tone,
+ ));
+ obj->tone.params = tone;
+ if (!lut) {
+ SH_FAIL(sh, "Failed generating tone-mapping LUT!");
+ return;
+ }
+
+ const float lut_range = tone.input_max - tone.input_min;
+ GLSL("#define tone_map(x) ("$"("$" * (x) + "$")) \n",
+ lut, SH_FLOAT_DYN(1.0f / lut_range),
+ SH_FLOAT_DYN(-tone.input_min / lut_range));
+
+ }
+
+ bool need_recovery = tone.input_max >= tone.output_max;
+ if (need_recovery && params->contrast_recovery && args->feature_map) {
+ ident_t pos, pt;
+ ident_t lowres = sh_bind(sh, args->feature_map, PL_TEX_ADDRESS_CLAMP,
+ PL_TEX_SAMPLE_LINEAR, "feature_map",
+ NULL, &pos, &pt);
+
+ // Obtain HF detail map from bicubic interpolation of LF features
+ GLSL("vec2 lpos = "$"; \n"
+ "vec2 lpt = "$"; \n"
+ "vec2 lsize = vec2(textureSize("$", 0)); \n"
+ "vec2 frac = fract(lpos * lsize + vec2(0.5)); \n"
+ "vec2 frac2 = frac * frac; \n"
+ "vec2 inv = vec2(1.0) - frac; \n"
+ "vec2 inv2 = inv * inv; \n"
+ "vec2 w0 = 1.0/6.0 * inv2 * inv; \n"
+ "vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac); \n"
+ "vec2 w2 = 2.0/3.0 - 0.5 * inv2 * (2.0 - inv); \n"
+ "vec2 w3 = 1.0/6.0 * frac2 * frac; \n"
+ "vec4 g = vec4(w0 + w1, w2 + w3); \n"
+ "vec4 h = vec4(w1, w3) / g + inv.xyxy; \n"
+ "h.xy -= vec2(2.0); \n"
+ "vec4 p = lpos.xyxy + lpt.xyxy * h; \n"
+ "float l00 = textureLod("$", p.xy, 0.0).r; \n"
+ "float l01 = textureLod("$", p.xw, 0.0).r; \n"
+ "float l0 = mix(l01, l00, g.y); \n"
+ "float l10 = textureLod("$", p.zy, 0.0).r; \n"
+ "float l11 = textureLod("$", p.zw, 0.0).r; \n"
+ "float l1 = mix(l11, l10, g.y); \n"
+ "float luma = mix(l1, l0, g.x); \n"
+ // Mix low-resolution tone mapped image with high-resolution
+ // tone mapped image according to desired strength.
+ "float highres = clamp(ipt.x, 0.0, 1.0); \n"
+ "float lowres = clamp(luma, 0.0, 1.0); \n"
+ "float detail = highres - lowres; \n"
+ "float base = tone_map(highres); \n"
+ "float sharp = tone_map(lowres) + detail; \n"
+ "ipt.x = clamp(mix(base, sharp, "$"), "$", "$"); \n",
+ pos, pt, lowres,
+ lowres, lowres, lowres, lowres,
+ SH_FLOAT(params->contrast_recovery),
+ SH_FLOAT(tone.output_min), SH_FLOAT_DYN(tone.output_max));
+
+ } else {
+
+ GLSL("ipt.x = tone_map(ipt.x); \n");
+ }
+
+ // Avoid raising saturation excessively when raising brightness, and
+ // also desaturate when reducing brightness greatly to account for the
+ // reduction in gamut volume.
+ GLSL("vec2 hull = vec2(i_orig, ipt.x); \n"
+ "hull = ((hull - 6.0) * hull + 9.0) * hull; \n"
+ "ipt.yz *= min(i_orig / ipt.x, hull.y / hull.x); \n");
+ }
+
+ if (need_gamut_map) {
+ const struct pl_gamut_map_function *fun = gamut.function;
+ sh_describef(sh, "gamut map (%s)", fun->name);
+
+ pl_assert(obj);
+ ident_t lut = sh_lut(sh, sh_lut_params(
+ .object = &obj->gamut.lut,
+ .var_type = PL_VAR_FLOAT,
+ .lut_type = SH_LUT_TEXTURE,
+ .fmt = gamut_fmt,
+ .method = params->lut3d_tricubic ? SH_LUT_CUBIC : SH_LUT_LINEAR,
+ .width = gamut.lut_size_I,
+ .height = gamut.lut_size_C,
+ .depth = gamut.lut_size_h,
+ .comps = 4,
+ .signature = gamut_map_signature(&gamut),
+ .cache = SH_CACHE(sh),
+ .fill = fill_gamut_lut,
+ .priv = &gamut,
+ ));
+ if (!lut) {
+ SH_FAIL(sh, "Failed generating gamut-mapping LUT!");
+ return;
+ }
+
+ // 3D LUT lookup (in ICh space)
+ const float lut_range = gamut.max_luma - gamut.min_luma;
+ GLSL("vec3 idx; \n"
+ "idx.x = "$" * ipt.x + "$"; \n"
+ "idx.y = 2.0 * length(ipt.yz); \n"
+ "idx.z = %f * atan(ipt.z, ipt.y) + 0.5;\n"
+ "ipt = "$"(idx).xyz; \n"
+ "ipt.yz -= vec2(32768.0/65535.0); \n",
+ SH_FLOAT(1.0f / lut_range),
+ SH_FLOAT(-gamut.min_luma / lut_range),
+ 0.5f / M_PI, lut);
+
+ if (params->show_clipping) {
+ GLSL("clip_lo = clip_lo || any(lessThan(idx, vec3(0.0))); \n"
+ "clip_hi = clip_hi || any(greaterThan(idx, vec3(1.0))); \n");
+ }
+
+ if (params->visualize_lut) {
+ visualize_gamut_map(sh, params->visualize_rect, lut,
+ params->visualize_hue, params->visualize_theta,
+ &gamut);
+ }
+ }
+
+ // Convert IPT back to linear RGB
+ GLSL("lmspq = "$" * ipt; \n"
+ "lms = pow(max(lmspq, 0.0), vec3(1.0/%f)); \n"
+ "lms = max(lms - vec3(%f), 0.0) \n"
+ " / (vec3(%f) - %f * lms); \n"
+ "lms = pow(lms, vec3(1.0/%f)); \n"
+ "lms *= %f; \n"
+ "color.rgb = "$" * lms; \n",
+ ipt2lms,
+ PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1,
+ 10000 / PL_COLOR_SDR_WHITE,
+ SH_MAT3(lms2rgb));
+
+ if (params->show_clipping) {
+ GLSL("if (clip_hi) { \n"
+ " float k = dot(color.rgb, vec3(2.0 / 3.0)); \n"
+ " color.rgb = clamp(vec3(k) - color.rgb, 0.0, 1.0); \n"
+ " float cmin = min(min(color.r, color.g), color.b); \n"
+ " float cmax = max(max(color.r, color.g), color.b); \n"
+ " float delta = cmax - cmin; \n"
+ " vec3 sat = smoothstep(cmin - 1e-6, cmax, color.rgb); \n"
+ " const vec3 red = vec3(1.0, 0.0, 0.0); \n"
+ " color.rgb = mix(red, sat, smoothstep(0.0, 0.3, delta)); \n"
+ "} else if (clip_lo) { \n"
+ " vec3 hi = vec3(0.0, 0.3, 0.3); \n"
+ " color.rgb = mix(color.rgb, hi, 0.5); \n"
+ "} \n");
+ }
+
+ if (need_tone_map) {
+ if (params->visualize_lut) {
+ float alpha = need_gamut_map ? powf(cosf(params->visualize_theta), 5.0f) : 1.0f;
+ visualize_tone_map(sh, params->visualize_rect, alpha, &tone);
+ }
+ GLSL("#undef tone_map \n");
+ }
+
+done:
+ pl_shader_delinearize(sh, &dst);
+ GLSL("}\n");
+}
+
+// Backwards compatibility wrapper around `pl_shader_color_map_ex`
+void pl_shader_color_map(pl_shader sh, const struct pl_color_map_params *params,
+ struct pl_color_space src, struct pl_color_space dst,
+ pl_shader_obj *state, bool prelinearized)
+{
+ pl_shader_color_map_ex(sh, params, pl_color_map_args(
+ .src = src,
+ .dst = dst,
+ .prelinearized = prelinearized,
+ .state = state,
+ .feature_map = NULL
+ ));
+}
+
+void pl_shader_cone_distort(pl_shader sh, struct pl_color_space csp,
+ const struct pl_cone_params *params)
+{
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+ if (!params || !params->cones)
+ return;
+
+ sh_describe(sh, "cone distortion");
+ GLSL("// pl_shader_cone_distort\n");
+ GLSL("{\n");
+
+ pl_color_space_infer(&csp);
+ pl_shader_linearize(sh, &csp);
+
+ pl_matrix3x3 cone_mat;
+ cone_mat = pl_get_cone_matrix(params, pl_raw_primaries_get(csp.primaries));
+ GLSL("color.rgb = "$" * color.rgb; \n", sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_mat3("cone_mat"),
+ .data = PL_TRANSPOSE_3X3(cone_mat.m),
+ }));
+
+ pl_shader_delinearize(sh, &csp);
+ GLSL("}\n");
+}
diff --git a/src/shaders/custom.c b/src/shaders/custom.c
new file mode 100644
index 0000000..3f03e57
--- /dev/null
+++ b/src/shaders/custom.c
@@ -0,0 +1,89 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "shaders.h"
+
+#include <libplacebo/shaders/custom.h>
+
+bool pl_shader_custom(pl_shader sh, const struct pl_custom_shader *params)
+{
+ if (params->compute) {
+ int bw = PL_DEF(params->compute_group_size[0], 16);
+ int bh = PL_DEF(params->compute_group_size[1], 16);
+ bool flex = !params->compute_group_size[0] ||
+ !params->compute_group_size[1];
+ if (!sh_try_compute(sh, bw, bh, flex, params->compute_shmem))
+ return false;
+ }
+
+ if (!sh_require(sh, params->input, params->output_w, params->output_h))
+ return false;
+
+ sh->output = params->output;
+
+ for (int i = 0; i < params->num_variables; i++) {
+ struct pl_shader_var sv = params->variables[i];
+ GLSLP("#define %s "$"\n", sv.var.name, sh_var(sh, sv));
+ }
+
+ for (int i = 0; i < params->num_descriptors; i++) {
+ struct pl_shader_desc sd = params->descriptors[i];
+ GLSLP("#define %s "$"\n", sd.desc.name, sh_desc(sh, sd));
+ }
+
+ for (int i = 0; i < params->num_vertex_attribs; i++) {
+ struct pl_shader_va sva = params->vertex_attribs[i];
+ GLSLP("#define %s "$"\n", sva.attr.name, sh_attr(sh, sva));
+ }
+
+ for (int i = 0; i < params->num_constants; i++) {
+ struct pl_shader_const sc = params->constants[i];
+ GLSLP("#define %s "$"\n", sc.name, sh_const(sh, sc));
+ }
+
+ if (params->prelude)
+ GLSLP("// pl_shader_custom prelude: \n%s\n", params->prelude);
+ if (params->header)
+ GLSLH("// pl_shader_custom header: \n%s\n", params->header);
+
+ if (params->description)
+ sh_describef(sh, "%s", params->description);
+
+ if (params->body) {
+ const char *output_decl = "";
+ if (params->output != params->input) {
+ switch (params->output) {
+ case PL_SHADER_SIG_NONE: break;
+ case PL_SHADER_SIG_COLOR:
+ output_decl = "vec4 color = vec4(0.0);";
+ break;
+
+ case PL_SHADER_SIG_SAMPLER:
+ pl_unreachable();
+ }
+ }
+
+ GLSL("// pl_shader_custom \n"
+ "%s \n"
+ "{ \n"
+ "%s \n"
+ "} \n",
+ output_decl, params->body);
+ }
+
+ return true;
+}
diff --git a/src/shaders/custom_mpv.c b/src/shaders/custom_mpv.c
new file mode 100644
index 0000000..4ef0817
--- /dev/null
+++ b/src/shaders/custom_mpv.c
@@ -0,0 +1,1768 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include <limits.h>
+
+#include "gpu.h"
+#include "shaders.h"
+
+#include <libplacebo/shaders/colorspace.h>
+#include <libplacebo/shaders/custom.h>
+
+// Hard-coded size limits, mainly for convenience (to avoid dynamic memory)
+#define SHADER_MAX_HOOKS 16
+#define SHADER_MAX_BINDS 16
+#define MAX_SHEXP_SIZE 32
+
+enum shexp_op {
+ SHEXP_OP_ADD,
+ SHEXP_OP_SUB,
+ SHEXP_OP_MUL,
+ SHEXP_OP_DIV,
+ SHEXP_OP_MOD,
+ SHEXP_OP_NOT,
+ SHEXP_OP_GT,
+ SHEXP_OP_LT,
+ SHEXP_OP_EQ,
+};
+
+enum shexp_tag {
+ SHEXP_END = 0, // End of an RPN expression
+ SHEXP_CONST, // Push a constant value onto the stack
+ SHEXP_TEX_W, // Get the width/height of a named texture (variable)
+ SHEXP_TEX_H,
+ SHEXP_OP2, // Pop two elements and push the result of a dyadic operation
+ SHEXP_OP1, // Pop one element and push the result of a monadic operation
+ SHEXP_VAR, // Arbitrary variable (e.g. shader parameters)
+};
+
+struct shexp {
+ enum shexp_tag tag;
+ union {
+ float cval;
+ pl_str varname;
+ enum shexp_op op;
+ } val;
+};
+
+struct custom_shader_hook {
+ // Variable/literal names of textures
+ pl_str pass_desc;
+ pl_str hook_tex[SHADER_MAX_HOOKS];
+ pl_str bind_tex[SHADER_MAX_BINDS];
+ pl_str save_tex;
+
+ // Shader body itself + metadata
+ pl_str pass_body;
+ float offset[2];
+ bool offset_align;
+ int comps;
+
+ // Special expressions governing the output size and execution conditions
+ struct shexp width[MAX_SHEXP_SIZE];
+ struct shexp height[MAX_SHEXP_SIZE];
+ struct shexp cond[MAX_SHEXP_SIZE];
+
+ // Special metadata for compute shaders
+ bool is_compute;
+ int block_w, block_h; // Block size (each block corresponds to one WG)
+ int threads_w, threads_h; // How many threads form a WG
+};
+
+static bool parse_rpn_shexpr(pl_str line, struct shexp out[MAX_SHEXP_SIZE])
+{
+ int pos = 0;
+
+ while (line.len > 0) {
+ pl_str word = pl_str_split_char(line, ' ', &line);
+ if (word.len == 0)
+ continue;
+
+ if (pos >= MAX_SHEXP_SIZE)
+ return false;
+
+ struct shexp *exp = &out[pos++];
+
+ if (pl_str_eatend0(&word, ".w") || pl_str_eatend0(&word, ".width")) {
+ exp->tag = SHEXP_TEX_W;
+ exp->val.varname = word;
+ continue;
+ }
+
+ if (pl_str_eatend0(&word, ".h") || pl_str_eatend0(&word, ".height")) {
+ exp->tag = SHEXP_TEX_H;
+ exp->val.varname = word;
+ continue;
+ }
+
+ switch (word.buf[0]) {
+ case '+': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_ADD; continue;
+ case '-': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_SUB; continue;
+ case '*': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_MUL; continue;
+ case '/': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_DIV; continue;
+ case '%': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_MOD; continue;
+ case '!': exp->tag = SHEXP_OP1; exp->val.op = SHEXP_OP_NOT; continue;
+ case '>': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_GT; continue;
+ case '<': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_LT; continue;
+ case '=': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_EQ; continue;
+ }
+
+ if (word.buf[0] >= '0' && word.buf[0] <= '9') {
+ exp->tag = SHEXP_CONST;
+ if (!pl_str_parse_float(word, &exp->val.cval))
+ return false;
+ continue;
+ }
+
+ // Treat as generic variable
+ exp->tag = SHEXP_VAR;
+ exp->val.varname = word;
+ }
+
+ return true;
+}
+
+static inline pl_str split_magic(pl_str *body)
+{
+ pl_str ret = pl_str_split_str0(*body, "//!", body);
+ if (body->len) {
+ // Make sure the separator is included in the remainder
+ body->buf -= 3;
+ body->len += 3;
+ }
+
+ return ret;
+}
+
+static bool parse_hook(pl_log log, pl_str *body, struct custom_shader_hook *out)
+{
+ *out = (struct custom_shader_hook){
+ .pass_desc = pl_str0("unknown user shader"),
+ .width = {{ SHEXP_TEX_W, { .varname = pl_str0("HOOKED") }}},
+ .height = {{ SHEXP_TEX_H, { .varname = pl_str0("HOOKED") }}},
+ .cond = {{ SHEXP_CONST, { .cval = 1.0 }}},
+ };
+
+ int hook_idx = 0;
+ int bind_idx = 0;
+
+ // Parse all headers
+ while (true) {
+ pl_str rest;
+ pl_str line = pl_str_strip(pl_str_getline(*body, &rest));
+
+ // Check for the presence of the magic line beginning
+ if (!pl_str_eatstart0(&line, "//!"))
+ break;
+
+ *body = rest;
+
+ // Parse the supported commands
+ if (pl_str_eatstart0(&line, "HOOK")) {
+ if (hook_idx == SHADER_MAX_HOOKS) {
+ pl_err(log, "Passes may only hook up to %d textures!",
+ SHADER_MAX_HOOKS);
+ return false;
+ }
+ out->hook_tex[hook_idx++] = pl_str_strip(line);
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "BIND")) {
+ if (bind_idx == SHADER_MAX_BINDS) {
+ pl_err(log, "Passes may only bind up to %d textures!",
+ SHADER_MAX_BINDS);
+ return false;
+ }
+ out->bind_tex[bind_idx++] = pl_str_strip(line);
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "SAVE")) {
+ pl_str save_tex = pl_str_strip(line);
+ if (pl_str_equals0(save_tex, "HOOKED")) {
+ // This is a special name that means "overwrite existing"
+ // texture, which we just signal by not having any `save_tex`
+ // name set.
+ out->save_tex = (pl_str) {0};
+ } else if (pl_str_equals0(save_tex, "MAIN")) {
+ // Compatibility alias
+ out->save_tex = pl_str0("MAINPRESUB");
+ } else {
+ out->save_tex = save_tex;
+ };
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "DESC")) {
+ out->pass_desc = pl_str_strip(line);
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "OFFSET")) {
+ line = pl_str_strip(line);
+ if (pl_str_equals0(line, "ALIGN")) {
+ out->offset_align = true;
+ } else {
+ if (!pl_str_parse_float(pl_str_split_char(line, ' ', &line), &out->offset[0]) ||
+ !pl_str_parse_float(pl_str_split_char(line, ' ', &line), &out->offset[1]) ||
+ line.len)
+ {
+ pl_err(log, "Error while parsing OFFSET!");
+ return false;
+ }
+ }
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "WIDTH")) {
+ if (!parse_rpn_shexpr(line, out->width)) {
+ pl_err(log, "Error while parsing WIDTH!");
+ return false;
+ }
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "HEIGHT")) {
+ if (!parse_rpn_shexpr(line, out->height)) {
+ pl_err(log, "Error while parsing HEIGHT!");
+ return false;
+ }
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "WHEN")) {
+ if (!parse_rpn_shexpr(line, out->cond)) {
+ pl_err(log, "Error while parsing WHEN!");
+ return false;
+ }
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "COMPONENTS")) {
+ if (!pl_str_parse_int(pl_str_strip(line), &out->comps)) {
+ pl_err(log, "Error parsing COMPONENTS: '%.*s'", PL_STR_FMT(line));
+ return false;
+ }
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "COMPUTE")) {
+ line = pl_str_strip(line);
+ bool ok = pl_str_parse_int(pl_str_split_char(line, ' ', &line), &out->block_w) &&
+ pl_str_parse_int(pl_str_split_char(line, ' ', &line), &out->block_h);
+
+ line = pl_str_strip(line);
+ if (ok && line.len) {
+ ok = pl_str_parse_int(pl_str_split_char(line, ' ', &line), &out->threads_w) &&
+ pl_str_parse_int(pl_str_split_char(line, ' ', &line), &out->threads_h) &&
+ !line.len;
+ } else {
+ out->threads_w = out->block_w;
+ out->threads_h = out->block_h;
+ }
+
+ if (!ok) {
+ pl_err(log, "Error while parsing COMPUTE!");
+ return false;
+ }
+
+ out->is_compute = true;
+ continue;
+ }
+
+ // Unknown command type
+ pl_err(log, "Unrecognized command '%.*s'!", PL_STR_FMT(line));
+ return false;
+ }
+
+ // The rest of the file up until the next magic line beginning (if any)
+ // shall be the shader body
+ out->pass_body = split_magic(body);
+
+ // Sanity checking
+ if (hook_idx == 0)
+ pl_warn(log, "Pass has no hooked textures (will be ignored)!");
+
+ return true;
+}
+
+static bool parse_tex(pl_gpu gpu, void *alloc, pl_str *body,
+ struct pl_shader_desc *out)
+{
+ *out = (struct pl_shader_desc) {
+ .desc = {
+ .name = "USER_TEX",
+ .type = PL_DESC_SAMPLED_TEX,
+ },
+ };
+
+ struct pl_tex_params params = {
+ .w = 1, .h = 1, .d = 0,
+ .sampleable = true,
+ .debug_tag = PL_DEBUG_TAG,
+ };
+
+ while (true) {
+ pl_str rest;
+ pl_str line = pl_str_strip(pl_str_getline(*body, &rest));
+
+ if (!pl_str_eatstart0(&line, "//!"))
+ break;
+
+ *body = rest;
+
+ if (pl_str_eatstart0(&line, "TEXTURE")) {
+ out->desc.name = pl_strdup0(alloc, pl_str_strip(line));
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "SIZE")) {
+ line = pl_str_strip(line);
+ int dims = 0;
+ int dim[4]; // extra space to catch invalid extra entries
+ while (line.len && dims < PL_ARRAY_SIZE(dim)) {
+ if (!pl_str_parse_int(pl_str_split_char(line, ' ', &line), &dim[dims++])) {
+ PL_ERR(gpu, "Error while parsing SIZE!");
+ return false;
+ }
+ }
+
+ uint32_t lim = dims == 1 ? gpu->limits.max_tex_1d_dim
+ : dims == 2 ? gpu->limits.max_tex_2d_dim
+ : dims == 3 ? gpu->limits.max_tex_3d_dim
+ : 0;
+
+ // Sanity check against GPU size limits
+ switch (dims) {
+ case 3:
+ params.d = dim[2];
+ if (params.d < 1 || params.d > lim) {
+ PL_ERR(gpu, "SIZE %d exceeds GPU's texture size limits (%d)!",
+ params.d, lim);
+ return false;
+ }
+ // fall through
+ case 2:
+ params.h = dim[1];
+ if (params.h < 1 || params.h > lim) {
+ PL_ERR(gpu, "SIZE %d exceeds GPU's texture size limits (%d)!",
+ params.h, lim);
+ return false;
+ }
+ // fall through
+ case 1:
+ params.w = dim[0];
+ if (params.w < 1 || params.w > lim) {
+ PL_ERR(gpu, "SIZE %d exceeds GPU's texture size limits (%d)!",
+ params.w, lim);
+ return false;
+ }
+ break;
+
+ default:
+ PL_ERR(gpu, "Invalid number of texture dimensions!");
+ return false;
+ };
+
+ // Clear out the superfluous components
+ if (dims < 3)
+ params.d = 0;
+ if (dims < 2)
+ params.h = 0;
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "FORMAT")) {
+ line = pl_str_strip(line);
+ params.format = NULL;
+ for (int n = 0; n < gpu->num_formats; n++) {
+ pl_fmt fmt = gpu->formats[n];
+ if (pl_str_equals0(line, fmt->name)) {
+ params.format = fmt;
+ break;
+ }
+ }
+
+ if (!params.format || params.format->opaque) {
+ PL_ERR(gpu, "Unrecognized/unavailable FORMAT name: '%.*s'!",
+ PL_STR_FMT(line));
+ return false;
+ }
+
+ if (!(params.format->caps & PL_FMT_CAP_SAMPLEABLE)) {
+ PL_ERR(gpu, "Chosen FORMAT '%.*s' is not sampleable!",
+ PL_STR_FMT(line));
+ return false;
+ }
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "FILTER")) {
+ line = pl_str_strip(line);
+ if (pl_str_equals0(line, "LINEAR")) {
+ out->binding.sample_mode = PL_TEX_SAMPLE_LINEAR;
+ } else if (pl_str_equals0(line, "NEAREST")) {
+ out->binding.sample_mode = PL_TEX_SAMPLE_NEAREST;
+ } else {
+ PL_ERR(gpu, "Unrecognized FILTER: '%.*s'!", PL_STR_FMT(line));
+ return false;
+ }
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "BORDER")) {
+ line = pl_str_strip(line);
+ if (pl_str_equals0(line, "CLAMP")) {
+ out->binding.address_mode = PL_TEX_ADDRESS_CLAMP;
+ } else if (pl_str_equals0(line, "REPEAT")) {
+ out->binding.address_mode = PL_TEX_ADDRESS_REPEAT;
+ } else if (pl_str_equals0(line, "MIRROR")) {
+ out->binding.address_mode = PL_TEX_ADDRESS_MIRROR;
+ } else {
+ PL_ERR(gpu, "Unrecognized BORDER: '%.*s'!", PL_STR_FMT(line));
+ return false;
+ }
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "STORAGE")) {
+ params.storable = true;
+ out->desc.type = PL_DESC_STORAGE_IMG;
+ out->desc.access = PL_DESC_ACCESS_READWRITE;
+ out->memory = PL_MEMORY_COHERENT;
+ continue;
+ }
+
+ PL_ERR(gpu, "Unrecognized command '%.*s'!", PL_STR_FMT(line));
+ return false;
+ }
+
+ if (!params.format) {
+ PL_ERR(gpu, "No FORMAT specified!");
+ return false;
+ }
+
+ int caps = params.format->caps;
+ if (out->binding.sample_mode == PL_TEX_SAMPLE_LINEAR && !(caps & PL_FMT_CAP_LINEAR)) {
+ PL_ERR(gpu, "The specified texture format cannot be linear filtered!");
+ return false;
+ }
+
+ // Decode the rest of the section (up to the next //! marker) as raw hex
+ // data for the texture
+ pl_str tex, hexdata = split_magic(body);
+ if (!pl_str_decode_hex(NULL, pl_str_strip(hexdata), &tex)) {
+ PL_ERR(gpu, "Error while parsing TEXTURE body: must be a valid "
+ "hexadecimal sequence!");
+ return false;
+ }
+
+ int texels = params.w * PL_DEF(params.h, 1) * PL_DEF(params.d, 1);
+ size_t expected_len = texels * params.format->texel_size;
+ if (tex.len == 0 && params.storable) {
+ // In this case, it's okay that the texture has no initial data
+ pl_free_ptr(&tex.buf);
+ } else if (tex.len != expected_len) {
+ PL_ERR(gpu, "Shader TEXTURE size mismatch: got %zu bytes, expected %zu!",
+ tex.len, expected_len);
+ pl_free(tex.buf);
+ return false;
+ }
+
+ params.initial_data = tex.buf;
+ out->binding.object = pl_tex_create(gpu, &params);
+ pl_free(tex.buf);
+
+ if (!out->binding.object) {
+ PL_ERR(gpu, "Failed creating custom texture!");
+ return false;
+ }
+
+ return true;
+}
+
+static bool parse_buf(pl_gpu gpu, void *alloc, pl_str *body,
+ struct pl_shader_desc *out)
+{
+ *out = (struct pl_shader_desc) {
+ .desc = {
+ .name = "USER_BUF",
+ .type = PL_DESC_BUF_UNIFORM,
+ },
+ };
+
+ // Temporary, to allow deferring variable placement until all headers
+ // have been processed (in order to e.g. determine buffer type)
+ void *tmp = pl_tmp(alloc); // will be freed automatically on failure
+ PL_ARRAY(struct pl_var) vars = {0};
+
+ while (true) {
+ pl_str rest;
+ pl_str line = pl_str_strip(pl_str_getline(*body, &rest));
+
+ if (!pl_str_eatstart0(&line, "//!"))
+ break;
+
+ *body = rest;
+
+ if (pl_str_eatstart0(&line, "BUFFER")) {
+ out->desc.name = pl_strdup0(alloc, pl_str_strip(line));
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "STORAGE")) {
+ out->desc.type = PL_DESC_BUF_STORAGE;
+ out->desc.access = PL_DESC_ACCESS_READWRITE;
+ out->memory = PL_MEMORY_COHERENT;
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "VAR")) {
+ pl_str type_name = pl_str_split_char(pl_str_strip(line), ' ', &line);
+ struct pl_var var = {0};
+ for (const struct pl_named_var *nv = pl_var_glsl_types; nv->glsl_name; nv++) {
+ if (pl_str_equals0(type_name, nv->glsl_name)) {
+ var = nv->var;
+ break;
+ }
+ }
+
+ if (!var.type) {
+ // No type found
+ PL_ERR(gpu, "Unrecognized GLSL type '%.*s'!", PL_STR_FMT(type_name));
+ return false;
+ }
+
+ pl_str var_name = pl_str_split_char(line, '[', &line);
+ if (line.len > 0) {
+ // Parse array dimension
+ if (!pl_str_parse_int(pl_str_split_char(line, ']', NULL), &var.dim_a)) {
+ PL_ERR(gpu, "Failed parsing array dimension from [%.*s!",
+ PL_STR_FMT(line));
+ return false;
+ }
+
+ if (var.dim_a < 1) {
+ PL_ERR(gpu, "Invalid array dimension %d!", var.dim_a);
+ return false;
+ }
+ }
+
+ var.name = pl_strdup0(alloc, pl_str_strip(var_name));
+ PL_ARRAY_APPEND(tmp, vars, var);
+ continue;
+ }
+
+ PL_ERR(gpu, "Unrecognized command '%.*s'!", PL_STR_FMT(line));
+ return false;
+ }
+
+ // Try placing all of the buffer variables
+ for (int i = 0; i < vars.num; i++) {
+ if (!sh_buf_desc_append(alloc, gpu, out, NULL, vars.elem[i])) {
+ PL_ERR(gpu, "Custom buffer exceeds GPU limitations!");
+ return false;
+ }
+ }
+
+ // Decode the rest of the section (up to the next //! marker) as raw hex
+ // data for the buffer
+ pl_str data, hexdata = split_magic(body);
+ if (!pl_str_decode_hex(tmp, pl_str_strip(hexdata), &data)) {
+ PL_ERR(gpu, "Error while parsing BUFFER body: must be a valid "
+ "hexadecimal sequence!");
+ return false;
+ }
+
+ size_t buf_size = sh_buf_desc_size(out);
+ if (data.len == 0 && out->desc.type == PL_DESC_BUF_STORAGE) {
+ // In this case, it's okay that the buffer has no initial data
+ } else if (data.len != buf_size) {
+ PL_ERR(gpu, "Shader BUFFER size mismatch: got %zu bytes, expected %zu!",
+ data.len, buf_size);
+ return false;
+ }
+
+ out->binding.object = pl_buf_create(gpu, pl_buf_params(
+ .size = buf_size,
+ .uniform = out->desc.type == PL_DESC_BUF_UNIFORM,
+ .storable = out->desc.type == PL_DESC_BUF_STORAGE,
+ .initial_data = data.len ? data.buf : NULL,
+ ));
+
+ if (!out->binding.object) {
+ PL_ERR(gpu, "Failed creating custom buffer!");
+ return false;
+ }
+
+ pl_free(tmp);
+ return true;
+}
+
+static bool parse_var(pl_log log, pl_str str, enum pl_var_type type, pl_var_data *out)
+{
+ if (!str.len)
+ return true;
+
+ pl_str buf = str;
+ bool ok = false;
+ switch (type) {
+ case PL_VAR_SINT:
+ ok = pl_str_parse_int(pl_str_split_char(buf, ' ', &buf), &out->i);
+ break;
+ case PL_VAR_UINT:
+ ok = pl_str_parse_uint(pl_str_split_char(buf, ' ', &buf), &out->u);
+ break;
+ case PL_VAR_FLOAT:
+ ok = pl_str_parse_float(pl_str_split_char(buf, ' ', &buf), &out->f);
+ break;
+ case PL_VAR_INVALID:
+ case PL_VAR_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ if (pl_str_strip(buf).len > 0)
+ ok = false; // left-over garbage
+
+ if (!ok) {
+ pl_err(log, "Failed parsing variable data: %.*s", PL_STR_FMT(str));
+ return false;
+ }
+
+ return true;
+}
+
+static bool check_bounds(pl_log log, enum pl_var_type type, const pl_var_data data,
+ const pl_var_data minimum, const pl_var_data maximum)
+{
+#define CHECK_BOUNDS(v, fmt) do \
+{ \
+ if (data.v < minimum.v) { \
+ pl_err(log, "Initial value "fmt" below declared minimum "fmt"!", \
+ data.v, minimum.v); \
+ return false; \
+ } \
+ if (data.v > maximum.v) { \
+ pl_err(log, "Initial value "fmt" above declared maximum "fmt"!", \
+ data.v, maximum.v); \
+ return false; \
+ } \
+} while (0)
+
+ switch (type) {
+ case PL_VAR_SINT:
+ CHECK_BOUNDS(i, "%d");
+ break;
+ case PL_VAR_UINT:
+ CHECK_BOUNDS(u, "%u");
+ break;
+ case PL_VAR_FLOAT:
+ CHECK_BOUNDS(f, "%f");
+ break;
+ case PL_VAR_INVALID:
+ case PL_VAR_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+#undef CHECK_BOUNDS
+ return true;
+}
+
+static bool parse_param(pl_log log, void *alloc, pl_str *body,
+ struct pl_hook_par *out)
+{
+ *out = (struct pl_hook_par) {0};
+ pl_str minimum = {0};
+ pl_str maximum = {0};
+ bool is_enum = false;
+
+ while (true) {
+ pl_str rest;
+ pl_str line = pl_str_strip(pl_str_getline(*body, &rest));
+
+ if (!pl_str_eatstart0(&line, "//!"))
+ break;
+
+ *body = rest;
+
+ if (pl_str_eatstart0(&line, "PARAM")) {
+ out->name = pl_strdup0(alloc, pl_str_strip(line));
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "DESC")) {
+ out->description = pl_strdup0(alloc, pl_str_strip(line));
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "MINIMUM")) {
+ minimum = pl_str_strip(line);
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "MAXIMUM")) {
+ maximum = pl_str_strip(line);
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "TYPE")) {
+ line = pl_str_strip(line);
+ is_enum = pl_str_eatstart0(&line, "ENUM");
+ line = pl_str_strip(line);
+ if (pl_str_eatstart0(&line, "DYNAMIC")) {
+ out->mode = PL_HOOK_PAR_DYNAMIC;
+ } else if (pl_str_eatstart0(&line, "CONSTANT")) {
+ out->mode = PL_HOOK_PAR_CONSTANT;
+ } else if (pl_str_eatstart0(&line, "DEFINE")) {
+ out->mode = PL_HOOK_PAR_DEFINE;
+ out->type = PL_VAR_SINT;
+ if (pl_str_strip(line).len > 0) {
+ pl_err(log, "TYPE DEFINE does not take any extra arguments, "
+ "unexpected: '%.*s'", PL_STR_FMT(line));
+ return false;
+ }
+ continue;
+ } else {
+ out->mode = PL_HOOK_PAR_VARIABLE;
+ }
+
+ line = pl_str_strip(line);
+ for (const struct pl_named_var *nv = pl_var_glsl_types;
+ nv->glsl_name; nv++)
+ {
+ if (pl_str_equals0(line, nv->glsl_name)) {
+ if (nv->var.dim_v > 1 || nv->var.dim_m > 1) {
+ pl_err(log, "GLSL type '%s' is incompatible with "
+ "shader parameters, must be scalar type!",
+ nv->glsl_name);
+ return false;
+ }
+
+ out->type = nv->var.type;
+ if (is_enum && out->type != PL_VAR_SINT) {
+ pl_err(log, "ENUM is only compatible with type int/DEFINE!");
+ return false;
+ }
+ goto next;
+ }
+ }
+
+ pl_err(log, "Unrecognized GLSL type '%.*s'!", PL_STR_FMT(line));
+ return false;
+ }
+
+ pl_err(log, "Unrecognized command '%.*s'!", PL_STR_FMT(line));
+ return false;
+
+next: ;
+ }
+
+ switch (out->type) {
+ case PL_VAR_INVALID:
+ pl_err(log, "Missing variable type!");
+ return false;
+ case PL_VAR_SINT:
+ out->minimum.i = INT_MIN;
+ out->maximum.i = INT_MAX;
+ break;
+ case PL_VAR_UINT:
+ out->minimum.u = 0;
+ out->maximum.u = UINT_MAX;
+ break;
+ case PL_VAR_FLOAT:
+ out->minimum.f = -INFINITY;
+ out->maximum.f = INFINITY;
+ break;
+ case PL_VAR_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ pl_str initial = pl_str_strip(split_magic(body));
+ if (!initial.len) {
+ pl_err(log, "Missing initial parameter value!");
+ return false;
+ }
+
+ if (is_enum) {
+ PL_ARRAY(const char *) names = {0};
+ pl_assert(out->type == PL_VAR_SINT);
+ do {
+ pl_str line = pl_str_strip(pl_str_getline(initial, &initial));
+ if (!line.len)
+ continue;
+ PL_ARRAY_APPEND(alloc, names, pl_strdup0(alloc, line));
+ } while (initial.len);
+
+ pl_assert(names.num >= 1);
+ out->initial.i = 0;
+ out->minimum.i = 0;
+ out->maximum.i = names.num - 1;
+ out->names = names.elem;
+ } else {
+ if (!parse_var(log, initial, out->type, &out->initial))
+ return false;
+ if (!parse_var(log, minimum, out->type, &out->minimum))
+ return false;
+ if (!parse_var(log, maximum, out->type, &out->maximum))
+ return false;
+ if (!check_bounds(log, out->type, out->initial, out->minimum, out->maximum))
+ return false;
+ }
+
+ out->data = pl_memdup(alloc, &out->initial, sizeof(out->initial));
+ return true;
+}
+
+static enum pl_hook_stage mp_stage_to_pl(pl_str stage)
+{
+ if (pl_str_equals0(stage, "RGB"))
+ return PL_HOOK_RGB_INPUT;
+ if (pl_str_equals0(stage, "LUMA"))
+ return PL_HOOK_LUMA_INPUT;
+ if (pl_str_equals0(stage, "CHROMA"))
+ return PL_HOOK_CHROMA_INPUT;
+ if (pl_str_equals0(stage, "ALPHA"))
+ return PL_HOOK_ALPHA_INPUT;
+ if (pl_str_equals0(stage, "XYZ"))
+ return PL_HOOK_XYZ_INPUT;
+
+ if (pl_str_equals0(stage, "CHROMA_SCALED"))
+ return PL_HOOK_CHROMA_SCALED;
+ if (pl_str_equals0(stage, "ALPHA_SCALED"))
+ return PL_HOOK_ALPHA_SCALED;
+
+ if (pl_str_equals0(stage, "NATIVE"))
+ return PL_HOOK_NATIVE;
+ if (pl_str_equals0(stage, "MAINPRESUB"))
+ return PL_HOOK_RGB;
+ if (pl_str_equals0(stage, "MAIN"))
+ return PL_HOOK_RGB; // Note: conflicts with above!
+
+ if (pl_str_equals0(stage, "LINEAR"))
+ return PL_HOOK_LINEAR;
+ if (pl_str_equals0(stage, "SIGMOID"))
+ return PL_HOOK_SIGMOID;
+ if (pl_str_equals0(stage, "PREKERNEL"))
+ return PL_HOOK_PRE_KERNEL;
+ if (pl_str_equals0(stage, "POSTKERNEL"))
+ return PL_HOOK_POST_KERNEL;
+
+ if (pl_str_equals0(stage, "SCALED"))
+ return PL_HOOK_SCALED;
+ if (pl_str_equals0(stage, "PREOUTPUT"))
+ return PL_HOOK_PRE_OUTPUT;
+ if (pl_str_equals0(stage, "OUTPUT"))
+ return PL_HOOK_OUTPUT;
+
+ return 0;
+}
+
+static pl_str pl_stage_to_mp(enum pl_hook_stage stage)
+{
+ switch (stage) {
+ case PL_HOOK_RGB_INPUT: return pl_str0("RGB");
+ case PL_HOOK_LUMA_INPUT: return pl_str0("LUMA");
+ case PL_HOOK_CHROMA_INPUT: return pl_str0("CHROMA");
+ case PL_HOOK_ALPHA_INPUT: return pl_str0("ALPHA");
+ case PL_HOOK_XYZ_INPUT: return pl_str0("XYZ");
+
+ case PL_HOOK_CHROMA_SCALED: return pl_str0("CHROMA_SCALED");
+ case PL_HOOK_ALPHA_SCALED: return pl_str0("ALPHA_SCALED");
+
+ case PL_HOOK_NATIVE: return pl_str0("NATIVE");
+ case PL_HOOK_RGB: return pl_str0("MAINPRESUB");
+
+ case PL_HOOK_LINEAR: return pl_str0("LINEAR");
+ case PL_HOOK_SIGMOID: return pl_str0("SIGMOID");
+ case PL_HOOK_PRE_KERNEL: return pl_str0("PREKERNEL");
+ case PL_HOOK_POST_KERNEL: return pl_str0("POSTKERNEL");
+
+ case PL_HOOK_SCALED: return pl_str0("SCALED");
+ case PL_HOOK_PRE_OUTPUT: return pl_str0("PREOUTPUT");
+ case PL_HOOK_OUTPUT: return pl_str0("OUTPUT");
+ };
+
+ pl_unreachable();
+}
+
+struct hook_pass {
+ enum pl_hook_stage exec_stages;
+ struct custom_shader_hook hook;
+};
+
+struct pass_tex {
+ pl_str name;
+ pl_tex tex;
+
+ // Metadata
+ pl_rect2df rect;
+ struct pl_color_repr repr;
+ struct pl_color_space color;
+ int comps;
+};
+
+struct hook_priv {
+ pl_log log;
+ pl_gpu gpu;
+ void *alloc;
+
+ PL_ARRAY(struct hook_pass) hook_passes;
+ PL_ARRAY(struct pl_hook_par) hook_params;
+
+ // Fixed (for shader-local resources)
+ PL_ARRAY(struct pl_shader_desc) descriptors;
+
+ // Dynamic per pass
+ enum pl_hook_stage save_stages;
+ PL_ARRAY(struct pass_tex) pass_textures;
+ pl_shader trc_helper;
+
+ // State for PRNG/frame count
+ int frame_count;
+ uint64_t prng_state[4];
+};
+
+static void hook_reset(void *priv)
+{
+ struct hook_priv *p = priv;
+ p->pass_textures.num = 0;
+}
+
+// Context during execution of a hook
+struct hook_ctx {
+ struct hook_priv *priv;
+ const struct pl_hook_params *params;
+ struct pass_tex hooked;
+};
+
+static bool lookup_tex(struct hook_ctx *ctx, pl_str var, float size[2])
+{
+ struct hook_priv *p = ctx->priv;
+ const struct pl_hook_params *params = ctx->params;
+
+ if (pl_str_equals0(var, "HOOKED")) {
+ pl_assert(ctx->hooked.tex);
+ size[0] = ctx->hooked.tex->params.w;
+ size[1] = ctx->hooked.tex->params.h;
+ return true;
+ }
+
+ if (pl_str_equals0(var, "NATIVE_CROPPED")) {
+ size[0] = fabs(pl_rect_w(params->src_rect));
+ size[1] = fabs(pl_rect_h(params->src_rect));
+ return true;
+ }
+
+ if (pl_str_equals0(var, "OUTPUT")) {
+ size[0] = abs(pl_rect_w(params->dst_rect));
+ size[1] = abs(pl_rect_h(params->dst_rect));
+ return true;
+ }
+
+ if (pl_str_equals0(var, "MAIN"))
+ var = pl_str0("MAINPRESUB");
+
+ for (int i = 0; i < p->pass_textures.num; i++) {
+ if (pl_str_equals(var, p->pass_textures.elem[i].name)) {
+ pl_tex tex = p->pass_textures.elem[i].tex;
+ size[0] = tex->params.w;
+ size[1] = tex->params.h;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool lookup_var(struct hook_ctx *ctx, pl_str var, float *val)
+{
+ struct hook_priv *p = ctx->priv;
+ for (int i = 0; i < p->hook_params.num; i++) {
+ const struct pl_hook_par *hp = &p->hook_params.elem[i];
+ if (pl_str_equals0(var, hp->name)) {
+ switch (hp->type) {
+ case PL_VAR_SINT: *val = hp->data->i; return true;
+ case PL_VAR_UINT: *val = hp->data->u; return true;
+ case PL_VAR_FLOAT: *val = hp->data->f; return true;
+ case PL_VAR_INVALID:
+ case PL_VAR_TYPE_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+ }
+
+ if (hp->names) {
+ for (int j = hp->minimum.i; j <= hp->maximum.i; j++) {
+ if (pl_str_equals0(var, hp->names[j])) {
+ *val = j;
+ return true;
+ }
+ }
+ }
+ }
+
+ PL_WARN(p, "Variable '%.*s' not found in RPN expression!", PL_STR_FMT(var));
+ return false;
+}
+
+// Returns whether successful. 'result' is left untouched on failure
+static bool eval_shexpr(struct hook_ctx *ctx,
+ const struct shexp expr[MAX_SHEXP_SIZE],
+ float *result)
+{
+ struct hook_priv *p = ctx->priv;
+ float stack[MAX_SHEXP_SIZE] = {0};
+ int idx = 0; // points to next element to push
+
+ for (int i = 0; i < MAX_SHEXP_SIZE; i++) {
+ switch (expr[i].tag) {
+ case SHEXP_END:
+ goto done;
+
+ case SHEXP_CONST:
+ // Since our SHEXPs are bound by MAX_SHEXP_SIZE, it should be
+ // impossible to overflow the stack
+ assert(idx < MAX_SHEXP_SIZE);
+ stack[idx++] = expr[i].val.cval;
+ continue;
+
+ case SHEXP_OP1:
+ if (idx < 1) {
+ PL_WARN(p, "Stack underflow in RPN expression!");
+ return false;
+ }
+
+ switch (expr[i].val.op) {
+ case SHEXP_OP_NOT: stack[idx-1] = !stack[idx-1]; break;
+ default: pl_unreachable();
+ }
+ continue;
+
+ case SHEXP_OP2:
+ if (idx < 2) {
+ PL_WARN(p, "Stack underflow in RPN expression!");
+ return false;
+ }
+
+ // Pop the operands in reverse order
+ float op2 = stack[--idx];
+ float op1 = stack[--idx];
+ float res = 0.0;
+ switch (expr[i].val.op) {
+ case SHEXP_OP_ADD: res = op1 + op2; break;
+ case SHEXP_OP_SUB: res = op1 - op2; break;
+ case SHEXP_OP_MUL: res = op1 * op2; break;
+ case SHEXP_OP_DIV: res = op1 / op2; break;
+ case SHEXP_OP_MOD: res = fmodf(op1, op2); break;
+ case SHEXP_OP_GT: res = op1 > op2; break;
+ case SHEXP_OP_LT: res = op1 < op2; break;
+ case SHEXP_OP_EQ: res = fabsf(op1 - op2) <= 1e-6 * fmaxf(op1, op2); break;
+ case SHEXP_OP_NOT: pl_unreachable();
+ }
+
+ if (!isfinite(res)) {
+ PL_WARN(p, "Illegal operation in RPN expression!");
+ return false;
+ }
+
+ stack[idx++] = res;
+ continue;
+
+ case SHEXP_TEX_W:
+ case SHEXP_TEX_H: {
+ pl_str name = expr[i].val.varname;
+ float size[2];
+
+ if (!lookup_tex(ctx, name, size)) {
+ PL_WARN(p, "Variable '%.*s' not found in RPN expression!",
+ PL_STR_FMT(name));
+ return false;
+ }
+
+ stack[idx++] = (expr[i].tag == SHEXP_TEX_W) ? size[0] : size[1];
+ continue;
+ }
+
+ case SHEXP_VAR: {
+ pl_str name = expr[i].val.varname;
+ float val;
+ if (!lookup_var(ctx, name, &val))
+ return false;
+ stack[idx++] = val;
+ continue;
+ }
+ }
+ }
+
+done:
+ // Return the single stack element
+ if (idx != 1) {
+ PL_WARN(p, "Malformed stack after RPN expression!");
+ return false;
+ }
+
+ *result = stack[0];
+ return true;
+}
+
+static double prng_step(uint64_t s[4])
+{
+ const uint64_t result = s[0] + s[3];
+ const uint64_t t = s[1] << 17;
+
+ s[2] ^= s[0];
+ s[3] ^= s[1];
+ s[1] ^= s[2];
+ s[0] ^= s[3];
+
+ s[2] ^= t;
+ s[3] = (s[3] << 45) | (s[3] >> (64 - 45));
+ return (result >> 11) * 0x1.0p-53;
+}
+
+static bool bind_pass_tex(pl_shader sh, pl_str name,
+ const struct pass_tex *ptex,
+ const pl_rect2df *rect,
+ bool hooked, bool mainpresub)
+{
+ ident_t id, pos, pt;
+
+ // Compatibility with mpv texture binding semantics
+ id = sh_bind(sh, ptex->tex, PL_TEX_ADDRESS_CLAMP, PL_TEX_SAMPLE_LINEAR,
+ "hook_tex", rect, &pos, &pt);
+ if (!id)
+ return false;
+
+ GLSLH("#define %.*s_raw "$" \n", PL_STR_FMT(name), id);
+ GLSLH("#define %.*s_pos "$" \n", PL_STR_FMT(name), pos);
+ GLSLH("#define %.*s_map "$"_map \n", PL_STR_FMT(name), pos);
+ GLSLH("#define %.*s_size vec2(textureSize("$", 0)) \n", PL_STR_FMT(name), id);
+ GLSLH("#define %.*s_pt "$" \n", PL_STR_FMT(name), pt);
+
+ float off[2] = { ptex->rect.x0, ptex->rect.y0 };
+ GLSLH("#define %.*s_off "$" \n", PL_STR_FMT(name),
+ sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec2("offset"),
+ .data = off,
+ }));
+
+ struct pl_color_repr repr = ptex->repr;
+ ident_t scale = SH_FLOAT(pl_color_repr_normalize(&repr));
+ GLSLH("#define %.*s_mul "$" \n", PL_STR_FMT(name), scale);
+
+ // Compatibility with mpv
+ GLSLH("#define %.*s_rot mat2(1.0, 0.0, 0.0, 1.0) \n", PL_STR_FMT(name));
+
+ // Sampling function boilerplate
+ GLSLH("#define %.*s_tex(pos) ("$" * vec4(textureLod("$", pos, 0.0))) \n",
+ PL_STR_FMT(name), scale, id);
+ GLSLH("#define %.*s_texOff(off) (%.*s_tex("$" + "$" * vec2(off))) \n",
+ PL_STR_FMT(name), PL_STR_FMT(name), pos, pt);
+
+ bool can_gather = ptex->tex->params.format->gatherable;
+ if (can_gather) {
+ GLSLH("#define %.*s_gather(pos, c) ("$" * vec4(textureGather("$", pos, c))) \n",
+ PL_STR_FMT(name), scale, id);
+ }
+
+ if (hooked) {
+ GLSLH("#define HOOKED_raw %.*s_raw \n", PL_STR_FMT(name));
+ GLSLH("#define HOOKED_pos %.*s_pos \n", PL_STR_FMT(name));
+ GLSLH("#define HOOKED_size %.*s_size \n", PL_STR_FMT(name));
+ GLSLH("#define HOOKED_rot %.*s_rot \n", PL_STR_FMT(name));
+ GLSLH("#define HOOKED_off %.*s_off \n", PL_STR_FMT(name));
+ GLSLH("#define HOOKED_pt %.*s_pt \n", PL_STR_FMT(name));
+ GLSLH("#define HOOKED_map %.*s_map \n", PL_STR_FMT(name));
+ GLSLH("#define HOOKED_mul %.*s_mul \n", PL_STR_FMT(name));
+ GLSLH("#define HOOKED_tex %.*s_tex \n", PL_STR_FMT(name));
+ GLSLH("#define HOOKED_texOff %.*s_texOff \n", PL_STR_FMT(name));
+ if (can_gather)
+ GLSLH("#define HOOKED_gather %.*s_gather \n", PL_STR_FMT(name));
+ }
+
+ if (mainpresub) {
+ GLSLH("#define MAIN_raw MAINPRESUB_raw \n");
+ GLSLH("#define MAIN_pos MAINPRESUB_pos \n");
+ GLSLH("#define MAIN_size MAINPRESUB_size \n");
+ GLSLH("#define MAIN_rot MAINPRESUB_rot \n");
+ GLSLH("#define MAIN_off MAINPRESUB_off \n");
+ GLSLH("#define MAIN_pt MAINPRESUB_pt \n");
+ GLSLH("#define MAIN_map MAINPRESUB_map \n");
+ GLSLH("#define MAIN_mul MAINPRESUB_mul \n");
+ GLSLH("#define MAIN_tex MAINPRESUB_tex \n");
+ GLSLH("#define MAIN_texOff MAINPRESUB_texOff \n");
+ if (can_gather)
+ GLSLH("#define MAIN_gather MAINPRESUB_gather \n");
+ }
+
+ return true;
+}
+
+static void save_pass_tex(struct hook_priv *p, struct pass_tex ptex)
+{
+
+ for (int i = 0; i < p->pass_textures.num; i++) {
+ if (!pl_str_equals(p->pass_textures.elem[i].name, ptex.name))
+ continue;
+
+ p->pass_textures.elem[i] = ptex;
+ return;
+ }
+
+ // No texture with this name yet, append new one
+ PL_ARRAY_APPEND(p->alloc, p->pass_textures, ptex);
+}
+
+static struct pl_hook_res hook_hook(void *priv, const struct pl_hook_params *params)
+{
+ struct hook_priv *p = priv;
+ pl_str stage = pl_stage_to_mp(params->stage);
+ struct pl_hook_res res = {0};
+
+ pl_shader sh = NULL;
+ struct hook_ctx ctx = {
+ .priv = p,
+ .params = params,
+ .hooked = {
+ .name = stage,
+ .tex = params->tex,
+ .rect = params->rect,
+ .repr = params->repr,
+ .color = params->color,
+ .comps = params->components,
+ },
+ };
+
+ // Save the input texture if needed
+ if (p->save_stages & params->stage) {
+ PL_TRACE(p, "Saving input texture '%.*s' for binding",
+ PL_STR_FMT(ctx.hooked.name));
+ save_pass_tex(p, ctx.hooked);
+ }
+
+ for (int n = 0; n < p->hook_passes.num; n++) {
+ const struct hook_pass *pass = &p->hook_passes.elem[n];
+ if (!(pass->exec_stages & params->stage))
+ continue;
+
+ const struct custom_shader_hook *hook = &pass->hook;
+ PL_TRACE(p, "Executing hook pass %d on stage '%.*s': %.*s",
+ n, PL_STR_FMT(stage), PL_STR_FMT(hook->pass_desc));
+
+ // Test for execution condition
+ float run = 0;
+ if (!eval_shexpr(&ctx, hook->cond, &run))
+ goto error;
+
+ if (!run) {
+ PL_TRACE(p, "Skipping hook due to condition");
+ continue;
+ }
+
+ // Generate a new shader object
+ sh = pl_dispatch_begin(params->dispatch);
+
+ // Bind all necessary input textures
+ for (int i = 0; i < PL_ARRAY_SIZE(hook->bind_tex); i++) {
+ pl_str texname = hook->bind_tex[i];
+ if (!texname.len)
+ break;
+
+ // Convenience alias, to allow writing shaders that are oblivious
+ // of the exact stage they hooked. This simply translates to
+ // whatever stage actually fired the hook.
+ bool hooked = false, mainpresub = false;
+ if (pl_str_equals0(texname, "HOOKED")) {
+ // Continue with binding this, under the new name
+ texname = stage;
+ hooked = true;
+ }
+
+ // Compatibility alias, because MAIN and MAINPRESUB mean the same
+ // thing to libplacebo, but user shaders are still written as
+ // though they can be different concepts.
+ if (pl_str_equals0(texname, "MAIN") ||
+ pl_str_equals0(texname, "MAINPRESUB"))
+ {
+ texname = pl_str0("MAINPRESUB");
+ mainpresub = true;
+ }
+
+ for (int j = 0; j < p->descriptors.num; j++) {
+ if (pl_str_equals0(texname, p->descriptors.elem[j].desc.name)) {
+ // Directly bind this, no need to bother with all the
+ // `bind_pass_tex` boilerplate
+ ident_t id = sh_desc(sh, p->descriptors.elem[j]);
+ GLSLH("#define %.*s "$" \n", PL_STR_FMT(texname), id);
+
+ if (p->descriptors.elem[j].desc.type == PL_DESC_SAMPLED_TEX) {
+ GLSLH("#define %.*s_tex(pos) (textureLod("$", pos, 0.0)) \n",
+ PL_STR_FMT(texname), id);
+ }
+ goto next_bind;
+ }
+ }
+
+ for (int j = 0; j < p->pass_textures.num; j++) {
+ if (pl_str_equals(texname, p->pass_textures.elem[j].name)) {
+ // Note: We bind the whole texture, rather than
+ // hooked.rect, because user shaders in general are not
+ // designed to handle cropped input textures.
+ const struct pass_tex *ptex = &p->pass_textures.elem[j];
+ pl_rect2df rect = {
+ 0, 0, ptex->tex->params.w, ptex->tex->params.h,
+ };
+
+ if (hook->offset_align && pl_str_equals(texname, stage)) {
+ float sx = pl_rect_w(ctx.hooked.rect) / pl_rect_w(params->src_rect),
+ sy = pl_rect_h(ctx.hooked.rect) / pl_rect_h(params->src_rect),
+ ox = ctx.hooked.rect.x0 - sx * params->src_rect.x0,
+ oy = ctx.hooked.rect.y0 - sy * params->src_rect.y0;
+
+ PL_TRACE(p, "Aligning plane with ref: %f %f", ox, oy);
+ pl_rect2df_offset(&rect, ox, oy);
+ }
+
+ if (!bind_pass_tex(sh, texname, &p->pass_textures.elem[j],
+ &rect, hooked, mainpresub))
+ {
+ goto error;
+ }
+ goto next_bind;
+ }
+ }
+
+ // If none of the above matched, this is an unknown texture name,
+ // so silently ignore this pass to match the mpv behavior
+ PL_TRACE(p, "Skipping hook due to no texture named '%.*s'.",
+ PL_STR_FMT(texname));
+ pl_dispatch_abort(params->dispatch, &sh);
+ goto next_pass;
+
+ next_bind: ; // outer 'continue'
+ }
+
+ // Set up the input variables
+ p->frame_count++;
+ GLSLH("#define frame "$" \n", sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_int("frame"),
+ .data = &p->frame_count,
+ .dynamic = true,
+ }));
+
+ float random = prng_step(p->prng_state);
+ GLSLH("#define random "$" \n", sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_float("random"),
+ .data = &random,
+ .dynamic = true,
+ }));
+
+ float src_size[2] = { pl_rect_w(params->src_rect), pl_rect_h(params->src_rect) };
+ GLSLH("#define input_size "$" \n", sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec2("input_size"),
+ .data = src_size,
+ }));
+
+ float dst_size[2] = { pl_rect_w(params->dst_rect), pl_rect_h(params->dst_rect) };
+ GLSLH("#define target_size "$" \n", sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec2("target_size"),
+ .data = dst_size,
+ }));
+
+ float tex_off[2] = { params->src_rect.x0, params->src_rect.y0 };
+ GLSLH("#define tex_offset "$" \n", sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec2("tex_offset"),
+ .data = tex_off,
+ }));
+
+ // Custom parameters
+ for (int i = 0; i < p->hook_params.num; i++) {
+ const struct pl_hook_par *hp = &p->hook_params.elem[i];
+ switch (hp->mode) {
+ case PL_HOOK_PAR_VARIABLE:
+ case PL_HOOK_PAR_DYNAMIC:
+ GLSLH("#define %s "$" \n", hp->name,
+ sh_var(sh, (struct pl_shader_var) {
+ .var = {
+ .name = hp->name,
+ .type = hp->type,
+ .dim_v = 1,
+ .dim_m = 1,
+ .dim_a = 1,
+ },
+ .data = hp->data,
+ .dynamic = hp->mode == PL_HOOK_PAR_DYNAMIC,
+ }));
+ break;
+
+ case PL_HOOK_PAR_CONSTANT:
+ GLSLH("#define %s "$" \n", hp->name,
+ sh_const(sh, (struct pl_shader_const) {
+ .name = hp->name,
+ .type = hp->type,
+ .data = hp->data,
+ .compile_time = true,
+ }));
+ break;
+
+ case PL_HOOK_PAR_DEFINE:
+ GLSLH("#define %s %d \n", hp->name, hp->data->i);
+ break;
+
+ case PL_HOOK_PAR_MODE_COUNT:
+ pl_unreachable();
+ }
+
+ if (hp->names) {
+ for (int j = hp->minimum.i; j <= hp->maximum.i; j++)
+ GLSLH("#define %s %d \n", hp->names[j], j);
+ }
+ }
+
+ // Helper sub-shaders
+ uint64_t sh_id = SH_PARAMS(sh).id;
+ pl_shader_reset(p->trc_helper, pl_shader_params(
+ .id = ++sh_id,
+ .gpu = p->gpu,
+ ));
+ pl_shader_linearize(p->trc_helper, params->orig_color);
+ GLSLH("#define linearize "$" \n", sh_subpass(sh, p->trc_helper));
+
+ pl_shader_reset(p->trc_helper, pl_shader_params(
+ .id = ++sh_id,
+ .gpu = p->gpu,
+ ));
+ pl_shader_delinearize(p->trc_helper, params->orig_color);
+ GLSLH("#define delinearize "$" \n", sh_subpass(sh, p->trc_helper));
+
+ // Load and run the user shader itself
+ sh_append_str(sh, SH_BUF_HEADER, hook->pass_body);
+ sh_describef(sh, "%.*s", PL_STR_FMT(hook->pass_desc));
+
+ // Resolve output size and create framebuffer
+ float out_size[2] = {0};
+ if (!eval_shexpr(&ctx, hook->width, &out_size[0]) ||
+ !eval_shexpr(&ctx, hook->height, &out_size[1]))
+ {
+ goto error;
+ }
+
+ int out_w = roundf(out_size[0]),
+ out_h = roundf(out_size[1]);
+
+ if (!sh_require(sh, PL_SHADER_SIG_NONE, out_w, out_h))
+ goto error;
+
+ // Generate a new texture to store the render result
+ pl_tex fbo;
+ fbo = params->get_tex(params->priv, out_w, out_h);
+ if (!fbo) {
+ PL_ERR(p, "Failed dispatching hook: `get_tex` callback failed?");
+ goto error;
+ }
+
+ bool ok;
+ if (hook->is_compute) {
+
+ if (!sh_try_compute(sh, hook->threads_w, hook->threads_h, false, 0) ||
+ !fbo->params.storable)
+ {
+ PL_ERR(p, "Failed dispatching COMPUTE shader");
+ goto error;
+ }
+
+ GLSLP("#define out_image "$" \n", sh_desc(sh, (struct pl_shader_desc) {
+ .binding.object = fbo,
+ .desc = {
+ .name = "out_image",
+ .type = PL_DESC_STORAGE_IMG,
+ .access = PL_DESC_ACCESS_WRITEONLY,
+ },
+ }));
+
+ sh->output = PL_SHADER_SIG_NONE;
+
+ GLSL("hook(); \n");
+ ok = pl_dispatch_compute(params->dispatch, pl_dispatch_compute_params(
+ .shader = &sh,
+ .dispatch_size = {
+ // Round up as many blocks as are needed to cover the image
+ PL_DIV_UP(out_w, hook->block_w),
+ PL_DIV_UP(out_h, hook->block_h),
+ 1,
+ },
+ .width = out_w,
+ .height = out_h,
+ ));
+
+ } else {
+
+ // Default non-COMPUTE shaders to explicitly use fragment shaders
+ // only, to avoid breaking things like fwidth()
+ sh->type = PL_DEF(sh->type, SH_FRAGMENT);
+
+ GLSL("vec4 color = hook(); \n");
+ ok = pl_dispatch_finish(params->dispatch, pl_dispatch_params(
+ .shader = &sh,
+ .target = fbo,
+ ));
+
+ }
+
+ if (!ok)
+ goto error;
+
+ float sx = (float) out_w / ctx.hooked.tex->params.w,
+ sy = (float) out_h / ctx.hooked.tex->params.h,
+ x0 = sx * ctx.hooked.rect.x0 + hook->offset[0],
+ y0 = sy * ctx.hooked.rect.y0 + hook->offset[1];
+
+ pl_rect2df new_rect = {
+ x0,
+ y0,
+ x0 + sx * pl_rect_w(ctx.hooked.rect),
+ y0 + sy * pl_rect_h(ctx.hooked.rect),
+ };
+
+ if (hook->offset_align) {
+ float rx = pl_rect_w(new_rect) / pl_rect_w(params->src_rect),
+ ry = pl_rect_h(new_rect) / pl_rect_h(params->src_rect),
+ ox = rx * params->src_rect.x0 - sx * ctx.hooked.rect.x0,
+ oy = ry * params->src_rect.y0 - sy * ctx.hooked.rect.y0;
+
+ pl_rect2df_offset(&new_rect, ox, oy);
+ }
+
+ // Save the result of this shader invocation
+ struct pass_tex ptex = {
+ .name = hook->save_tex.len ? hook->save_tex : stage,
+ .tex = fbo,
+ .repr = ctx.hooked.repr,
+ .color = ctx.hooked.color,
+ .comps = PL_DEF(hook->comps, ctx.hooked.comps),
+ .rect = new_rect,
+ };
+
+ // It's assumed that users will correctly normalize the input
+ pl_color_repr_normalize(&ptex.repr);
+
+ PL_TRACE(p, "Saving output texture '%.*s' from hook execution on '%.*s'",
+ PL_STR_FMT(ptex.name), PL_STR_FMT(stage));
+
+ save_pass_tex(p, ptex);
+
+ // Update the result object, unless we saved to a different name
+ if (pl_str_equals(ptex.name, stage)) {
+ ctx.hooked = ptex;
+ res = (struct pl_hook_res) {
+ .output = PL_HOOK_SIG_TEX,
+ .tex = fbo,
+ .repr = ptex.repr,
+ .color = ptex.color,
+ .components = ptex.comps,
+ .rect = new_rect,
+ };
+ }
+
+next_pass: ;
+ }
+
+ return res;
+
+error:
+ pl_dispatch_abort(params->dispatch, &sh);
+ return (struct pl_hook_res) { .failed = true };
+}
+
+const struct pl_hook *pl_mpv_user_shader_parse(pl_gpu gpu,
+ const char *shader_text,
+ size_t shader_len)
+{
+ if (!shader_len)
+ return NULL;
+
+ pl_str shader = { (uint8_t *) shader_text, shader_len };
+
+ struct pl_hook *hook = pl_zalloc_obj(NULL, hook, struct hook_priv);
+ struct hook_priv *p = PL_PRIV(hook);
+
+ *hook = (struct pl_hook) {
+ .input = PL_HOOK_SIG_TEX,
+ .priv = p,
+ .reset = hook_reset,
+ .hook = hook_hook,
+ .signature = pl_str_hash(shader),
+ };
+
+ *p = (struct hook_priv) {
+ .log = gpu->log,
+ .gpu = gpu,
+ .alloc = hook,
+ .trc_helper = pl_shader_alloc(gpu->log, NULL),
+ .prng_state = {
+ // Determined by fair die roll
+ 0xb76d71f9443c228allu, 0x93a02092fc4807e8llu,
+ 0x06d81748f838bd07llu, 0x9381ee129dddce6cllu,
+ },
+ };
+
+ shader = pl_strdup(hook, shader);
+
+ // Skip all garbage (e.g. comments) before the first header
+ int pos = pl_str_find(shader, pl_str0("//!"));
+ if (pos < 0) {
+ PL_ERR(gpu, "Shader appears to contain no headers?");
+ goto error;
+ }
+ shader = pl_str_drop(shader, pos);
+
+ // Loop over the file
+ while (shader.len > 0)
+ {
+ // Peek at the first header to dispatch the right type
+ if (pl_str_startswith0(shader, "//!TEXTURE")) {
+ struct pl_shader_desc sd;
+ if (!parse_tex(gpu, hook, &shader, &sd))
+ goto error;
+
+ PL_INFO(gpu, "Registering named texture '%s'", sd.desc.name);
+ PL_ARRAY_APPEND(hook, p->descriptors, sd);
+ continue;
+ }
+
+ if (pl_str_startswith0(shader, "//!BUFFER")) {
+ struct pl_shader_desc sd;
+ if (!parse_buf(gpu, hook, &shader, &sd))
+ goto error;
+
+ PL_INFO(gpu, "Registering named buffer '%s'", sd.desc.name);
+ PL_ARRAY_APPEND(hook, p->descriptors, sd);
+ continue;
+ }
+
+ if (pl_str_startswith0(shader, "//!PARAM")) {
+ struct pl_hook_par hp;
+ if (!parse_param(gpu->log, hook, &shader, &hp))
+ goto error;
+
+ PL_INFO(gpu, "Registering named parameter '%s'", hp.name);
+ PL_ARRAY_APPEND(hook, p->hook_params, hp);
+ continue;
+ }
+
+ struct custom_shader_hook h;
+ if (!parse_hook(gpu->log, &shader, &h))
+ goto error;
+
+ struct hook_pass pass = {
+ .exec_stages = 0,
+ .hook = h,
+ };
+
+ for (int i = 0; i < PL_ARRAY_SIZE(h.hook_tex); i++)
+ pass.exec_stages |= mp_stage_to_pl(h.hook_tex[i]);
+ for (int i = 0; i < PL_ARRAY_SIZE(h.bind_tex); i++) {
+ p->save_stages |= mp_stage_to_pl(h.bind_tex[i]);
+ if (pl_str_equals0(h.bind_tex[i], "HOOKED"))
+ p->save_stages |= pass.exec_stages;
+ }
+
+ // As an extra precaution, this avoids errors when trying to run
+ // conditions against planes that were never hooked. As a sole
+ // exception, OUTPUT is special because it's hard-coded to return the
+ // dst_rect even before it was hooked. (This is an apparently
+ // undocumented mpv quirk, but shaders rely on it in practice)
+ enum pl_hook_stage rpn_stages = 0;
+ for (int i = 0; i < PL_ARRAY_SIZE(h.width); i++) {
+ if (h.width[i].tag == SHEXP_TEX_W || h.width[i].tag == SHEXP_TEX_H)
+ rpn_stages |= mp_stage_to_pl(h.width[i].val.varname);
+ }
+ for (int i = 0; i < PL_ARRAY_SIZE(h.height); i++) {
+ if (h.height[i].tag == SHEXP_TEX_W || h.height[i].tag == SHEXP_TEX_H)
+ rpn_stages |= mp_stage_to_pl(h.height[i].val.varname);
+ }
+ for (int i = 0; i < PL_ARRAY_SIZE(h.cond); i++) {
+ if (h.cond[i].tag == SHEXP_TEX_W || h.cond[i].tag == SHEXP_TEX_H)
+ rpn_stages |= mp_stage_to_pl(h.cond[i].val.varname);
+ }
+
+ p->save_stages |= rpn_stages & ~PL_HOOK_OUTPUT;
+
+ PL_INFO(gpu, "Registering hook pass: %.*s", PL_STR_FMT(h.pass_desc));
+ PL_ARRAY_APPEND(hook, p->hook_passes, pass);
+ }
+
+ // We need to hook on both the exec and save stages, so that we can keep
+ // track of any textures we might need
+ hook->stages |= p->save_stages;
+ for (int i = 0; i < p->hook_passes.num; i++)
+ hook->stages |= p->hook_passes.elem[i].exec_stages;
+
+ hook->parameters = p->hook_params.elem;
+ hook->num_parameters = p->hook_params.num;
+
+ PL_MSG(gpu, PL_LOG_DEBUG, "Loaded user shader:");
+ pl_msg_source(gpu->log, PL_LOG_DEBUG, shader_text);
+
+ return hook;
+
+error:
+ pl_mpv_user_shader_destroy((const struct pl_hook **) &hook);
+ PL_MSG(gpu, PL_LOG_ERR, "Failed to parse user shader:");
+ pl_msg_source(gpu->log, PL_LOG_ERR, shader_text);
+ pl_log_stack_trace(gpu->log, PL_LOG_ERR);
+ return NULL;
+}
+
+void pl_mpv_user_shader_destroy(const struct pl_hook **hookp)
+{
+ const struct pl_hook *hook = *hookp;
+ if (!hook)
+ return;
+
+ struct hook_priv *p = PL_PRIV(hook);
+ for (int i = 0; i < p->descriptors.num; i++) {
+ switch (p->descriptors.elem[i].desc.type) {
+ case PL_DESC_BUF_UNIFORM:
+ case PL_DESC_BUF_STORAGE:
+ case PL_DESC_BUF_TEXEL_UNIFORM:
+ case PL_DESC_BUF_TEXEL_STORAGE: {
+ pl_buf buf = p->descriptors.elem[i].binding.object;
+ pl_buf_destroy(p->gpu, &buf);
+ break;
+ }
+
+ case PL_DESC_SAMPLED_TEX:
+ case PL_DESC_STORAGE_IMG: {
+ pl_tex tex = p->descriptors.elem[i].binding.object;
+ pl_tex_destroy(p->gpu, &tex);
+ break;
+
+ case PL_DESC_INVALID:
+ case PL_DESC_TYPE_COUNT:
+ pl_unreachable();
+ }
+ }
+ }
+
+ pl_shader_free(&p->trc_helper);
+ pl_free((void *) hook);
+ *hookp = NULL;
+}
diff --git a/src/shaders/deinterlacing.c b/src/shaders/deinterlacing.c
new file mode 100644
index 0000000..5c85138
--- /dev/null
+++ b/src/shaders/deinterlacing.c
@@ -0,0 +1,260 @@
+/*
+ * This file is part of libplacebo, but also based on vf_yadif_cuda.cu:
+ * Copyright (C) 2018 Philip Langdale <philipl@overt.org>
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "shaders.h"
+
+#include <libplacebo/shaders/deinterlacing.h>
+
+const struct pl_deinterlace_params pl_deinterlace_default_params = { PL_DEINTERLACE_DEFAULTS };
+
+void pl_shader_deinterlace(pl_shader sh, const struct pl_deinterlace_source *src,
+ const struct pl_deinterlace_params *params)
+{
+ params = PL_DEF(params, &pl_deinterlace_default_params);
+
+ const struct pl_tex_params *texparams = &src->cur.top->params;
+ if (!sh_require(sh, PL_SHADER_SIG_NONE, texparams->w, texparams->h))
+ return;
+
+ sh_describe(sh, "deinterlacing");
+ GLSL("vec4 color = vec4(0,0,0,1); \n"
+ "// pl_shader_deinterlace \n"
+ "{ \n");
+
+ uint8_t comp_mask = PL_DEF(src->component_mask, 0xFu);
+ comp_mask &= (1u << texparams->format->num_components) - 1u;
+ if (!comp_mask) {
+ SH_FAIL(sh, "pl_shader_deinterlace: empty component mask?");
+ return;
+ }
+
+ const uint8_t num_comps = sh_num_comps(comp_mask);
+ const char *swiz = sh_swizzle(comp_mask);
+ GLSL("#define T %s \n", sh_float_type(comp_mask));
+
+ ident_t pos, pt;
+ ident_t cur = sh_bind(sh, src->cur.top, PL_TEX_ADDRESS_MIRROR,
+ PL_TEX_SAMPLE_NEAREST, "cur", NULL, &pos, &pt);
+ if (!cur)
+ return;
+
+ GLSL("#define GET(TEX, X, Y) \\\n"
+ " (textureLod(TEX, pos + pt * vec2(X, Y), 0.0).%s) \n"
+ "vec2 pos = "$"; \n"
+ "vec2 pt = "$"; \n"
+ "T res; \n",
+ swiz, pos, pt);
+
+ if (src->field == PL_FIELD_NONE) {
+ GLSL("res = GET("$", 0, 0); \n", cur);
+ goto done;
+ }
+
+ // Don't modify the primary field
+ GLSL("int yh = textureSize("$", 0).y; \n"
+ "int yo = int("$".y * float(yh)); \n"
+ "if (yo %% 2 == %d) { \n"
+ " res = GET("$", 0, 0); \n"
+ "} else { \n",
+ cur, pos,
+ src->field == PL_FIELD_TOP ? 0 : 1,
+ cur);
+
+ switch (params->algo) {
+ case PL_DEINTERLACE_WEAVE:
+ GLSL("res = GET("$", 0, 0); \n", cur);
+ break;
+
+ case PL_DEINTERLACE_BOB:
+ GLSL("res = GET("$", 0, %d); \n", cur,
+ src->field == PL_FIELD_TOP ? -1 : 1);
+ break;
+
+
+ case PL_DEINTERLACE_YADIF: {
+ // Try using a compute shader for this, for the sole reason of
+ // optimizing for thread group synchronicity. Otherwise, because we
+ // alternate between lines output as-is and lines output deinterlaced,
+ // half of our thread group will be mostly idle at any point in time.
+ const int bw = PL_DEF(sh_glsl(sh).subgroup_size, 32);
+ sh_try_compute(sh, bw, 1, true, 0);
+
+ // This magic constant is hard-coded in the original implementation as
+ // '1' on an 8-bit scale. Since we work with arbitrary bit depth
+ // floating point textures, we have to convert this somehow. Hard-code
+ // it as 1/255 under the assumption that the original intent was to be
+ // roughly 1 unit of brightness increment on an 8-bit source. This may
+ // or may not produce suboptimal results on higher-bit-depth content.
+ static const float spatial_bias = 1 / 255.0f;
+
+ // Calculate spatial prediction
+ ident_t spatial_pred = sh_fresh(sh, "spatial_predictor");
+ GLSLH("float "$"(float a, float b, float c, float d, float e, float f, float g, \n"
+ " float h, float i, float j, float k, float l, float m, float n) \n"
+ "{ \n"
+ " float spatial_pred = (d + k) / 2.0; \n"
+ " float spatial_score = abs(c - j) + abs(d - k) + abs(e - l) - %f; \n"
+
+ " float score = abs(b - k) + abs(c - l) + abs(d - m); \n"
+ " if (score < spatial_score) { \n"
+ " spatial_pred = (c + l) / 2.0; \n"
+ " spatial_score = score; \n"
+ " score = abs(a - l) + abs(b - m) + abs(c - n); \n"
+ " if (score < spatial_score) { \n"
+ " spatial_pred = (b + m) / 2.0; \n"
+ " spatial_score = score; \n"
+ " } \n"
+ " } \n"
+ " score = abs(d - i) + abs(e - j) + abs(f - k); \n"
+ " if (score < spatial_score) { \n"
+ " spatial_pred = (e + j) / 2.0; \n"
+ " spatial_score = score; \n"
+ " score = abs(e - h) + abs(f - i) + abs(g - j); \n"
+ " if (score < spatial_score) { \n"
+ " spatial_pred = (f + i) / 2.0; \n"
+ " spatial_score = score; \n"
+ " } \n"
+ " } \n"
+ " return spatial_pred; \n"
+ "} \n",
+ spatial_pred, spatial_bias);
+
+ GLSL("T a = GET("$", -3, -1); \n"
+ "T b = GET("$", -2, -1); \n"
+ "T c = GET("$", -1, -1); \n"
+ "T d = GET("$", 0, -1); \n"
+ "T e = GET("$", +1, -1); \n"
+ "T f = GET("$", +2, -1); \n"
+ "T g = GET("$", +3, -1); \n"
+ "T h = GET("$", -3, +1); \n"
+ "T i = GET("$", -2, +1); \n"
+ "T j = GET("$", -1, +1); \n"
+ "T k = GET("$", 0, +1); \n"
+ "T l = GET("$", +1, +1); \n"
+ "T m = GET("$", +2, +1); \n"
+ "T n = GET("$", +3, +1); \n",
+ cur, cur, cur, cur, cur, cur, cur, cur, cur, cur, cur, cur, cur, cur);
+
+ if (num_comps == 1) {
+ GLSL("res = "$"(a, b, c, d, e, f, g, h, i, j, k, l, m, n); \n", spatial_pred);
+ } else {
+ for (uint8_t i = 0; i < num_comps; i++) {
+ char c = "xyzw"[i];
+ GLSL("res.%c = "$"(a.%c, b.%c, c.%c, d.%c, e.%c, f.%c, g.%c, \n"
+ " h.%c, i.%c, j.%c, k.%c, l.%c, m.%c, n.%c); \n",
+ c, spatial_pred, c, c, c, c, c, c, c, c, c, c, c, c, c, c);
+ }
+ }
+
+ // Calculate temporal prediction
+ ident_t temporal_pred = sh_fresh(sh, "temporal_predictor");
+ GLSLH("float "$"(float A, float B, float C, float D, float E, float F, \n"
+ " float G, float H, float I, float J, float K, float L, \n"
+ " float spatial_pred) \n"
+ "{ \n"
+ " float p0 = (C + H) / 2.0; \n"
+ " float p1 = F; \n"
+ " float p2 = (D + I) / 2.0; \n"
+ " float p3 = G; \n"
+ " float p4 = (E + J) / 2.0; \n"
+
+ " float tdiff0 = abs(D - I) / 2.0; \n"
+ " float tdiff1 = (abs(A - F) + abs(B - G)) / 2.0; \n"
+ " float tdiff2 = (abs(K - F) + abs(G - L)) / 2.0; \n"
+ " float diff = max(tdiff0, max(tdiff1, tdiff2)); \n",
+ temporal_pred);
+ if (!params->skip_spatial_check) {
+ GLSLH("float maxi = max(p2 - min(p3, p1), min(p0 - p1, p4 - p3)); \n"
+ "float mini = min(p2 - max(p3, p1), max(p0 - p1, p4 - p3)); \n"
+ "diff = max(diff, max(mini, -maxi)); \n");
+ }
+ GLSLH(" if (spatial_pred > p2 + diff) \n"
+ " spatial_pred = p2 + diff; \n"
+ " if (spatial_pred < p2 - diff) \n"
+ " spatial_pred = p2 - diff; \n"
+ " return spatial_pred; \n"
+ "} \n");
+
+ ident_t prev2 = cur, next2 = cur;
+ if (src->prev.top && src->prev.top != src->cur.top) {
+ pl_assert(src->prev.top->params.w == texparams->w);
+ pl_assert(src->prev.top->params.h == texparams->h);
+ prev2 = sh_bind(sh, src->prev.top, PL_TEX_ADDRESS_MIRROR,
+ PL_TEX_SAMPLE_NEAREST, "prev", NULL, NULL, NULL);
+ if (!prev2)
+ return;
+ }
+
+ if (src->next.top && src->next.top != src->cur.top) {
+ pl_assert(src->next.top->params.w == texparams->w);
+ pl_assert(src->next.top->params.h == texparams->h);
+ next2 = sh_bind(sh, src->next.top, PL_TEX_ADDRESS_MIRROR,
+ PL_TEX_SAMPLE_NEAREST, "next", NULL, NULL, NULL);
+ if (!next2)
+ return;
+ }
+
+ enum pl_field first_field = PL_DEF(src->first_field, PL_FIELD_TOP);
+ ident_t prev1 = src->field == first_field ? prev2 : cur;
+ ident_t next1 = src->field == first_field ? cur : next2;
+
+ GLSL("T A = GET("$", 0, -1); \n"
+ "T B = GET("$", 0, 1); \n"
+ "T C = GET("$", 0, -2); \n"
+ "T D = GET("$", 0, 0); \n"
+ "T E = GET("$", 0, +2); \n"
+ "T F = GET("$", 0, -1); \n"
+ "T G = GET("$", 0, +1); \n"
+ "T H = GET("$", 0, -2); \n"
+ "T I = GET("$", 0, 0); \n"
+ "T J = GET("$", 0, +2); \n"
+ "T K = GET("$", 0, -1); \n"
+ "T L = GET("$", 0, +1); \n",
+ prev2, prev2,
+ prev1, prev1, prev1,
+ cur, cur,
+ next1, next1, next1,
+ next2, next2);
+
+ if (num_comps == 1) {
+ GLSL("res = "$"(A, B, C, D, E, F, G, H, I, J, K, L, res); \n", temporal_pred);
+ } else {
+ for (uint8_t i = 0; i < num_comps; i++) {
+ char c = "xyzw"[i];
+ GLSL("res.%c = "$"(A.%c, B.%c, C.%c, D.%c, E.%c, F.%c, \n"
+ " G.%c, H.%c, I.%c, J.%c, K.%c, L.%c, \n"
+ " res.%c); \n",
+ c, temporal_pred, c, c, c, c, c, c, c, c, c, c, c, c, c);
+ }
+ }
+ break;
+ }
+
+ case PL_DEINTERLACE_ALGORITHM_COUNT:
+ pl_unreachable();
+ }
+
+ GLSL("}\n"); // End of primary/secondary field branch
+
+done:
+ GLSL("color.%s = res; \n"
+ "#undef T \n"
+ "#undef GET \n"
+ "} \n",
+ swiz);
+}
diff --git a/src/shaders/dithering.c b/src/shaders/dithering.c
new file mode 100644
index 0000000..4485d11
--- /dev/null
+++ b/src/shaders/dithering.c
@@ -0,0 +1,527 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include "shaders.h"
+
+#include <libplacebo/shaders/dithering.h>
+
+const struct pl_dither_params pl_dither_default_params = { PL_DITHER_DEFAULTS };
+
+struct sh_dither_obj {
+ pl_shader_obj lut;
+};
+
+static void sh_dither_uninit(pl_gpu gpu, void *ptr)
+{
+ struct sh_dither_obj *obj = ptr;
+ pl_shader_obj_destroy(&obj->lut);
+ *obj = (struct sh_dither_obj) {0};
+}
+
+static void fill_dither_matrix(void *data, const struct sh_lut_params *params)
+{
+ pl_assert(params->width > 0 && params->height > 0 && params->comps == 1);
+
+ const struct pl_dither_params *dpar = params->priv;
+ switch (dpar->method) {
+ case PL_DITHER_ORDERED_LUT:
+ pl_assert(params->width == params->height);
+ pl_generate_bayer_matrix(data, params->width);
+ return;
+
+ case PL_DITHER_BLUE_NOISE:
+ pl_assert(params->width == params->height);
+ pl_generate_blue_noise(data, params->width);
+ return;
+
+ case PL_DITHER_ORDERED_FIXED:
+ case PL_DITHER_WHITE_NOISE:
+ case PL_DITHER_METHOD_COUNT:
+ return;
+ }
+
+ pl_unreachable();
+}
+
+static bool dither_method_is_lut(enum pl_dither_method method)
+{
+ switch (method) {
+ case PL_DITHER_BLUE_NOISE:
+ case PL_DITHER_ORDERED_LUT:
+ return true;
+ case PL_DITHER_ORDERED_FIXED:
+ case PL_DITHER_WHITE_NOISE:
+ return false;
+ case PL_DITHER_METHOD_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+}
+
+static inline float approx_gamma(enum pl_color_transfer trc)
+{
+ switch (trc) {
+ case PL_COLOR_TRC_UNKNOWN: return 1.0f;
+ case PL_COLOR_TRC_LINEAR: return 1.0f;
+ case PL_COLOR_TRC_PRO_PHOTO:return 1.8f;
+ case PL_COLOR_TRC_GAMMA18: return 1.8f;
+ case PL_COLOR_TRC_GAMMA20: return 2.0f;
+ case PL_COLOR_TRC_GAMMA24: return 2.4f;
+ case PL_COLOR_TRC_GAMMA26: return 2.6f;
+ case PL_COLOR_TRC_ST428: return 2.6f;
+ case PL_COLOR_TRC_GAMMA28: return 2.8f;
+
+ case PL_COLOR_TRC_SRGB:
+ case PL_COLOR_TRC_BT_1886:
+ case PL_COLOR_TRC_GAMMA22:
+ return 2.2f;
+
+ case PL_COLOR_TRC_PQ:
+ case PL_COLOR_TRC_HLG:
+ case PL_COLOR_TRC_V_LOG:
+ case PL_COLOR_TRC_S_LOG1:
+ case PL_COLOR_TRC_S_LOG2:
+ return 2.0f; // TODO: handle this better
+
+ case PL_COLOR_TRC_COUNT: break;
+ }
+
+ pl_unreachable();
+}
+
+void pl_shader_dither(pl_shader sh, int new_depth,
+ pl_shader_obj *dither_state,
+ const struct pl_dither_params *params)
+{
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+
+ if (new_depth <= 0 || new_depth > 256) {
+ PL_WARN(sh, "Invalid dither depth: %d.. ignoring", new_depth);
+ return;
+ }
+
+ sh_describef(sh, "dithering (%d bits)", new_depth);
+ GLSL("// pl_shader_dither \n"
+ "{ \n"
+ "float bias; \n");
+
+ params = PL_DEF(params, &pl_dither_default_params);
+ if (params->lut_size < 0 || params->lut_size > 8) {
+ SH_FAIL(sh, "Invalid `lut_size` specified: %d", params->lut_size);
+ return;
+ }
+
+ enum pl_dither_method method = params->method;
+ ident_t lut = NULL_IDENT;
+ int lut_size = 0;
+
+ if (dither_method_is_lut(method)) {
+ if (!dither_state) {
+ PL_WARN(sh, "LUT-based dither method specified but no dither state "
+ "object given, falling back to non-LUT based methods.");
+ goto fallback;
+ }
+
+ struct sh_dither_obj *obj;
+ obj = SH_OBJ(sh, dither_state, PL_SHADER_OBJ_DITHER,
+ struct sh_dither_obj, sh_dither_uninit);
+ if (!obj)
+ goto fallback;
+
+ bool cache = method == PL_DITHER_BLUE_NOISE;
+ lut_size = 1 << PL_DEF(params->lut_size, pl_dither_default_params.lut_size);
+ lut = sh_lut(sh, sh_lut_params(
+ .object = &obj->lut,
+ .var_type = PL_VAR_FLOAT,
+ .width = lut_size,
+ .height = lut_size,
+ .comps = 1,
+ .fill = fill_dither_matrix,
+ .signature = (CACHE_KEY_DITHER ^ method) * lut_size,
+ .cache = cache ? SH_CACHE(sh) : NULL,
+ .priv = (void *) params,
+ ));
+ if (!lut)
+ goto fallback;
+ }
+
+ goto done;
+
+fallback:
+ method = PL_DITHER_ORDERED_FIXED;
+ // fall through
+
+done: ;
+
+ int size = 0;
+ if (lut) {
+ size = lut_size;
+ } else if (method == PL_DITHER_ORDERED_FIXED) {
+ size = 16; // hard-coded size
+ }
+
+ if (size) {
+ // Transform the screen position to the cyclic range [0,1)
+ GLSL("vec2 pos = fract(gl_FragCoord.xy * 1.0/"$"); \n", SH_FLOAT(size));
+
+ if (params->temporal) {
+ int phase = SH_PARAMS(sh).index % 8;
+ float r = phase * (M_PI / 2); // rotate
+ float m = phase < 4 ? 1 : -1; // mirror
+ float mat[2][2] = {
+ {cos(r), -sin(r) },
+ {sin(r) * m, cos(r) * m},
+ };
+
+ ident_t rot = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_mat2("dither_rot"),
+ .data = &mat[0][0],
+ .dynamic = true,
+ });
+ GLSL("pos = fract("$" * pos + vec2(1.0));\n", rot);
+ }
+ }
+
+ switch (method) {
+ case PL_DITHER_WHITE_NOISE: {
+ ident_t prng = sh_prng(sh, params->temporal, NULL);
+ GLSL("bias = "$".x;\n", prng);
+ break;
+ }
+
+ case PL_DITHER_ORDERED_FIXED:
+ // Bitwise ordered dither using only 32-bit uints
+ GLSL("uvec2 xy = uvec2(pos * 16.0) %% 16u; \n"
+ // Bitwise merge (morton number)
+ "xy.x = xy.x ^ xy.y; \n"
+ "xy = (xy | xy << 2) & uvec2(0x33333333); \n"
+ "xy = (xy | xy << 1) & uvec2(0x55555555); \n"
+ // Bitwise inversion
+ "uint b = xy.x + (xy.y << 1); \n"
+ "b = (b * 0x0802u & 0x22110u) | \n"
+ " (b * 0x8020u & 0x88440u); \n"
+ "b = 0x10101u * b; \n"
+ "b = (b >> 16) & 0xFFu; \n"
+ // Generate bias value
+ "bias = float(b) * 1.0/256.0; \n");
+ break;
+
+ case PL_DITHER_BLUE_NOISE:
+ case PL_DITHER_ORDERED_LUT:
+ pl_assert(lut);
+ GLSL("bias = "$"(ivec2(pos * "$"));\n", lut, SH_FLOAT(lut_size));
+ break;
+
+ case PL_DITHER_METHOD_COUNT:
+ pl_unreachable();
+ }
+
+ // Scale factor for dither rounding
+ GLSL("const float scale = %llu.0; \n", (1LLU << new_depth) - 1);
+
+ const float gamma = approx_gamma(params->transfer);
+ if (gamma != 1.0f && new_depth <= 4) {
+ GLSL("const float gamma = "$"; \n"
+ "vec4 color_lin = pow(color, vec4(gamma)); \n",
+ SH_FLOAT(gamma));
+
+ if (new_depth == 1) {
+ // Special case for bit depth 1 dithering, in this case we can just
+ // ignore the low/high rounding because we know we are always
+ // dithering between 0.0 and 1.0.
+ GLSL("const vec4 low = vec4(0.0); \n"
+ "const vec4 high = vec4(1.0); \n"
+ "vec4 offset = color_lin; \n");
+ } else {
+ // Linearize the low, high and current color values
+ GLSL("vec4 low = floor(color * scale) / scale; \n"
+ "vec4 high = ceil(color * scale) / scale; \n"
+ "vec4 low_lin = pow(low, vec4(gamma)); \n"
+ "vec4 high_lin = pow(high, vec4(gamma)); \n"
+ "vec4 range = high_lin - low_lin; \n"
+ "vec4 offset = (color_lin - low_lin) / \n"
+ " max(range, 1e-6); \n");
+ }
+
+ // Mix in the correct ratio corresponding to the offset and bias
+ GLSL("color = mix(low, high, greaterThan(offset, vec4(bias))); \n");
+ } else {
+ // Approximate each gamma segment as a straight line, this simplifies
+ // the process of dithering down to a single scale and (biased) round.
+ GLSL("color = scale * color + vec4(bias); \n"
+ "color = floor(color) * (1.0 / scale); \n");
+ }
+
+ GLSL("} \n");
+}
+
+/* Error diffusion code is taken from mpv, original copyright (c) 2019 Bin Jin
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// After a (y, x) -> (y, x + y * shift) mapping, find the right most column that
+// will be affected by the current column.
+static int compute_rightmost_shifted_column(const struct pl_error_diffusion_kernel *k)
+{
+ int ret = 0;
+ for (int y = 0; y <= PL_EDF_MAX_DY; y++) {
+ for (int x = PL_EDF_MIN_DX; x <= PL_EDF_MAX_DX; x++) {
+ if (k->pattern[y][x - PL_EDF_MIN_DX] != 0) {
+ int shifted_x = x + y * k->shift;
+
+ // The shift mapping guarantees current column (or left of it)
+ // won't be affected by error diffusion.
+ assert(shifted_x > 0);
+
+ ret = PL_MAX(ret, shifted_x);
+ }
+ }
+ }
+ return ret;
+}
+
+size_t pl_error_diffusion_shmem_req(const struct pl_error_diffusion_kernel *kernel,
+ int height)
+{
+ // We add PL_EDF_MAX_DY empty lines on the bottom to handle errors
+ // propagated out from bottom side.
+ int rows = height + PL_EDF_MAX_DY;
+ int shifted_columns = compute_rightmost_shifted_column(kernel) + 1;
+
+ // The shared memory is an array of size rows*shifted_columns. Each element
+ // is a single uint for three RGB component.
+ return rows * shifted_columns * sizeof(uint32_t);
+}
+
+bool pl_shader_error_diffusion(pl_shader sh, const struct pl_error_diffusion_params *params)
+{
+ const int width = params->input_tex->params.w, height = params->input_tex->params.h;
+ const struct pl_glsl_version glsl = sh_glsl(sh);
+ const struct pl_error_diffusion_kernel *kernel =
+ PL_DEF(params->kernel, &pl_error_diffusion_sierra_lite);
+
+ pl_assert(params->output_tex->params.w == width);
+ pl_assert(params->output_tex->params.h == height);
+ if (!sh_require(sh, PL_SHADER_SIG_NONE, width, height))
+ return false;
+
+ if (params->new_depth <= 0 || params->new_depth > 256) {
+ PL_WARN(sh, "Invalid dither depth: %d.. ignoring", params->new_depth);
+ return false;
+ }
+
+ // The parallel error diffusion works by applying the shift mapping first.
+ // Taking the Floyd and Steinberg algorithm for example. After applying
+ // the (y, x) -> (y, x + y * shift) mapping (with shift=2), all errors are
+ // propagated into the next few columns, which makes parallel processing on
+ // the same column possible.
+ //
+ // X 7/16 X 7/16
+ // 3/16 5/16 1/16 ==> 0 0 3/16 5/16 1/16
+
+ // Figuring out the size of rectangle containing all shifted pixels.
+ // The rectangle height is not changed.
+ int shifted_width = width + (height - 1) * kernel->shift;
+
+ // We process all pixels from the shifted rectangles column by column, with
+ // a single global work group of size |block_size|.
+ // Figuring out how many block are required to process all pixels. We need
+ // this explicitly to make the number of barrier() calls match.
+ int block_size = PL_MIN(glsl.max_group_threads, height);
+ int blocks = PL_DIV_UP(height * shifted_width, block_size);
+
+ // If we figure out how many of the next columns will be affected while the
+ // current columns is being processed. We can store errors of only a few
+ // columns in the shared memory. Using a ring buffer will further save the
+ // cost while iterating to next column.
+ //
+ int ring_buffer_rows = height + PL_EDF_MAX_DY;
+ int ring_buffer_columns = compute_rightmost_shifted_column(kernel) + 1;
+ ident_t ring_buffer_size = sh_const(sh, (struct pl_shader_const) {
+ .type = PL_VAR_UINT,
+ .name = "ring_buffer_size",
+ .data = &(unsigned) { ring_buffer_rows * ring_buffer_columns },
+ .compile_time = true,
+ });
+
+ // Compute shared memory requirements and try enabling compute shader.
+ size_t shmem_req = ring_buffer_rows * ring_buffer_columns * sizeof(uint32_t);
+ if (!sh_try_compute(sh, block_size, 1, false, shmem_req)) {
+ PL_ERR(sh, "Cannot execute error diffusion kernel: too old GPU or "
+ "insufficient compute shader memory!");
+ return false;
+ }
+
+ ident_t in_tex = sh_desc(sh, (struct pl_shader_desc) {
+ .binding.object = params->input_tex,
+ .desc = {
+ .name = "input_tex",
+ .type = PL_DESC_SAMPLED_TEX,
+ },
+ });
+
+ ident_t out_img = sh_desc(sh, (struct pl_shader_desc) {
+ .binding.object = params->output_tex,
+ .desc = {
+ .name = "output_tex",
+ .type = PL_DESC_STORAGE_IMG,
+ .access = PL_DESC_ACCESS_WRITEONLY,
+ },
+ });
+
+ sh->output = PL_SHADER_SIG_NONE;
+ sh_describef(sh, "error diffusion (%s, %d bits)",
+ kernel->name, params->new_depth);
+
+ // Defines the ring buffer in shared memory.
+ GLSLH("shared uint err_rgb8["$"]; \n", ring_buffer_size);
+ GLSL("// pl_shader_error_diffusion \n"
+ // Safeguard against accidental over-execution
+ "if (gl_WorkGroupID != uvec3(0)) \n"
+ " return; \n"
+ // Initialize the ring buffer.
+ "for (uint i = gl_LocalInvocationIndex; i < "$"; i+=gl_WorkGroupSize.x)\n"
+ " err_rgb8[i] = 0u; \n"
+
+ // Main block loop, add barrier here to have previous block all
+ // processed before starting the processing of the next.
+ "for (uint block_id = 0; block_id < "$"; block_id++) { \n"
+ "barrier(); \n"
+ // Compute the coordinate of the pixel we are currently processing,
+ // both before and after the shift mapping.
+ "uint id = block_id * gl_WorkGroupSize.x + gl_LocalInvocationIndex; \n"
+ "const uint height = "$"; \n"
+ "int y = int(id %% height), x_shifted = int(id / height); \n"
+ "int x = x_shifted - y * %d; \n"
+ // Proceed only if we are processing a valid pixel.
+ "if (x >= 0 && x < "$") { \n"
+ // The index that the current pixel have on the ring buffer.
+ "uint idx = uint(x_shifted * "$" + y) %% "$"; \n"
+ // Fetch the current pixel.
+ "vec4 pix_orig = texelFetch("$", ivec2(x, y), 0); \n"
+ "vec3 pix = pix_orig.rgb; \n",
+ ring_buffer_size,
+ SH_UINT(blocks),
+ SH_UINT(height),
+ kernel->shift,
+ SH_INT(width),
+ SH_INT(ring_buffer_rows),
+ ring_buffer_size,
+ in_tex);
+
+ // The dithering will quantize pixel value into multiples of 1/dither_quant.
+ int dither_quant = (1 << params->new_depth) - 1;
+
+ // We encode errors in RGB components into a single 32-bit unsigned integer.
+ // The error we propagate from the current pixel is in range of
+ // [-0.5 / dither_quant, 0.5 / dither_quant]. While not quite obvious, the
+ // sum of all errors been propagated into a pixel is also in the same range.
+ // It's possible to map errors in this range into [-127, 127], and use an
+ // unsigned 8-bit integer to store it (using standard two's complement).
+ // The three 8-bit unsigned integers can then be encoded into a single
+ // 32-bit unsigned integer, with two 4-bit padding to prevent addition
+ // operation overflows affecting other component. There are at most 12
+ // addition operations on each pixel, so 4-bit padding should be enough.
+ // The overflow from R component will be discarded.
+ //
+ // The following figure is how the encoding looks like.
+ //
+ // +------------------------------------+
+ // |RRRRRRRR|0000|GGGGGGGG|0000|BBBBBBBB|
+ // +------------------------------------+
+ //
+
+ // The bitshift position for R and G component.
+ const int bitshift_r = 24, bitshift_g = 12;
+ // The multiplier we use to map [-0.5, 0.5] to [-127, 127].
+ const int uint8_mul = 127 * 2;
+
+ GLSL(// Add the error previously propagated into current pixel, and clear
+ // it in the ring buffer.
+ "uint err_u32 = err_rgb8[idx] + %uu; \n"
+ "pix = pix * %d.0 + vec3(int((err_u32 >> %d) & 0xFFu) - 128, \n"
+ " int((err_u32 >> %d) & 0xFFu) - 128, \n"
+ " int( err_u32 & 0xFFu) - 128) / %d.0; \n"
+ "err_rgb8[idx] = 0u; \n"
+ // Write the dithered pixel.
+ "vec3 dithered = round(pix); \n"
+ "imageStore("$", ivec2(x, y), vec4(dithered / %d.0, pix_orig.a)); \n"
+ // Prepare for error propagation pass
+ "vec3 err_divided = (pix - dithered) * %d.0 / %d.0; \n"
+ "ivec3 tmp; \n",
+ (128u << bitshift_r) | (128u << bitshift_g) | 128u,
+ dither_quant, bitshift_r, bitshift_g, uint8_mul,
+ out_img, dither_quant,
+ uint8_mul, kernel->divisor);
+
+ // Group error propagation with same weight factor together, in order to
+ // reduce the number of annoying error encoding.
+ for (int dividend = 1; dividend <= kernel->divisor; dividend++) {
+ bool err_assigned = false;
+
+ for (int y = 0; y <= PL_EDF_MAX_DY; y++) {
+ for (int x = PL_EDF_MIN_DX; x <= PL_EDF_MAX_DX; x++) {
+ if (kernel->pattern[y][x - PL_EDF_MIN_DX] != dividend)
+ continue;
+
+ if (!err_assigned) {
+ err_assigned = true;
+
+ GLSL("tmp = ivec3(round(err_divided * %d.0)); \n"
+ "err_u32 = (uint(tmp.r & 0xFF) << %d) | \n"
+ " (uint(tmp.g & 0xFF) << %d) | \n"
+ " uint(tmp.b & 0xFF); \n",
+ dividend,
+ bitshift_r, bitshift_g);
+ }
+
+ int shifted_x = x + y * kernel->shift;
+
+ // Unlike the right border, errors propagated out from left
+ // border will remain in the ring buffer. This will produce
+ // visible artifacts near the left border, especially for
+ // shift=3 kernels.
+ if (x < 0)
+ GLSL("if (x >= %d) \n", -x);
+
+ // Calculate the new position in the ring buffer to propagate
+ // the error into.
+ int ring_buffer_delta = shifted_x * ring_buffer_rows + y;
+ GLSL("atomicAdd(err_rgb8[(idx + %du) %% "$"], err_u32); \n",
+ ring_buffer_delta, ring_buffer_size);
+ }
+ }
+ }
+
+ GLSL("}} \n"); // end of main loop + valid pixel conditional
+ return true;
+}
diff --git a/src/shaders/film_grain.c b/src/shaders/film_grain.c
new file mode 100644
index 0000000..b1d25ff
--- /dev/null
+++ b/src/shaders/film_grain.c
@@ -0,0 +1,65 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "shaders.h"
+#include "shaders/film_grain.h"
+
+bool pl_needs_film_grain(const struct pl_film_grain_params *params)
+{
+ switch (params->data.type) {
+ case PL_FILM_GRAIN_NONE: return false;
+ case PL_FILM_GRAIN_AV1: return pl_needs_fg_av1(params);
+ case PL_FILM_GRAIN_H274: return pl_needs_fg_h274(params);
+ default: pl_unreachable();
+ }
+}
+
+struct sh_grain_obj {
+ pl_shader_obj av1;
+ pl_shader_obj h274;
+};
+
+static void sh_grain_uninit(pl_gpu gpu, void *ptr)
+{
+ struct sh_grain_obj *obj = ptr;
+ pl_shader_obj_destroy(&obj->av1);
+ pl_shader_obj_destroy(&obj->h274);
+}
+
+bool pl_shader_film_grain(pl_shader sh, pl_shader_obj *grain_state,
+ const struct pl_film_grain_params *params)
+{
+ if (!pl_needs_film_grain(params)) {
+ // FIXME: Instead of erroring, sample directly
+ SH_FAIL(sh, "pl_shader_film_grain called but no film grain needs to be "
+ "applied, test with `pl_needs_film_grain` first!");
+ return false;
+ }
+
+ struct sh_grain_obj *obj;
+ obj = SH_OBJ(sh, grain_state, PL_SHADER_OBJ_FILM_GRAIN,
+ struct sh_grain_obj, sh_grain_uninit);
+ if (!obj)
+ return false;
+
+ switch (params->data.type) {
+ case PL_FILM_GRAIN_NONE: return false;
+ case PL_FILM_GRAIN_AV1: return pl_shader_fg_av1(sh, &obj->av1, params);
+ case PL_FILM_GRAIN_H274: return pl_shader_fg_h274(sh, &obj->h274, params);
+ default: pl_unreachable();
+ }
+}
diff --git a/src/shaders/film_grain.h b/src/shaders/film_grain.h
new file mode 100644
index 0000000..f6498c1
--- /dev/null
+++ b/src/shaders/film_grain.h
@@ -0,0 +1,75 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+#include <libplacebo/shaders/film_grain.h>
+
+bool pl_needs_fg_av1(const struct pl_film_grain_params *);
+bool pl_needs_fg_h274(const struct pl_film_grain_params *);
+
+bool pl_shader_fg_av1(pl_shader, pl_shader_obj *, const struct pl_film_grain_params *);
+bool pl_shader_fg_h274(pl_shader, pl_shader_obj *, const struct pl_film_grain_params *);
+
+// Common helper function
+static inline enum pl_channel channel_map(int i, const struct pl_film_grain_params *params)
+{
+ static const enum pl_channel map_rgb[3] = {
+ [PL_CHANNEL_G] = PL_CHANNEL_Y,
+ [PL_CHANNEL_B] = PL_CHANNEL_CB,
+ [PL_CHANNEL_R] = PL_CHANNEL_CR,
+ };
+
+ static const enum pl_channel map_xyz[3] = {
+ [1] = PL_CHANNEL_Y, // Y
+ [2] = PL_CHANNEL_CB, // Z
+ [0] = PL_CHANNEL_CR, // X
+ };
+
+ if (i >= params->components)
+ return PL_CHANNEL_NONE;
+
+ int comp = params->component_mapping[i];
+ if (comp < 0 || comp > 2)
+ return PL_CHANNEL_NONE;
+
+ switch (params->repr->sys) {
+ case PL_COLOR_SYSTEM_UNKNOWN:
+ case PL_COLOR_SYSTEM_RGB:
+ return map_rgb[comp];
+ case PL_COLOR_SYSTEM_XYZ:
+ return map_xyz[comp];
+
+ case PL_COLOR_SYSTEM_BT_601:
+ case PL_COLOR_SYSTEM_BT_709:
+ case PL_COLOR_SYSTEM_SMPTE_240M:
+ case PL_COLOR_SYSTEM_BT_2020_NC:
+ case PL_COLOR_SYSTEM_BT_2020_C:
+ case PL_COLOR_SYSTEM_BT_2100_PQ:
+ case PL_COLOR_SYSTEM_BT_2100_HLG:
+ case PL_COLOR_SYSTEM_DOLBYVISION:
+ case PL_COLOR_SYSTEM_YCGCO:
+ return comp;
+
+ case PL_COLOR_SYSTEM_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+}
diff --git a/src/shaders/film_grain_av1.c b/src/shaders/film_grain_av1.c
new file mode 100644
index 0000000..3b11ea3
--- /dev/null
+++ b/src/shaders/film_grain_av1.c
@@ -0,0 +1,1001 @@
+/*
+ * This file is part of libplacebo, which is normally licensed under the terms
+ * of the LGPL v2.1+. However, this file (film_grain_av1.c) is also available
+ * under the terms of the more permissive MIT license:
+ *
+ * Copyright (c) 2018-2019 Niklas Haas
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "shaders.h"
+#include "shaders/film_grain.h"
+
+// Taken from the spec. Range is [-2048, 2047], mean is 0 and stddev is 512
+static const int16_t gaussian_sequence[2048] = {
+ 56, 568, -180, 172, 124, -84, 172, -64, -900, 24, 820,
+ 224, 1248, 996, 272, -8, -916, -388, -732, -104, -188, 800,
+ 112, -652, -320, -376, 140, -252, 492, -168, 44, -788, 588,
+ -584, 500, -228, 12, 680, 272, -476, 972, -100, 652, 368,
+ 432, -196, -720, -192, 1000, -332, 652, -136, -552, -604, -4,
+ 192, -220, -136, 1000, -52, 372, -96, -624, 124, -24, 396,
+ 540, -12, -104, 640, 464, 244, -208, -84, 368, -528, -740,
+ 248, -968, -848, 608, 376, -60, -292, -40, -156, 252, -292,
+ 248, 224, -280, 400, -244, 244, -60, 76, -80, 212, 532,
+ 340, 128, -36, 824, -352, -60, -264, -96, -612, 416, -704,
+ 220, -204, 640, -160, 1220, -408, 900, 336, 20, -336, -96,
+ -792, 304, 48, -28, -1232, -1172, -448, 104, -292, -520, 244,
+ 60, -948, 0, -708, 268, 108, 356, -548, 488, -344, -136,
+ 488, -196, -224, 656, -236, -1128, 60, 4, 140, 276, -676,
+ -376, 168, -108, 464, 8, 564, 64, 240, 308, -300, -400,
+ -456, -136, 56, 120, -408, -116, 436, 504, -232, 328, 844,
+ -164, -84, 784, -168, 232, -224, 348, -376, 128, 568, 96,
+ -1244, -288, 276, 848, 832, -360, 656, 464, -384, -332, -356,
+ 728, -388, 160, -192, 468, 296, 224, 140, -776, -100, 280,
+ 4, 196, 44, -36, -648, 932, 16, 1428, 28, 528, 808,
+ 772, 20, 268, 88, -332, -284, 124, -384, -448, 208, -228,
+ -1044, -328, 660, 380, -148, -300, 588, 240, 540, 28, 136,
+ -88, -436, 256, 296, -1000, 1400, 0, -48, 1056, -136, 264,
+ -528, -1108, 632, -484, -592, -344, 796, 124, -668, -768, 388,
+ 1296, -232, -188, -200, -288, -4, 308, 100, -168, 256, -500,
+ 204, -508, 648, -136, 372, -272, -120, -1004, -552, -548, -384,
+ 548, -296, 428, -108, -8, -912, -324, -224, -88, -112, -220,
+ -100, 996, -796, 548, 360, -216, 180, 428, -200, -212, 148,
+ 96, 148, 284, 216, -412, -320, 120, -300, -384, -604, -572,
+ -332, -8, -180, -176, 696, 116, -88, 628, 76, 44, -516,
+ 240, -208, -40, 100, -592, 344, -308, -452, -228, 20, 916,
+ -1752, -136, -340, -804, 140, 40, 512, 340, 248, 184, -492,
+ 896, -156, 932, -628, 328, -688, -448, -616, -752, -100, 560,
+ -1020, 180, -800, -64, 76, 576, 1068, 396, 660, 552, -108,
+ -28, 320, -628, 312, -92, -92, -472, 268, 16, 560, 516,
+ -672, -52, 492, -100, 260, 384, 284, 292, 304, -148, 88,
+ -152, 1012, 1064, -228, 164, -376, -684, 592, -392, 156, 196,
+ -524, -64, -884, 160, -176, 636, 648, 404, -396, -436, 864,
+ 424, -728, 988, -604, 904, -592, 296, -224, 536, -176, -920,
+ 436, -48, 1176, -884, 416, -776, -824, -884, 524, -548, -564,
+ -68, -164, -96, 692, 364, -692, -1012, -68, 260, -480, 876,
+ -1116, 452, -332, -352, 892, -1088, 1220, -676, 12, -292, 244,
+ 496, 372, -32, 280, 200, 112, -440, -96, 24, -644, -184,
+ 56, -432, 224, -980, 272, -260, 144, -436, 420, 356, 364,
+ -528, 76, 172, -744, -368, 404, -752, -416, 684, -688, 72,
+ 540, 416, 92, 444, 480, -72, -1416, 164, -1172, -68, 24,
+ 424, 264, 1040, 128, -912, -524, -356, 64, 876, -12, 4,
+ -88, 532, 272, -524, 320, 276, -508, 940, 24, -400, -120,
+ 756, 60, 236, -412, 100, 376, -484, 400, -100, -740, -108,
+ -260, 328, -268, 224, -200, -416, 184, -604, -564, -20, 296,
+ 60, 892, -888, 60, 164, 68, -760, 216, -296, 904, -336,
+ -28, 404, -356, -568, -208, -1480, -512, 296, 328, -360, -164,
+ -1560, -776, 1156, -428, 164, -504, -112, 120, -216, -148, -264,
+ 308, 32, 64, -72, 72, 116, 176, -64, -272, 460, -536,
+ -784, -280, 348, 108, -752, -132, 524, -540, -776, 116, -296,
+ -1196, -288, -560, 1040, -472, 116, -848, -1116, 116, 636, 696,
+ 284, -176, 1016, 204, -864, -648, -248, 356, 972, -584, -204,
+ 264, 880, 528, -24, -184, 116, 448, -144, 828, 524, 212,
+ -212, 52, 12, 200, 268, -488, -404, -880, 824, -672, -40,
+ 908, -248, 500, 716, -576, 492, -576, 16, 720, -108, 384,
+ 124, 344, 280, 576, -500, 252, 104, -308, 196, -188, -8,
+ 1268, 296, 1032, -1196, 436, 316, 372, -432, -200, -660, 704,
+ -224, 596, -132, 268, 32, -452, 884, 104, -1008, 424, -1348,
+ -280, 4, -1168, 368, 476, 696, 300, -8, 24, 180, -592,
+ -196, 388, 304, 500, 724, -160, 244, -84, 272, -256, -420,
+ 320, 208, -144, -156, 156, 364, 452, 28, 540, 316, 220,
+ -644, -248, 464, 72, 360, 32, -388, 496, -680, -48, 208,
+ -116, -408, 60, -604, -392, 548, -840, 784, -460, 656, -544,
+ -388, -264, 908, -800, -628, -612, -568, 572, -220, 164, 288,
+ -16, -308, 308, -112, -636, -760, 280, -668, 432, 364, 240,
+ -196, 604, 340, 384, 196, 592, -44, -500, 432, -580, -132,
+ 636, -76, 392, 4, -412, 540, 508, 328, -356, -36, 16,
+ -220, -64, -248, -60, 24, -192, 368, 1040, 92, -24, -1044,
+ -32, 40, 104, 148, 192, -136, -520, 56, -816, -224, 732,
+ 392, 356, 212, -80, -424, -1008, -324, 588, -1496, 576, 460,
+ -816, -848, 56, -580, -92, -1372, -112, -496, 200, 364, 52,
+ -140, 48, -48, -60, 84, 72, 40, 132, -356, -268, -104,
+ -284, -404, 732, -520, 164, -304, -540, 120, 328, -76, -460,
+ 756, 388, 588, 236, -436, -72, -176, -404, -316, -148, 716,
+ -604, 404, -72, -88, -888, -68, 944, 88, -220, -344, 960,
+ 472, 460, -232, 704, 120, 832, -228, 692, -508, 132, -476,
+ 844, -748, -364, -44, 1116, -1104, -1056, 76, 428, 552, -692,
+ 60, 356, 96, -384, -188, -612, -576, 736, 508, 892, 352,
+ -1132, 504, -24, -352, 324, 332, -600, -312, 292, 508, -144,
+ -8, 484, 48, 284, -260, -240, 256, -100, -292, -204, -44,
+ 472, -204, 908, -188, -1000, -256, 92, 1164, -392, 564, 356,
+ 652, -28, -884, 256, 484, -192, 760, -176, 376, -524, -452,
+ -436, 860, -736, 212, 124, 504, -476, 468, 76, -472, 552,
+ -692, -944, -620, 740, -240, 400, 132, 20, 192, -196, 264,
+ -668, -1012, -60, 296, -316, -828, 76, -156, 284, -768, -448,
+ -832, 148, 248, 652, 616, 1236, 288, -328, -400, -124, 588,
+ 220, 520, -696, 1032, 768, -740, -92, -272, 296, 448, -464,
+ 412, -200, 392, 440, -200, 264, -152, -260, 320, 1032, 216,
+ 320, -8, -64, 156, -1016, 1084, 1172, 536, 484, -432, 132,
+ 372, -52, -256, 84, 116, -352, 48, 116, 304, -384, 412,
+ 924, -300, 528, 628, 180, 648, 44, -980, -220, 1320, 48,
+ 332, 748, 524, -268, -720, 540, -276, 564, -344, -208, -196,
+ 436, 896, 88, -392, 132, 80, -964, -288, 568, 56, -48,
+ -456, 888, 8, 552, -156, -292, 948, 288, 128, -716, -292,
+ 1192, -152, 876, 352, -600, -260, -812, -468, -28, -120, -32,
+ -44, 1284, 496, 192, 464, 312, -76, -516, -380, -456, -1012,
+ -48, 308, -156, 36, 492, -156, -808, 188, 1652, 68, -120,
+ -116, 316, 160, -140, 352, 808, -416, 592, 316, -480, 56,
+ 528, -204, -568, 372, -232, 752, -344, 744, -4, 324, -416,
+ -600, 768, 268, -248, -88, -132, -420, -432, 80, -288, 404,
+ -316, -1216, -588, 520, -108, 92, -320, 368, -480, -216, -92,
+ 1688, -300, 180, 1020, -176, 820, -68, -228, -260, 436, -904,
+ 20, 40, -508, 440, -736, 312, 332, 204, 760, -372, 728,
+ 96, -20, -632, -520, -560, 336, 1076, -64, -532, 776, 584,
+ 192, 396, -728, -520, 276, -188, 80, -52, -612, -252, -48,
+ 648, 212, -688, 228, -52, -260, 428, -412, -272, -404, 180,
+ 816, -796, 48, 152, 484, -88, -216, 988, 696, 188, -528,
+ 648, -116, -180, 316, 476, 12, -564, 96, 476, -252, -364,
+ -376, -392, 556, -256, -576, 260, -352, 120, -16, -136, -260,
+ -492, 72, 556, 660, 580, 616, 772, 436, 424, -32, -324,
+ -1268, 416, -324, -80, 920, 160, 228, 724, 32, -516, 64,
+ 384, 68, -128, 136, 240, 248, -204, -68, 252, -932, -120,
+ -480, -628, -84, 192, 852, -404, -288, -132, 204, 100, 168,
+ -68, -196, -868, 460, 1080, 380, -80, 244, 0, 484, -888,
+ 64, 184, 352, 600, 460, 164, 604, -196, 320, -64, 588,
+ -184, 228, 12, 372, 48, -848, -344, 224, 208, -200, 484,
+ 128, -20, 272, -468, -840, 384, 256, -720, -520, -464, -580,
+ 112, -120, 644, -356, -208, -608, -528, 704, 560, -424, 392,
+ 828, 40, 84, 200, -152, 0, -144, 584, 280, -120, 80,
+ -556, -972, -196, -472, 724, 80, 168, -32, 88, 160, -688,
+ 0, 160, 356, 372, -776, 740, -128, 676, -248, -480, 4,
+ -364, 96, 544, 232, -1032, 956, 236, 356, 20, -40, 300,
+ 24, -676, -596, 132, 1120, -104, 532, -1096, 568, 648, 444,
+ 508, 380, 188, -376, -604, 1488, 424, 24, 756, -220, -192,
+ 716, 120, 920, 688, 168, 44, -460, 568, 284, 1144, 1160,
+ 600, 424, 888, 656, -356, -320, 220, 316, -176, -724, -188,
+ -816, -628, -348, -228, -380, 1012, -452, -660, 736, 928, 404,
+ -696, -72, -268, -892, 128, 184, -344, -780, 360, 336, 400,
+ 344, 428, 548, -112, 136, -228, -216, -820, -516, 340, 92,
+ -136, 116, -300, 376, -244, 100, -316, -520, -284, -12, 824,
+ 164, -548, -180, -128, 116, -924, -828, 268, -368, -580, 620,
+ 192, 160, 0, -1676, 1068, 424, -56, -360, 468, -156, 720,
+ 288, -528, 556, -364, 548, -148, 504, 316, 152, -648, -620,
+ -684, -24, -376, -384, -108, -920, -1032, 768, 180, -264, -508,
+ -1268, -260, -60, 300, -240, 988, 724, -376, -576, -212, -736,
+ 556, 192, 1092, -620, -880, 376, -56, -4, -216, -32, 836,
+ 268, 396, 1332, 864, -600, 100, 56, -412, -92, 356, 180,
+ 884, -468, -436, 292, -388, -804, -704, -840, 368, -348, 140,
+ -724, 1536, 940, 372, 112, -372, 436, -480, 1136, 296, -32,
+ -228, 132, -48, -220, 868, -1016, -60, -1044, -464, 328, 916,
+ 244, 12, -736, -296, 360, 468, -376, -108, -92, 788, 368,
+ -56, 544, 400, -672, -420, 728, 16, 320, 44, -284, -380,
+ -796, 488, 132, 204, -596, -372, 88, -152, -908, -636, -572,
+ -624, -116, -692, -200, -56, 276, -88, 484, -324, 948, 864,
+ 1000, -456, -184, -276, 292, -296, 156, 676, 320, 160, 908,
+ -84, -1236, -288, -116, 260, -372, -644, 732, -756, -96, 84,
+ 344, -520, 348, -688, 240, -84, 216, -1044, -136, -676, -396,
+ -1500, 960, -40, 176, 168, 1516, 420, -504, -344, -364, -360,
+ 1216, -940, -380, -212, 252, -660, -708, 484, -444, -152, 928,
+ -120, 1112, 476, -260, 560, -148, -344, 108, -196, 228, -288,
+ 504, 560, -328, -88, 288, -1008, 460, -228, 468, -836, -196,
+ 76, 388, 232, 412, -1168, -716, -644, 756, -172, -356, -504,
+ 116, 432, 528, 48, 476, -168, -608, 448, 160, -532, -272,
+ 28, -676, -12, 828, 980, 456, 520, 104, -104, 256, -344,
+ -4, -28, -368, -52, -524, -572, -556, -200, 768, 1124, -208,
+ -512, 176, 232, 248, -148, -888, 604, -600, -304, 804, -156,
+ -212, 488, -192, -804, -256, 368, -360, -916, -328, 228, -240,
+ -448, -472, 856, -556, -364, 572, -12, -156, -368, -340, 432,
+ 252, -752, -152, 288, 268, -580, -848, -592, 108, -76, 244,
+ 312, -716, 592, -80, 436, 360, 4, -248, 160, 516, 584,
+ 732, 44, -468, -280, -292, -156, -588, 28, 308, 912, 24,
+ 124, 156, 180, -252, 944, -924, -772, -520, -428, -624, 300,
+ -212, -1144, 32, -724, 800, -1128, -212, -1288, -848, 180, -416,
+ 440, 192, -576, -792, -76, -1080, 80, -532, -352, -132, 380,
+ -820, 148, 1112, 128, 164, 456, 700, -924, 144, -668, -384,
+ 648, -832, 508, 552, -52, -100, -656, 208, -568, 748, -88,
+ 680, 232, 300, 192, -408, -1012, -152, -252, -268, 272, -876,
+ -664, -648, -332, -136, 16, 12, 1152, -28, 332, -536, 320,
+ -672, -460, -316, 532, -260, 228, -40, 1052, -816, 180, 88,
+ -496, -556, -672, -368, 428, 92, 356, 404, -408, 252, 196,
+ -176, -556, 792, 268, 32, 372, 40, 96, -332, 328, 120,
+ 372, -900, -40, 472, -264, -592, 952, 128, 656, 112, 664,
+ -232, 420, 4, -344, -464, 556, 244, -416, -32, 252, 0,
+ -412, 188, -696, 508, -476, 324, -1096, 656, -312, 560, 264,
+ -136, 304, 160, -64, -580, 248, 336, -720, 560, -348, -288,
+ -276, -196, -500, 852, -544, -236, -1128, -992, -776, 116, 56,
+ 52, 860, 884, 212, -12, 168, 1020, 512, -552, 924, -148,
+ 716, 188, 164, -340, -520, -184, 880, -152, -680, -208, -1156,
+ -300, -528, -472, 364, 100, -744, -1056, -32, 540, 280, 144,
+ -676, -32, -232, -280, -224, 96, 568, -76, 172, 148, 148,
+ 104, 32, -296, -32, 788, -80, 32, -16, 280, 288, 944,
+ 428, -484
+};
+
+static inline int get_random_number(int bits, uint16_t *state)
+{
+ int r = *state;
+ uint16_t bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1;
+ *state = (r >> 1) | (bit << 15);
+
+ return (*state >> (16 - bits)) & ((1 << bits) - 1);
+}
+
+static inline int round2(int x, int shift)
+{
+ if (!shift)
+ return x;
+
+ return (x + (1 << (shift - 1))) >> shift;
+}
+
+enum {
+ BLOCK_SIZE = 32,
+ SCALING_LUT_SIZE = 256,
+
+ GRAIN_WIDTH = 82,
+ GRAIN_HEIGHT = 73,
+ // On the GPU we only need a subsection of this
+ GRAIN_WIDTH_LUT = 64,
+ GRAIN_HEIGHT_LUT = 64,
+ GRAIN_PAD_LUT = 9,
+
+ // For subsampled grain textures
+ SUB_GRAIN_WIDTH = 44,
+ SUB_GRAIN_HEIGHT = 38,
+ SUB_GRAIN_WIDTH_LUT = GRAIN_WIDTH_LUT >> 1,
+ SUB_GRAIN_HEIGHT_LUT = GRAIN_HEIGHT_LUT >> 1,
+ SUB_GRAIN_PAD_LUT = 6,
+};
+
+// Contains the shift by which the offsets are indexed
+enum offset {
+ OFFSET_TL = 24,
+ OFFSET_T = 16,
+ OFFSET_L = 8,
+ OFFSET_N = 0,
+};
+
+// Helper function to compute some common constants
+struct grain_scale {
+ int grain_center;
+ int grain_min;
+ int grain_max;
+ float texture_scale;
+ float grain_scale;
+};
+
+static inline int bit_depth(const struct pl_color_repr *repr)
+{
+ int depth = PL_DEF(repr->bits.color_depth,
+ PL_DEF(repr->bits.sample_depth, 8));
+ pl_assert(depth >= 8);
+ return PL_MIN(depth, 12);
+}
+
+static struct grain_scale get_grain_scale(const struct pl_film_grain_params *params)
+{
+ int bits = bit_depth(params->repr);
+ struct grain_scale ret = {
+ .grain_center = 128 << (bits - 8),
+ };
+
+ ret.grain_min = -ret.grain_center;
+ ret.grain_max = (256 << (bits - 8)) - 1 - ret.grain_center;
+
+ struct pl_color_repr repr = *params->repr;
+ ret.texture_scale = pl_color_repr_normalize(&repr);
+
+ // Since our color samples are normalized to the range [0, 1], we need to
+ // scale down grain values from the scale [0, 2^b - 1] to this range.
+ ret.grain_scale = 1.0 / ((1 << bits) - 1);
+
+ return ret;
+}
+
+// Generates the basic grain table (LumaGrain in the spec).
+static void generate_grain_y(float out[GRAIN_HEIGHT_LUT][GRAIN_WIDTH_LUT],
+ int16_t buf[GRAIN_HEIGHT][GRAIN_WIDTH],
+ const struct pl_film_grain_params *params)
+{
+ const struct pl_av1_grain_data *data = &params->data.params.av1;
+ struct grain_scale scale = get_grain_scale(params);
+ uint16_t seed = (uint16_t) params->data.seed;
+ int bits = bit_depth(params->repr);
+ int shift = 12 - bits + data->grain_scale_shift;
+ pl_assert(shift >= 0);
+
+ for (int y = 0; y < GRAIN_HEIGHT; y++) {
+ for (int x = 0; x < GRAIN_WIDTH; x++) {
+ int16_t value = gaussian_sequence[ get_random_number(11, &seed) ];
+ buf[y][x] = round2(value, shift);
+ }
+ }
+
+ const int ar_pad = 3;
+ int ar_lag = data->ar_coeff_lag;
+
+ for (int y = ar_pad; y < GRAIN_HEIGHT; y++) {
+ for (int x = ar_pad; x < GRAIN_WIDTH - ar_pad; x++) {
+ const int8_t *coeff = data->ar_coeffs_y;
+ int sum = 0;
+ for (int dy = -ar_lag; dy <= 0; dy++) {
+ for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+ if (!dx && !dy)
+ break;
+ sum += *(coeff++) * buf[y + dy][x + dx];
+ }
+ }
+
+ int16_t grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+ grain = PL_CLAMP(grain, scale.grain_min, scale.grain_max);
+ buf[y][x] = grain;
+ }
+ }
+
+ for (int y = 0; y < GRAIN_HEIGHT_LUT; y++) {
+ for (int x = 0; x < GRAIN_WIDTH_LUT; x++) {
+ int16_t grain = buf[y + GRAIN_PAD_LUT][x + GRAIN_PAD_LUT];
+ out[y][x] = grain * scale.grain_scale;
+ }
+ }
+}
+
+static void generate_grain_uv(float *out, int16_t buf[GRAIN_HEIGHT][GRAIN_WIDTH],
+ const int16_t buf_y[GRAIN_HEIGHT][GRAIN_WIDTH],
+ enum pl_channel channel, int sub_x, int sub_y,
+ const struct pl_film_grain_params *params)
+{
+ const struct pl_av1_grain_data *data = &params->data.params.av1;
+ struct grain_scale scale = get_grain_scale(params);
+ int bits = bit_depth(params->repr);
+ int shift = 12 - bits + data->grain_scale_shift;
+ pl_assert(shift >= 0);
+
+ uint16_t seed = params->data.seed;
+ if (channel == PL_CHANNEL_CB) {
+ seed ^= 0xb524;
+ } else if (channel == PL_CHANNEL_CR) {
+ seed ^= 0x49d8;
+ }
+
+ int chromaW = sub_x ? SUB_GRAIN_WIDTH : GRAIN_WIDTH;
+ int chromaH = sub_y ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT;
+
+ const int8_t *coeffs[] = {
+ [PL_CHANNEL_CB] = data->ar_coeffs_uv[0],
+ [PL_CHANNEL_CR] = data->ar_coeffs_uv[1],
+ };
+
+ for (int y = 0; y < chromaH; y++) {
+ for (int x = 0; x < chromaW; x++) {
+ int16_t value = gaussian_sequence[ get_random_number(11, &seed) ];
+ buf[y][x] = round2(value, shift);
+ }
+ }
+
+ const int ar_pad = 3;
+ int ar_lag = data->ar_coeff_lag;
+
+ for (int y = ar_pad; y < chromaH; y++) {
+ for (int x = ar_pad; x < chromaW - ar_pad; x++) {
+ const int8_t *coeff = coeffs[channel];
+ pl_assert(coeff);
+ int sum = 0;
+ for (int dy = -ar_lag; dy <= 0; dy++) {
+ for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+ // For the final (current) pixel, we need to add in the
+ // contribution from the luma grain texture
+ if (!dx && !dy) {
+ if (!data->num_points_y)
+ break;
+ int luma = 0;
+ int lumaX = ((x - ar_pad) << sub_x) + ar_pad;
+ int lumaY = ((y - ar_pad) << sub_y) + ar_pad;
+ for (int i = 0; i <= sub_y; i++) {
+ for (int j = 0; j <= sub_x; j++) {
+ luma += buf_y[lumaY + i][lumaX + j];
+ }
+ }
+ luma = round2(luma, sub_x + sub_y);
+ sum += luma * (*coeff);
+ break;
+ }
+
+ sum += *(coeff++) * buf[y + dy][x + dx];
+ }
+ }
+
+ int16_t grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+ grain = PL_CLAMP(grain, scale.grain_min, scale.grain_max);
+ buf[y][x] = grain;
+ }
+ }
+
+ int lutW = GRAIN_WIDTH_LUT >> sub_x;
+ int lutH = GRAIN_HEIGHT_LUT >> sub_y;
+ int padX = sub_x ? SUB_GRAIN_PAD_LUT : GRAIN_PAD_LUT;
+ int padY = sub_y ? SUB_GRAIN_PAD_LUT : GRAIN_PAD_LUT;
+
+ for (int y = 0; y < lutH; y++) {
+ for (int x = 0; x < lutW; x++) {
+ int16_t grain = buf[y + padY][x + padX];
+ out[y * lutW + x] = grain * scale.grain_scale;
+ }
+ }
+}
+
+static void generate_offsets(void *pbuf, const struct sh_lut_params *params)
+{
+ const struct pl_film_grain_data *data = params->priv;
+ unsigned int *buf = pbuf;
+ pl_static_assert(sizeof(unsigned int) >= sizeof(uint32_t));
+
+ for (int y = 0; y < params->height; y++) {
+ uint16_t state = data->seed;
+ state ^= ((y * 37 + 178) & 0xFF) << 8;
+ state ^= ((y * 173 + 105) & 0xFF);
+
+ for (int x = 0; x < params->width; x++) {
+ unsigned int *offsets = &buf[y * params->width + x];
+
+ uint8_t val = get_random_number(8, &state);
+ uint8_t val_l = x ? (offsets - 1)[0] : 0;
+ uint8_t val_t = y ? (offsets - params->width)[0] : 0;
+ uint8_t val_tl = x && y ? (offsets - params->width - 1)[0] : 0;
+
+ // Encode four offsets into a single 32-bit integer for the
+ // convenience of the GPU. That way only one LUT fetch is
+ // required for the entire block.
+ *offsets = ((uint32_t) val_tl << OFFSET_TL)
+ | ((uint32_t) val_t << OFFSET_T)
+ | ((uint32_t) val_l << OFFSET_L)
+ | ((uint32_t) val << OFFSET_N);
+ }
+ }
+}
+
+static void generate_scaling(void *pdata, const struct sh_lut_params *params)
+{
+ assert(params->width == SCALING_LUT_SIZE && params->comps == 1);
+ float *data = pdata;
+
+ struct {
+ int num;
+ uint8_t (*points)[2];
+ const struct pl_av1_grain_data *data;
+ } *ctx = params->priv;
+
+ float range = 1 << ctx->data->scaling_shift;
+
+ // Fill up the preceding entries with the initial value
+ for (int i = 0; i < ctx->points[0][0]; i++)
+ data[i] = ctx->points[0][1] / range;
+
+ // Linearly interpolate the values in the middle
+ for (int i = 0; i < ctx->num - 1; i++) {
+ int bx = ctx->points[i][0];
+ int by = ctx->points[i][1];
+ int dx = ctx->points[i + 1][0] - bx;
+ int dy = ctx->points[i + 1][1] - by;
+ int delta = dy * ((0x10000 + (dx >> 1)) / dx);
+ for (int x = 0; x < dx; x++) {
+ int v = by + ((x * delta + 0x8000) >> 16);
+ data[bx + x] = v / range;
+ }
+ }
+
+ // Fill up the remaining entries with the final value
+ for (int i = ctx->points[ctx->num - 1][0]; i < SCALING_LUT_SIZE; i++)
+ data[i] = ctx->points[ctx->num - 1][1] / range;
+}
+
+static void sample(pl_shader sh, enum offset off, ident_t lut, int idx,
+ int sub_x, int sub_y)
+{
+ int dx = (off & OFFSET_L) ? 1 : 0,
+ dy = (off & OFFSET_T) ? 1 : 0;
+
+ static const char *index_strs[] = {
+ [0] = ".x",
+ [1] = ".y",
+ };
+
+ GLSL("offset = uvec2(%du, %du) * uvec2((data >> %d) & 0xFu, \n"
+ " (data >> %d) & 0xFu);\n"
+ "pos = offset + local_id.xy + uvec2(%d, %d); \n"
+ "val = "$"(pos)%s; \n",
+ sub_x ? 1 : 2, sub_y ? 1 : 2, off + 4, off,
+ (BLOCK_SIZE >> sub_x) * dx,
+ (BLOCK_SIZE >> sub_y) * dy,
+ lut, idx >= 0 ? index_strs[idx] : "");
+}
+
+struct grain_obj_av1 {
+ // LUT objects for the offsets, grain and scaling luts
+ pl_shader_obj lut_offsets;
+ pl_shader_obj lut_grain[2];
+ pl_shader_obj lut_scaling[3];
+
+ // Previous parameters used to check reusability
+ struct pl_film_grain_data data;
+ struct pl_color_repr repr;
+ bool fg_has_y;
+ bool fg_has_u;
+ bool fg_has_v;
+
+ // Space to store the temporary arrays, reused
+ uint32_t *offsets;
+ float grain[2][GRAIN_HEIGHT_LUT][GRAIN_WIDTH_LUT];
+ int16_t grain_tmp_y[GRAIN_HEIGHT][GRAIN_WIDTH];
+ int16_t grain_tmp_uv[GRAIN_HEIGHT][GRAIN_WIDTH];
+};
+
+static void av1_grain_uninit(pl_gpu gpu, void *ptr)
+{
+ struct grain_obj_av1 *obj = ptr;
+ pl_shader_obj_destroy(&obj->lut_offsets);
+ for (int i = 0; i < PL_ARRAY_SIZE(obj->lut_grain); i++)
+ pl_shader_obj_destroy(&obj->lut_grain[i]);
+ for (int i = 0; i < PL_ARRAY_SIZE(obj->lut_scaling); i++)
+ pl_shader_obj_destroy(&obj->lut_scaling[i]);
+ *obj = (struct grain_obj_av1) {0};
+}
+
+bool pl_needs_fg_av1(const struct pl_film_grain_params *params)
+{
+ const struct pl_av1_grain_data *data = &params->data.params.av1;
+ bool has_y = data->num_points_y > 0;
+ bool has_u = data->num_points_uv[0] > 0 || data->chroma_scaling_from_luma;
+ bool has_v = data->num_points_uv[1] > 0 || data->chroma_scaling_from_luma;
+
+ for (int i = 0; i < 3; i++) {
+ enum pl_channel channel = channel_map(i, params);
+ if (channel == PL_CHANNEL_Y && has_y)
+ return true;
+ if (channel == PL_CHANNEL_CB && has_u)
+ return true;
+ if (channel == PL_CHANNEL_CR && has_v)
+ return true;
+ }
+
+ return false;
+}
+
+static inline bool av1_grain_data_eq(const struct pl_film_grain_data *da,
+ const struct pl_film_grain_data *db)
+{
+ const struct pl_av1_grain_data *a = &da->params.av1, *b = &db->params.av1;
+
+ // Only check the fields that are relevant for grain LUT generation
+ return da->seed == db->seed &&
+ a->chroma_scaling_from_luma == b->chroma_scaling_from_luma &&
+ a->scaling_shift == b->scaling_shift &&
+ a->ar_coeff_lag == b->ar_coeff_lag &&
+ a->ar_coeff_shift == b->ar_coeff_shift &&
+ a->grain_scale_shift == b->grain_scale_shift &&
+ !memcmp(a->ar_coeffs_y, b->ar_coeffs_y, sizeof(a->ar_coeffs_y)) &&
+ !memcmp(a->ar_coeffs_uv, b->ar_coeffs_uv, sizeof(a->ar_coeffs_uv));
+}
+
+static void fill_grain_lut(void *data, const struct sh_lut_params *params)
+{
+ struct grain_obj_av1 *obj = params->priv;
+ size_t entries = params->width * params->height * params->comps;
+ memcpy(data, obj->grain, entries * sizeof(float));
+}
+
+bool pl_shader_fg_av1(pl_shader sh, pl_shader_obj *grain_state,
+ const struct pl_film_grain_params *params)
+{
+ int sub_x = 0, sub_y = 0;
+ int tex_w = params->tex->params.w,
+ tex_h = params->tex->params.h;
+
+ if (params->luma_tex) {
+ sub_x = params->luma_tex->params.w > tex_w;
+ sub_y = params->luma_tex->params.h > tex_h;
+ }
+
+ const struct pl_av1_grain_data *data = &params->data.params.av1;
+ bool fg_has_y = data->num_points_y > 0;
+ bool fg_has_u = data->num_points_uv[0] > 0 || data->chroma_scaling_from_luma;
+ bool fg_has_v = data->num_points_uv[1] > 0 || data->chroma_scaling_from_luma;
+
+ bool tex_is_y = false, tex_is_cb = false, tex_is_cr = false;
+ for (int i = 0; i < 3; i++) {
+ switch (channel_map(i, params)) {
+ case PL_CHANNEL_Y: tex_is_y = true; break;
+ case PL_CHANNEL_CB: tex_is_cb = true; break;
+ case PL_CHANNEL_CR: tex_is_cr = true; break;
+ default: break;
+ };
+ }
+
+ if (tex_is_y && (sub_x || sub_y)) {
+ PL_WARN(sh, "pl_film_grain_params.channels includes PL_CHANNEL_Y but "
+ "plane is subsampled, this makes no sense. Continuing anyway "
+ "but output is likely incorrect.");
+ }
+
+ if (!sh_require(sh, PL_SHADER_SIG_NONE, tex_w, tex_h))
+ return false;
+
+ pl_gpu gpu = SH_GPU(sh);
+ if (!gpu) {
+ PL_ERR(sh, "AV1 film grain synthesis requires a non-NULL pl_gpu!");
+ return false;
+ }
+
+ // Disable generation for unneeded component types
+ fg_has_y &= tex_is_y;
+ fg_has_u &= tex_is_cb;
+ fg_has_v &= tex_is_cr;
+
+ int bw = BLOCK_SIZE >> sub_x;
+ int bh = BLOCK_SIZE >> sub_y;
+ bool is_compute = sh_try_compute(sh, bw, bh, false, sizeof(uint32_t));
+
+ struct grain_obj_av1 *obj;
+ obj = SH_OBJ(sh, grain_state, PL_SHADER_OBJ_AV1_GRAIN,
+ struct grain_obj_av1, av1_grain_uninit);
+ if (!obj)
+ return false;
+
+ // Note: In theory we could check only the parameters related to luma or
+ // only related to chroma and skip updating for changes to irrelevant
+ // parts, but this is probably not worth it since the seed is expected to
+ // change per frame anyway.
+ bool needs_update = !av1_grain_data_eq(&params->data, &obj->data) ||
+ !pl_color_repr_equal(params->repr, &obj->repr) ||
+ fg_has_y != obj->fg_has_y ||
+ fg_has_u != obj->fg_has_u ||
+ fg_has_v != obj->fg_has_v;
+
+ if (needs_update) {
+ // This is needed even for chroma, so statically generate it
+ generate_grain_y(obj->grain[0], obj->grain_tmp_y, params);
+ }
+
+ ident_t lut[3];
+ int idx[3] = {-1};
+
+ if (fg_has_y) {
+ lut[0] = sh_lut(sh, sh_lut_params(
+ .object = &obj->lut_grain[0],
+ .var_type = PL_VAR_FLOAT,
+ .lut_type = SH_LUT_TEXTURE,
+ .width = GRAIN_WIDTH_LUT,
+ .height = GRAIN_HEIGHT_LUT,
+ .comps = 1,
+ .update = needs_update,
+ .dynamic = true,
+ .fill = fill_grain_lut,
+ .priv = obj,
+ ));
+
+ if (!lut[0]) {
+ SH_FAIL(sh, "Failed generating/uploading luma grain LUT!");
+ return false;
+ }
+ }
+
+ // Try merging the chroma LUTs into a single texture
+ int chroma_comps = 0;
+ if (fg_has_u) {
+ generate_grain_uv(&obj->grain[chroma_comps][0][0], obj->grain_tmp_uv,
+ obj->grain_tmp_y, PL_CHANNEL_CB, sub_x, sub_y,
+ params);
+ idx[1] = chroma_comps++;
+ }
+ if (fg_has_v) {
+ generate_grain_uv(&obj->grain[chroma_comps][0][0], obj->grain_tmp_uv,
+ obj->grain_tmp_y, PL_CHANNEL_CR, sub_x, sub_y,
+ params);
+ idx[2] = chroma_comps++;
+ }
+
+ if (chroma_comps > 0) {
+ lut[1] = lut[2] = sh_lut(sh, sh_lut_params(
+ .object = &obj->lut_grain[1],
+ .var_type = PL_VAR_FLOAT,
+ .lut_type = SH_LUT_TEXTURE,
+ .width = GRAIN_WIDTH_LUT >> sub_x,
+ .height = GRAIN_HEIGHT_LUT >> sub_y,
+ .comps = chroma_comps,
+ .update = needs_update,
+ .dynamic = true,
+ .fill = fill_grain_lut,
+ .priv = obj,
+ ));
+
+ if (!lut[1]) {
+ SH_FAIL(sh, "Failed generating/uploading chroma grain LUT!");
+ return false;
+ }
+
+ if (chroma_comps == 1)
+ idx[1] = idx[2] = -1;
+ }
+
+ ident_t offsets = sh_lut(sh, sh_lut_params(
+ .object = &obj->lut_offsets,
+ .var_type = PL_VAR_UINT,
+ .lut_type = SH_LUT_AUTO,
+ .width = PL_ALIGN2(tex_w << sub_x, 128) / 32,
+ .height = PL_ALIGN2(tex_h << sub_y, 128) / 32,
+ .comps = 1,
+ .update = needs_update,
+ .dynamic = true,
+ .fill = generate_offsets,
+ .priv = (void *) &params->data,
+ ));
+
+ if (!offsets) {
+ SH_FAIL(sh, "Failed generating/uploading block offsets LUT!");
+ return false;
+ }
+
+ // For the scaling LUTs, we assume they'll be relatively constant
+ // throughout the video so doing some extra work to avoid reinitializing
+ // them constantly is probably worth it. Probably.
+ const struct pl_av1_grain_data *obj_data = &obj->data.params.av1;
+ bool scaling_changed = false;
+ if (fg_has_y || data->chroma_scaling_from_luma) {
+ scaling_changed |= data->num_points_y != obj_data->num_points_y;
+ scaling_changed |= memcmp(data->points_y, obj_data->points_y,
+ sizeof(data->points_y));
+ }
+
+ if (fg_has_u && !data->chroma_scaling_from_luma) {
+ scaling_changed |= data->num_points_uv[0] != obj_data->num_points_uv[0];
+ scaling_changed |= memcmp(data->points_uv[0],
+ obj_data->points_uv[0],
+ sizeof(data->points_uv[0]));
+ }
+
+ if (fg_has_v && !data->chroma_scaling_from_luma) {
+ scaling_changed |= data->num_points_uv[1] != obj_data->num_points_uv[1];
+ scaling_changed |= memcmp(data->points_uv[1],
+ obj_data->points_uv[1],
+ sizeof(data->points_uv[1]));
+ }
+
+ ident_t scaling[3] = {0};
+ for (int i = 0; i < 3; i++) {
+ struct {
+ int num;
+ const uint8_t (*points)[2];
+ const struct pl_av1_grain_data *data;
+ } priv;
+
+ priv.data = data;
+ if (i == 0 || data->chroma_scaling_from_luma) {
+ priv.num = data->num_points_y;
+ priv.points = &data->points_y[0];
+ } else {
+ priv.num = data->num_points_uv[i - 1];
+ priv.points = &data->points_uv[i - 1][0];
+ }
+
+ // Skip scaling for unneeded channels
+ bool has_c[3] = { fg_has_y, fg_has_u, fg_has_v };
+ if (has_c[i] && priv.num > 0) {
+ scaling[i] = sh_lut(sh, sh_lut_params(
+ .object = &obj->lut_scaling[i],
+ .var_type = PL_VAR_FLOAT,
+ .method = SH_LUT_LINEAR,
+ .width = SCALING_LUT_SIZE,
+ .comps = 1,
+ .update = scaling_changed,
+ .dynamic = true,
+ .fill = generate_scaling,
+ .priv = &priv,
+ ));
+
+ if (!scaling[i]) {
+ SH_FAIL(sh, "Failed generating/uploading scaling LUTs!");
+ return false;
+ }
+ }
+ }
+
+ // Done updating LUTs
+ obj->data = params->data;
+ obj->repr = *params->repr;
+ obj->fg_has_y = fg_has_y;
+ obj->fg_has_u = fg_has_u;
+ obj->fg_has_v = fg_has_v;
+
+ sh_describe(sh, "AV1 film grain");
+ GLSL("vec4 color; \n"
+ "// pl_shader_film_grain (AV1) \n"
+ "{ \n"
+ "uvec2 offset; \n"
+ "uvec2 pos; \n"
+ "float val; \n"
+ "float grain; \n");
+
+ if (is_compute) {
+ GLSL("uvec2 block_id = gl_WorkGroupID.xy; \n"
+ "uvec2 local_id = gl_LocalInvocationID.xy; \n"
+ "uvec2 global_id = gl_GlobalInvocationID.xy; \n");
+ } else {
+ GLSL("uvec2 global_id = uvec2(gl_FragCoord); \n"
+ "uvec2 block_id = global_id / uvec2(%d, %d); \n"
+ "uvec2 local_id = global_id - uvec2(%d, %d) * block_id; \n",
+ bw, bh, bw, bh);
+ }
+
+ // Load the data vector which holds the offsets
+ if (is_compute) {
+ ident_t id = sh_fresh(sh, "data");
+ GLSLH("shared uint "$"; \n", id);
+ GLSL("if (gl_LocalInvocationIndex == 0u) \n"
+ " "$" = uint("$"(block_id)); \n"
+ "barrier(); \n"
+ "uint data = "$"; \n",
+ id, offsets, id);
+ } else {
+ GLSL("uint data = uint("$"(block_id)); \n", offsets);
+ }
+
+ struct grain_scale scale = get_grain_scale(params);
+ pl_color_repr_normalize(params->repr);
+ int bits = PL_DEF(params->repr->bits.color_depth, 8);
+ pl_assert(bits >= 8);
+
+ ident_t minValue, maxLuma, maxChroma;
+ if (pl_color_levels_guess(params->repr) == PL_COLOR_LEVELS_LIMITED) {
+ float out_scale = (1 << bits) / ((1 << bits) - 1.0);
+ minValue = SH_FLOAT(16 / 256.0 * out_scale);
+ maxLuma = SH_FLOAT(235 / 256.0 * out_scale);
+ maxChroma = SH_FLOAT(240 / 256.0 * out_scale);
+ if (!pl_color_system_is_ycbcr_like(params->repr->sys))
+ maxChroma = maxLuma;
+ } else {
+ minValue = SH_FLOAT(0.0);
+ maxLuma = SH_FLOAT(1.0);
+ maxChroma = SH_FLOAT(1.0);
+ }
+
+ // Load the color value of the tex itself
+ ident_t tex = sh_desc(sh, (struct pl_shader_desc) {
+ .binding.object = params->tex,
+ .desc = (struct pl_desc) {
+ .name = "tex",
+ .type = PL_DESC_SAMPLED_TEX,
+ },
+ });
+
+ ident_t tex_scale = SH_FLOAT(scale.texture_scale);
+ GLSL("color = vec4("$") * texelFetch("$", ivec2(global_id), 0); \n",
+ tex_scale, tex);
+
+ // If we need access to the external luma plane, load it now
+ if (tex_is_cb || tex_is_cr) {
+ GLSL("float averageLuma; \n");
+ if (tex_is_y) {
+ // We already have the luma channel as part of the pre-sampled color
+ for (int i = 0; i < 3; i++) {
+ if (channel_map(i, params) == PL_CHANNEL_Y) {
+ GLSL("averageLuma = color["$"]; \n", SH_INT(i));
+ break;
+ }
+ }
+ } else {
+ // Luma channel not present in image, attach it separately
+ pl_assert(params->luma_tex);
+ ident_t luma = sh_desc(sh, (struct pl_shader_desc) {
+ .binding.object = params->luma_tex,
+ .desc = (struct pl_desc) {
+ .name = "luma",
+ .type = PL_DESC_SAMPLED_TEX,
+ },
+ });
+
+ GLSL("pos = global_id * uvec2(%du, %du); \n"
+ "averageLuma = texelFetch("$", ivec2(pos), 0)["$"]; \n"
+ "averageLuma *= "$"; \n",
+ 1 << sub_x, 1 << sub_y,
+ luma, SH_INT(params->luma_comp),
+ tex_scale);
+ }
+ }
+
+ ident_t grain_min = SH_FLOAT(scale.grain_min * scale.grain_scale);
+ ident_t grain_max = SH_FLOAT(scale.grain_max * scale.grain_scale);
+
+ for (int i = 0; i < params->components; i++) {
+ enum pl_channel c = channel_map(i, params);
+ if (c == PL_CHANNEL_NONE)
+ continue;
+ if (!scaling[c])
+ continue;
+
+ sample(sh, OFFSET_N, lut[c], idx[c], sub_x, sub_y);
+ GLSL("grain = val; \n");
+
+ if (data->overlap) {
+ const char *weights[] = { "vec2(27.0, 17.0)", "vec2(23.0, 22.0)" };
+
+ // X-direction overlapping
+ GLSL("if (block_id.x > 0u && local_id.x < %du) { \n"
+ "vec2 w = %s / 32.0; \n"
+ "if (local_id.x == 1u) w.xy = w.yx; \n",
+ 2 >> sub_x, weights[sub_x]);
+ sample(sh, OFFSET_L, lut[c], idx[c], sub_x, sub_y);
+ GLSL("grain = dot(vec2(val, grain), w); \n"
+ "} \n");
+
+ // Y-direction overlapping
+ GLSL("if (block_id.y > 0u && local_id.y < %du) { \n"
+ "vec2 w = %s / 32.0; \n"
+ "if (local_id.y == 1u) w.xy = w.yx; \n",
+ 2 >> sub_y, weights[sub_y]);
+
+ // We need to special-case the top left pixels since these need to
+ // pre-blend the top-left offset block before blending vertically
+ GLSL(" if (block_id.x > 0u && local_id.x < %du) {\n"
+ " vec2 w2 = %s / 32.0; \n"
+ " if (local_id.x == 1u) w2.xy = w2.yx; \n",
+ 2 >> sub_x, weights[sub_x]);
+ sample(sh, OFFSET_TL, lut[c], idx[c], sub_x, sub_y);
+ GLSL(" float tmp = val; \n");
+ sample(sh, OFFSET_T, lut[c], idx[c], sub_x, sub_y);
+ GLSL(" val = dot(vec2(tmp, val), w2); \n"
+ " } else { \n");
+ sample(sh, OFFSET_T, lut[c], idx[c], sub_x, sub_y);
+ GLSL(" } \n"
+ "grain = dot(vec2(val, grain), w); \n"
+ "} \n");
+
+ // Correctly clip the interpolated grain
+ GLSL("grain = clamp(grain, "$", "$"); \n", grain_min, grain_max);
+ }
+
+ if (c == PL_CHANNEL_Y) {
+ GLSL("color[%d] += "$"(color[%d]) * grain; \n"
+ "color[%d] = clamp(color[%d], "$", "$"); \n",
+ i, scaling[c], i,
+ i, i, minValue, maxLuma);
+ } else {
+ GLSL("val = averageLuma; \n");
+ if (!data->chroma_scaling_from_luma) {
+ // We need to load some extra variables for the mixing. Do this
+ // using sh_var instead of hard-coding them to avoid shader
+ // recompilation when these values change.
+ ident_t mult = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec2("mult"),
+ .data = &(float[2]){
+ data->uv_mult_luma[c - 1] / 64.0,
+ data->uv_mult[c - 1] / 64.0,
+ },
+ });
+
+ int c_offset = (unsigned) data->uv_offset[c - 1] << (bits - 8);
+ ident_t offset = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_float("offset"),
+ .data = &(float) { c_offset * scale.grain_scale },
+ });
+
+ GLSL("val = dot(vec2(val, color[%d]), "$"); \n"
+ "val += "$"; \n",
+ i, mult, offset);
+ }
+ GLSL("color[%d] += "$"(val) * grain; \n"
+ "color[%d] = clamp(color[%d], "$", "$"); \n",
+ i, scaling[c],
+ i, i, minValue, maxChroma);
+ }
+ }
+
+ GLSL("} \n");
+ return true;
+}
diff --git a/src/shaders/film_grain_h274.c b/src/shaders/film_grain_h274.c
new file mode 100644
index 0000000..6d524da
--- /dev/null
+++ b/src/shaders/film_grain_h274.c
@@ -0,0 +1,815 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "shaders.h"
+#include "shaders/film_grain.h"
+
+static const int8_t Gaussian_LUT[2048+4];
+static const uint32_t Seed_LUT[256];
+static const int8_t R64T[64][64];
+
+static void prng_shift(uint32_t *state)
+{
+ // Primitive polynomial x^31 + x^3 + 1 (modulo 2)
+ uint32_t x = *state;
+ uint8_t feedback = 1u ^ (x >> 2) ^ (x >> 30);
+ *state = (x << 1) | (feedback & 1u);
+}
+
+
+static void generate_slice(float *out, size_t out_width, uint8_t h, uint8_t v,
+ int8_t grain[64][64], int16_t tmp[64][64])
+{
+ const uint8_t freq_h = ((h + 3) << 2) - 1;
+ const uint8_t freq_v = ((v + 3) << 2) - 1;
+ uint32_t seed = Seed_LUT[h + v * 13];
+
+ // Initialize with random gaussian values, using the output array as a
+ // temporary buffer for these intermediate values.
+ //
+ // Note: To make the subsequent matrix multiplication cache friendlier, we
+ // store each *column* of the starting image in a *row* of `grain`
+ for (int y = 0; y <= freq_v; y++) {
+ for (int x = 0; x <= freq_h; x += 4) {
+ uint16_t offset = seed % 2048;
+ grain[x + 0][y] = Gaussian_LUT[offset + 0];
+ grain[x + 1][y] = Gaussian_LUT[offset + 1];
+ grain[x + 2][y] = Gaussian_LUT[offset + 2];
+ grain[x + 3][y] = Gaussian_LUT[offset + 3];
+ prng_shift(&seed);
+ }
+ }
+
+ grain[0][0] = 0;
+
+ // 64x64 inverse integer transform
+ for (int y = 0; y < 64; y++) {
+ for (int x = 0; x <= freq_h; x++) {
+ int32_t sum = 0;
+ for (int p = 0; p <= freq_v; p++)
+ sum += R64T[y][p] * grain[x][p];
+ tmp[y][x] = (sum + 128) >> 8;
+ }
+ }
+
+ for (int y = 0; y < 64; y++) {
+ for (int x = 0; x < 64; x++) {
+ int32_t sum = 0;
+ for (int p = 0; p <= freq_h; p++)
+ sum += tmp[y][p] * R64T[x][p]; // R64T^T = R64
+ sum = (sum + 128) >> 8;
+ grain[y][x] = PL_CLAMP(sum, -127, 127);
+ }
+ }
+
+ static const uint8_t deblock_factors[13] = {
+ 64, 71, 77, 84, 90, 96, 103, 109, 116, 122, 128, 128, 128
+ };
+
+ // Deblock horizontal edges by simple attentuation of values
+ const uint8_t deblock_coeff = deblock_factors[v];
+ for (int y = 0; y < 64; y++) {
+ switch (y % 8) {
+ case 0: case 7:
+ // Deblock
+ for (int x = 0; x < 64; x++)
+ out[x] = ((grain[y][x] * deblock_coeff) >> 7) / 255.0;
+ break;
+
+ case 1: case 2:
+ case 3: case 4:
+ case 5: case 6:
+ // No deblock
+ for (int x = 0; x < 64; x++)
+ out[x] = grain[y][x] / 255.0;
+ break;
+
+ default: pl_unreachable();
+ }
+
+ out += out_width;
+ }
+}
+
+static void fill_grain_lut(void *data, const struct sh_lut_params *params)
+{
+ struct {
+ int8_t grain[64][64];
+ int16_t tmp[64][64];
+ } *tmp = pl_alloc_ptr(NULL, tmp);
+
+ float *out = data;
+ assert(params->var_type == PL_VAR_FLOAT);
+
+ for (int h = 0; h < 13; h++) {
+ for (int v = 0; v < 13; v++) {
+ float *slice = out + (h * 64) * params->width + (v * 64);
+ generate_slice(slice, params->width, h, v, tmp->grain, tmp->tmp);
+ }
+ }
+
+ pl_free(tmp);
+}
+
+bool pl_needs_fg_h274(const struct pl_film_grain_params *params)
+{
+ const struct pl_h274_grain_data *data = &params->data.params.h274;
+ if (data->model_id != 0)
+ return false;
+
+ for (int i = 0; i < 3; i++) {
+ enum pl_channel channel = channel_map(i, params);
+ if (channel < 0 || channel >= 3)
+ continue;
+ if (data->component_model_present[channel])
+ return true;
+ }
+
+ return false;
+}
+
+bool pl_shader_fg_h274(pl_shader sh, pl_shader_obj *grain_state,
+ const struct pl_film_grain_params *params)
+{
+ if (!sh_require(sh, PL_SHADER_SIG_NONE, params->tex->params.w, params->tex->params.h))
+ return false;
+
+ size_t shmem_req = 0;
+ ident_t group_sum = NULL_IDENT;
+
+ const struct pl_glsl_version glsl = sh_glsl(sh);
+ if (glsl.subgroup_size < 8*8) {
+ group_sum = sh_fresh(sh, "group_sum");
+ shmem_req += sizeof(int);
+ GLSLH("shared int "$"; \n", group_sum);
+ GLSL($" = 0; barrier(); \n", group_sum);
+ }
+
+ if (!sh_try_compute(sh, 8, 8, false, shmem_req)) {
+ SH_FAIL(sh, "H.274 film grain synthesis requires compute shaders!");
+ return false;
+ }
+
+ ident_t db = sh_lut(sh, sh_lut_params(
+ .object = grain_state,
+ .var_type = PL_VAR_FLOAT,
+ .lut_type = SH_LUT_TEXTURE,
+ .width = 13 * 64,
+ .height = 13 * 64,
+ .comps = 1,
+ .fill = fill_grain_lut,
+ .signature = CACHE_KEY_H274, // doesn't depend on anything
+ .cache = SH_CACHE(sh),
+ ));
+
+ sh_describe(sh, "H.274 film grain");
+ GLSL("vec4 color; \n"
+ "// pl_shader_film_grain (H.274) \n"
+ "{ \n");
+
+ // Load the color value of the tex itself
+ ident_t tex = sh_desc(sh, (struct pl_shader_desc) {
+ .binding.object = params->tex,
+ .desc = (struct pl_desc) {
+ .name = "tex",
+ .type = PL_DESC_SAMPLED_TEX,
+ },
+ });
+
+ GLSL("ivec2 pos = ivec2(gl_GlobalInvocationID); \n"
+ "color = vec4("$") * texelFetch("$", pos, 0); \n",
+ SH_FLOAT(pl_color_repr_normalize(params->repr)), tex);
+
+ const struct pl_h274_grain_data *data = &params->data.params.h274;
+ ident_t scale_factor = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_float("scale_factor"),
+ .data = &(float){ 1.0 / (1 << (data->log2_scale_factor + 6)) },
+ });
+
+ // pcg3d (http://www.jcgt.org/published/0009/03/02/)
+ GLSL("uvec3 pcg = uvec3("$", gl_WorkGroupID.xy / 2u); \n"
+ "pcg = pcg * 1664525u + 1013904223u; \n"
+ "pcg.x += pcg.y * pcg.z; \n"
+ "pcg.y += pcg.z * pcg.x; \n"
+ "pcg.z += pcg.x * pcg.y; \n"
+ "pcg ^= pcg >> 16u; \n"
+ "pcg.x += pcg.y * pcg.z; \n"
+ "pcg.y += pcg.z * pcg.x; \n"
+ "pcg.z += pcg.x * pcg.y; \n",
+ sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_uint("seed"),
+ .data = &(unsigned int){ params->data.seed },
+ }));
+
+ for (int idx = 0; idx < params->components; idx++) {
+ enum pl_channel c = channel_map(idx, params);
+ if (c == PL_CHANNEL_NONE)
+ continue;
+ if (!data->component_model_present[c])
+ continue;
+
+ GLSL("// component %d\n{\n", c);
+
+ // Compute the local 8x8 average
+ GLSL("float avg = color[%d] / 64.0; \n", c);
+
+ const int precision = 10000000;
+ if (glsl.subgroup_size) {
+ GLSL("avg = subgroupAdd(avg); \n");
+
+ if (glsl.subgroup_size < 8*8) {
+ GLSL("if (subgroupElect()) \n"
+ " atomicAdd("$", int(avg * %d.0)); \n"
+ "barrier(); \n"
+ "avg = float("$") / %d.0; \n",
+ group_sum, precision, group_sum, precision);
+ }
+ } else {
+ GLSL("atomicAdd("$", int(avg * %d.0)); \n"
+ "barrier(); \n"
+ "avg = float("$") / %d.0; \n",
+ group_sum, precision, group_sum, precision);
+ }
+
+ // Hard-coded unrolled loop, to avoid having to load a dynamically
+ // sized array into the shader - and to optimize for the very common
+ // case of there only being a single intensity interval
+ GLSL("uint val; \n");
+ for (int i = 0; i < data->num_intensity_intervals[c]; i++) {
+ ident_t bounds = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec2("bounds"),
+ .data = &(float[2]) {
+ data->intensity_interval_lower_bound[c][i] / 255.0,
+ data->intensity_interval_upper_bound[c][i] / 255.0,
+ },
+ });
+
+ const uint8_t num_values = data->num_model_values[c];
+ uint8_t h = num_values > 1 ? data->comp_model_value[c][i][1] : 8;
+ uint8_t v = num_values > 2 ? data->comp_model_value[c][i][2] : h;
+ h = PL_CLAMP(h, 2, 14) - 2;
+ v = PL_CLAMP(v, 2, 14) - 2;
+ // FIXME: double h/v for subsampled planes!
+
+ // Reduce scale for chroma planes
+ int16_t scale = data->comp_model_value[c][i][0];
+ if (c > 0 && pl_color_system_is_ycbcr_like(params->repr->sys))
+ scale >>= 1;
+
+ pl_static_assert(sizeof(unsigned int) >= sizeof(uint32_t));
+ ident_t values = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_uint("comp_model_value"),
+ .data = &(unsigned int) {
+ (uint16_t) scale << 16 | h << 8 | v,
+ },
+ });
+
+ GLSL("if (avg >= "$".x && avg <= "$".y) \n"
+ " val = "$"; else \n",
+ bounds, bounds, values);
+ }
+ GLSL(" val = 0u; \n");
+
+ // Extract the grain parameters from comp_model_value
+ GLSL("uvec2 offset = uvec2((val & 0xFF00u) >> 2, \n"
+ " (val & 0xFFu) << 6); \n"
+ "float scale = "$" * float(int(val >> 16)); \n"
+ // Add randomness
+ "uint rand = pcg[%d]; \n"
+ "offset.x += (rand >> 16u) %% 52u; \n"
+ "offset.y += (rand & 0xFFFFu) %% 56u; \n"
+ "offset.x &= 0xFFFCu; \n"
+ "offset.y &= 0xFFF8u; \n"
+ "if ((rand & 1u) == 1u) scale = -scale; \n"
+ // Add local offset and compute grain
+ "offset += 8u * (gl_WorkGroupID.xy %% 2u); \n"
+ "offset += gl_LocalInvocationID.xy; \n"
+ "float grain = "$"(offset); \n"
+ "color[%d] += scale * grain; \n",
+ scale_factor, c, db, c);
+
+ // TODO: Deblocking?
+
+ GLSL("}\n");
+ }
+
+ GLSL("} \n");
+ return true;
+}
+
+// These tables are all taken from the SMPTE RDD 5-2006 specification
+static const int8_t Gaussian_LUT[2048+4] = {
+ -11, 12, 103, -11, 42, -35, 12, 59, 77, 98, -87, 3, 65, -78, 45, 56, -51, 21,
+ 13, -11, -20, -19, 33, -127, 17, -6, -105, 18, 19, 71, 48, -10, -38, 42,
+ -2, 75, -67, 52, -90, 33, -47, 21, -3, -56, 49, 1, -57, -42, -1, 120, -127,
+ -108, -49, 9, 14, 127, 122, 109, 52, 127, 2, 7, 114, 19, 30, 12, 77, 112,
+ 82, -61, -127, 111, -52, -29, 2, -49, -24, 58, -29, -73, 12, 112, 67, 79,
+ -3, -114, -87, -6, -5, 40, 58, -81, 49, -27, -31, -34, -105, 50, 16, -24,
+ -35, -14, -15, -127, -55, -22, -55, -127, -112, 5, -26, -72, 127, 127, -2,
+ 41, 87, -65, -16, 55, 19, 91, -81, -65, -64, 35, -7, -54, 99, -7, 88, 125,
+ -26, 91, 0, 63, 60, -14, -23, 113, -33, 116, 14, 26, 51, -16, 107, -8, 53,
+ 38, -34, 17, -7, 4, -91, 6, 63, 63, -15, 39, -36, 19, 55, 17, -51, 40, 33,
+ -37, 126, -39, -118, 17, -30, 0, 19, 98, 60, 101, -12, -73, -17, -52, 98,
+ 3, 3, 60, 33, -3, -2, 10, -42, -106, -38, 14, 127, 16, -127, -31, -86, -39,
+ -56, 46, -41, 75, 23, -19, -22, -70, 74, -54, -2, 32, -45, 17, -92, 59,
+ -64, -67, 56, -102, -29, -87, -34, -92, 68, 5, -74, -61, 93, -43, 14, -26,
+ -38, -126, -17, 16, -127, 64, 34, 31, 93, 17, -51, -59, 71, 77, 81, 127,
+ 127, 61, 33, -106, -93, 0, 0, 75, -69, 71, 127, -19, -111, 30, 23, 15, 2,
+ 39, 92, 5, 42, 2, -6, 38, 15, 114, -30, -37, 50, 44, 106, 27, 119, 7, -80,
+ 25, -68, -21, 92, -11, -1, 18, 41, -50, 79, -127, -43, 127, 18, 11, -21,
+ 32, -52, 27, -88, -90, -39, -19, -10, 24, -118, 72, -24, -44, 2, 12, 86,
+ -107, 39, -33, -127, 47, 51, -24, -22, 46, 0, 15, -35, -69, -2, -74, 24,
+ -6, 0, 29, -3, 45, 32, -32, 117, -45, 79, -24, -17, -109, -10, -70, 88,
+ -48, 24, -91, 120, -37, 50, -127, 58, 32, -82, -10, -17, -7, 46, -127, -15,
+ 89, 127, 17, 98, -39, -33, 37, 42, -40, -32, -21, 105, -19, 19, 19, -59,
+ -9, 30, 0, -127, 34, 127, -84, 75, 24, -40, -49, -127, -107, -14, 45, -75,
+ 1, 30, -20, 41, -68, -40, 12, 127, -3, 5, 20, -73, -59, -127, -3, -3, -53,
+ -6, -119, 93, 120, -80, -50, 0, 20, -46, 67, 78, -12, -22, -127, 36, -41,
+ 56, 119, -5, -116, -22, 68, -14, -90, 24, -82, -44, -127, 107, -25, -37,
+ 40, -7, -7, -82, 5, -87, 44, -34, 9, -127, 39, 70, 49, -63, 74, -49, 109,
+ -27, -89, -47, -39, 44, 49, -4, 60, -42, 80, 9, -127, -9, -56, -49, 125,
+ -66, 47, 36, 117, 15, -11, -96, 109, 94, -17, -56, 70, 8, -14, -5, 50, 37,
+ -45, 120, -30, -76, 40, -46, 6, 3, 69, 17, -78, 1, -79, 6, 127, 43, 26,
+ 127, -127, 28, -55, -26, 55, 112, 48, 107, -1, -77, -1, 53, -9, -22, -43,
+ 123, 108, 127, 102, 68, 46, 5, 1, 123, -13, -55, -34, -49, 89, 65, -105,
+ -5, 94, -53, 62, 45, 30, 46, 18, -35, 15, 41, 47, -98, -24, 94, -75, 127,
+ -114, 127, -68, 1, -17, 51, -95, 47, 12, 34, -45, -75, 89, -107, -9, -58,
+ -29, -109, -24, 127, -61, -13, 77, -45, 17, 19, 83, -24, 9, 127, -66, 54,
+ 4, 26, 13, 111, 43, -113, -22, 10, -24, 83, 67, -14, 75, -123, 59, 127,
+ -12, 99, -19, 64, -38, 54, 9, 7, 61, -56, 3, -57, 113, -104, -59, 3, -9,
+ -47, 74, 85, -55, -34, 12, 118, 28, 93, -72, 13, -99, -72, -20, 30, 72,
+ -94, 19, -54, 64, -12, -63, -25, 65, 72, -10, 127, 0, -127, 103, -20, -73,
+ -112, -103, -6, 28, -42, -21, -59, -29, -26, 19, -4, -51, 94, -58, -95,
+ -37, 35, 20, -69, 127, -19, -127, -22, -120, -53, 37, 74, -127, -1, -12,
+ -119, -53, -28, 38, 69, 17, 16, -114, 89, 62, 24, 37, -23, 49, -101, -32,
+ -9, -95, -53, 5, 93, -23, -49, -8, 51, 3, -75, -90, -10, -39, 127, -86,
+ -22, 20, 20, 113, 75, 52, -31, 92, -63, 7, -12, 46, 36, 101, -43, -17, -53,
+ -7, -38, -76, -31, -21, 62, 31, 62, 20, -127, 31, 64, 36, 102, -85, -10,
+ 77, 80, 58, -79, -8, 35, 8, 80, -24, -9, 3, -17, 72, 127, 83, -87, 55, 18,
+ -119, -123, 36, 10, 127, 56, -55, 113, 13, 26, 32, -13, -48, 22, -13, 5,
+ 58, 27, 24, 26, -11, -36, 37, -92, 78, 81, 9, 51, 14, 67, -13, 0, 32, 45,
+ -76, 32, -39, -22, -49, -127, -27, 31, -9, 36, 14, 71, 13, 57, 12, -53,
+ -86, 53, -44, -35, 2, 127, 12, -66, -44, 46, -115, 3, 10, 56, -35, 119,
+ -19, -61, 52, -59, -127, -49, -23, 4, -5, 17, -82, -6, 127, 25, 79, 67, 64,
+ -25, 14, -64, -37, -127, -28, 21, -63, 66, -53, -41, 109, -62, 15, -22, 13,
+ 29, -63, 20, 27, 95, -44, -59, -116, -10, 79, -49, 22, -43, -16, 46, -47,
+ -120, -36, -29, -52, -44, 29, 127, -13, 49, -9, -127, 75, -28, -23, 88, 59,
+ 11, -95, 81, -59, 58, 60, -26, 40, -92, -3, -22, -58, -45, -59, -22, -53,
+ 71, -29, 66, -32, -23, 14, -17, -66, -24, -28, -62, 47, 38, 17, 16, -37,
+ -24, -11, 8, -27, -19, 59, 45, -49, -47, -4, -22, -81, 30, -67, -127, 74,
+ 102, 5, -18, 98, 34, -66, 42, -52, 7, -59, 24, -58, -19, -24, -118, -73,
+ 91, 15, -16, 79, -32, -79, -127, -36, 41, 77, -83, 2, 56, 22, -75, 127,
+ -16, -21, 12, 31, 56, -113, -127, 90, 55, 61, 12, 55, -14, -113, -14, 32,
+ 49, -67, -17, 91, -10, 1, 21, 69, -70, 99, -19, -112, 66, -90, -10, -9,
+ -71, 127, 50, -81, -49, 24, 61, -61, -111, 7, -41, 127, 88, -66, 108, -127,
+ -6, 36, -14, 41, -50, 14, 14, 73, -101, -28, 77, 127, -8, -100, 88, 38,
+ 121, 88, -125, -60, 13, -94, -115, 20, -67, -87, -94, -119, 44, -28, -30,
+ 18, 5, -53, -61, 20, -43, 11, -77, -60, 13, 29, 3, 6, -72, 38, -60, -11,
+ 108, -53, 41, 66, -12, -127, -127, -49, 24, 29, 46, 36, 91, 34, -33, 116,
+ -51, -34, -52, 91, 7, -83, 73, -26, -103, 24, -10, 76, 84, 5, 68, -80, -13,
+ -17, -32, -48, 20, 50, 26, 10, 63, -104, -14, 37, 127, 114, 97, 35, 1, -33,
+ -55, 127, -124, -33, 61, -7, 119, -32, -127, -53, -42, 63, 3, -5, -26, 70,
+ -58, -33, -44, -43, 34, -56, -127, 127, 25, -35, -11, 16, -81, 29, -58, 40,
+ -127, -127, 20, -47, -11, -36, -63, -52, -32, -82, 78, -76, -73, 8, 27,
+ -72, -9, -74, -85, -86, -57, 25, 78, -10, -97, 35, -65, 8, -59, 14, 1, -42,
+ 32, -88, -44, 17, -3, -9, 59, 40, 12, -108, -40, 24, 34, 18, -28, 2, 51,
+ -110, -4, 100, 1, 65, 22, 0, 127, 61, 45, 25, -31, 6, 9, -7, -48, 99, 16,
+ 44, -2, -40, 32, -39, -52, 10, -110, -19, 56, -127, 69, 26, 51, 92, 40, 61,
+ -52, 45, -38, 13, 85, 122, 27, 66, 45, -111, -83, -3, 31, 37, 19, -36, 58,
+ 71, 39, -78, -47, 58, -78, 8, -62, -36, -14, 61, 42, -127, 71, -4, 24, -54,
+ 52, -127, 67, -4, -42, 30, -63, 59, -3, -1, -18, -46, -92, -81, -96, -14,
+ -53, -10, -11, -77, 13, 1, 8, -67, -127, 127, -28, 26, -14, 18, -13, -26,
+ 2, 10, -46, -32, -15, 27, -31, -59, 59, 77, -121, 28, 40, -54, -62, -31,
+ -21, -37, -32, -6, -127, -25, -60, 70, -127, 112, -127, 127, 88, -7, 116,
+ 110, 53, 87, -127, 3, 16, 23, 74, -106, -51, 3, 74, -82, -112, -74, 65, 81,
+ 25, 53, 127, -45, -50, -103, -41, -65, -29, 79, -67, 64, -33, -30, -8, 127,
+ 0, -13, -51, 67, -14, 5, -92, 29, -35, -8, -90, -57, -3, 36, 43, 44, -31,
+ -69, -7, 36, 39, -51, 43, -81, 58, 6, 127, 12, 57, 66, 46, 59, -43, -42,
+ 41, -15, -120, 24, 3, -11, 19, -13, 51, 28, 3, 55, -48, -12, -1, 2, 97,
+ -19, 29, 42, 13, 43, 78, -44, 56, -108, -43, -19, 127, 15, -11, -18, -81,
+ 83, -37, 77, -109, 15, 65, -50, 43, 12, 13, 27, 28, 61, 57, 30, 26, 106,
+ -18, 56, 13, 97, 4, -8, -62, -103, 94, 108, -44, 52, 27, -47, -9, 105, -53,
+ 46, 89, 103, -33, 38, -34, 55, 51, 70, -94, -35, -87, -107, -19, -31, 9,
+ -19, 79, -14, 77, 5, -19, -107, 85, 21, -45, -39, -42, 9, -29, 74, 47, -75,
+ 60, -127, 120, -112, -57, -32, 41, 7, 79, 76, 66, 57, 41, -25, 31, 37, -47,
+ -36, 43, -73, -37, 63, 127, -69, -52, 90, -33, -61, 60, -55, 44, 15, 4,
+ -67, 13, -92, 64, 29, -39, -3, 83, -2, -38, -85, -86, 58, 35, -69, -61, 29,
+ -37, -95, -78, 4, 30, -4, -32, -80, -22, -9, -77, 46, 7, -93, -71, 65, 9,
+ -50, 127, -70, 26, -12, -39, -114, 63, -127, -100, 4, -32, 111, 22, -60,
+ 65, -101, 26, -42, 21, -59, -27, -74, 2, -94, 6, 126, 5, 76, -88, -9, -43,
+ -101, 127, 1, 125, 92, -63, 52, 56, 4, 81, -127, 127, 80, 127, -29, 30,
+ 116, -74, -17, -57, 105, 48, 45, 25, -72, 48, -38, -108, 31, -34, 4, -11,
+ 41, -127, 52, -104, -43, -37, 52, 2, 47, 87, -9, 77, 27, -41, -25, 90, 86,
+ -56, 75, 10, 33, 78, 58, 127, 127, -7, -73, 49, -33, -106, -35, 38, 57, 53,
+ -17, -4, 83, 52, -108, 54, -125, 28, 23, 56, -43, -88, -17, -6, 47, 23, -9,
+ 0, -13, 111, 75, 27, -52, -38, -34, 39, 30, 66, 39, 38, -64, 38, 3, 21,
+ -32, -51, -28, 54, -38, -87, 20, 52, 115, 18, -81, -70, 0, -14, -46, -46,
+ -3, 125, 16, -14, 23, -82, -84, -69, -20, -65, -127, 9, 81, -49, 61, 7,
+ -36, -45, -42, 57, -26, 47, 20, -85, 46, -13, 41, -37, -75, -60, 86, -78,
+ -127, 12, 50, 2, -3, 13, 47, 5, 19, -78, -55, -27, 65, -71, 12, -108, 20,
+ -16, 11, -31, 63, -55, 37, 75, -17, 127, -73, -33, -28, -120, 105, 68, 106,
+ -103, -106, 71, 61, 2, 23, -3, 33, -5, -15, -67, -15, -23, -54, 15, -63,
+ 76, 58, -110, 1, 83, -27, 22, 75, -39, -17, -11, 64, -17, -127, -54, -66,
+ 31, 96, 116, 3, -114, -7, -108, -63, 97, 9, 50, 8, 75, -28, 72, 112, -36,
+ -112, 95, -50, 23, -13, -19, 55, 21, 23, 92, 91, 22, -49, 16, -75, 23, 9,
+ -49, -97, -37, 49, -36, 36, -127, -86, 43, 127, -24, -24, 84, 83, -35, -34,
+ -12, 109, 102, -38, 51, -68, 34, 19, -22, 49, -32, 127, 40, 24, -93, -4,
+ -3, 105, 3, -58, -18, 8, 127, -18, 125, 68, 69, -62, 30, -36, 54, -57, -24,
+ 17, 43, -36, -27, -57, -67, -21, -10, -49, 68, 12, 65, 4, 48, 55, 127, -75,
+ 44, 89, -66, -13, -78, -82, -91, 22, 30, 33, -40, -87, -34, 96, -91, 39,
+ 10, -64, -3, -12, 127, -50, -37, -56, 23, -35, -36, -54, 90, -91, 2, 50,
+ 77, -6, -127, 16, 46, -5, -73, 0, -56, -18, -72, 28, 93, 60, 49, 20, 18,
+ 111, -111, 32, -83, 47, 47, -10, 35, -88, 43, 57, -98, 127, -17, 0, 1, -39,
+ -127, -2, 0, 63, 93, 0, 36, -66, -61, -19, 39, -127, 58, 50, -17, 127, 88,
+ -43, -108, -51, -16, 7, -36, 68, 46, -14, 107, 40, 57, 7, 19, 8, 3, 88,
+ -90, -92, -18, -21, -24, 13, 7, -4, -78, -91, -4, 8, -35, -5, 19, 2, -111,
+ 4, -66, -81, 122, -20, -34, -37, -84, 127, 68, 46, 17, 47,
+
+ // Repeat the beginning of the array to allow wrapping reads
+ -11, 12, 103, -11,
+};
+
+static const uint32_t Seed_LUT[256] = {
+ 747538460, 1088979410, 1744950180, 1767011913, 1403382928,
+ 521866116, 1060417601, 2110622736, 1557184770, 105289385, 585624216,
+ 1827676546, 1191843873, 1018104344, 1123590530, 663361569, 2023850500,
+ 76561770, 1226763489, 80325252, 1992581442, 502705249, 740409860,
+ 516219202, 557974537, 1883843076, 720112066, 1640137737, 1820967556,
+ 40667586, 155354121, 1820967557, 1115949072, 1631803309, 98284748,
+ 287433856, 2119719977, 988742797, 1827432592, 579378475, 1017745956,
+ 1309377032, 1316535465, 2074315269, 1923385360, 209722667, 1546228260,
+ 168102420, 135274561, 355958469, 248291472, 2127839491, 146920100,
+ 585982612, 1611702337, 696506029, 1386498192, 1258072451, 1212240548,
+ 1043171860, 1217404993, 1090770605, 1386498193, 169093201, 541098240,
+ 1468005469, 456510673, 1578687785, 1838217424, 2010752065, 2089828354,
+ 1362717428, 970073673, 854129835, 714793201, 1266069081, 1047060864,
+ 1991471829, 1098097741, 913883585, 1669598224, 1337918685, 1219264706,
+ 1799741108, 1834116681, 683417731, 1120274457, 1073098457, 1648396544,
+ 176642749, 31171789, 718317889, 1266977808, 1400892508, 549749008,
+ 1808010512, 67112961, 1005669825, 903663673, 1771104465, 1277749632,
+ 1229754427, 950632997, 1979371465, 2074373264, 305357524, 1049387408,
+ 1171033360, 1686114305, 2147468765, 1941195985, 117709841, 809550080,
+ 991480851, 1816248997, 1561503561, 329575568, 780651196, 1659144592,
+ 1910793616, 604016641, 1665084765, 1530186961, 1870928913, 809550081,
+ 2079346113, 71307521, 876663040, 1073807360, 832356664, 1573927377,
+ 204073344, 2026918147, 1702476788, 2043881033, 57949587, 2001393952,
+ 1197426649, 1186508931, 332056865, 950043140, 890043474, 349099312,
+ 148914948, 236204097, 2022643605, 1441981517, 498130129, 1443421481,
+ 924216797, 1817491777, 1913146664, 1411989632, 929068432, 495735097,
+ 1684636033, 1284520017, 432816184, 1344884865, 210843729, 676364544,
+ 234449232, 12112337, 1350619139, 1753272996, 2037118872, 1408560528,
+ 533334916, 1043640385, 357326099, 201376421, 110375493, 541106497,
+ 416159637, 242512193, 777294080, 1614872576, 1535546636, 870600145,
+ 910810409, 1821440209, 1605432464, 1145147393, 951695441, 1758494976,
+ 1506656568, 1557150160, 608221521, 1073840384, 217672017, 684818688,
+ 1750138880, 16777217, 677990609, 953274371, 1770050213, 1359128393,
+ 1797602707, 1984616737, 1865815816, 2120835200, 2051677060, 1772234061,
+ 1579794881, 1652821009, 1742099468, 1887260865, 46468113, 1011925248,
+ 1134107920, 881643832, 1354774993, 472508800, 1892499769, 1752793472,
+ 1962502272, 687898625, 883538000, 1354355153, 1761673473, 944820481,
+ 2020102353, 22020353, 961597696, 1342242816, 964808962, 1355809701,
+ 17016649, 1386540177, 647682692, 1849012289, 751668241, 1557184768,
+ 127374604, 1927564752, 1045744913, 1614921984, 43588881, 1016185088,
+ 1544617984, 1090519041, 136122424, 215038417, 1563027841, 2026918145,
+ 1688778833, 701530369, 1372639488, 1342242817, 2036945104, 953274369,
+ 1750192384, 16842753, 964808960, 1359020032, 1358954497
+};
+
+// Note: This is pre-transposed, i.e. stored column-major order
+static const int8_t R64T[64][64] = {
+ {
+ 32, 45, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 43, 43, 43, 42,
+ 42, 41, 41, 40, 40, 39, 39, 38, 38, 37, 36, 36, 35, 34, 34, 33,
+ 32, 31, 30, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18,
+ 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 4, 3, 2, 1,
+ }, {
+ 32, 45, 45, 44, 43, 42, 41, 39, 38, 36, 34, 31, 29, 26, 23, 20,
+ 17, 14, 11, 8, 4, 1, -2, -6, -9, -12, -15, -18, -21, -24, -27, -30,
+ -32, -34, -36, -38, -40, -41, -43, -44, -44, -45, -45, -45, -45, -45, -44, -43,
+ -42, -40, -39, -37, -35, -33, -30, -28, -25, -22, -19, -16, -13, -10, -7, -3,
+ }, {
+ 32, 45, 44, 42, 40, 37, 34, 30, 25, 20, 15, 10, 4, -1, -7, -12,
+ -17, -22, -27, -31, -35, -38, -41, -43, -44, -45, -45, -45, -43, -41, -39, -36,
+ -32, -28, -23, -18, -13, -8, -2, 3, 9, 14, 19, 24, 29, 33, 36, 39,
+ 42, 44, 45, 45, 45, 44, 43, 40, 38, 34, 30, 26, 21, 16, 11, 6,
+ }, {
+ 32, 45, 43, 39, 35, 30, 23, 16, 9, 1, -7, -14, -21, -28, -34, -38,
+ -42, -44, -45, -45, -43, -40, -36, -31, -25, -18, -11, -3, 4, 12, 19, 26,
+ 32, 37, 41, 44, 45, 45, 44, 41, 38, 33, 27, 20, 13, 6, -2, -10,
+ -17, -24, -30, -36, -40, -43, -45, -45, -44, -42, -39, -34, -29, -22, -15, -8,
+ }, {
+ 32, 44, 41, 36, 29, 20, 11, 1, -9, -18, -27, -34, -40, -44, -45, -45,
+ -42, -37, -30, -22, -13, -3, 7, 16, 25, 33, 39, 43, 45, 45, 43, 38,
+ 32, 24, 15, 6, -4, -14, -23, -31, -38, -42, -45, -45, -43, -39, -34, -26,
+ -17, -8, 2, 12, 21, 30, 36, 41, 44, 45, 44, 40, 35, 28, 19, 10,
+ }, {
+ 32, 44, 39, 31, 21, 10, -2, -14, -25, -34, -41, -45, -45, -42, -36, -28,
+ -17, -6, 7, 18, 29, 37, 43, 45, 44, 40, 34, 24, 13, 1, -11, -22,
+ -32, -39, -44, -45, -43, -38, -30, -20, -9, 3, 15, 26, 35, 41, 45, 45,
+ 42, 36, 27, 16, 4, -8, -19, -30, -38, -43, -45, -44, -40, -33, -23, -12,
+ }, {
+ 32, 43, 36, 26, 13, -1, -15, -28, -38, -44, -45, -42, -35, -24, -11, 3,
+ 17, 30, 39, 44, 45, 41, 34, 22, 9, -6, -19, -31, -40, -45, -45, -40,
+ -32, -20, -7, 8, 21, 33, 41, 45, 44, 39, 30, 18, 4, -10, -23, -34,
+ -42, -45, -44, -38, -29, -16, -2, 12, 25, 36, 43, 45, 43, 37, 27, 14,
+ }, {
+ 32, 42, 34, 20, 4, -12, -27, -38, -44, -45, -39, -28, -13, 3, 19, 33,
+ 42, 45, 43, 34, 21, 6, -11, -26, -38, -44, -45, -39, -29, -14, 2, 18,
+ 32, 41, 45, 43, 35, 22, 7, -10, -25, -37, -44, -45, -40, -30, -15, 1,
+ 17, 31, 41, 45, 43, 36, 23, 8, -9, -24, -36, -44, -45, -40, -30, -16,
+ }, {
+ 32, 41, 30, 14, -4, -22, -36, -44, -44, -37, -23, -6, 13, 30, 41, 45,
+ 42, 31, 15, -3, -21, -36, -44, -45, -38, -24, -7, 12, 29, 40, 45, 42,
+ 32, 16, -2, -20, -35, -44, -45, -38, -25, -8, 11, 28, 40, 45, 43, 33,
+ 17, -1, -19, -34, -43, -45, -39, -26, -9, 10, 27, 39, 45, 43, 34, 18,
+ }, {
+ 32, 40, 27, 8, -13, -31, -43, -45, -38, -22, -2, 18, 35, 44, 44, 34,
+ 17, -3, -23, -38, -45, -42, -30, -12, 9, 28, 41, 45, 40, 26, 7, -14,
+ -32, -43, -45, -37, -21, -1, 19, 36, 44, 44, 34, 16, -4, -24, -39, -45,
+ -42, -30, -11, 10, 29, 41, 45, 39, 25, 6, -15, -33, -43, -45, -36, -20,
+ }, {
+ 32, 39, 23, 1, -21, -38, -45, -40, -25, -3, 19, 37, 45, 41, 27, 6,
+ -17, -36, -45, -42, -29, -8, 15, 34, 44, 43, 30, 10, -13, -33, -44, -44,
+ -32, -12, 11, 31, 43, 44, 34, 14, -9, -30, -43, -45, -35, -16, 7, 28,
+ 42, 45, 36, 18, -4, -26, -41, -45, -38, -20, 2, 24, 40, 45, 39, 22,
+ }, {
+ 32, 38, 19, -6, -29, -43, -44, -31, -9, 16, 36, 45, 40, 22, -2, -26,
+ -42, -45, -34, -12, 13, 34, 45, 41, 25, 1, -23, -40, -45, -36, -15, 10,
+ 32, 44, 43, 28, 4, -20, -39, -45, -38, -18, 7, 30, 43, 44, 30, 8,
+ -17, -37, -45, -39, -21, 3, 27, 42, 44, 33, 11, -14, -35, -45, -41, -24,
+ }, {
+ 32, 37, 15, -12, -35, -45, -39, -18, 9, 33, 45, 40, 21, -6, -30, -44,
+ -42, -24, 2, 28, 43, 43, 27, 1, -25, -42, -44, -30, -4, 22, 41, 45,
+ 32, 8, -19, -39, -45, -34, -11, 16, 38, 45, 36, 14, -13, -36, -45, -38,
+ -17, 10, 34, 45, 40, 20, -7, -31, -44, -41, -23, 3, 29, 44, 43, 26,
+ }, {
+ 32, 36, 11, -18, -40, -45, -30, -3, 25, 43, 43, 24, -4, -31, -45, -39,
+ -17, 12, 36, 45, 35, 10, -19, -40, -44, -30, -2, 26, 43, 42, 23, -6,
+ -32, -45, -39, -16, 13, 37, 45, 34, 9, -20, -41, -44, -29, -1, 27, 44,
+ 42, 22, -7, -33, -45, -38, -15, 14, 38, 45, 34, 8, -21, -41, -44, -28,
+ }, {
+ 32, 34, 7, -24, -43, -41, -19, 12, 38, 45, 30, 1, -29, -45, -39, -14,
+ 17, 40, 44, 26, -4, -33, -45, -36, -9, 22, 43, 42, 21, -10, -36, -45,
+ -32, -3, 27, 44, 40, 16, -15, -39, -44, -28, 2, 31, 45, 37, 11, -20,
+ -42, -43, -23, 8, 35, 45, 34, 6, -25, -44, -41, -18, 13, 38, 45, 30,
+ }, {
+ 32, 33, 2, -30, -45, -36, -7, 26, 44, 38, 11, -22, -43, -40, -15, 18,
+ 42, 42, 19, -14, -40, -44, -23, 10, 38, 45, 27, -6, -35, -45, -30, 1,
+ 32, 45, 34, 3, -29, -45, -36, -8, 25, 44, 39, 12, -21, -43, -41, -16,
+ 17, 41, 43, 20, -13, -39, -44, -24, 9, 37, 45, 28, -4, -34, -45, -31,
+ }, {
+ 32, 31, -2, -34, -45, -28, 7, 37, 44, 24, -11, -39, -43, -20, 15, 41,
+ 42, 16, -19, -43, -40, -12, 23, 44, 38, 8, -27, -45, -35, -3, 30, 45,
+ 32, -1, -34, -45, -29, 6, 36, 45, 25, -10, -39, -44, -21, 14, 41, 42,
+ 17, -18, -43, -40, -13, 22, 44, 38, 9, -26, -45, -36, -4, 30, 45, 33,
+ }, {
+ 32, 30, -7, -38, -43, -18, 19, 44, 38, 6, -30, -45, -29, 8, 39, 43,
+ 17, -20, -44, -37, -4, 31, 45, 28, -9, -39, -43, -16, 21, 44, 36, 3,
+ -32, -45, -27, 10, 40, 42, 15, -22, -44, -36, -2, 33, 45, 26, -11, -40,
+ -42, -14, 23, 45, 35, 1, -34, -45, -25, 12, 41, 41, 13, -24, -45, -34,
+ }, {
+ 32, 28, -11, -41, -40, -8, 30, 45, 25, -14, -43, -38, -4, 33, 45, 22,
+ -17, -44, -36, -1, 35, 44, 19, -20, -44, -34, 2, 37, 43, 16, -23, -45,
+ -32, 6, 39, 42, 13, -26, -45, -30, 9, 40, 41, 10, -29, -45, -27, 12,
+ 42, 39, 7, -31, -45, -24, 15, 43, 38, 3, -34, -45, -21, 18, 44, 36,
+ }, {
+ 32, 26, -15, -44, -35, 3, 39, 41, 9, -31, -45, -20, 21, 45, 30, -10,
+ -42, -38, -2, 36, 43, 14, -27, -45, -25, 16, 44, 34, -4, -39, -41, -8,
+ 32, 45, 19, -22, -45, -30, 11, 42, 38, 1, -36, -43, -13, 28, 45, 24,
+ -17, -44, -34, 6, 40, 40, 7, -33, -44, -18, 23, 45, 29, -12, -43, -37,
+ }, {
+ 32, 24, -19, -45, -29, 14, 44, 33, -9, -42, -36, 3, 40, 39, 2, -37,
+ -42, -8, 34, 44, 13, -30, -45, -18, 25, 45, 23, -20, -45, -28, 15, 44,
+ 32, -10, -43, -36, 4, 40, 39, 1, -38, -41, -7, 34, 43, 12, -30, -45,
+ -17, 26, 45, 22, -21, -45, -27, 16, 44, 31, -11, -43, -35, 6, 41, 38,
+ }, {
+ 32, 22, -23, -45, -21, 24, 45, 20, -25, -45, -19, 26, 45, 18, -27, -45,
+ -17, 28, 45, 16, -29, -45, -15, 30, 44, 14, -30, -44, -13, 31, 44, 12,
+ -32, -44, -11, 33, 43, 10, -34, -43, -9, 34, 43, 8, -35, -42, -7, 36,
+ 42, 6, -36, -41, -4, 37, 41, 3, -38, -40, -2, 38, 40, 1, -39, -39,
+ }, {
+ 32, 20, -27, -45, -13, 33, 43, 6, -38, -39, 2, 41, 35, -10, -44, -30,
+ 17, 45, 23, -24, -45, -16, 30, 44, 9, -36, -41, -1, 40, 37, -7, -43,
+ -32, 14, 45, 26, -21, -45, -19, 28, 44, 12, -34, -42, -4, 38, 39, -3,
+ -42, -34, 11, 44, 29, -18, -45, -22, 25, 45, 15, -31, -43, -8, 36, 40,
+ }, {
+ 32, 18, -30, -43, -4, 39, 36, -10, -44, -26, 23, 45, 13, -34, -41, 1,
+ 42, 33, -15, -45, -21, 28, 44, 8, -38, -38, 7, 44, 29, -20, -45, -16,
+ 32, 42, 2, -40, -35, 12, 45, 24, -25, -45, -11, 36, 40, -3, -43, -31,
+ 17, 45, 19, -30, -43, -6, 39, 37, -9, -44, -27, 22, 45, 14, -34, -41,
+ }, {
+ 32, 16, -34, -40, 4, 44, 27, -24, -44, -8, 39, 36, -13, -45, -19, 31,
+ 42, -1, -43, -30, 21, 45, 11, -37, -38, 10, 45, 22, -29, -43, -2, 41,
+ 32, -18, -45, -14, 35, 39, -7, -44, -25, 26, 44, 6, -40, -34, 15, 45,
+ 17, -33, -41, 3, 43, 28, -23, -45, -9, 38, 36, -12, -45, -20, 30, 42,
+ }, {
+ 32, 14, -36, -37, 13, 45, 15, -36, -38, 12, 45, 16, -35, -38, 11, 45,
+ 17, -34, -39, 10, 45, 18, -34, -39, 9, 45, 19, -33, -40, 8, 45, 20,
+ -32, -40, 7, 45, 21, -31, -41, 6, 44, 22, -30, -41, 4, 44, 23, -30,
+ -42, 3, 44, 24, -29, -42, 2, 44, 25, -28, -43, 1, 43, 26, -27, -43,
+ }, {
+ 32, 12, -39, -33, 21, 44, 2, -43, -25, 30, 41, -8, -45, -16, 36, 36,
+ -17, -45, -7, 41, 29, -26, -43, 3, 44, 20, -34, -38, 13, 45, 11, -39,
+ -32, 22, 44, 1, -43, -24, 30, 40, -9, -45, -15, 37, 35, -18, -45, -6,
+ 42, 28, -27, -42, 4, 45, 19, -34, -38, 14, 45, 10, -40, -31, 23, 44,
+ }, {
+ 32, 10, -41, -28, 29, 40, -11, -45, -9, 41, 27, -30, -40, 12, 45, 8,
+ -42, -26, 30, 39, -13, -45, -7, 42, 25, -31, -39, 14, 45, 6, -43, -24,
+ 32, 38, -15, -45, -4, 43, 23, -33, -38, 16, 45, 3, -43, -22, 34, 37,
+ -17, -45, -2, 44, 21, -34, -36, 18, 44, 1, -44, -20, 35, 36, -19, -44,
+ }, {
+ 32, 8, -43, -22, 35, 34, -23, -42, 9, 45, 7, -43, -21, 36, 34, -24,
+ -42, 10, 45, 6, -43, -20, 36, 33, -25, -41, 11, 45, 4, -44, -19, 37,
+ 32, -26, -41, 12, 45, 3, -44, -18, 38, 31, -27, -40, 13, 45, 2, -44,
+ -17, 38, 30, -28, -40, 14, 45, 1, -44, -16, 39, 30, -29, -39, 15, 45,
+ }, {
+ 32, 6, -44, -16, 40, 26, -34, -34, 25, 40, -15, -44, 4, 45, 7, -44,
+ -17, 39, 27, -33, -35, 24, 41, -14, -44, 3, 45, 8, -43, -18, 39, 28,
+ -32, -36, 23, 41, -13, -45, 2, 45, 9, -43, -19, 38, 29, -31, -36, 22,
+ 42, -12, -45, 1, 45, 10, -43, -20, 38, 30, -30, -37, 21, 42, -11, -45,
+ }, {
+ 32, 3, -45, -10, 43, 16, -41, -22, 38, 28, -34, -33, 29, 37, -23, -40,
+ 17, 43, -11, -45, 4, 45, 2, -45, -9, 44, 15, -41, -21, 38, 27, -34,
+ -32, 30, 36, -24, -40, 18, 43, -12, -44, 6, 45, 1, -45, -8, 44, 14,
+ -42, -20, 39, 26, -35, -31, 30, 36, -25, -39, 19, 42, -13, -44, 7, 45,
+ }, {
+ 32, 1, -45, -3, 45, 6, -45, -8, 44, 10, -44, -12, 43, 14, -43, -16,
+ 42, 18, -41, -20, 40, 22, -39, -24, 38, 26, -36, -28, 35, 30, -34, -31,
+ 32, 33, -30, -34, 29, 36, -27, -37, 25, 38, -23, -39, 21, 40, -19, -41,
+ 17, 42, -15, -43, 13, 44, -11, -44, 9, 45, -7, -45, 4, 45, -2, -45,
+ }, {
+ 32, -1, -45, 3, 45, -6, -45, 8, 44, -10, -44, 12, 43, -14, -43, 16,
+ 42, -18, -41, 20, 40, -22, -39, 24, 38, -26, -36, 28, 35, -30, -34, 31,
+ 32, -33, -30, 34, 29, -36, -27, 37, 25, -38, -23, 39, 21, -40, -19, 41,
+ 17, -42, -15, 43, 13, -44, -11, 44, 9, -45, -7, 45, 4, -45, -2, 45,
+ }, {
+ 32, -3, -45, 10, 43, -16, -41, 22, 38, -28, -34, 33, 29, -37, -23, 40,
+ 17, -43, -11, 45, 4, -45, 2, 45, -9, -44, 15, 41, -21, -38, 27, 34,
+ -32, -30, 36, 24, -40, -18, 43, 12, -44, -6, 45, -1, -45, 8, 44, -14,
+ -42, 20, 39, -26, -35, 31, 30, -36, -25, 39, 19, -42, -13, 44, 7, -45,
+ }, {
+ 32, -6, -44, 16, 40, -26, -34, 34, 25, -40, -15, 44, 4, -45, 7, 44,
+ -17, -39, 27, 33, -35, -24, 41, 14, -44, -3, 45, -8, -43, 18, 39, -28,
+ -32, 36, 23, -41, -13, 45, 2, -45, 9, 43, -19, -38, 29, 31, -36, -22,
+ 42, 12, -45, -1, 45, -10, -43, 20, 38, -30, -30, 37, 21, -42, -11, 45,
+ }, {
+ 32, -8, -43, 22, 35, -34, -23, 42, 9, -45, 7, 43, -21, -36, 34, 24,
+ -42, -10, 45, -6, -43, 20, 36, -33, -25, 41, 11, -45, 4, 44, -19, -37,
+ 32, 26, -41, -12, 45, -3, -44, 18, 38, -31, -27, 40, 13, -45, 2, 44,
+ -17, -38, 30, 28, -40, -14, 45, -1, -44, 16, 39, -30, -29, 39, 15, -45,
+ }, {
+ 32, -10, -41, 28, 29, -40, -11, 45, -9, -41, 27, 30, -40, -12, 45, -8,
+ -42, 26, 30, -39, -13, 45, -7, -42, 25, 31, -39, -14, 45, -6, -43, 24,
+ 32, -38, -15, 45, -4, -43, 23, 33, -38, -16, 45, -3, -43, 22, 34, -37,
+ -17, 45, -2, -44, 21, 34, -36, -18, 44, -1, -44, 20, 35, -36, -19, 44,
+ }, {
+ 32, -12, -39, 33, 21, -44, 2, 43, -25, -30, 41, 8, -45, 16, 36, -36,
+ -17, 45, -7, -41, 29, 26, -43, -3, 44, -20, -34, 38, 13, -45, 11, 39,
+ -32, -22, 44, -1, -43, 24, 30, -40, -9, 45, -15, -37, 35, 18, -45, 6,
+ 42, -28, -27, 42, 4, -45, 19, 34, -38, -14, 45, -10, -40, 31, 23, -44,
+ }, {
+ 32, -14, -36, 37, 13, -45, 15, 36, -38, -12, 45, -16, -35, 38, 11, -45,
+ 17, 34, -39, -10, 45, -18, -34, 39, 9, -45, 19, 33, -40, -8, 45, -20,
+ -32, 40, 7, -45, 21, 31, -41, -6, 44, -22, -30, 41, 4, -44, 23, 30,
+ -42, -3, 44, -24, -29, 42, 2, -44, 25, 28, -43, -1, 43, -26, -27, 43,
+ }, {
+ 32, -16, -34, 40, 4, -44, 27, 24, -44, 8, 39, -36, -13, 45, -19, -31,
+ 42, 1, -43, 30, 21, -45, 11, 37, -38, -10, 45, -22, -29, 43, -2, -41,
+ 32, 18, -45, 14, 35, -39, -7, 44, -25, -26, 44, -6, -40, 34, 15, -45,
+ 17, 33, -41, -3, 43, -28, -23, 45, -9, -38, 36, 12, -45, 20, 30, -42,
+ }, {
+ 32, -18, -30, 43, -4, -39, 36, 10, -44, 26, 23, -45, 13, 34, -41, -1,
+ 42, -33, -15, 45, -21, -28, 44, -8, -38, 38, 7, -44, 29, 20, -45, 16,
+ 32, -42, 2, 40, -35, -12, 45, -24, -25, 45, -11, -36, 40, 3, -43, 31,
+ 17, -45, 19, 30, -43, 6, 39, -37, -9, 44, -27, -22, 45, -14, -34, 41,
+ }, {
+ 32, -20, -27, 45, -13, -33, 43, -6, -38, 39, 2, -41, 35, 10, -44, 30,
+ 17, -45, 23, 24, -45, 16, 30, -44, 9, 36, -41, 1, 40, -37, -7, 43,
+ -32, -14, 45, -26, -21, 45, -19, -28, 44, -12, -34, 42, -4, -38, 39, 3,
+ -42, 34, 11, -44, 29, 18, -45, 22, 25, -45, 15, 31, -43, 8, 36, -40,
+ }, {
+ 32, -22, -23, 45, -21, -24, 45, -20, -25, 45, -19, -26, 45, -18, -27, 45,
+ -17, -28, 45, -16, -29, 45, -15, -30, 44, -14, -30, 44, -13, -31, 44, -12,
+ -32, 44, -11, -33, 43, -10, -34, 43, -9, -34, 43, -8, -35, 42, -7, -36,
+ 42, -6, -36, 41, -4, -37, 41, -3, -38, 40, -2, -38, 40, -1, -39, 39,
+ }, {
+ 32, -24, -19, 45, -29, -14, 44, -33, -9, 42, -36, -3, 40, -39, 2, 37,
+ -42, 8, 34, -44, 13, 30, -45, 18, 25, -45, 23, 20, -45, 28, 15, -44,
+ 32, 10, -43, 36, 4, -40, 39, -1, -38, 41, -7, -34, 43, -12, -30, 45,
+ -17, -26, 45, -22, -21, 45, -27, -16, 44, -31, -11, 43, -35, -6, 41, -38,
+ }, {
+ 32, -26, -15, 44, -35, -3, 39, -41, 9, 31, -45, 20, 21, -45, 30, 10,
+ -42, 38, -2, -36, 43, -14, -27, 45, -25, -16, 44, -34, -4, 39, -41, 8,
+ 32, -45, 19, 22, -45, 30, 11, -42, 38, -1, -36, 43, -13, -28, 45, -24,
+ -17, 44, -34, -6, 40, -40, 7, 33, -44, 18, 23, -45, 29, 12, -43, 37,
+ }, {
+ 32, -28, -11, 41, -40, 8, 30, -45, 25, 14, -43, 38, -4, -33, 45, -22,
+ -17, 44, -36, 1, 35, -44, 19, 20, -44, 34, 2, -37, 43, -16, -23, 45,
+ -32, -6, 39, -42, 13, 26, -45, 30, 9, -40, 41, -10, -29, 45, -27, -12,
+ 42, -39, 7, 31, -45, 24, 15, -43, 38, -3, -34, 45, -21, -18, 44, -36,
+ }, {
+ 32, -30, -7, 38, -43, 18, 19, -44, 38, -6, -30, 45, -29, -8, 39, -43,
+ 17, 20, -44, 37, -4, -31, 45, -28, -9, 39, -43, 16, 21, -44, 36, -3,
+ -32, 45, -27, -10, 40, -42, 15, 22, -44, 36, -2, -33, 45, -26, -11, 40,
+ -42, 14, 23, -45, 35, -1, -34, 45, -25, -12, 41, -41, 13, 24, -45, 34,
+ }, {
+ 32, -31, -2, 34, -45, 28, 7, -37, 44, -24, -11, 39, -43, 20, 15, -41,
+ 42, -16, -19, 43, -40, 12, 23, -44, 38, -8, -27, 45, -35, 3, 30, -45,
+ 32, 1, -34, 45, -29, -6, 36, -45, 25, 10, -39, 44, -21, -14, 41, -42,
+ 17, 18, -43, 40, -13, -22, 44, -38, 9, 26, -45, 36, -4, -30, 45, -33,
+ }, {
+ 32, -33, 2, 30, -45, 36, -7, -26, 44, -38, 11, 22, -43, 40, -15, -18,
+ 42, -42, 19, 14, -40, 44, -23, -10, 38, -45, 27, 6, -35, 45, -30, -1,
+ 32, -45, 34, -3, -29, 45, -36, 8, 25, -44, 39, -12, -21, 43, -41, 16,
+ 17, -41, 43, -20, -13, 39, -44, 24, 9, -37, 45, -28, -4, 34, -45, 31,
+ }, {
+ 32, -34, 7, 24, -43, 41, -19, -12, 38, -45, 30, -1, -29, 45, -39, 14,
+ 17, -40, 44, -26, -4, 33, -45, 36, -9, -22, 43, -42, 21, 10, -36, 45,
+ -32, 3, 27, -44, 40, -16, -15, 39, -44, 28, 2, -31, 45, -37, 11, 20,
+ -42, 43, -23, -8, 35, -45, 34, -6, -25, 44, -41, 18, 13, -38, 45, -30,
+ }, {
+ 32, -36, 11, 18, -40, 45, -30, 3, 25, -43, 43, -24, -4, 31, -45, 39,
+ -17, -12, 36, -45, 35, -10, -19, 40, -44, 30, -2, -26, 43, -42, 23, 6,
+ -32, 45, -39, 16, 13, -37, 45, -34, 9, 20, -41, 44, -29, 1, 27, -44,
+ 42, -22, -7, 33, -45, 38, -15, -14, 38, -45, 34, -8, -21, 41, -44, 28,
+ }, {
+ 32, -37, 15, 12, -35, 45, -39, 18, 9, -33, 45, -40, 21, 6, -30, 44,
+ -42, 24, 2, -28, 43, -43, 27, -1, -25, 42, -44, 30, -4, -22, 41, -45,
+ 32, -8, -19, 39, -45, 34, -11, -16, 38, -45, 36, -14, -13, 36, -45, 38,
+ -17, -10, 34, -45, 40, -20, -7, 31, -44, 41, -23, -3, 29, -44, 43, -26,
+ }, {
+ 32, -38, 19, 6, -29, 43, -44, 31, -9, -16, 36, -45, 40, -22, -2, 26,
+ -42, 45, -34, 12, 13, -34, 45, -41, 25, -1, -23, 40, -45, 36, -15, -10,
+ 32, -44, 43, -28, 4, 20, -39, 45, -38, 18, 7, -30, 43, -44, 30, -8,
+ -17, 37, -45, 39, -21, -3, 27, -42, 44, -33, 11, 14, -35, 45, -41, 24,
+ }, {
+ 32, -39, 23, -1, -21, 38, -45, 40, -25, 3, 19, -37, 45, -41, 27, -6,
+ -17, 36, -45, 42, -29, 8, 15, -34, 44, -43, 30, -10, -13, 33, -44, 44,
+ -32, 12, 11, -31, 43, -44, 34, -14, -9, 30, -43, 45, -35, 16, 7, -28,
+ 42, -45, 36, -18, -4, 26, -41, 45, -38, 20, 2, -24, 40, -45, 39, -22,
+ }, {
+ 32, -40, 27, -8, -13, 31, -43, 45, -38, 22, -2, -18, 35, -44, 44, -34,
+ 17, 3, -23, 38, -45, 42, -30, 12, 9, -28, 41, -45, 40, -26, 7, 14,
+ -32, 43, -45, 37, -21, 1, 19, -36, 44, -44, 34, -16, -4, 24, -39, 45,
+ -42, 30, -11, -10, 29, -41, 45, -39, 25, -6, -15, 33, -43, 45, -36, 20,
+ }, {
+ 32, -41, 30, -14, -4, 22, -36, 44, -44, 37, -23, 6, 13, -30, 41, -45,
+ 42, -31, 15, 3, -21, 36, -44, 45, -38, 24, -7, -12, 29, -40, 45, -42,
+ 32, -16, -2, 20, -35, 44, -45, 38, -25, 8, 11, -28, 40, -45, 43, -33,
+ 17, 1, -19, 34, -43, 45, -39, 26, -9, -10, 27, -39, 45, -43, 34, -18,
+ }, {
+ 32, -42, 34, -20, 4, 12, -27, 38, -44, 45, -39, 28, -13, -3, 19, -33,
+ 42, -45, 43, -34, 21, -6, -11, 26, -38, 44, -45, 39, -29, 14, 2, -18,
+ 32, -41, 45, -43, 35, -22, 7, 10, -25, 37, -44, 45, -40, 30, -15, -1,
+ 17, -31, 41, -45, 43, -36, 23, -8, -9, 24, -36, 44, -45, 40, -30, 16,
+ }, {
+ 32, -43, 36, -26, 13, 1, -15, 28, -38, 44, -45, 42, -35, 24, -11, -3,
+ 17, -30, 39, -44, 45, -41, 34, -22, 9, 6, -19, 31, -40, 45, -45, 40,
+ -32, 20, -7, -8, 21, -33, 41, -45, 44, -39, 30, -18, 4, 10, -23, 34,
+ -42, 45, -44, 38, -29, 16, -2, -12, 25, -36, 43, -45, 43, -37, 27, -14,
+ }, {
+ 32, -44, 39, -31, 21, -10, -2, 14, -25, 34, -41, 45, -45, 42, -36, 28,
+ -17, 6, 7, -18, 29, -37, 43, -45, 44, -40, 34, -24, 13, -1, -11, 22,
+ -32, 39, -44, 45, -43, 38, -30, 20, -9, -3, 15, -26, 35, -41, 45, -45,
+ 42, -36, 27, -16, 4, 8, -19, 30, -38, 43, -45, 44, -40, 33, -23, 12,
+ }, {
+ 32, -44, 41, -36, 29, -20, 11, -1, -9, 18, -27, 34, -40, 44, -45, 45,
+ -42, 37, -30, 22, -13, 3, 7, -16, 25, -33, 39, -43, 45, -45, 43, -38,
+ 32, -24, 15, -6, -4, 14, -23, 31, -38, 42, -45, 45, -43, 39, -34, 26,
+ -17, 8, 2, -12, 21, -30, 36, -41, 44, -45, 44, -40, 35, -28, 19, -10,
+ }, {
+ 32, -45, 43, -39, 35, -30, 23, -16, 9, -1, -7, 14, -21, 28, -34, 38,
+ -42, 44, -45, 45, -43, 40, -36, 31, -25, 18, -11, 3, 4, -12, 19, -26,
+ 32, -37, 41, -44, 45, -45, 44, -41, 38, -33, 27, -20, 13, -6, -2, 10,
+ -17, 24, -30, 36, -40, 43, -45, 45, -44, 42, -39, 34, -29, 22, -15, 8,
+ }, {
+ 32, -45, 44, -42, 40, -37, 34, -30, 25, -20, 15, -10, 4, 1, -7, 12,
+ -17, 22, -27, 31, -35, 38, -41, 43, -44, 45, -45, 45, -43, 41, -39, 36,
+ -32, 28, -23, 18, -13, 8, -2, -3, 9, -14, 19, -24, 29, -33, 36, -39,
+ 42, -44, 45, -45, 45, -44, 43, -40, 38, -34, 30, -26, 21, -16, 11, -6,
+ }, {
+ 32, -45, 45, -44, 43, -42, 41, -39, 38, -36, 34, -31, 29, -26, 23, -20,
+ 17, -14, 11, -8, 4, -1, -2, 6, -9, 12, -15, 18, -21, 24, -27, 30,
+ -32, 34, -36, 38, -40, 41, -43, 44, -44, 45, -45, 45, -45, 45, -44, 43,
+ -42, 40, -39, 37, -35, 33, -30, 28, -25, 22, -19, 16, -13, 10, -7, 3,
+ }, {
+ 32, -45, 45, -45, 45, -45, 45, -45, 44, -44, 44, -44, 43, -43, 43, -42,
+ 42, -41, 41, -40, 40, -39, 39, -38, 38, -37, 36, -36, 35, -34, 34, -33,
+ 32, -31, 30, -30, 29, -28, 27, -26, 25, -24, 23, -22, 21, -20, 19, -18,
+ 17, -16, 15, -14, 13, -12, 11, -10, 9, -8, 7, -6, 4, -3, 2, -1,
+ }
+};
diff --git a/src/shaders/icc.c b/src/shaders/icc.c
new file mode 100644
index 0000000..6a16cfd
--- /dev/null
+++ b/src/shaders/icc.c
@@ -0,0 +1,781 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include "shaders.h"
+
+#include <libplacebo/tone_mapping.h>
+#include <libplacebo/shaders/icc.h>
+
+const struct pl_icc_params pl_icc_default_params = { PL_ICC_DEFAULTS };
+
+#ifdef PL_HAVE_LCMS
+
+#include <lcms2.h>
+#include <lcms2_plugin.h>
+
+struct icc_priv {
+ pl_log log;
+ pl_cache cache; // for backwards compatibility
+ cmsContext cms;
+ cmsHPROFILE profile;
+ cmsHPROFILE approx; // approximation profile
+ float a, b, scale; // approxmation tone curve parameters and scaling
+ cmsCIEXYZ black;
+ float gamma_stddev;
+ uint64_t lut_sig;
+};
+
+static void error_callback(cmsContext cms, cmsUInt32Number code,
+ const char *msg)
+{
+ pl_log log = cmsGetContextUserData(cms);
+ pl_err(log, "lcms2: [%d] %s", (int) code, msg);
+}
+
+static void set_callback(void *priv, pl_cache_obj obj)
+{
+ pl_icc_object icc = priv;
+ icc->params.cache_save(icc->params.cache_priv, obj.key, obj.data, obj.size);
+}
+
+static pl_cache_obj get_callback(void *priv, uint64_t key)
+{
+ pl_icc_object icc = priv;
+ int s_r = icc->params.size_r, s_g = icc->params.size_g, s_b = icc->params.size_b;
+ size_t data_size = s_r * s_g * s_b * sizeof(uint16_t[4]);
+ void *data = pl_alloc(NULL, data_size);
+ bool ok = icc->params.cache_load(icc->params.cache_priv, key, data, data_size);
+ if (!ok) {
+ pl_free(data);
+ return (pl_cache_obj) {0};
+ }
+
+ return (pl_cache_obj) {
+ .key = key,
+ .data = data,
+ .size = data_size,
+ .free = pl_free,
+ };
+}
+
+void pl_icc_close(pl_icc_object *picc)
+{
+ pl_icc_object icc = *picc;
+ if (!icc)
+ return;
+
+ struct icc_priv *p = PL_PRIV(icc);
+ cmsCloseProfile(p->approx);
+ cmsCloseProfile(p->profile);
+ cmsDeleteContext(p->cms);
+ pl_cache_destroy(&p->cache);
+ pl_free_ptr((void **) picc);
+}
+
+static bool detect_csp(pl_icc_object icc, struct pl_raw_primaries *prim,
+ float *out_gamma)
+{
+ struct icc_priv *p = PL_PRIV(icc);
+ cmsHTRANSFORM tf;
+ cmsHPROFILE xyz = cmsCreateXYZProfileTHR(p->cms);
+ if (!xyz)
+ return false;
+
+ // We need to use an unadapted observer to get the raw values
+ cmsFloat64Number prev_adapt = cmsSetAdaptationStateTHR(p->cms, 0.0);
+ tf = cmsCreateTransformTHR(p->cms, p->profile, TYPE_RGB_8, xyz, TYPE_XYZ_DBL,
+ INTENT_ABSOLUTE_COLORIMETRIC,
+ /* Note: These flags mostly don't do anything
+ * anyway, but specify them regardless */
+ cmsFLAGS_NOCACHE |
+ cmsFLAGS_NOOPTIMIZE);
+ cmsSetAdaptationStateTHR(p->cms, prev_adapt);
+ cmsCloseProfile(xyz);
+ if (!tf)
+ return false;
+
+ enum {
+ RED,
+ GREEN,
+ BLUE,
+ WHITE,
+ BLACK,
+ GRAY,
+ RAMP,
+ };
+
+ static const uint8_t test[][3] = {
+ [RED] = { 0xFF, 0, 0 },
+ [GREEN] = { 0, 0xFF, 0 },
+ [BLUE] = { 0, 0, 0xFF },
+ [WHITE] = { 0xFF, 0xFF, 0xFF },
+ [BLACK] = { 0x00, 0x00, 0x00 },
+ [GRAY] = { 0x80, 0x80, 0x80 },
+
+ // Grayscale ramp (excluding endpoints)
+#define V(d) { d, d, d }
+ V(0x01), V(0x02), V(0x03), V(0x04), V(0x05), V(0x06), V(0x07),
+ V(0x08), V(0x09), V(0x0A), V(0x0B), V(0x0C), V(0x0D), V(0x0E), V(0x0F),
+ V(0x10), V(0x11), V(0x12), V(0x13), V(0x14), V(0x15), V(0x16), V(0x17),
+ V(0x18), V(0x19), V(0x1A), V(0x1B), V(0x1C), V(0x1D), V(0x1E), V(0x1F),
+ V(0x20), V(0x21), V(0x22), V(0x23), V(0x24), V(0x25), V(0x26), V(0x27),
+ V(0x28), V(0x29), V(0x2A), V(0x2B), V(0x2C), V(0x2D), V(0x2E), V(0x2F),
+ V(0x30), V(0x31), V(0x32), V(0x33), V(0x34), V(0x35), V(0x36), V(0x37),
+ V(0x38), V(0x39), V(0x3A), V(0x3B), V(0x3C), V(0x3D), V(0x3E), V(0x3F),
+ V(0x40), V(0x41), V(0x42), V(0x43), V(0x44), V(0x45), V(0x46), V(0x47),
+ V(0x48), V(0x49), V(0x4A), V(0x4B), V(0x4C), V(0x4D), V(0x4E), V(0x4F),
+ V(0x50), V(0x51), V(0x52), V(0x53), V(0x54), V(0x55), V(0x56), V(0x57),
+ V(0x58), V(0x59), V(0x5A), V(0x5B), V(0x5C), V(0x5D), V(0x5E), V(0x5F),
+ V(0x60), V(0x61), V(0x62), V(0x63), V(0x64), V(0x65), V(0x66), V(0x67),
+ V(0x68), V(0x69), V(0x6A), V(0x6B), V(0x6C), V(0x6D), V(0x6E), V(0x6F),
+ V(0x70), V(0x71), V(0x72), V(0x73), V(0x74), V(0x75), V(0x76), V(0x77),
+ V(0x78), V(0x79), V(0x7A), V(0x7B), V(0x7C), V(0x7D), V(0x7E), V(0x7F),
+ V(0x80), V(0x81), V(0x82), V(0x83), V(0x84), V(0x85), V(0x86), V(0x87),
+ V(0x88), V(0x89), V(0x8A), V(0x8B), V(0x8C), V(0x8D), V(0x8E), V(0x8F),
+ V(0x90), V(0x91), V(0x92), V(0x93), V(0x94), V(0x95), V(0x96), V(0x97),
+ V(0x98), V(0x99), V(0x9A), V(0x9B), V(0x9C), V(0x9D), V(0x9E), V(0x9F),
+ V(0xA0), V(0xA1), V(0xA2), V(0xA3), V(0xA4), V(0xA5), V(0xA6), V(0xA7),
+ V(0xA8), V(0xA9), V(0xAA), V(0xAB), V(0xAC), V(0xAD), V(0xAE), V(0xAF),
+ V(0xB0), V(0xB1), V(0xB2), V(0xB3), V(0xB4), V(0xB5), V(0xB6), V(0xB7),
+ V(0xB8), V(0xB9), V(0xBA), V(0xBB), V(0xBC), V(0xBD), V(0xBE), V(0xBF),
+ V(0xC0), V(0xC1), V(0xC2), V(0xC3), V(0xC4), V(0xC5), V(0xC6), V(0xC7),
+ V(0xC8), V(0xC9), V(0xCA), V(0xCB), V(0xCC), V(0xCD), V(0xCE), V(0xCF),
+ V(0xD0), V(0xD1), V(0xD2), V(0xD3), V(0xD4), V(0xD5), V(0xD6), V(0xD7),
+ V(0xD8), V(0xD9), V(0xDA), V(0xDB), V(0xDC), V(0xDD), V(0xDE), V(0xDF),
+ V(0xE0), V(0xE1), V(0xE2), V(0xE3), V(0xE4), V(0xE5), V(0xE6), V(0xE7),
+ V(0xE8), V(0xE9), V(0xEA), V(0xEB), V(0xEC), V(0xED), V(0xEE), V(0xEF),
+ V(0xF0), V(0xF1), V(0xF2), V(0xF3), V(0xF4), V(0xF5), V(0xF6), V(0xF7),
+ V(0xF8), V(0xF9), V(0xFA), V(0xFB), V(0xFC), V(0xFD), V(0xFE),
+#undef V
+ };
+
+ cmsCIEXYZ dst[PL_ARRAY_SIZE(test)] = {0};
+ cmsDoTransform(tf, test, dst, PL_ARRAY_SIZE(dst));
+ cmsDeleteTransform(tf);
+
+ // Read primaries from transformed RGBW values
+ prim->red = pl_cie_from_XYZ(dst[RED].X, dst[RED].Y, dst[RED].Z);
+ prim->green = pl_cie_from_XYZ(dst[GREEN].X, dst[GREEN].Y, dst[GREEN].Z);
+ prim->blue = pl_cie_from_XYZ(dst[BLUE].X, dst[BLUE].Y, dst[BLUE].Z);
+ prim->white = pl_cie_from_XYZ(dst[WHITE].X, dst[WHITE].Y, dst[WHITE].Z);
+
+ // Rough estimate of overall gamma and starting point for curve black point
+ const float y_approx = dst[GRAY].Y ? log(dst[GRAY].Y) / log(0.5) : 1.0f;
+ const float kb = fmaxf(dst[BLACK].Y, 0.0f);
+ float b = powf(kb, 1 / y_approx);
+
+ // Estimate mean and stddev of gamma (Welford's method)
+ float M = 0.0, S = 0.0;
+ int k = 1;
+ for (int i = RAMP; i < PL_ARRAY_SIZE(dst); i++) { // exclude primaries
+ if (dst[i].Y <= 0 || dst[i].Y >= 1)
+ continue;
+ float src = (1 - b) * (test[i][0] / 255.0) + b;
+ float y = log(dst[i].Y) / log(src);
+ float tmpM = M;
+ M += (y - tmpM) / k;
+ S += (y - tmpM) * (y - M);
+ k++;
+
+ // Update estimate of black point according to current gamma estimate
+ b = powf(kb, 1 / M);
+ }
+ S = sqrt(S / (k - 1));
+
+ PL_INFO(p, "Detected profile approximation gamma %.3f", M);
+ if (S > 0.5) {
+ PL_WARN(p, "Detected profile gamma (%.3f) very far from pure power "
+ "response (stddev=%.1f), suspected unusual or broken profile. "
+ "Using anyway, but results may be poor.", M, S);
+ } else if (!(M > 0)) {
+ PL_ERR(p, "Arithmetic error in ICC profile gamma estimation? "
+ "Please open an issue");
+ return false;
+ }
+
+ *out_gamma = M;
+ p->gamma_stddev = S;
+ return true;
+}
+
+static bool detect_contrast(pl_icc_object icc, struct pl_hdr_metadata *hdr,
+ struct pl_icc_params *params, float max_luma)
+{
+ struct icc_priv *p = PL_PRIV(icc);
+ cmsCIEXYZ *white = cmsReadTag(p->profile, cmsSigLuminanceTag);
+ enum pl_rendering_intent intent = params->intent;
+ /* LittleCMS refuses to detect an intent in absolute colorimetric intent,
+ * so fall back to relative colorimetric since we only care about the
+ * brightness value here */
+ if (intent == PL_INTENT_ABSOLUTE_COLORIMETRIC)
+ intent = PL_INTENT_RELATIVE_COLORIMETRIC;
+ if (!cmsDetectDestinationBlackPoint(&p->black, p->profile, intent, 0)) {
+ /*
+ * v4 ICC profiles have a black point tag but only for
+ * perceptual/saturation intents. So we change the rendering intent
+ * to perceptual if we are provided a v4 ICC profile.
+ */
+ if (cmsGetEncodedICCversion(p->profile) >= 0x4000000 && intent != PL_INTENT_PERCEPTUAL) {
+ params->intent = PL_INTENT_PERCEPTUAL;
+ return detect_contrast(icc, hdr, params, max_luma);
+ }
+
+ PL_ERR(p, "Failed detecting ICC profile black point!");
+ return false;
+ }
+
+ if (white) {
+ PL_DEBUG(p, "Detected raw white point X=%.2f Y=%.2f Z=%.2f cd/m^2",
+ white->X, white->Y, white->Z);
+ }
+ PL_DEBUG(p, "Detected raw black point X=%.6f%% Y=%.6f%% Z=%.6f%%",
+ p->black.X * 100, p->black.Y * 100, p->black.Z * 100);
+
+ if (max_luma <= 0)
+ max_luma = white ? white->Y : PL_COLOR_SDR_WHITE;
+
+ hdr->max_luma = max_luma;
+ hdr->min_luma = p->black.Y * max_luma;
+ hdr->min_luma = PL_MAX(hdr->min_luma, 1e-6); // prevent true 0
+ PL_INFO(p, "Using ICC contrast %.0f:1", hdr->max_luma / hdr->min_luma);
+ return true;
+}
+
+static void infer_clut_size(struct pl_icc_object_t *icc)
+{
+ struct icc_priv *p = PL_PRIV(icc);
+ struct pl_icc_params *params = &icc->params;
+ if (params->size_r && params->size_g && params->size_b) {
+ PL_DEBUG(p, "Using fixed 3DLUT size: %dx%dx%d",
+ (int) params->size_r, (int) params->size_g, (int) params->size_b);
+ return;
+ }
+
+#define REQUIRE_SIZE(N) \
+ params->size_r = PL_MAX(params->size_r, N); \
+ params->size_g = PL_MAX(params->size_g, N); \
+ params->size_b = PL_MAX(params->size_b, N)
+
+ // Default size for sanity
+ REQUIRE_SIZE(9);
+
+ // Ensure enough precision to track the (absolute) black point
+ if (p->black.Y > 1e-4) {
+ float black_rel = powf(p->black.Y, 1.0f / icc->gamma);
+ int min_size = 2 * (int) ceilf(1.0f / black_rel);
+ REQUIRE_SIZE(min_size);
+ }
+
+ // Ensure enough precision to track the gamma curve
+ if (p->gamma_stddev > 1e-2) {
+ REQUIRE_SIZE(65);
+ } else if (p->gamma_stddev > 1e-3) {
+ REQUIRE_SIZE(33);
+ } else if (p->gamma_stddev > 1e-4) {
+ REQUIRE_SIZE(17);
+ }
+
+ // Ensure enough precision to track any internal CLUTs
+ cmsPipeline *pipe = NULL;
+ switch (icc->params.intent) {
+ case PL_INTENT_SATURATION:
+ pipe = cmsReadTag(p->profile, cmsSigBToA2Tag);
+ if (pipe)
+ break;
+ // fall through
+ case PL_INTENT_RELATIVE_COLORIMETRIC:
+ case PL_INTENT_ABSOLUTE_COLORIMETRIC:
+ default:
+ pipe = cmsReadTag(p->profile, cmsSigBToA1Tag);
+ if (pipe)
+ break;
+ // fall through
+ case PL_INTENT_PERCEPTUAL:
+ pipe = cmsReadTag(p->profile, cmsSigBToA0Tag);
+ break;
+ }
+
+ if (!pipe) {
+ switch (icc->params.intent) {
+ case PL_INTENT_SATURATION:
+ pipe = cmsReadTag(p->profile, cmsSigAToB2Tag);
+ if (pipe)
+ break;
+ // fall through
+ case PL_INTENT_RELATIVE_COLORIMETRIC:
+ case PL_INTENT_ABSOLUTE_COLORIMETRIC:
+ default:
+ pipe = cmsReadTag(p->profile, cmsSigAToB1Tag);
+ if (pipe)
+ break;
+ // fall through
+ case PL_INTENT_PERCEPTUAL:
+ pipe = cmsReadTag(p->profile, cmsSigAToB0Tag);
+ break;
+ }
+ }
+
+ if (pipe) {
+ for (cmsStage *stage = cmsPipelineGetPtrToFirstStage(pipe);
+ stage; stage = cmsStageNext(stage))
+ {
+ switch (cmsStageType(stage)) {
+ case cmsSigCLutElemType: ;
+ _cmsStageCLutData *data = cmsStageData(stage);
+ if (data->Params->nInputs != 3)
+ continue;
+ params->size_r = PL_MAX(params->size_r, data->Params->nSamples[0]);
+ params->size_g = PL_MAX(params->size_g, data->Params->nSamples[1]);
+ params->size_b = PL_MAX(params->size_b, data->Params->nSamples[2]);
+ break;
+
+ default:
+ continue;
+ }
+ }
+ }
+
+ // Clamp the output size to make sure profiles are not too large
+ params->size_r = PL_MIN(params->size_r, 129);
+ params->size_g = PL_MIN(params->size_g, 129);
+ params->size_b = PL_MIN(params->size_b, 129);
+
+ // Constrain the total LUT size to roughly 1M entries
+ const size_t max_size = 1000000;
+ size_t total_size = params->size_r * params->size_g * params->size_b;
+ if (total_size > max_size) {
+ float factor = powf((float) max_size / total_size, 1/3.0f);
+ params->size_r = ceilf(factor * params->size_r);
+ params->size_g = ceilf(factor * params->size_g);
+ params->size_b = ceilf(factor * params->size_b);
+ }
+
+ PL_INFO(p, "Chosen 3DLUT size: %dx%dx%d",
+ (int) params->size_r, (int) params->size_g, (int) params->size_b);
+}
+
+static bool icc_init(struct pl_icc_object_t *icc)
+{
+ struct icc_priv *p = PL_PRIV(icc);
+ struct pl_icc_params *params = &icc->params;
+ if (params->intent < 0 || params->intent > PL_INTENT_ABSOLUTE_COLORIMETRIC)
+ params->intent = cmsGetHeaderRenderingIntent(p->profile);
+
+ struct pl_raw_primaries *out_prim = &icc->csp.hdr.prim;
+ if (!detect_csp(icc, out_prim, &icc->gamma))
+ return false;
+ if (!detect_contrast(icc, &icc->csp.hdr, params, params->max_luma))
+ return false;
+ infer_clut_size(icc);
+
+ const struct pl_raw_primaries *best = NULL;
+ for (enum pl_color_primaries prim = 1; prim < PL_COLOR_PRIM_COUNT; prim++) {
+ const struct pl_raw_primaries *raw = pl_raw_primaries_get(prim);
+ if (!icc->csp.primaries && pl_raw_primaries_similar(raw, out_prim)) {
+ icc->containing_primaries = prim;
+ icc->csp.primaries = prim;
+ best = raw;
+ break;
+ }
+
+ if (pl_primaries_superset(raw, out_prim) &&
+ (!best || pl_primaries_superset(best, raw)))
+ {
+ icc->containing_primaries = prim;
+ best = raw;
+ }
+ }
+
+ if (!best) {
+ PL_WARN(p, "ICC profile too wide to handle, colors may be clipped!");
+ icc->containing_primaries = PL_COLOR_PRIM_ACES_AP0;
+ best = pl_raw_primaries_get(icc->containing_primaries);
+ }
+
+ // Create approximation profile. Use a tone-curve based on a BT.1886-style
+ // pure power curve, with an approximation gamma matched to the ICC
+ // profile. We stretch the luminance range *before* the input to the gamma
+ // function, to avoid numerical issues near the black point. (This removes
+ // the need for a separate linear section)
+ //
+ // Y = scale * (aX + b)^y, where Y = PCS luma and X = encoded value ([0-1])
+ p->scale = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_NORM, icc->csp.hdr.max_luma);
+ p->b = powf(icc->csp.hdr.min_luma / icc->csp.hdr.max_luma, 1.0f / icc->gamma);
+ p->a = (1 - p->b);
+ cmsToneCurve *curve = cmsBuildParametricToneCurve(p->cms, 2,
+ (double[3]) { icc->gamma, p->a, p->b });
+ if (!curve)
+ return false;
+
+ cmsCIExyY wp_xyY = { best->white.x, best->white.y, 1.0 };
+ cmsCIExyYTRIPLE prim_xyY = {
+ .Red = { best->red.x, best->red.y, 1.0 },
+ .Green = { best->green.x, best->green.y, 1.0 },
+ .Blue = { best->blue.x, best->blue.y, 1.0 },
+ };
+
+ p->approx = cmsCreateRGBProfileTHR(p->cms, &wp_xyY, &prim_xyY,
+ (cmsToneCurve *[3]){ curve, curve, curve });
+ cmsFreeToneCurve(curve);
+ if (!p->approx)
+ return false;
+
+ // We need to create an ICC V2 profile because ICC V4 perceptual profiles
+ // have normalized semantics, but we want colorimetric mapping with BPC
+ cmsSetHeaderRenderingIntent(p->approx, icc->params.intent);
+ cmsSetProfileVersion(p->approx, 2.2);
+
+ // Hash all parameters affecting the generated 3DLUT
+ p->lut_sig = CACHE_KEY_ICC_3DLUT;
+ pl_hash_merge(&p->lut_sig, icc->signature);
+ pl_hash_merge(&p->lut_sig, params->intent);
+ pl_hash_merge(&p->lut_sig, params->size_r);
+ pl_hash_merge(&p->lut_sig, params->size_g);
+ pl_hash_merge(&p->lut_sig, params->size_b);
+ pl_hash_merge(&p->lut_sig, params->force_bpc);
+ union { double d; uint64_t u; } v = { .d = icc->csp.hdr.max_luma };
+ pl_hash_merge(&p->lut_sig, v.u);
+ // min luma depends only on the max luma and profile
+
+ // Backwards compatibility with old caching API
+ if ((params->cache_save || params->cache_load) && !params->cache) {
+ p->cache = pl_cache_create(pl_cache_params(
+ .log = p->log,
+ .set = params->cache_save ? set_callback : NULL,
+ .get = params->cache_load ? get_callback : NULL,
+ .priv = icc,
+ ));
+ }
+
+ return true;
+}
+
+pl_icc_object pl_icc_open(pl_log log, const struct pl_icc_profile *profile,
+ const struct pl_icc_params *params)
+{
+ if (!profile->len)
+ return NULL;
+
+ struct pl_icc_object_t *icc = pl_zalloc_obj(NULL, icc, struct icc_priv);
+ struct icc_priv *p = PL_PRIV(icc);
+ icc->params = params ? *params : pl_icc_default_params;
+ icc->signature = profile->signature;
+ p->log = log;
+ p->cms = cmsCreateContext(NULL, (void *) log);
+ if (!p->cms) {
+ PL_ERR(p, "Failed creating LittleCMS context!");
+ goto error;
+ }
+
+ cmsSetLogErrorHandlerTHR(p->cms, error_callback);
+ PL_INFO(p, "Opening ICC profile..");
+ p->profile = cmsOpenProfileFromMemTHR(p->cms, profile->data, profile->len);
+ if (!p->profile) {
+ PL_ERR(p, "Failed opening ICC profile");
+ goto error;
+ }
+
+ if (cmsGetColorSpace(p->profile) != cmsSigRgbData) {
+ PL_ERR(p, "Invalid ICC profile: not RGB");
+ goto error;
+ }
+
+ if (!icc_init(icc))
+ goto error;
+
+ return icc;
+
+error:
+ pl_icc_close((pl_icc_object *) &icc);
+ return NULL;
+}
+
+static bool icc_reopen(pl_icc_object kicc, const struct pl_icc_params *params)
+{
+ struct pl_icc_object_t *icc = (struct pl_icc_object_t *) kicc;
+ struct icc_priv *p = PL_PRIV(icc);
+ cmsCloseProfile(p->approx);
+ pl_cache_destroy(&p->cache);
+
+ *icc = (struct pl_icc_object_t) {
+ .params = *params,
+ .signature = icc->signature,
+ };
+
+ *p = (struct icc_priv) {
+ .log = p->log,
+ .cms = p->cms,
+ .profile = p->profile,
+ };
+
+ PL_DEBUG(p, "Reinitializing ICC profile in-place");
+ return icc_init(icc);
+}
+
+bool pl_icc_update(pl_log log, pl_icc_object *out_icc,
+ const struct pl_icc_profile *profile,
+ const struct pl_icc_params *params)
+{
+ params = PL_DEF(params, &pl_icc_default_params);
+ pl_icc_object icc = *out_icc;
+ if (!icc && !profile)
+ return false; // nothing to update
+
+ uint64_t sig = profile ? profile->signature : icc->signature;
+ if (!icc || icc->signature != sig) {
+ pl_assert(profile);
+ pl_icc_close(&icc);
+ *out_icc = icc = pl_icc_open(log, profile, params);
+ return icc != NULL;
+ }
+
+ int size_r = PL_DEF(params->size_r, icc->params.size_r);
+ int size_g = PL_DEF(params->size_g, icc->params.size_g);
+ int size_b = PL_DEF(params->size_b, icc->params.size_b);
+ bool compat = params->intent == icc->params.intent &&
+ params->max_luma == icc->params.max_luma &&
+ params->force_bpc == icc->params.force_bpc &&
+ size_r == icc->params.size_r &&
+ size_g == icc->params.size_g &&
+ size_b == icc->params.size_b;
+ if (compat)
+ return true;
+
+ // ICC signature is the same but parameters are different, re-open in-place
+ if (!icc_reopen(icc, params)) {
+ pl_icc_close(&icc);
+ *out_icc = NULL;
+ return false;
+ }
+
+ return true;
+}
+
+static void fill_lut(void *datap, const struct sh_lut_params *params, bool decode)
+{
+ pl_icc_object icc = params->priv;
+ struct icc_priv *p = PL_PRIV(icc);
+ cmsHPROFILE srcp = decode ? p->profile : p->approx;
+ cmsHPROFILE dstp = decode ? p->approx : p->profile;
+ int s_r = params->width, s_g = params->height, s_b = params->depth;
+
+ pl_clock_t start = pl_clock_now();
+ cmsHTRANSFORM tf = cmsCreateTransformTHR(p->cms, srcp, TYPE_RGB_16,
+ dstp, TYPE_RGBA_16,
+ icc->params.intent,
+ cmsFLAGS_BLACKPOINTCOMPENSATION |
+ cmsFLAGS_NOCACHE | cmsFLAGS_NOOPTIMIZE);
+ if (!tf)
+ return;
+
+ pl_clock_t after_transform = pl_clock_now();
+ pl_log_cpu_time(p->log, start, after_transform, "creating ICC transform");
+
+ uint16_t *tmp = pl_alloc(NULL, s_r * 3 * sizeof(tmp[0]));
+ for (int b = 0; b < s_b; b++) {
+ for (int g = 0; g < s_g; g++) {
+ // Transform a single line of the output buffer
+ for (int r = 0; r < s_r; r++) {
+ tmp[r * 3 + 0] = r * 65535 / (s_r - 1);
+ tmp[r * 3 + 1] = g * 65535 / (s_g - 1);
+ tmp[r * 3 + 2] = b * 65535 / (s_b - 1);
+ }
+
+ size_t offset = (b * s_g + g) * s_r * 4;
+ uint16_t *data = ((uint16_t *) datap) + offset;
+ cmsDoTransform(tf, tmp, data, s_r);
+
+ if (!icc->params.force_bpc)
+ continue;
+
+ // Fix the black point manually. Work-around for "improper"
+ // profiles, as black point compensation should already have
+ // taken care of this normally.
+ const uint16_t knee = 16u << 8;
+ if (tmp[0] >= knee || tmp[1] >= knee)
+ continue;
+ for (int r = 0; r < s_r; r++) {
+ uint16_t s = (2 * tmp[1] + tmp[2] + tmp[r * 3]) >> 2;
+ if (s >= knee)
+ break;
+ for (int c = 0; c < 3; c++)
+ data[r * 3 + c] = (s * data[r * 3 + c] + (knee - s) * s) >> 12;
+ }
+ }
+ }
+
+ pl_log_cpu_time(p->log, after_transform, pl_clock_now(), "generating ICC 3DLUT");
+ cmsDeleteTransform(tf);
+ pl_free(tmp);
+}
+
+static void fill_decode(void *datap, const struct sh_lut_params *params)
+{
+ fill_lut(datap, params, true);
+}
+
+static void fill_encode(void *datap, const struct sh_lut_params *params)
+{
+ fill_lut(datap, params, false);
+}
+
+static pl_cache get_cache(pl_icc_object icc, pl_shader sh)
+{
+ struct icc_priv *p = PL_PRIV(icc);
+ return PL_DEF(icc->params.cache, PL_DEF(p->cache, SH_CACHE(sh)));
+}
+
+void pl_icc_decode(pl_shader sh, pl_icc_object icc, pl_shader_obj *lut_obj,
+ struct pl_color_space *out_csp)
+{
+ struct icc_priv *p = PL_PRIV(icc);
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+
+ pl_fmt fmt = pl_find_fmt(SH_GPU(sh), PL_FMT_UNORM, 4, 16, 16, PL_FMT_CAP_LINEAR);
+ if (!fmt) {
+ SH_FAIL(sh, "Failed finding ICC 3DLUT texture format!");
+ return;
+ }
+
+ ident_t lut = sh_lut(sh, sh_lut_params(
+ .object = lut_obj,
+ .var_type = PL_VAR_FLOAT,
+ .method = SH_LUT_TETRAHEDRAL,
+ .fmt = fmt,
+ .width = icc->params.size_r,
+ .height = icc->params.size_g,
+ .depth = icc->params.size_b,
+ .comps = 4,
+ .signature = p->lut_sig,
+ .fill = fill_decode,
+ .cache = get_cache(icc, sh),
+ .priv = (void *) icc,
+ ));
+
+ if (!lut) {
+ SH_FAIL(sh, "pl_icc_decode: failed generating LUT object");
+ return;
+ }
+
+ // Y = scale * (aX + b)^y
+ sh_describe(sh, "ICC 3DLUT");
+ GLSL("// pl_icc_decode \n"
+ "{ \n"
+ "color.rgb = "$"(color.rgb).rgb; \n"
+ "color.rgb = "$" * color.rgb + vec3("$"); \n"
+ "color.rgb = pow(color.rgb, vec3("$")); \n"
+ "color.rgb = "$" * color.rgb; \n"
+ "} \n",
+ lut,
+ SH_FLOAT(p->a), SH_FLOAT(p->b),
+ SH_FLOAT(icc->gamma),
+ SH_FLOAT(p->scale));
+
+ if (out_csp) {
+ *out_csp = (struct pl_color_space) {
+ .primaries = icc->containing_primaries,
+ .transfer = PL_COLOR_TRC_LINEAR,
+ .hdr = icc->csp.hdr,
+ };
+ }
+}
+
+void pl_icc_encode(pl_shader sh, pl_icc_object icc, pl_shader_obj *lut_obj)
+{
+ struct icc_priv *p = PL_PRIV(icc);
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+
+ pl_fmt fmt = pl_find_fmt(SH_GPU(sh), PL_FMT_UNORM, 4, 16, 16, PL_FMT_CAP_LINEAR);
+ if (!fmt) {
+ SH_FAIL(sh, "Failed finding ICC 3DLUT texture format!");
+ return;
+ }
+
+ ident_t lut = sh_lut(sh, sh_lut_params(
+ .object = lut_obj,
+ .var_type = PL_VAR_FLOAT,
+ .method = SH_LUT_TETRAHEDRAL,
+ .fmt = fmt,
+ .width = icc->params.size_r,
+ .height = icc->params.size_g,
+ .depth = icc->params.size_b,
+ .comps = 4,
+ .signature = ~p->lut_sig, // avoid confusion with decoding LUTs
+ .fill = fill_encode,
+ .cache = get_cache(icc, sh),
+ .priv = (void *) icc,
+ ));
+
+ if (!lut) {
+ SH_FAIL(sh, "pl_icc_encode: failed generating LUT object");
+ return;
+ }
+
+ // X = 1/a * (Y/scale)^(1/y) - b/a
+ sh_describe(sh, "ICC 3DLUT");
+ GLSL("// pl_icc_encode \n"
+ "{ \n"
+ "color.rgb = max(color.rgb, 0.0); \n"
+ "color.rgb = 1.0/"$" * color.rgb; \n"
+ "color.rgb = pow(color.rgb, vec3("$")); \n"
+ "color.rgb = 1.0/"$" * color.rgb - "$"; \n"
+ "color.rgb = "$"(color.rgb).rgb; \n"
+ "} \n",
+ SH_FLOAT(p->scale),
+ SH_FLOAT(1.0f / icc->gamma),
+ SH_FLOAT(p->a), SH_FLOAT(p->b / p->a),
+ lut);
+}
+
+#else // !PL_HAVE_LCMS
+
+void pl_icc_close(pl_icc_object *picc) {};
+pl_icc_object pl_icc_open(pl_log log, const struct pl_icc_profile *profile,
+ const struct pl_icc_params *pparams)
+{
+ pl_err(log, "libplacebo compiled without LittleCMS 2 support!");
+ return NULL;
+}
+
+bool pl_icc_update(pl_log log, pl_icc_object *obj,
+ const struct pl_icc_profile *profile,
+ const struct pl_icc_params *params)
+{
+ static bool warned;
+ if (!warned) {
+ pl_err(log, "libplacebo compiled without LittleCMS 2 support!");
+ warned = true;
+ }
+ *obj = NULL;
+ return false;
+}
+
+void pl_icc_decode(pl_shader sh, pl_icc_object icc, pl_shader_obj *lut_obj,
+ struct pl_color_space *out_csp)
+{
+ pl_unreachable(); // can't get a pl_icc_object
+}
+
+void pl_icc_encode(pl_shader sh, pl_icc_object icc, pl_shader_obj *lut_obj)
+{
+ pl_unreachable();
+}
+
+#endif
diff --git a/src/shaders/lut.c b/src/shaders/lut.c
new file mode 100644
index 0000000..b0124fc
--- /dev/null
+++ b/src/shaders/lut.c
@@ -0,0 +1,820 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include <ctype.h>
+
+#include "shaders.h"
+
+#include <libplacebo/shaders/lut.h>
+
+static inline bool isnumeric(char c)
+{
+ return (c >= '0' && c <= '9') || c == '-';
+}
+
+void pl_lut_free(struct pl_custom_lut **lut)
+{
+ pl_free_ptr(lut);
+}
+
+struct pl_custom_lut *pl_lut_parse_cube(pl_log log, const char *cstr, size_t cstr_len)
+{
+ struct pl_custom_lut *lut = pl_zalloc_ptr(NULL, lut);
+ pl_str str = (pl_str) { (uint8_t *) cstr, cstr_len };
+ lut->signature = pl_str_hash(str);
+ int entries = 0;
+
+ float min[3] = { 0.0, 0.0, 0.0 };
+ float max[3] = { 1.0, 1.0, 1.0 };
+
+ // Parse header
+ while (str.len && !isnumeric(str.buf[0])) {
+ pl_str line = pl_str_strip(pl_str_getline(str, &str));
+ if (!line.len)
+ continue; // skip empty line
+
+ if (pl_str_eatstart0(&line, "TITLE")) {
+ pl_info(log, "Loading LUT: %.*s", PL_STR_FMT(pl_str_strip(line)));
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "LUT_3D_SIZE")) {
+ line = pl_str_strip(line);
+ int size;
+ if (!pl_str_parse_int(line, &size)) {
+ pl_err(log, "Failed parsing dimension '%.*s'", PL_STR_FMT(line));
+ goto error;
+ }
+ if (size <= 0 || size > 1024) {
+ pl_err(log, "Invalid 3DLUT size: %dx%d%x", size, size, size);
+ goto error;
+ }
+
+ lut->size[0] = lut->size[1] = lut->size[2] = size;
+ entries = size * size * size;
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "LUT_1D_SIZE")) {
+ line = pl_str_strip(line);
+ int size;
+ if (!pl_str_parse_int(line, &size)) {
+ pl_err(log, "Failed parsing dimension '%.*s'", PL_STR_FMT(line));
+ goto error;
+ }
+ if (size <= 0 || size > 65536) {
+ pl_err(log, "Invalid 1DLUT size: %d", size);
+ goto error;
+ }
+
+ lut->size[0] = size;
+ lut->size[1] = lut->size[2] = 0;
+ entries = size;
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "DOMAIN_MIN")) {
+ line = pl_str_strip(line);
+ if (!pl_str_parse_float(pl_str_split_char(line, ' ', &line), &min[0]) ||
+ !pl_str_parse_float(pl_str_split_char(line, ' ', &line), &min[1]) ||
+ !pl_str_parse_float(line, &min[2]))
+ {
+ pl_err(log, "Failed parsing domain: '%.*s'", PL_STR_FMT(line));
+ goto error;
+ }
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "DOMAIN_MAX")) {
+ line = pl_str_strip(line);
+ if (!pl_str_parse_float(pl_str_split_char(line, ' ', &line), &max[0]) ||
+ !pl_str_parse_float(pl_str_split_char(line, ' ', &line), &max[1]) ||
+ !pl_str_parse_float(line, &max[2]))
+ {
+ pl_err(log, "Failed parsing domain: '%.*s'", PL_STR_FMT(line));
+ goto error;
+ }
+ continue;
+ }
+
+ if (pl_str_eatstart0(&line, "#")) {
+ pl_debug(log, "Unhandled .cube comment: %.*s",
+ PL_STR_FMT(pl_str_strip(line)));
+ continue;
+ }
+
+ pl_warn(log, "Unhandled .cube line: %.*s", PL_STR_FMT(pl_str_strip(line)));
+ }
+
+ if (!entries) {
+ pl_err(log, "Missing LUT size specification?");
+ goto error;
+ }
+
+ for (int i = 0; i < 3; i++) {
+ if (max[i] - min[i] < 1e-6) {
+ pl_err(log, "Invalid domain range: [%f, %f]", min[i], max[i]);
+ goto error;
+ }
+ }
+
+ float *data = pl_alloc(lut, sizeof(float[3]) * entries);
+ lut->data = data;
+
+ // Parse LUT body
+ pl_clock_t start = pl_clock_now();
+ for (int n = 0; n < entries; n++) {
+ for (int c = 0; c < 3; c++) {
+ static const char * const digits = "0123456789.-+e";
+
+ // Extract valid digit sequence
+ size_t len = pl_strspn(str, digits);
+ pl_str entry = (pl_str) { str.buf, len };
+ str.buf += len;
+ str.len -= len;
+
+ if (!entry.len) {
+ if (!str.len) {
+ pl_err(log, "Failed parsing LUT: Unexpected EOF, expected "
+ "%d entries, got %d", entries * 3, n * 3 + c + 1);
+ } else {
+ pl_err(log, "Failed parsing LUT: Unexpected '%c', expected "
+ "digit", str.buf[0]);
+ }
+ goto error;
+ }
+
+ float num;
+ if (!pl_str_parse_float(entry, &num)) {
+ pl_err(log, "Failed parsing float value '%.*s'", PL_STR_FMT(entry));
+ goto error;
+ }
+
+ // Rescale to range 0.0 - 1.0
+ *data++ = (num - min[c]) / (max[c] - min[c]);
+
+ // Skip whitespace between digits
+ str = pl_str_strip(str);
+ }
+ }
+
+ str = pl_str_strip(str);
+ if (str.len)
+ pl_warn(log, "Extra data after LUT?... ignoring '%c'", str.buf[0]);
+
+ pl_log_cpu_time(log, start, pl_clock_now(), "parsing .cube LUT");
+ return lut;
+
+error:
+ pl_free(lut);
+ return NULL;
+}
+
+static void fill_lut(void *datap, const struct sh_lut_params *params)
+{
+ const struct pl_custom_lut *lut = params->priv;
+
+ int dim_r = params->width;
+ int dim_g = PL_DEF(params->height, 1);
+ int dim_b = PL_DEF(params->depth, 1);
+
+ float *data = datap;
+ for (int b = 0; b < dim_b; b++) {
+ for (int g = 0; g < dim_g; g++) {
+ for (int r = 0; r < dim_r; r++) {
+ size_t offset = (b * dim_g + g) * dim_r + r;
+ const float *src = &lut->data[offset * 3];
+ float *dst = &data[offset * 4];
+ dst[0] = src[0];
+ dst[1] = src[1];
+ dst[2] = src[2];
+ dst[3] = 0.0f;
+ }
+ }
+ }
+}
+
+void pl_shader_custom_lut(pl_shader sh, const struct pl_custom_lut *lut,
+ pl_shader_obj *lut_state)
+{
+ if (!lut)
+ return;
+
+ int dims;
+ if (lut->size[0] > 0 && lut->size[1] > 0 && lut->size[2] > 0) {
+ dims = 3;
+ } else if (lut->size[0] > 0 && !lut->size[1] && !lut->size[2]) {
+ dims = 1;
+ } else {
+ SH_FAIL(sh, "Invalid dimensions %dx%dx%d for pl_custom_lut, must be 1D "
+ "or 3D!", lut->size[0], lut->size[1], lut->size[2]);
+ return;
+ }
+
+ if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+ return;
+
+ ident_t fun = sh_lut(sh, sh_lut_params(
+ .object = lut_state,
+ .var_type = PL_VAR_FLOAT,
+ .method = SH_LUT_TETRAHEDRAL,
+ .width = lut->size[0],
+ .height = lut->size[1],
+ .depth = lut->size[2],
+ .comps = 4, // for better texel alignment
+ .signature = lut->signature,
+ .fill = fill_lut,
+ .priv = (void *) lut,
+ ));
+
+ if (!fun) {
+ SH_FAIL(sh, "pl_shader_custom_lut: failed generating LUT object");
+ return;
+ }
+
+ GLSL("// pl_shader_custom_lut \n");
+
+ static const pl_matrix3x3 zero = {0};
+ if (memcmp(&lut->shaper_in, &zero, sizeof(zero)) != 0) {
+ GLSL("color.rgb = "$" * color.rgb; \n", sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_mat3("shaper_in"),
+ .data = PL_TRANSPOSE_3X3(lut->shaper_in.m),
+ }));
+ }
+
+ switch (dims) {
+ case 1:
+ sh_describe(sh, "custom 1DLUT");
+ GLSL("color.rgb = vec3("$"(color.r).r, \n"
+ " "$"(color.g).g, \n"
+ " "$"(color.b).b); \n",
+ fun, fun, fun);
+ break;
+ case 3:
+ sh_describe(sh, "custom 3DLUT");
+ GLSL("color.rgb = "$"(color.rgb).rgb; \n", fun);
+ break;
+ }
+
+ if (memcmp(&lut->shaper_out, &zero, sizeof(zero)) != 0) {
+ GLSL("color.rgb = "$" * color.rgb; \n", sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_mat3("shaper_out"),
+ .data = PL_TRANSPOSE_3X3(lut->shaper_out.m),
+ }));
+ }
+}
+
+// Defines a LUT position helper macro. This translates from an absolute texel
+// scale (either in texels, or normalized to [0,1]) to the texture coordinate
+// scale for the corresponding sample in a texture of dimension `lut_size`.
+static ident_t texel_scale(pl_shader sh, int lut_size, bool normalized)
+{
+ const float base = 0.5f / lut_size;
+ const float end = 1.0f - 0.5f / lut_size;
+ const float scale = (end - base) / (normalized ? 1.0f : (lut_size - 1));
+
+ ident_t name = sh_fresh(sh, "LUT_SCALE");
+ GLSLH("#define "$"(x) ("$" * (x) + "$") \n",
+ name, SH_FLOAT(scale), SH_FLOAT(base));
+ return name;
+}
+
+struct sh_lut_obj {
+ enum sh_lut_type type;
+ enum sh_lut_method method;
+ enum pl_var_type vartype;
+ pl_fmt fmt;
+ int width, height, depth, comps;
+ uint64_t signature;
+ bool error; // reset if params change
+
+ // weights, depending on the lut type
+ pl_tex tex;
+ pl_str str;
+ void *data;
+};
+
+static void sh_lut_uninit(pl_gpu gpu, void *ptr)
+{
+ struct sh_lut_obj *lut = ptr;
+ pl_tex_destroy(gpu, &lut->tex);
+ pl_free(lut->str.buf);
+ pl_free(lut->data);
+
+ *lut = (struct sh_lut_obj) {0};
+}
+
+// Maximum number of floats to embed as a literal array (when using SH_LUT_AUTO)
+#define SH_LUT_MAX_LITERAL_SOFT 64
+#define SH_LUT_MAX_LITERAL_HARD 256
+
+ident_t sh_lut(pl_shader sh, const struct sh_lut_params *params)
+{
+ pl_gpu gpu = SH_GPU(sh);
+ pl_cache_obj obj = { .key = CACHE_KEY_SH_LUT ^ params->signature };
+
+ const enum pl_var_type vartype = params->var_type;
+ pl_assert(vartype != PL_VAR_INVALID);
+ pl_assert(params->method == SH_LUT_NONE || vartype == PL_VAR_FLOAT);
+ pl_assert(params->width > 0 && params->height >= 0 && params->depth >= 0);
+ pl_assert(params->comps > 0);
+ pl_assert(!params->cache || params->signature);
+
+ int sizes[] = { params->width, params->height, params->depth };
+ int size = params->width * PL_DEF(params->height, 1) * PL_DEF(params->depth, 1);
+ int dims = params->depth ? 3 : params->height ? 2 : 1;
+ enum sh_lut_method method = params->method;
+ if (method == SH_LUT_TETRAHEDRAL && dims != 3)
+ method = SH_LUT_LINEAR;
+ if (method == SH_LUT_CUBIC && dims != 3)
+ method = SH_LUT_LINEAR;
+
+ int texdim = 0;
+ uint32_t max_tex_dim[] = {
+ gpu ? gpu->limits.max_tex_1d_dim : 0,
+ gpu ? gpu->limits.max_tex_2d_dim : 0,
+ (gpu && gpu->glsl.version > 100) ? gpu->limits.max_tex_3d_dim : 0,
+ };
+
+ struct sh_lut_obj *lut = SH_OBJ(sh, params->object, PL_SHADER_OBJ_LUT,
+ struct sh_lut_obj, sh_lut_uninit);
+
+ if (!lut)
+ return NULL_IDENT;
+
+ bool update = params->update || lut->signature != params->signature ||
+ vartype != lut->vartype || params->fmt != lut->fmt ||
+ params->width != lut->width || params->height != lut->height ||
+ params->depth != lut->depth || params->comps != lut->comps;
+
+ if (lut->error && !update)
+ return NULL_IDENT; // suppress error spam until something changes
+
+ // Try picking the right number of dimensions for the texture LUT. This
+ // allows e.g. falling back to 2D textures if 1D textures are unsupported.
+ for (int d = dims; d <= PL_ARRAY_SIZE(max_tex_dim); d++) {
+ // For a given dimension to be compatible, all coordinates need to be
+ // within the maximum texture size for that dimension
+ for (int i = 0; i < d; i++) {
+ if (sizes[i] > max_tex_dim[d - 1])
+ goto next_dim;
+ }
+
+ // All dimensions are compatible, so pick this texture dimension
+ texdim = d;
+ break;
+
+next_dim: ; // `continue` out of the inner loop
+ }
+
+ static const enum pl_fmt_type fmt_type[PL_VAR_TYPE_COUNT] = {
+ [PL_VAR_SINT] = PL_FMT_SINT,
+ [PL_VAR_UINT] = PL_FMT_UINT,
+ [PL_VAR_FLOAT] = PL_FMT_FLOAT,
+ };
+
+ enum pl_fmt_caps texcaps = PL_FMT_CAP_SAMPLEABLE;
+ bool is_linear = method == SH_LUT_LINEAR || method == SH_LUT_CUBIC;
+ if (is_linear)
+ texcaps |= PL_FMT_CAP_LINEAR;
+
+ pl_fmt texfmt = params->fmt;
+ if (texfmt) {
+ bool ok;
+ switch (texfmt->type) {
+ case PL_FMT_SINT: ok = vartype == PL_VAR_SINT; break;
+ case PL_FMT_UINT: ok = vartype == PL_VAR_UINT; break;
+ default: ok = vartype == PL_VAR_FLOAT; break;
+ }
+
+ if (!ok) {
+ PL_ERR(sh, "Specified texture format '%s' does not match LUT "
+ "data type!", texfmt->name);
+ goto error;
+ }
+
+ if (~texfmt->caps & texcaps) {
+ PL_ERR(sh, "Specified texture format '%s' does not match "
+ "required capabilities 0x%x!\n", texfmt->name, texcaps);
+ goto error;
+ }
+ }
+
+ if (texdim && !texfmt) {
+ texfmt = pl_find_fmt(gpu, fmt_type[vartype], params->comps,
+ vartype == PL_VAR_FLOAT ? 16 : 32,
+ pl_var_type_size(vartype) * 8,
+ texcaps);
+ }
+
+ enum sh_lut_type type = params->lut_type;
+
+ // The linear sampling code currently only supports 1D linear interpolation
+ if (is_linear && dims > 1) {
+ if (texfmt) {
+ type = SH_LUT_TEXTURE;
+ } else {
+ PL_ERR(sh, "Can't emulate linear LUTs for 2D/3D LUTs and no "
+ "texture support available!");
+ goto error;
+ }
+ }
+
+ bool can_uniform = gpu && gpu->limits.max_variable_comps >= size * params->comps;
+ bool can_literal = sh_glsl(sh).version > 110; // needed for literal arrays
+ can_literal &= size <= SH_LUT_MAX_LITERAL_HARD && !params->dynamic;
+
+ // Deselect unsupported methods
+ if (type == SH_LUT_UNIFORM && !can_uniform)
+ type = SH_LUT_AUTO;
+ if (type == SH_LUT_LITERAL && !can_literal)
+ type = SH_LUT_AUTO;
+ if (type == SH_LUT_TEXTURE && !texfmt)
+ type = SH_LUT_AUTO;
+
+ // Sorted by priority
+ if (!type && can_literal && !method && size <= SH_LUT_MAX_LITERAL_SOFT)
+ type = SH_LUT_LITERAL;
+ if (!type && texfmt)
+ type = SH_LUT_TEXTURE;
+ if (!type && can_uniform)
+ type = SH_LUT_UNIFORM;
+ if (!type && can_literal)
+ type = SH_LUT_LITERAL;
+
+ if (!type) {
+ PL_ERR(sh, "Can't generate LUT: no compatible methods!");
+ goto error;
+ }
+
+ // Reinitialize the existing LUT if needed
+ update |= type != lut->type;
+ update |= method != lut->method;
+
+ if (update) {
+ if (params->dynamic)
+ pl_log_level_cap(sh->log, PL_LOG_TRACE);
+
+ size_t el_size = params->comps * pl_var_type_size(vartype);
+ if (type == SH_LUT_TEXTURE)
+ el_size = texfmt->texel_size;
+
+ size_t buf_size = size * el_size;
+ if (pl_cache_get(params->cache, &obj) && obj.size == buf_size) {
+ PL_DEBUG(sh, "Re-using cached LUT (0x%"PRIx64") with size %zu",
+ obj.key, obj.size);
+ } else {
+ PL_DEBUG(sh, "LUT invalidated, regenerating..");
+ pl_cache_obj_resize(NULL, &obj, buf_size);
+ pl_clock_t start = pl_clock_now();
+ params->fill(obj.data, params);
+ pl_log_cpu_time(sh->log, start, pl_clock_now(), "generating shader LUT");
+ }
+
+ pl_assert(obj.data && obj.size);
+ if (params->dynamic)
+ pl_log_level_cap(sh->log, PL_LOG_NONE);
+
+ switch (type) {
+ case SH_LUT_TEXTURE: {
+ if (!texdim) {
+ PL_ERR(sh, "Texture LUT exceeds texture dimensions!");
+ goto error;
+ }
+
+ if (!texfmt) {
+ PL_ERR(sh, "Found no compatible texture format for LUT!");
+ goto error;
+ }
+
+ struct pl_tex_params tex_params = {
+ .w = params->width,
+ .h = PL_DEF(params->height, texdim >= 2 ? 1 : 0),
+ .d = PL_DEF(params->depth, texdim >= 3 ? 1 : 0),
+ .format = texfmt,
+ .sampleable = true,
+ .host_writable = params->dynamic,
+ .initial_data = params->dynamic ? NULL : obj.data,
+ .debug_tag = params->debug_tag,
+ };
+
+ bool ok;
+ if (params->dynamic) {
+ ok = pl_tex_recreate(gpu, &lut->tex, &tex_params);
+ if (ok) {
+ ok = pl_tex_upload(gpu, pl_tex_transfer_params(
+ .tex = lut->tex,
+ .ptr = obj.data,
+ ));
+ }
+ } else {
+ // Can't use pl_tex_recreate because of `initial_data`
+ pl_tex_destroy(gpu, &lut->tex);
+ lut->tex = pl_tex_create(gpu, &tex_params);
+ ok = lut->tex;
+ }
+
+ if (!ok) {
+ PL_ERR(sh, "Failed creating LUT texture!");
+ goto error;
+ }
+ break;
+ }
+
+ case SH_LUT_UNIFORM:
+ pl_free(lut->data);
+ lut->data = pl_memdup(NULL, obj.data, obj.size);
+ break;
+
+ case SH_LUT_LITERAL: {
+ lut->str.len = 0;
+ static const char prefix[PL_VAR_TYPE_COUNT] = {
+ [PL_VAR_SINT] = 'i',
+ [PL_VAR_UINT] = 'u',
+ [PL_VAR_FLOAT] = ' ',
+ };
+
+ for (int i = 0; i < size * params->comps; i += params->comps) {
+ if (i > 0)
+ pl_str_append_asprintf_c(lut, &lut->str, ",");
+ if (params->comps > 1) {
+ pl_str_append_asprintf_c(lut, &lut->str, "%cvec%d(",
+ prefix[vartype], params->comps);
+ }
+ for (int c = 0; c < params->comps; c++) {
+ switch (vartype) {
+ case PL_VAR_FLOAT:
+ pl_str_append_asprintf_c(lut, &lut->str, "%s%f",
+ c > 0 ? "," : "",
+ ((float *) obj.data)[i+c]);
+ break;
+ case PL_VAR_UINT:
+ pl_str_append_asprintf_c(lut, &lut->str, "%s%u",
+ c > 0 ? "," : "",
+ ((unsigned int *) obj.data)[i+c]);
+ break;
+ case PL_VAR_SINT:
+ pl_str_append_asprintf_c(lut, &lut->str, "%s%d",
+ c > 0 ? "," : "",
+ ((int *) obj.data)[i+c]);
+ break;
+ case PL_VAR_INVALID:
+ case PL_VAR_TYPE_COUNT:
+ pl_unreachable();
+ }
+ }
+ if (params->comps > 1)
+ pl_str_append_asprintf_c(lut, &lut->str, ")");
+ }
+ break;
+ }
+
+ case SH_LUT_AUTO:
+ pl_unreachable();
+ }
+
+ lut->type = type;
+ lut->method = method;
+ lut->vartype = vartype;
+ lut->fmt = params->fmt;
+ lut->width = params->width;
+ lut->height = params->height;
+ lut->depth = params->depth;
+ lut->comps = params->comps;
+ lut->signature = params->signature;
+ pl_cache_set(params->cache, &obj);
+ }
+
+ // Done updating, generate the GLSL
+ ident_t name = sh_fresh(sh, "lut");
+ ident_t arr_name = NULL_IDENT;
+
+ static const char * const swizzles[] = {"x", "xy", "xyz", "xyzw"};
+ static const char * const vartypes[PL_VAR_TYPE_COUNT][4] = {
+ [PL_VAR_SINT] = { "int", "ivec2", "ivec3", "ivec4" },
+ [PL_VAR_UINT] = { "uint", "uvec2", "uvec3", "uvec4" },
+ [PL_VAR_FLOAT] = { "float", "vec2", "vec3", "vec4" },
+ };
+
+ switch (type) {
+ case SH_LUT_TEXTURE: {
+ assert(texdim);
+ ident_t tex = sh_desc(sh, (struct pl_shader_desc) {
+ .desc = {
+ .name = "weights",
+ .type = PL_DESC_SAMPLED_TEX,
+ },
+ .binding = {
+ .object = lut->tex,
+ .sample_mode = is_linear ? PL_TEX_SAMPLE_LINEAR
+ : PL_TEX_SAMPLE_NEAREST,
+ }
+ });
+
+ if (is_linear) {
+ ident_t pos_macros[PL_ARRAY_SIZE(sizes)] = {0};
+ for (int i = 0; i < dims; i++)
+ pos_macros[i] = texel_scale(sh, sizes[i], true);
+
+ GLSLH("#define "$"(pos) (textureLod("$", %s(\\\n",
+ name, tex, vartypes[PL_VAR_FLOAT][texdim - 1]);
+
+ for (int i = 0; i < texdim; i++) {
+ char sep = i == 0 ? ' ' : ',';
+ if (pos_macros[i]) {
+ if (dims > 1) {
+ GLSLH(" %c"$"(%s(pos).%c)\\\n", sep, pos_macros[i],
+ vartypes[PL_VAR_FLOAT][dims - 1], "xyzw"[i]);
+ } else {
+ GLSLH(" %c"$"(float(pos))\\\n", sep, pos_macros[i]);
+ }
+ } else {
+ GLSLH(" %c%f\\\n", sep, 0.5);
+ }
+ }
+ GLSLH(" ), 0.0).%s)\n", swizzles[params->comps - 1]);
+ } else {
+ GLSLH("#define "$"(pos) (texelFetch("$", %s(pos",
+ name, tex, vartypes[PL_VAR_SINT][texdim - 1]);
+
+ // Fill up extra components of the index
+ for (int i = dims; i < texdim; i++)
+ GLSLH(", 0");
+
+ GLSLH("), 0).%s)\n", swizzles[params->comps - 1]);
+ }
+ break;
+ }
+
+ case SH_LUT_UNIFORM:
+ arr_name = sh_var(sh, (struct pl_shader_var) {
+ .var = {
+ .name = "weights",
+ .type = vartype,
+ .dim_v = params->comps,
+ .dim_m = 1,
+ .dim_a = size,
+ },
+ .data = lut->data,
+ });
+ break;
+
+ case SH_LUT_LITERAL:
+ arr_name = sh_fresh(sh, "weights");
+ GLSLH("const %s "$"[%d] = %s[](\n ",
+ vartypes[vartype][params->comps - 1], arr_name, size,
+ vartypes[vartype][params->comps - 1]);
+ sh_append_str(sh, SH_BUF_HEADER, lut->str);
+ GLSLH(");\n");
+ break;
+
+ case SH_LUT_AUTO:
+ pl_unreachable();
+ }
+
+ if (arr_name) {
+ GLSLH("#define "$"(pos) ("$"[int((pos)%s)\\\n",
+ name, arr_name, dims > 1 ? "[0]" : "");
+ int shift = params->width;
+ for (int i = 1; i < dims; i++) {
+ GLSLH(" + %d * int((pos)[%d])\\\n", shift, i);
+ shift *= sizes[i];
+ }
+ GLSLH(" ])\n");
+
+ if (is_linear) {
+ pl_assert(dims == 1);
+ pl_assert(vartype == PL_VAR_FLOAT);
+ ident_t arr_lut = name;
+ name = sh_fresh(sh, "lut_lin");
+ GLSLH("%s "$"(float fpos) { \n"
+ " fpos = clamp(fpos, 0.0, 1.0) * %d.0; \n"
+ " float fbase = floor(fpos); \n"
+ " float fceil = ceil(fpos); \n"
+ " float fcoord = fpos - fbase; \n"
+ " return mix("$"(fbase), "$"(fceil), fcoord); \n"
+ "} \n",
+ vartypes[PL_VAR_FLOAT][params->comps - 1], name,
+ size - 1,
+ arr_lut, arr_lut);
+ }
+ }
+
+ if (method == SH_LUT_CUBIC && dims == 3) {
+ ident_t lin_lut = name;
+ name = sh_fresh(sh, "lut_tricubic");
+ GLSLH("%s "$"(vec3 pos) { \n"
+ " vec3 scale = vec3(%d.0, %d.0, %d.0); \n"
+ " vec3 scale_inv = 1.0 / scale; \n"
+ " pos *= scale; \n"
+ " vec3 fpos = fract(pos); \n"
+ " vec3 base = pos - fpos; \n"
+ " vec3 fpos2 = fpos * fpos; \n"
+ " vec3 inv = 1.0 - fpos; \n"
+ " vec3 inv2 = inv * inv; \n"
+ " vec3 w0 = 1.0/6.0 * inv2 * inv; \n"
+ " vec3 w1 = 2.0/3.0 - 0.5 * fpos2 * (2.0 - fpos); \n"
+ " vec3 w2 = 2.0/3.0 - 0.5 * inv2 * (2.0 - inv); \n"
+ " vec3 w3 = 1.0/6.0 * fpos2 * fpos; \n"
+ " vec3 g0 = w0 + w1; \n"
+ " vec3 g1 = w2 + w3; \n"
+ " vec3 h0 = scale_inv * ((w1 / g0) - 1.0 + base); \n"
+ " vec3 h1 = scale_inv * ((w3 / g1) + 1.0 + base); \n"
+ " %s c000, c001, c010, c011, c100, c101, c110, c111; \n"
+ " c000 = "$"(h0); \n"
+ " c100 = "$"(vec3(h1.x, h0.y, h0.z)); \n"
+ " c000 = mix(c100, c000, g0.x); \n"
+ " c010 = "$"(vec3(h0.x, h1.y, h0.z)); \n"
+ " c110 = "$"(vec3(h1.x, h1.y, h0.z)); \n"
+ " c010 = mix(c110, c010, g0.x); \n"
+ " c000 = mix(c010, c000, g0.y); \n"
+ " c001 = "$"(vec3(h0.x, h0.y, h1.z)); \n"
+ " c101 = "$"(vec3(h1.x, h0.y, h1.z)); \n"
+ " c001 = mix(c101, c001, g0.x); \n"
+ " c011 = "$"(vec3(h0.x, h1.y, h1.z)); \n"
+ " c111 = "$"(h1); \n"
+ " c011 = mix(c111, c011, g0.x); \n"
+ " c001 = mix(c011, c001, g0.y); \n"
+ " return mix(c001, c000, g0.z); \n"
+ "} \n",
+ vartypes[PL_VAR_FLOAT][params->comps - 1], name,
+ sizes[0] - 1, sizes[1] - 1, sizes[2] - 1,
+ vartypes[PL_VAR_FLOAT][params->comps - 1],
+ lin_lut, lin_lut, lin_lut, lin_lut,
+ lin_lut, lin_lut, lin_lut, lin_lut);
+ }
+
+ if (method == SH_LUT_TETRAHEDRAL) {
+ ident_t int_lut = name;
+ name = sh_fresh(sh, "lut_barycentric");
+ GLSLH("%s "$"(vec3 pos) { \n"
+ // Compute bounding vertices and fractional part
+ " pos = clamp(pos, 0.0, 1.0) * vec3(%d.0, %d.0, %d.0); \n"
+ " vec3 base = floor(pos); \n"
+ " vec3 fpart = pos - base; \n"
+ // v0 and v3 are always 'black' and 'white', respectively
+ // v1 and v2 are the closest RGB and CMY vertices, respectively
+ " ivec3 v0 = ivec3(base), v3 = ivec3(ceil(pos)); \n"
+ " ivec3 v1 = v0, v2 = v3; \n"
+ // Table of boolean checks to simplify following math
+ " bvec3 c = greaterThanEqual(fpart.xyz, fpart.yzx); \n"
+ " bool c_xy = c.x, c_yx = !c.x, \n"
+ " c_yz = c.y, c_zy = !c.y, \n"
+ " c_zx = c.z, c_xz = !c.z; \n"
+ " vec3 s = fpart.xyz; \n"
+ " bool cond; \n",
+ vartypes[PL_VAR_FLOAT][params->comps - 1], name,
+ sizes[0] - 1, sizes[1] - 1, sizes[2] - 1);
+
+ // Subdivision of the cube into six congruent tetrahedras
+ //
+ // For each tetrahedron, test if the point is inside, and if so, update
+ // the edge vertices. We test all six, even though only one case will
+ // ever be true, because this avoids branches.
+ static const char *indices[] = { "xyz", "xzy", "zxy", "zyx", "yzx", "yxz"};
+ for (int i = 0; i < PL_ARRAY_SIZE(indices); i++) {
+ const char x = indices[i][0], y = indices[i][1], z = indices[i][2];
+ GLSLH("cond = c_%c%c && c_%c%c; \n"
+ "s = cond ? fpart.%c%c%c : s; \n"
+ "v1.%c = cond ? v3.%c : v1.%c; \n"
+ "v2.%c = cond ? v0.%c : v2.%c; \n",
+ x, y, y, z,
+ x, y, z,
+ x, x, x,
+ z, z, z);
+ }
+
+ // Interpolate in barycentric coordinates, with four texel fetches
+ GLSLH(" return (1.0 - s.x) * "$"(v0) + \n"
+ " (s.x - s.y) * "$"(v1) + \n"
+ " (s.y - s.z) * "$"(v2) + \n"
+ " (s.z) * "$"(v3); \n"
+ "} \n",
+ int_lut, int_lut, int_lut, int_lut);
+ }
+
+ lut->error = false;
+ pl_cache_obj_free(&obj);
+ pl_assert(name);
+ return name;
+
+error:
+ lut->error = true;
+ pl_cache_obj_free(&obj);
+ return NULL_IDENT;
+}
diff --git a/src/shaders/meson.build b/src/shaders/meson.build
new file mode 100644
index 0000000..746747c
--- /dev/null
+++ b/src/shaders/meson.build
@@ -0,0 +1,23 @@
+shader_sources = [
+ 'colorspace.c',
+ 'custom.c',
+ 'custom_mpv.c',
+ 'deinterlacing.c',
+ 'dithering.c',
+ 'film_grain.c',
+ 'film_grain_av1.c',
+ 'film_grain_h274.c',
+ 'icc.c',
+ 'lut.c',
+ 'sampling.c',
+]
+
+foreach s : shader_sources
+ sources += custom_target(s,
+ command: glsl_preproc,
+ depend_files: glsl_deps,
+ env: python_env,
+ input: s,
+ output: s,
+ )
+endforeach
diff --git a/src/shaders/sampling.c b/src/shaders/sampling.c
new file mode 100644
index 0000000..fc10f80
--- /dev/null
+++ b/src/shaders/sampling.c
@@ -0,0 +1,1198 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include "shaders.h"
+
+#include <libplacebo/colorspace.h>
+#include <libplacebo/shaders/sampling.h>
+
+const struct pl_deband_params pl_deband_default_params = { PL_DEBAND_DEFAULTS };
+
+static inline struct pl_tex_params src_params(const struct pl_sample_src *src)
+{
+ if (src->tex)
+ return src->tex->params;
+
+ return (struct pl_tex_params) {
+ .w = src->tex_w,
+ .h = src->tex_h,
+ };
+}
+
+enum filter {
+ NEAREST = PL_TEX_SAMPLE_NEAREST,
+ LINEAR = PL_TEX_SAMPLE_LINEAR,
+ BEST,
+ FASTEST,
+};
+
+// Helper function to compute the src/dst sizes and upscaling ratios
+static bool setup_src(pl_shader sh, const struct pl_sample_src *src,
+ ident_t *src_tex, ident_t *pos, ident_t *pt,
+ float *ratio_x, float *ratio_y, uint8_t *comp_mask,
+ float *scale, bool resizeable,
+ enum filter filter)
+{
+ enum pl_shader_sig sig;
+ float src_w, src_h;
+ enum pl_tex_sample_mode sample_mode;
+ if (src->tex) {
+ pl_fmt fmt = src->tex->params.format;
+ bool can_linear = fmt->caps & PL_FMT_CAP_LINEAR;
+ pl_assert(pl_tex_params_dimension(src->tex->params) == 2);
+ sig = PL_SHADER_SIG_NONE;
+ src_w = pl_rect_w(src->rect);
+ src_h = pl_rect_h(src->rect);
+ switch (filter) {
+ case FASTEST:
+ case NEAREST:
+ sample_mode = PL_TEX_SAMPLE_NEAREST;
+ break;
+ case LINEAR:
+ if (!can_linear) {
+ SH_FAIL(sh, "Trying to use a shader that requires linear "
+ "sampling with a texture whose format (%s) does not "
+ "support PL_FMT_CAP_LINEAR", fmt->name);
+ return false;
+ }
+ sample_mode = PL_TEX_SAMPLE_LINEAR;
+ break;
+ case BEST:
+ sample_mode = can_linear ? PL_TEX_SAMPLE_LINEAR : PL_TEX_SAMPLE_NEAREST;
+ break;
+ }
+ } else {
+ pl_assert(src->tex_w && src->tex_h);
+ sig = PL_SHADER_SIG_SAMPLER;
+ src_w = src->sampled_w;
+ src_h = src->sampled_h;
+ if (filter == BEST || filter == FASTEST) {
+ sample_mode = src->mode;
+ } else {
+ sample_mode = (enum pl_tex_sample_mode) filter;
+ if (sample_mode != src->mode) {
+ SH_FAIL(sh, "Trying to use a shader that requires a different "
+ "filter mode than the external sampler.");
+ return false;
+ }
+ }
+ }
+
+ src_w = PL_DEF(src_w, src_params(src).w);
+ src_h = PL_DEF(src_h, src_params(src).h);
+ pl_assert(src_w && src_h);
+
+ int out_w = PL_DEF(src->new_w, roundf(fabs(src_w)));
+ int out_h = PL_DEF(src->new_h, roundf(fabs(src_h)));
+ pl_assert(out_w && out_h);
+
+ if (ratio_x)
+ *ratio_x = out_w / fabs(src_w);
+ if (ratio_y)
+ *ratio_y = out_h / fabs(src_h);
+ if (scale)
+ *scale = PL_DEF(src->scale, 1.0);
+
+ if (comp_mask) {
+ uint8_t tex_mask = 0x0Fu;
+ if (src->tex) {
+ // Mask containing only the number of components in the texture
+ tex_mask = (1 << src->tex->params.format->num_components) - 1;
+ }
+
+ uint8_t src_mask = src->component_mask;
+ if (!src_mask)
+ src_mask = (1 << PL_DEF(src->components, 4)) - 1;
+
+ // Only actually sample components that are both requested and
+ // available in the texture being sampled
+ *comp_mask = tex_mask & src_mask;
+ }
+
+ if (resizeable)
+ out_w = out_h = 0;
+ if (!sh_require(sh, sig, out_w, out_h))
+ return false;
+
+ if (src->tex) {
+ pl_rect2df rect = {
+ .x0 = src->rect.x0,
+ .y0 = src->rect.y0,
+ .x1 = src->rect.x0 + src_w,
+ .y1 = src->rect.y0 + src_h,
+ };
+
+ *src_tex = sh_bind(sh, src->tex, src->address_mode, sample_mode,
+ "src_tex", &rect, pos, pt);
+ } else {
+ if (pt) {
+ float sx = 1.0 / src->tex_w, sy = 1.0 / src->tex_h;
+ if (src->sampler == PL_SAMPLER_RECT)
+ sx = sy = 1.0;
+
+ *pt = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec2("tex_pt"),
+ .data = &(float[2]) { sx, sy },
+ });
+ }
+
+ sh->sampler_type = src->sampler;
+
+ pl_assert(src->format);
+ switch (src->format) {
+ case PL_FMT_UNKNOWN:
+ case PL_FMT_FLOAT:
+ case PL_FMT_UNORM:
+ case PL_FMT_SNORM: sh->sampler_prefix = ' '; break;
+ case PL_FMT_UINT: sh->sampler_prefix = 'u'; break;
+ case PL_FMT_SINT: sh->sampler_prefix = 's'; break;
+ case PL_FMT_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ *src_tex = sh_fresh(sh, "src_tex");
+ *pos = sh_fresh(sh, "pos");
+
+ GLSLH("#define "$" src_tex \n"
+ "#define "$" pos \n",
+ *src_tex, *pos);
+ }
+
+ return true;
+}
+
+void pl_shader_deband(pl_shader sh, const struct pl_sample_src *src,
+ const struct pl_deband_params *params)
+{
+ float scale;
+ ident_t tex, pos, pt;
+ uint8_t mask;
+ if (!setup_src(sh, src, &tex, &pos, &pt, NULL, NULL, &mask, &scale, false, LINEAR))
+ return;
+
+ params = PL_DEF(params, &pl_deband_default_params);
+ sh_describe(sh, "debanding");
+ GLSL("vec4 color; \n"
+ "// pl_shader_deband \n"
+ "{ \n"
+ "vec2 pos = "$", pt = "$"; \n"
+ "color = textureLod("$", pos, 0.0);\n",
+ pos, pt, tex);
+
+ mask &= ~0x8u; // ignore alpha channel
+ uint8_t num_comps = sh_num_comps(mask);
+ const char *swiz = sh_swizzle(mask);
+ pl_assert(num_comps <= 3);
+ if (!num_comps) {
+ GLSL("color *= "$"; \n"
+ "} \n",
+ SH_FLOAT(scale));
+ return;
+ }
+
+ GLSL("#define GET(X, Y) \\\n"
+ " (textureLod("$", pos + pt * vec2(X, Y), 0.0).%s) \n"
+ "#define T %s \n",
+ tex, swiz, sh_float_type(mask));
+
+ ident_t prng = sh_prng(sh, true, NULL);
+ GLSL("T avg, diff, bound; \n"
+ "T res = color.%s; \n"
+ "vec2 d; \n",
+ swiz);
+
+ if (params->iterations > 0) {
+ ident_t radius = sh_const_float(sh, "radius", params->radius);
+ ident_t threshold = sh_const_float(sh, "threshold",
+ params->threshold / (1000 * scale));
+
+ // For each iteration, compute the average at a given distance and
+ // pick it instead of the color if the difference is below the threshold.
+ for (int i = 1; i <= params->iterations; i++) {
+ GLSL(// Compute a random angle and distance
+ "d = "$".xy * vec2(%d.0 * "$", %f); \n"
+ "d = d.x * vec2(cos(d.y), sin(d.y)); \n"
+ // Sample at quarter-turn intervals around the source pixel
+ "avg = T(0.0); \n"
+ "avg += GET(+d.x, +d.y); \n"
+ "avg += GET(-d.x, +d.y); \n"
+ "avg += GET(-d.x, -d.y); \n"
+ "avg += GET(+d.x, -d.y); \n"
+ "avg *= 0.25; \n"
+ // Compare the (normalized) average against the pixel
+ "diff = abs(res - avg); \n"
+ "bound = T("$" / %d.0); \n",
+ prng, i, radius, M_PI * 2,
+ threshold, i);
+
+ if (num_comps > 1) {
+ GLSL("res = mix(avg, res, greaterThan(diff, bound)); \n");
+ } else {
+ GLSL("res = mix(avg, res, diff > bound); \n");
+ }
+ }
+ }
+
+ // Add some random noise to smooth out residual differences
+ if (params->grain > 0) {
+ // Avoid adding grain near true black
+ GLSL("bound = T(\n");
+ for (int c = 0; c < num_comps; c++) {
+ GLSL("%c"$, c > 0 ? ',' : ' ',
+ SH_FLOAT(params->grain_neutral[c] / scale));
+ }
+ GLSL("); \n"
+ "T strength = min(abs(res - bound), "$"); \n"
+ "res += strength * (T("$") - T(0.5)); \n",
+ SH_FLOAT(params->grain / (1000.0 * scale)), prng);
+ }
+
+ GLSL("color.%s = res; \n"
+ "color *= "$"; \n"
+ "#undef T \n"
+ "#undef GET \n"
+ "} \n",
+ swiz, SH_FLOAT(scale));
+}
+
+bool pl_shader_sample_direct(pl_shader sh, const struct pl_sample_src *src)
+{
+ float scale;
+ ident_t tex, pos;
+ if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, &scale, true, BEST))
+ return false;
+
+ GLSL("// pl_shader_sample_direct \n"
+ "vec4 color = vec4("$") * textureLod("$", "$", 0.0); \n",
+ SH_FLOAT(scale), tex, pos);
+ return true;
+}
+
+bool pl_shader_sample_nearest(pl_shader sh, const struct pl_sample_src *src)
+{
+ float scale;
+ ident_t tex, pos;
+ if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, &scale, true, NEAREST))
+ return false;
+
+ sh_describe(sh, "nearest");
+ GLSL("// pl_shader_sample_nearest \n"
+ "vec4 color = vec4("$") * textureLod("$", "$", 0.0); \n",
+ SH_FLOAT(scale), tex, pos);
+ return true;
+}
+
+bool pl_shader_sample_bilinear(pl_shader sh, const struct pl_sample_src *src)
+{
+ float scale;
+ ident_t tex, pos;
+ if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, &scale, true, LINEAR))
+ return false;
+
+ sh_describe(sh, "bilinear");
+ GLSL("// pl_shader_sample_bilinear \n"
+ "vec4 color = vec4("$") * textureLod("$", "$", 0.0); \n",
+ SH_FLOAT(scale), tex, pos);
+ return true;
+}
+
+bool pl_shader_sample_bicubic(pl_shader sh, const struct pl_sample_src *src)
+{
+ ident_t tex, pos, pt;
+ float rx, ry, scale;
+ if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
+ return false;
+
+ if (rx < 1 || ry < 1) {
+ PL_TRACE(sh, "Using fast bicubic sampling when downscaling. This "
+ "will most likely result in nasty aliasing!");
+ }
+
+ // Explanation of how bicubic scaling with only 4 texel fetches is done:
+ // http://www.mate.tue.nl/mate/pdfs/10318.pdf
+ // 'Efficient GPU-Based Texture Interpolation using Uniform B-Splines'
+
+ sh_describe(sh, "bicubic");
+#pragma GLSL /* pl_shader_sample_bicubic */ \
+ vec4 color; \
+ { \
+ vec2 pos = $pos; \
+ vec2 size = vec2(textureSize($tex, 0)); \
+ vec2 frac = fract(pos * size + vec2(0.5)); \
+ vec2 frac2 = frac * frac; \
+ vec2 inv = vec2(1.0) - frac; \
+ vec2 inv2 = inv * inv; \
+ /* compute filter weights directly */ \
+ vec2 w0 = 1.0/6.0 * inv2 * inv; \
+ vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac); \
+ vec2 w2 = 2.0/3.0 - 0.5 * inv2 * (2.0 - inv); \
+ vec2 w3 = 1.0/6.0 * frac2 * frac; \
+ vec4 g = vec4(w0 + w1, w2 + w3); \
+ vec4 h = vec4(w1, w3) / g + inv.xyxy; \
+ h.xy -= vec2(2.0); \
+ /* sample four corners, then interpolate */ \
+ vec4 p = pos.xyxy + $pt.xyxy * h; \
+ vec4 c00 = textureLod($tex, p.xy, 0.0); \
+ vec4 c01 = textureLod($tex, p.xw, 0.0); \
+ vec4 c0 = mix(c01, c00, g.y); \
+ vec4 c10 = textureLod($tex, p.zy, 0.0); \
+ vec4 c11 = textureLod($tex, p.zw, 0.0); \
+ vec4 c1 = mix(c11, c10, g.y); \
+ color = ${float:scale} * mix(c1, c0, g.x); \
+ }
+
+ return true;
+}
+
+bool pl_shader_sample_hermite(pl_shader sh, const struct pl_sample_src *src)
+{
+ ident_t tex, pos, pt;
+ float rx, ry, scale;
+ if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
+ return false;
+
+ if (rx < 1 || ry < 1) {
+ PL_TRACE(sh, "Using fast hermite sampling when downscaling. This "
+ "will most likely result in nasty aliasing!");
+ }
+
+ sh_describe(sh, "hermite");
+#pragma GLSL /* pl_shader_sample_hermite */ \
+ vec4 color; \
+ { \
+ vec2 pos = $pos; \
+ vec2 size = vec2(textureSize($tex, 0)); \
+ vec2 frac = fract(pos * size + vec2(0.5)); \
+ pos += $pt * (smoothstep(0.0, 1.0, frac) - frac); \
+ color = ${float:scale} * textureLod($tex, pos, 0.0); \
+ }
+
+ return true;
+}
+
+bool pl_shader_sample_gaussian(pl_shader sh, const struct pl_sample_src *src)
+{
+ ident_t tex, pos, pt;
+ float rx, ry, scale;
+ if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
+ return false;
+
+ if (rx < 1 || ry < 1) {
+ PL_TRACE(sh, "Using fast gaussian sampling when downscaling. This "
+ "will most likely result in nasty aliasing!");
+ }
+
+ sh_describe(sh, "gaussian");
+#pragma GLSL /* pl_shader_sample_gaussian */ \
+ vec4 color; \
+ { \
+ vec2 pos = $pos; \
+ vec2 size = vec2(textureSize($tex, 0)); \
+ vec2 off = -fract(pos * size + vec2(0.5)); \
+ vec2 off2 = -2.0 * off * off; \
+ /* compute gaussian weights */ \
+ vec2 w0 = exp(off2 + 4.0 * off - vec2(2.0)); \
+ vec2 w1 = exp(off2); \
+ vec2 w2 = exp(off2 - 4.0 * off - vec2(2.0)); \
+ vec2 w3 = exp(off2 - 8.0 * off - vec2(8.0)); \
+ vec4 g = vec4(w0 + w1, w2 + w3); \
+ vec4 h = vec4(w1, w3) / g; \
+ h.xy -= vec2(1.0); \
+ h.zw += vec2(1.0); \
+ g.xy /= g.xy + g.zw; /* explicitly normalize */ \
+ /* sample four corners, then interpolate */ \
+ vec4 p = pos.xyxy + $pt.xyxy * (h + off.xyxy); \
+ vec4 c00 = textureLod($tex, p.xy, 0.0); \
+ vec4 c01 = textureLod($tex, p.xw, 0.0); \
+ vec4 c0 = mix(c01, c00, g.y); \
+ vec4 c10 = textureLod($tex, p.zy, 0.0); \
+ vec4 c11 = textureLod($tex, p.zw, 0.0); \
+ vec4 c1 = mix(c11, c10, g.y); \
+ color = ${float:scale} * mix(c1, c0, g.x); \
+ }
+
+ return true;
+}
+
+bool pl_shader_sample_oversample(pl_shader sh, const struct pl_sample_src *src,
+ float threshold)
+{
+ ident_t tex, pos, pt;
+ float rx, ry, scale;
+ if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
+ return false;
+
+ threshold = PL_CLAMP(threshold, 0.0f, 0.5f);
+ sh_describe(sh, "oversample");
+ #pragma GLSL /* pl_shader_sample_oversample */ \
+ vec4 color; \
+ { \
+ vec2 pos = $pos; \
+ vec2 size = vec2(textureSize($tex, 0)); \
+ /* Round the position to the nearest pixel */ \
+ vec2 fcoord = fract(pos * size - vec2(0.5)); \
+ float rx = ${dynamic float:rx}; \
+ float ry = ${dynamic float:ry}; \
+ vec2 coeff = (fcoord - vec2(0.5)) * vec2(rx, ry); \
+ coeff = clamp(coeff + vec2(0.5), 0.0, 1.0); \
+ @if (threshold > 0) { \
+ float thresh = ${float:threshold}; \
+ coeff = mix(coeff, vec2(0.0), \
+ lessThan(coeff, vec2(thresh))); \
+ coeff = mix(coeff, vec2(1.0), \
+ greaterThan(coeff, vec2(1.0 - thresh))); \
+ @} \
+ \
+ /* Compute the right output blend of colors */ \
+ pos += (coeff - fcoord) * $pt; \
+ color = ${float:scale} * textureLod($tex, pos, 0.0); \
+ }
+
+ return true;
+}
+
+static void describe_filter(pl_shader sh, const struct pl_filter_config *cfg,
+ const char *stage, float rx, float ry)
+{
+ const char *dir;
+ if (rx > 1 && ry > 1) {
+ dir = "up";
+ } else if (rx < 1 && ry < 1) {
+ dir = "down";
+ } else if (rx == 1 && ry == 1) {
+ dir = "noop";
+ } else {
+ dir = "ana";
+ }
+
+ if (cfg->name) {
+ sh_describef(sh, "%s %sscaling (%s)", stage, dir, cfg->name);
+ } else if (cfg->window) {
+ sh_describef(sh, "%s %sscaling (%s+%s)", stage, dir,
+ PL_DEF(cfg->kernel->name, "unknown"),
+ PL_DEF(cfg->window->name, "unknown"));
+ } else {
+ sh_describef(sh, "%s %sscaling (%s)", stage, dir,
+ PL_DEF(cfg->kernel->name, "unknown"));
+ }
+}
+
+// Subroutine for computing and adding an individual texel contribution
+// If `in` is NULL, samples directly
+// If `in` is set, takes the pixel from inX[idx] where X is the component,
+// `in` is the given identifier, and `idx` must be defined by the caller
+static void polar_sample(pl_shader sh, pl_filter filter,
+ ident_t tex, ident_t lut, ident_t radius,
+ int x, int y, uint8_t comp_mask, ident_t in,
+ bool use_ar, ident_t scale)
+{
+ // Since we can't know the subpixel position in advance, assume a
+ // worst case scenario
+ int yy = y > 0 ? y-1 : y;
+ int xx = x > 0 ? x-1 : x;
+ float dmin = sqrt(xx*xx + yy*yy);
+ // Skip samples definitely outside the radius
+ if (dmin >= filter->radius)
+ return;
+
+ // Check for samples that might be skippable
+ bool maybe_skippable = dmin >= filter->radius - M_SQRT2;
+
+ // Check for samples that definitely won't contribute to anti-ringing
+ const float ar_radius = filter->radius_zero;
+ use_ar &= dmin < ar_radius;
+
+#pragma GLSL \
+ offset = ivec2(${const int: x}, ${const int: y}); \
+ d = length(vec2(offset) - fcoord); \
+ @if (maybe_skippable) \
+ if (d < $radius) { \
+ w = $lut(d * 1.0 / $radius); \
+ wsum += w; \
+ @if (in != NULL_IDENT) { \
+ @for (c : comp_mask) \
+ c[@c] = ${in}_@c[idx]; \
+ @} else { \
+ c = textureLod($tex, base + pt * vec2(offset), 0.0); \
+ @} \
+ @for (c : comp_mask) \
+ color[@c] += w * c[@c]; \
+ @if (use_ar) { \
+ if (d <= ${const float: ar_radius}) { \
+ @for (c : comp_mask) { \
+ cc = vec2($scale * c[@c]); \
+ cc.x = 1.0 - cc.x; \
+ ww = cc + vec2(0.10); \
+ ww = ww * ww; \
+ ww = ww * ww; \
+ ww = ww * ww; \
+ ww = ww * ww; \
+ ww = ww * ww; \
+ ww = w * ww; \
+ ar@c += ww * cc; \
+ wwsum@c += ww; \
+ @} \
+ } \
+ @} \
+ @if (maybe_skippable) \
+ }
+}
+
+struct sh_sampler_obj {
+ pl_filter filter;
+ pl_shader_obj lut;
+ pl_shader_obj pass2; // for pl_shader_sample_ortho
+};
+
+#define SCALER_LUT_SIZE 256
+#define SCALER_LUT_CUTOFF 1e-3f
+
+static void sh_sampler_uninit(pl_gpu gpu, void *ptr)
+{
+ struct sh_sampler_obj *obj = ptr;
+ pl_shader_obj_destroy(&obj->lut);
+ pl_shader_obj_destroy(&obj->pass2);
+ pl_filter_free(&obj->filter);
+ *obj = (struct sh_sampler_obj) {0};
+}
+
+static void fill_polar_lut(void *data, const struct sh_lut_params *params)
+{
+ const struct sh_sampler_obj *obj = params->priv;
+ pl_filter filt = obj->filter;
+
+ pl_assert(params->width == filt->params.lut_entries && params->comps == 1);
+ memcpy(data, filt->weights, params->width * sizeof(float));
+}
+
+bool pl_shader_sample_polar(pl_shader sh, const struct pl_sample_src *src,
+ const struct pl_sample_filter_params *params)
+{
+ pl_assert(params);
+ if (!params->filter.polar) {
+ SH_FAIL(sh, "Trying to use polar sampling with a non-polar filter?");
+ return false;
+ }
+
+ uint8_t cmask;
+ float rx, ry, scalef;
+ ident_t src_tex, pos, pt, scale;
+ if (!setup_src(sh, src, &src_tex, &pos, &pt, &rx, &ry, &cmask, &scalef, false, FASTEST))
+ return false;
+
+ struct sh_sampler_obj *obj;
+ obj = SH_OBJ(sh, params->lut, PL_SHADER_OBJ_SAMPLER, struct sh_sampler_obj,
+ sh_sampler_uninit);
+ if (!obj)
+ return false;
+
+ float inv_scale = 1.0 / PL_MIN(rx, ry);
+ inv_scale = PL_MAX(inv_scale, 1.0);
+ if (params->no_widening)
+ inv_scale = 1.0;
+ scale = sh_const_float(sh, "scale", scalef);
+
+ struct pl_filter_config cfg = params->filter;
+ cfg.antiring = PL_DEF(cfg.antiring, params->antiring);
+ cfg.blur = PL_DEF(cfg.blur, 1.0f) * inv_scale;
+ bool update = !obj->filter || !pl_filter_config_eq(&obj->filter->params.config, &cfg);
+ if (update) {
+ pl_filter_free(&obj->filter);
+ obj->filter = pl_filter_generate(sh->log, pl_filter_params(
+ .config = cfg,
+ .lut_entries = SCALER_LUT_SIZE,
+ .cutoff = SCALER_LUT_CUTOFF,
+ ));
+
+ if (!obj->filter) {
+ // This should never happen, but just in case ..
+ SH_FAIL(sh, "Failed initializing polar filter!");
+ return false;
+ }
+ }
+
+ describe_filter(sh, &cfg, "polar", rx, ry);
+ GLSL("// pl_shader_sample_polar \n"
+ "vec4 color = vec4(0.0); \n"
+ "{ \n"
+ "vec2 pos = "$", pt = "$"; \n"
+ "vec2 size = vec2(textureSize("$", 0)); \n"
+ "vec2 fcoord = fract(pos * size - vec2(0.5)); \n"
+ "vec2 base = pos - pt * fcoord; \n"
+ "vec2 center = base + pt * vec2(0.5); \n"
+ "ivec2 offset; \n"
+ "float w, d, wsum = 0.0; \n"
+ "int idx; \n"
+ "vec4 c; \n",
+ pos, pt, src_tex);
+
+ bool use_ar = cfg.antiring > 0;
+ if (use_ar) {
+#pragma GLSL \
+ vec2 ww, cc; \
+ @for (c : cmask) \
+ vec2 ar@c = vec2(0.0), wwsum@c = vec2(0.0);
+ }
+
+ int bound = ceil(obj->filter->radius);
+ int offset = bound - 1; // padding top/left
+ int padding = offset + bound; // total padding
+
+ // Determined experimentally on modern AMD and Nvidia hardware. 32 is a
+ // good tradeoff for the horizontal work group size. Apart from that,
+ // just use as many threads as possible.
+ const int bw = 32, bh = sh_glsl(sh).max_group_threads / bw;
+
+ // We need to sample everything from base_min to base_max, so make sure we
+ // have enough room in shmem. The extra margin on the ceilf guards against
+ // floating point inaccuracy on near-integer scaling ratios.
+ const float margin = 1e-5;
+ int iw = (int) ceilf(bw / rx - margin) + padding + 1,
+ ih = (int) ceilf(bh / ry - margin) + padding + 1;
+ int sizew = iw, sizeh = ih;
+
+ pl_gpu gpu = SH_GPU(sh);
+ bool dynamic_size = SH_PARAMS(sh).dynamic_constants ||
+ !gpu || !gpu->limits.array_size_constants;
+ if (dynamic_size) {
+ // Overallocate the array slightly to reduce recompilation overhead
+ sizew = PL_ALIGN2(sizew, 8);
+ sizeh = PL_ALIGN2(sizeh, 8);
+ }
+
+ int num_comps = __builtin_popcount(cmask);
+ int shmem_req = (sizew * sizeh * num_comps + 2) * sizeof(float);
+ bool is_compute = !params->no_compute && sh_glsl(sh).compute &&
+ sh_try_compute(sh, bw, bh, false, shmem_req);
+
+ // Note: SH_LUT_LITERAL might be faster in some specific cases, but not by
+ // much, and it's catastrophically slow on other platforms.
+ ident_t lut = sh_lut(sh, sh_lut_params(
+ .object = &obj->lut,
+ .lut_type = SH_LUT_TEXTURE,
+ .var_type = PL_VAR_FLOAT,
+ .method = SH_LUT_LINEAR,
+ .width = SCALER_LUT_SIZE,
+ .comps = 1,
+ .update = update,
+ .fill = fill_polar_lut,
+ .priv = obj,
+ ));
+
+ if (!lut) {
+ SH_FAIL(sh, "Failed initializing polar LUT!");
+ return false;
+ }
+
+ ident_t radius_c = sh_const_float(sh, "radius", obj->filter->radius);
+ ident_t in = sh_fresh(sh, "in");
+
+ if (is_compute) {
+
+ // Compute shader kernel
+ GLSL("uvec2 base_id = uvec2(0u); \n");
+ if (src->rect.x0 > src->rect.x1)
+ GLSL("base_id.x = gl_WorkGroupSize.x - 1u; \n");
+ if (src->rect.y0 > src->rect.y1)
+ GLSL("base_id.y = gl_WorkGroupSize.y - 1u; \n");
+
+ GLSLH("shared vec2 "$"_base; \n", in);
+ GLSL("if (gl_LocalInvocationID.xy == base_id) \n"
+ " "$"_base = base; \n"
+ "barrier(); \n"
+ "ivec2 rel = ivec2(round((base - "$"_base) * size)); \n",
+ in, in);
+
+ ident_t sizew_c = sh_const(sh, (struct pl_shader_const) {
+ .type = PL_VAR_SINT,
+ .compile_time = true,
+ .name = "sizew",
+ .data = &sizew,
+ });
+
+ ident_t sizeh_c = sh_const(sh, (struct pl_shader_const) {
+ .type = PL_VAR_SINT,
+ .compile_time = true,
+ .name = "sizeh",
+ .data = &sizeh,
+ });
+
+ ident_t iw_c = sizew_c, ih_c = sizeh_c;
+ if (dynamic_size) {
+ iw_c = sh_const_int(sh, "iw", iw);
+ ih_c = sh_const_int(sh, "ih", ih);
+ }
+
+ // Load all relevant texels into shmem
+ GLSL("for (int y = int(gl_LocalInvocationID.y); y < "$"; y += %d) { \n"
+ "for (int x = int(gl_LocalInvocationID.x); x < "$"; x += %d) { \n"
+ "c = textureLod("$", "$"_base + pt * vec2(x - %d, y - %d), 0.0); \n",
+ ih_c, bh, iw_c, bw, src_tex, in, offset, offset);
+
+ for (uint8_t comps = cmask; comps;) {
+ uint8_t c = __builtin_ctz(comps);
+ GLSLH("shared float "$"_%d["$" * "$"]; \n", in, c, sizeh_c, sizew_c);
+ GLSL(""$"_%d["$" * y + x] = c[%d]; \n", in, c, sizew_c, c);
+ comps &= ~(1 << c);
+ }
+
+ GLSL("}} \n"
+ "barrier(); \n");
+
+ // Dispatch the actual samples
+ for (int y = 1 - bound; y <= bound; y++) {
+ for (int x = 1 - bound; x <= bound; x++) {
+ GLSL("idx = "$" * rel.y + rel.x + "$" * %d + %d; \n",
+ sizew_c, sizew_c, y + offset, x + offset);
+ polar_sample(sh, obj->filter, src_tex, lut, radius_c,
+ x, y, cmask, in, use_ar, scale);
+ }
+ }
+ } else {
+ // Fragment shader sampling
+ for (uint8_t comps = cmask; comps;) {
+ uint8_t c = __builtin_ctz(comps);
+ GLSL("vec4 "$"_%d; \n", in, c);
+ comps &= ~(1 << c);
+ }
+
+ // For maximum efficiency, we want to use textureGather() if
+ // possible, rather than direct sampling. Since this is not
+ // always possible/sensible, we need to possibly intermix gathering
+ // with regular sampling. This requires keeping track of which
+ // pixels in the next row were already gathered by the previous
+ // row.
+ uint32_t gathered_cur = 0x0, gathered_next = 0x0;
+ const float radius2 = PL_SQUARE(obj->filter->radius);
+ const int base = bound - 1;
+
+ if (base + bound >= 8 * sizeof(gathered_cur)) {
+ SH_FAIL(sh, "Polar radius %f exceeds implementation capacity!",
+ obj->filter->radius);
+ return false;
+ }
+
+ for (int y = 1 - bound; y <= bound; y++) {
+ for (int x = 1 - bound; x <= bound; x++) {
+ // Skip already gathered texels
+ uint32_t bit = 1llu << (base + x);
+ if (gathered_cur & bit)
+ continue;
+
+ // Using texture gathering is only more efficient than direct
+ // sampling in the case where we expect to be able to use all
+ // four gathered texels, without having to discard any. So
+ // only do it if we suspect it will be a win rather than a
+ // loss.
+ int xx = x*x, xx1 = (x+1)*(x+1);
+ int yy = y*y, yy1 = (y+1)*(y+1);
+ bool use_gather = PL_MAX(xx, xx1) + PL_MAX(yy, yy1) < radius2;
+ use_gather &= PL_MAX(x, y) <= sh_glsl(sh).max_gather_offset;
+ use_gather &= PL_MIN(x, y) >= sh_glsl(sh).min_gather_offset;
+ use_gather &= !src->tex || src->tex->params.format->gatherable;
+
+ // Gathering from components other than the R channel requires
+ // support for GLSL 400, which introduces the overload of
+ // textureGather* that allows specifying the component.
+ //
+ // This is also the minimum requirement if we don't know the
+ // texture format capabilities, for the sampler2D interface
+ if (cmask != 0x1 || !src->tex)
+ use_gather &= sh_glsl(sh).version >= 400;
+
+ if (!use_gather) {
+ // Switch to direct sampling instead
+ polar_sample(sh, obj->filter, src_tex, lut, radius_c,
+ x, y, cmask, NULL_IDENT, use_ar, scale);
+ continue;
+ }
+
+ // Gather the four surrounding texels simultaneously
+ for (uint8_t comps = cmask; comps;) {
+ uint8_t c = __builtin_ctz(comps);
+ if (x || y) {
+ if (c) {
+ GLSL($"_%d = textureGatherOffset("$", "
+ "center, ivec2(%d, %d), %d); \n",
+ in, c, src_tex, x, y, c);
+ } else {
+ GLSL($"_0 = textureGatherOffset("$", "
+ "center, ivec2(%d, %d)); \n",
+ in, src_tex, x, y);
+ }
+ } else {
+ if (c) {
+ GLSL($"_%d = textureGather("$", center, %d); \n",
+ in, c, src_tex, c);
+ } else {
+ GLSL($"_0 = textureGather("$", center); \n",
+ in, src_tex);
+ }
+ }
+ comps &= ~(1 << c);
+ }
+
+ // Mix in all of the points with their weights
+ for (int p = 0; p < 4; p++) {
+ // The four texels are gathered counterclockwise starting
+ // from the bottom left
+ static const int xo[4] = {0, 1, 1, 0};
+ static const int yo[4] = {1, 1, 0, 0};
+ if (x+xo[p] > bound || y+yo[p] > bound)
+ continue; // next subpixel
+
+ GLSL("idx = %d;\n", p);
+ polar_sample(sh, obj->filter, src_tex, lut, radius_c,
+ x+xo[p], y+yo[p], cmask, in, use_ar, scale);
+ }
+
+ // Mark the other next row's pixels as already gathered
+ gathered_next |= bit | (bit << 1);
+ x++; // skip adjacent pixel
+ }
+
+ // Prepare for new row
+ gathered_cur = gathered_next;
+ gathered_next = 0;
+ }
+ }
+
+#pragma GLSL \
+ color = $scale / wsum * color; \
+ @if (use_ar) { \
+ @for (c : cmask) { \
+ ww = ar@c / wwsum@c; \
+ ww.x = 1.0 - ww.x; \
+ w = clamp(color[@c], ww.x, ww.y); \
+ w = mix(w, dot(ww, vec2(0.5)), ww.x > ww.y); \
+ color[@c] = mix(color[@c], w, ${float:cfg.antiring}); \
+ @} \
+ @} \
+ @if (!(cmask & (1 << PL_CHANNEL_A))) \
+ color.a = 1.0; \
+ }
+
+ return true;
+}
+
+static void fill_ortho_lut(void *data, const struct sh_lut_params *params)
+{
+ const struct sh_sampler_obj *obj = params->priv;
+ pl_filter filt = obj->filter;
+
+ if (filt->radius == filt->radius_zero) {
+ // Main lobe covers entire radius, so all weights are positive, meaning
+ // we can use the linear resampling trick
+ for (int n = 0; n < SCALER_LUT_SIZE; n++) {
+ const float *weights = filt->weights + n * filt->row_stride;
+ float *row = (float *) data + n * filt->row_stride;
+ pl_assert(filt->row_size % 2 == 0);
+ for (int i = 0; i < filt->row_size; i += 2) {
+ const float w0 = weights[i], w1 = weights[i+1];
+ assert(w0 + w1 >= 0.0f);
+ row[i] = w0 + w1;
+ row[i+1] = w1 / (w0 + w1);
+ }
+ }
+ } else {
+ size_t entries = SCALER_LUT_SIZE * filt->row_stride;
+ pl_assert(params->width * params->height * params->comps == entries);
+ memcpy(data, filt->weights, entries * sizeof(float));
+ }
+}
+
+enum {
+ SEP_VERT = 0,
+ SEP_HORIZ,
+ SEP_PASSES
+};
+
+bool pl_shader_sample_ortho2(pl_shader sh, const struct pl_sample_src *src,
+ const struct pl_sample_filter_params *params)
+{
+ pl_assert(params);
+ if (params->filter.polar) {
+ SH_FAIL(sh, "Trying to use separated sampling with a polar filter?");
+ return false;
+ }
+
+ pl_gpu gpu = SH_GPU(sh);
+ pl_assert(gpu);
+
+ uint8_t comps;
+ float ratio[SEP_PASSES], scale;
+ ident_t src_tex, pos, pt;
+ if (!setup_src(sh, src, &src_tex, &pos, &pt,
+ &ratio[SEP_HORIZ], &ratio[SEP_VERT],
+ &comps, &scale, false, LINEAR))
+ return false;
+
+
+ int pass;
+ if (fabs(ratio[SEP_HORIZ] - 1.0f) < 1e-6f) {
+ pass = SEP_VERT;
+ } else if (fabs(ratio[SEP_VERT] - 1.0f) < 1e-6f) {
+ pass = SEP_HORIZ;
+ } else {
+ SH_FAIL(sh, "Trying to use pl_shader_sample_ortho with a "
+ "pl_sample_src that requires scaling in multiple directions "
+ "(rx=%f, ry=%f), this is not possible!",
+ ratio[SEP_HORIZ], ratio[SEP_VERT]);
+ return false;
+ }
+
+ // We can store a separate sampler object per dimension, so dispatch the
+ // right one. This is needed for two reasons:
+ // 1. Anamorphic content can have a different scaling ratio for each
+ // dimension. In particular, you could be upscaling in one and
+ // downscaling in the other.
+ // 2. After fixing the source for `setup_src`, we lose information about
+ // the scaling ratio of the other component. (Although this is only a
+ // minor reason and could easily be changed with some boilerplate)
+ struct sh_sampler_obj *obj;
+ obj = SH_OBJ(sh, params->lut, PL_SHADER_OBJ_SAMPLER,
+ struct sh_sampler_obj, sh_sampler_uninit);
+ if (!obj)
+ return false;
+
+ if (pass != 0) {
+ obj = SH_OBJ(sh, &obj->pass2, PL_SHADER_OBJ_SAMPLER,
+ struct sh_sampler_obj, sh_sampler_uninit);
+ assert(obj);
+ }
+
+ float inv_scale = 1.0 / ratio[pass];
+ inv_scale = PL_MAX(inv_scale, 1.0);
+ if (params->no_widening)
+ inv_scale = 1.0;
+
+ struct pl_filter_config cfg = params->filter;
+ cfg.antiring = PL_DEF(cfg.antiring, params->antiring);
+ cfg.blur = PL_DEF(cfg.blur, 1.0f) * inv_scale;
+ bool update = !obj->filter || !pl_filter_config_eq(&obj->filter->params.config, &cfg);
+
+ if (update) {
+ pl_filter_free(&obj->filter);
+ obj->filter = pl_filter_generate(sh->log, pl_filter_params(
+ .config = cfg,
+ .lut_entries = SCALER_LUT_SIZE,
+ .max_row_size = gpu->limits.max_tex_2d_dim / 4,
+ .row_stride_align = 4,
+ ));
+
+ if (!obj->filter) {
+ // This should never happen, but just in case ..
+ SH_FAIL(sh, "Failed initializing separated filter!");
+ return false;
+ }
+ }
+
+ int N = obj->filter->row_size; // number of samples to convolve
+ int width = obj->filter->row_stride / 4; // width of the LUT texture
+ ident_t lut = sh_lut(sh, sh_lut_params(
+ .object = &obj->lut,
+ .var_type = PL_VAR_FLOAT,
+ .method = SH_LUT_LINEAR,
+ .width = width,
+ .height = SCALER_LUT_SIZE,
+ .comps = 4,
+ .update = update,
+ .fill = fill_ortho_lut,
+ .priv = obj,
+ ));
+ if (!lut) {
+ SH_FAIL(sh, "Failed initializing separated LUT!");
+ return false;
+ }
+
+ const int dir[SEP_PASSES][2] = {
+ [SEP_HORIZ] = {1, 0},
+ [SEP_VERT] = {0, 1},
+ };
+
+ static const char *names[SEP_PASSES] = {
+ [SEP_HORIZ] = "ortho (horiz)",
+ [SEP_VERT] = "ortho (vert)",
+ };
+
+ describe_filter(sh, &cfg, names[pass], ratio[pass], ratio[pass]);
+
+ float denom = PL_MAX(1, width - 1); // avoid division by zero
+ bool use_ar = cfg.antiring > 0 && ratio[pass] > 1.0;
+ bool use_linear = obj->filter->radius == obj->filter->radius_zero;
+ use_ar &= !use_linear; // filter has no negative weights
+
+#pragma GLSL /* pl_shader_sample_ortho */ \
+ vec4 color = vec4(0.0, 0.0, 0.0, 1.0); \
+ { \
+ vec2 pos = $pos, pt = $pt; \
+ vec2 size = vec2(textureSize($src_tex, 0)); \
+ vec2 dir = vec2(${const float:dir[pass][0]}, ${const float: dir[pass][1]}); \
+ pt *= dir; \
+ vec2 fcoord2 = fract(pos * size - vec2(0.5)); \
+ float fcoord = dot(fcoord2, dir); \
+ vec2 base = pos - fcoord * pt - pt * vec2(${const float: N / 2 - 1}); \
+ vec4 ws; \
+ float off; \
+ ${vecType: comps} c, ca = ${vecType: comps}(0.0); \
+ @if (use_ar) { \
+ ${vecType: comps} hi = ${vecType: comps}(0.0); \
+ ${vecType: comps} lo = ${vecType: comps}(1e9); \
+ @} \
+ @for (n < N) { \
+ @if @(n % 4 == 0) \
+ ws = $lut(vec2(float(@n / 4) / ${const float: denom}, fcoord)); \
+ @if @(vars.use_ar && (n == vars.n / 2 - 1 || n == vars.n / 2)) { \
+ c = textureLod($src_tex, base + pt * @n.0, 0.0).${swizzle: comps}; \
+ ca += ws[@n % 4] * c; \
+ lo = min(lo, c); \
+ hi = max(hi, c); \
+ @} else { \
+ @if (use_linear) { \
+ @if @(n % 2 == 0) { \
+ off = @n.0 + ws[@n % 4 + 1]; \
+ ca += ws[@n % 4] * textureLod($src_tex, base + pt * off, \
+ 0.0).${swizzle: comps}; \
+ @} \
+ @} else { \
+ ca += ws[@n % 4] * textureLod($src_tex, base + pt * @n.0, \
+ 0.0).${swizzle: comps}; \
+ @} \
+ @} \
+ @} \
+ @if (use_ar) \
+ ca = mix(ca, clamp(ca, lo, hi), ${float: cfg.antiring}); \
+ color.${swizzle: comps} = ${float: scale} * ca; \
+ }
+
+ return true;
+}
+
+const struct pl_distort_params pl_distort_default_params = { PL_DISTORT_DEFAULTS };
+
+void pl_shader_distort(pl_shader sh, pl_tex src_tex, int out_w, int out_h,
+ const struct pl_distort_params *params)
+{
+ pl_assert(params);
+ if (!sh_require(sh, PL_SHADER_SIG_NONE, out_w, out_h))
+ return;
+
+ const int src_w = src_tex->params.w, src_h = src_tex->params.h;
+ float rx = 1.0f, ry = 1.0f;
+ if (src_w > src_h) {
+ ry = (float) src_h / src_w;
+ } else {
+ rx = (float) src_w / src_h;
+ }
+
+ // Map from texel coordinates [0,1]² to aspect-normalized representation
+ const pl_transform2x2 tex2norm = {
+ .mat.m = {
+ { 2 * rx, 0 },
+ { 0, -2 * ry },
+ },
+ .c = { -rx, ry },
+ };
+
+ // Map from aspect-normalized representation to canvas coords [-1,1]²
+ const float sx = params->unscaled ? (float) src_w / out_w : 1.0f;
+ const float sy = params->unscaled ? (float) src_h / out_h : 1.0f;
+ const pl_transform2x2 norm2canvas = {
+ .mat.m = {
+ { sx / rx, 0 },
+ { 0, sy / ry },
+ },
+ };
+
+ struct pl_transform2x2 transform = params->transform;
+ pl_transform2x2_mul(&transform, &tex2norm);
+ pl_transform2x2_rmul(&norm2canvas, &transform);
+
+ if (params->constrain) {
+ pl_rect2df bb = pl_transform2x2_bounds(&transform, &(pl_rect2df) {
+ .x1 = 1, .y1 = 1,
+ });
+ const float k = fmaxf(fmaxf(pl_rect_w(bb), pl_rect_h(bb)), 2.0f);
+ pl_transform2x2_scale(&transform, 2.0f / k);
+ };
+
+ // Bind the canvas coordinates as [-1,1]², flipped vertically to correspond
+ // to normal mathematical axis conventions
+ static const pl_rect2df canvas = {
+ .x0 = -1.0f, .x1 = 1.0f,
+ .y0 = 1.0f, .y1 = -1.0f,
+ };
+
+ ident_t pos = sh_attr_vec2(sh, "pos", &canvas);
+ ident_t pt, tex = sh_bind(sh, src_tex, params->address_mode,
+ PL_TEX_SAMPLE_LINEAR, "tex", NULL, NULL, &pt);
+
+ // Bind the inverse of the tex2canvas transform (i.e. canvas2tex)
+ pl_transform2x2_invert(&transform);
+ ident_t tf = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_mat2("tf"),
+ .data = PL_TRANSPOSE_2X2(transform.mat.m),
+ });
+
+ ident_t tf_c = sh_var(sh, (struct pl_shader_var) {
+ .var = pl_var_vec2("tf_c"),
+ .data = transform.c,
+ });
+
+ // See pl_shader_sample_bicubic
+ sh_describe(sh, "distortion");
+#pragma GLSL /* pl_shader_sample_distort */ \
+ vec4 color; \
+ { \
+ vec2 pos = $tf * $pos + $tf_c; \
+ vec2 pt = $pt; \
+ @if (params->bicubic) { \
+ vec2 size = vec2(textureSize($tex, 0)); \
+ vec2 frac = fract(pos * size + vec2(0.5)); \
+ vec2 frac2 = frac * frac; \
+ vec2 inv = vec2(1.0) - frac; \
+ vec2 inv2 = inv * inv; \
+ vec2 w0 = 1.0/6.0 * inv2 * inv; \
+ vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac); \
+ vec2 w2 = 2.0/3.0 - 0.5 * inv2 * (2.0 - inv); \
+ vec2 w3 = 1.0/6.0 * frac2 * frac; \
+ vec4 g = vec4(w0 + w1, w2 + w3); \
+ vec4 h = vec4(w1, w3) / g + inv.xyxy; \
+ h.xy -= vec2(2.0); \
+ vec4 p = pos.xyxy + pt.xyxy * h; \
+ vec4 c00 = textureLod($tex, p.xy, 0.0); \
+ vec4 c01 = textureLod($tex, p.xw, 0.0); \
+ vec4 c0 = mix(c01, c00, g.y); \
+ vec4 c10 = textureLod($tex, p.zy, 0.0); \
+ vec4 c11 = textureLod($tex, p.zw, 0.0); \
+ vec4 c1 = mix(c11, c10, g.y); \
+ color = mix(c1, c0, g.x); \
+ @} else { \
+ color = texture($tex, pos); \
+ @} \
+ @if (params->alpha_mode) { \
+ vec2 border = min(pos, vec2(1.0) - pos); \
+ border = smoothstep(vec2(0.0), pt, border); \
+ @if (params->alpha_mode == PL_ALPHA_PREMULTIPLIED) \
+ color.rgba *= border.x * border.y; \
+ @else \
+ color.a *= border.x * border.y; \
+ @} \
+ }
+
+}
diff --git a/src/swapchain.c b/src/swapchain.c
new file mode 100644
index 0000000..2b9ed90
--- /dev/null
+++ b/src/swapchain.c
@@ -0,0 +1,92 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+#include "log.h"
+#include "swapchain.h"
+
+void pl_swapchain_destroy(pl_swapchain *ptr)
+{
+ pl_swapchain sw = *ptr;
+ if (!sw)
+ return;
+
+ const struct pl_sw_fns *impl = PL_PRIV(sw);
+ impl->destroy(sw);
+ *ptr = NULL;
+}
+
+int pl_swapchain_latency(pl_swapchain sw)
+{
+ const struct pl_sw_fns *impl = PL_PRIV(sw);
+ if (!impl->latency)
+ return 0;
+
+ return impl->latency(sw);
+}
+
+bool pl_swapchain_resize(pl_swapchain sw, int *width, int *height)
+{
+ int dummy[2] = {0};
+ width = PL_DEF(width, &dummy[0]);
+ height = PL_DEF(height, &dummy[1]);
+
+ const struct pl_sw_fns *impl = PL_PRIV(sw);
+ if (!impl->resize) {
+ *width = *height = 0;
+ return true;
+ }
+
+ return impl->resize(sw, width, height);
+}
+
+void pl_swapchain_colorspace_hint(pl_swapchain sw, const struct pl_color_space *csp)
+{
+ const struct pl_sw_fns *impl = PL_PRIV(sw);
+ if (!impl->colorspace_hint)
+ return;
+
+ struct pl_swapchain_colors fix = {0};
+ if (csp) {
+ fix = *csp;
+ // Ensure we have valid values set for all the fields
+ pl_color_space_infer(&fix);
+ }
+
+ impl->colorspace_hint(sw, &fix);
+}
+
+bool pl_swapchain_start_frame(pl_swapchain sw,
+ struct pl_swapchain_frame *out_frame)
+{
+ *out_frame = (struct pl_swapchain_frame) {0}; // sanity
+
+ const struct pl_sw_fns *impl = PL_PRIV(sw);
+ return impl->start_frame(sw, out_frame);
+}
+
+bool pl_swapchain_submit_frame(pl_swapchain sw)
+{
+ const struct pl_sw_fns *impl = PL_PRIV(sw);
+ return impl->submit_frame(sw);
+}
+
+void pl_swapchain_swap_buffers(pl_swapchain sw)
+{
+ const struct pl_sw_fns *impl = PL_PRIV(sw);
+ impl->swap_buffers(sw);
+}
diff --git a/src/swapchain.h b/src/swapchain.h
new file mode 100644
index 0000000..934a2b9
--- /dev/null
+++ b/src/swapchain.h
@@ -0,0 +1,39 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+#include <libplacebo/swapchain.h>
+
+// This struct must be the first member of the swapchains's priv struct. The
+// `pl_swapchain` helpers will cast the priv struct to this struct!
+
+#define SW_PFN(name) __typeof__(pl_swapchain_##name) *name
+struct pl_sw_fns {
+ // This destructor follows the same rules as `pl_gpu_fns`
+ void (*destroy)(pl_swapchain sw);
+
+ SW_PFN(latency); // optional
+ SW_PFN(resize); // optional
+ SW_PFN(colorspace_hint); // optional
+ SW_PFN(start_frame);
+ SW_PFN(submit_frame);
+ SW_PFN(swap_buffers);
+};
+#undef SW_PFN
diff --git a/src/tests/bench.c b/src/tests/bench.c
new file mode 100644
index 0000000..22638d8
--- /dev/null
+++ b/src/tests/bench.c
@@ -0,0 +1,550 @@
+#include "tests.h"
+
+#include <libplacebo/dispatch.h>
+#include <libplacebo/vulkan.h>
+#include <libplacebo/shaders/colorspace.h>
+#include <libplacebo/shaders/deinterlacing.h>
+#include <libplacebo/shaders/sampling.h>
+
+enum {
+ // Image configuration
+ NUM_TEX = 16,
+ WIDTH = 2048,
+ HEIGHT = 2048,
+ DEPTH = 16,
+ COMPS = 4,
+
+ // Queue configuration
+ NUM_QUEUES = NUM_TEX,
+ ASYNC_TX = 1,
+ ASYNC_COMP = 1,
+
+ // Test configuration
+ TEST_MS = 1000,
+ WARMUP_MS = 500,
+};
+
+static pl_tex create_test_img(pl_gpu gpu)
+{
+ pl_fmt fmt = pl_find_fmt(gpu, PL_FMT_FLOAT, COMPS, DEPTH, 32, PL_FMT_CAP_LINEAR);
+ REQUIRE(fmt);
+
+ const float xc = (WIDTH - 1) / 2.0f;
+ const float yc = (HEIGHT - 1) / 2.0f;
+ const float kf = 0.5f / sqrtf(xc * xc + yc * yc);
+ const float invphi = 0.61803398874989;
+ const float freqR = kf * M_PI * 0.2f;
+ const float freqG = freqR * invphi;
+ const float freqB = freqG * invphi;
+ float *data = malloc(WIDTH * HEIGHT * COMPS * sizeof(float));
+ for (int y = 0; y < HEIGHT; y++) {
+ for (int x = 0; x < WIDTH; x++) {
+ float *color = &data[(y * WIDTH + x) * COMPS];
+ float xx = x - xc, yy = y - yc;
+ float r2 = xx * xx + yy * yy;
+ switch (COMPS) {
+ case 4: color[3] = 1.0;
+ case 3: color[2] = 0.5f * sinf(freqB * r2) + 0.5f;;
+ case 2: color[1] = 0.5f * sinf(freqG * r2) + 0.5f;;
+ case 1: color[0] = 0.5f * sinf(freqR * r2) + 0.5f;;
+ }
+ }
+ }
+
+ pl_tex tex = pl_tex_create(gpu, pl_tex_params(
+ .format = fmt,
+ .w = WIDTH,
+ .h = HEIGHT,
+ .sampleable = true,
+ .initial_data = data,
+ ));
+
+ free(data);
+ REQUIRE(tex);
+ return tex;
+}
+
+struct bench {
+ void (*run_sh)(pl_shader sh, pl_shader_obj *state,
+ pl_tex src);
+
+ void (*run_tex)(pl_gpu gpu, pl_tex tex);
+};
+
+static void run_bench(pl_gpu gpu, pl_dispatch dp,
+ pl_shader_obj *state, pl_tex src,
+ pl_tex fbo, pl_timer timer,
+ const struct bench *bench)
+{
+ REQUIRE(bench);
+ REQUIRE(bench->run_sh || bench->run_tex);
+ if (bench->run_sh) {
+ pl_shader sh = pl_dispatch_begin(dp);
+ bench->run_sh(sh, state, src);
+
+ pl_dispatch_finish(dp, pl_dispatch_params(
+ .shader = &sh,
+ .target = fbo,
+ .timer = timer,
+ ));
+ } else {
+ bench->run_tex(gpu, fbo);
+ }
+}
+
+static void benchmark(pl_gpu gpu, const char *name,
+ const struct bench *bench)
+{
+ pl_dispatch dp = pl_dispatch_create(gpu->log, gpu);
+ REQUIRE(dp);
+ pl_shader_obj state = NULL;
+ pl_tex src = create_test_img(gpu);
+
+ // Create the FBOs
+ pl_fmt fmt = pl_find_fmt(gpu, PL_FMT_FLOAT, COMPS, DEPTH, 32,
+ PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_BLITTABLE);
+ REQUIRE(fmt);
+
+ pl_tex fbos[NUM_TEX] = {0};
+ for (int i = 0; i < NUM_TEX; i++) {
+ fbos[i] = pl_tex_create(gpu, pl_tex_params(
+ .format = fmt,
+ .w = WIDTH,
+ .h = HEIGHT,
+ .renderable = true,
+ .blit_dst = true,
+ .host_writable = true,
+ .host_readable = true,
+ .storable = !!(fmt->caps & PL_FMT_CAP_STORABLE),
+ ));
+ REQUIRE(fbos[i]);
+
+ pl_tex_clear(gpu, fbos[i], (float[4]){ 0.0 });
+ }
+
+ // Run the benchmark and flush+block once to force shader compilation etc.
+ run_bench(gpu, dp, &state, src, fbos[0], NULL, bench);
+ pl_gpu_finish(gpu);
+
+ // Perform the actual benchmark
+ pl_clock_t start_warmup = 0, start_test = 0;
+ unsigned long frames = 0, frames_warmup = 0;
+
+ pl_timer timer = pl_timer_create(gpu);
+ uint64_t gputime_total = 0;
+ unsigned long gputime_count = 0;
+ uint64_t gputime;
+
+ start_warmup = pl_clock_now();
+ do {
+ const int idx = frames % NUM_TEX;
+ while (pl_tex_poll(gpu, fbos[idx], UINT64_MAX))
+ ; // do nothing
+ run_bench(gpu, dp, &state, src, fbos[idx], start_test ? timer : NULL, bench);
+ pl_gpu_flush(gpu);
+ frames++;
+
+ if (start_test) {
+ while ((gputime = pl_timer_query(gpu, timer))) {
+ gputime_total += gputime;
+ gputime_count++;
+ }
+ }
+
+ pl_clock_t now = pl_clock_now();
+ if (start_test) {
+ if (pl_clock_diff(now, start_test) > TEST_MS * 1e-3)
+ break;
+ } else if (pl_clock_diff(now, start_warmup) > WARMUP_MS * 1e-3) {
+ start_test = now;
+ frames_warmup = frames;
+ }
+ } while (true);
+
+ // Force the GPU to finish execution and re-measure the final stop time
+ pl_gpu_finish(gpu);
+
+ pl_clock_t stop = pl_clock_now();
+ while ((gputime = pl_timer_query(gpu, timer))) {
+ gputime_total += gputime;
+ gputime_count++;
+ }
+
+ frames -= frames_warmup;
+ double secs = pl_clock_diff(stop, start_test);
+ printf("'%s':\t%4lu frames in %1.6f seconds => %2.6f ms/frame (%5.2f FPS)",
+ name, frames, secs, 1000 * secs / frames, frames / secs);
+ if (gputime_count)
+ printf(", gpu time: %2.6f ms", 1e-6 * gputime_total / gputime_count);
+ printf("\n");
+
+ pl_timer_destroy(gpu, &timer);
+ pl_shader_obj_destroy(&state);
+ pl_dispatch_destroy(&dp);
+ pl_tex_destroy(gpu, &src);
+ for (int i = 0; i < NUM_TEX; i++)
+ pl_tex_destroy(gpu, &fbos[i]);
+}
+
+// List of benchmarks
+static void bench_deband(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+ pl_shader_deband(sh, pl_sample_src( .tex = src ), NULL);
+}
+
+static void bench_deband_heavy(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+ pl_shader_deband(sh, pl_sample_src( .tex = src ), pl_deband_params(
+ .iterations = 4,
+ .threshold = 4.0,
+ .radius = 4.0,
+ .grain = 16.0,
+ ));
+}
+
+static void bench_bilinear(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+ REQUIRE(pl_shader_sample_bilinear(sh, pl_sample_src( .tex = src )));
+}
+
+static void bench_bicubic(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+ REQUIRE(pl_shader_sample_bicubic(sh, pl_sample_src( .tex = src )));
+}
+
+static void bench_hermite(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+ REQUIRE(pl_shader_sample_hermite(sh, pl_sample_src( .tex = src )));
+}
+
+static void bench_gaussian(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+ REQUIRE(pl_shader_sample_gaussian(sh, pl_sample_src( .tex = src )));
+}
+
+static void bench_dither_blue(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+ REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
+ pl_shader_dither(sh, 8, state, pl_dither_params(
+ .method = PL_DITHER_BLUE_NOISE,
+ ));
+}
+
+static void bench_dither_white(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+ REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
+ pl_shader_dither(sh, 8, state, pl_dither_params(
+ .method = PL_DITHER_WHITE_NOISE,
+ ));
+}
+
+static void bench_dither_ordered_fix(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+ REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
+ pl_shader_dither(sh, 8, state, pl_dither_params(
+ .method = PL_DITHER_ORDERED_FIXED,
+ ));
+}
+
+static void bench_polar(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+ struct pl_sample_filter_params params = {
+ .filter = pl_filter_ewa_lanczos,
+ .lut = state,
+ };
+
+ REQUIRE(pl_shader_sample_polar(sh, pl_sample_src( .tex = src ), &params));
+}
+
+static void bench_polar_nocompute(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+ struct pl_sample_filter_params params = {
+ .filter = pl_filter_ewa_lanczos,
+ .no_compute = true,
+ .lut = state,
+ };
+
+ REQUIRE(pl_shader_sample_polar(sh, pl_sample_src( .tex = src ), &params));
+}
+
+static void bench_hdr_peak(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+ REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
+ REQUIRE(pl_shader_detect_peak(sh, pl_color_space_hdr10, state, &pl_peak_detect_default_params));
+}
+
+static void bench_hdr_peak_hq(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+ REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
+ REQUIRE(pl_shader_detect_peak(sh, pl_color_space_hdr10, state, &pl_peak_detect_high_quality_params));
+}
+
+static void bench_hdr_lut(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+ struct pl_color_map_params params = {
+ PL_COLOR_MAP_DEFAULTS
+ .tone_mapping_function = &pl_tone_map_bt2390,
+ .tone_mapping_mode = PL_TONE_MAP_RGB,
+ };
+
+ REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
+ pl_shader_color_map_ex(sh, &params, pl_color_map_args(
+ .src = pl_color_space_hdr10,
+ .dst = pl_color_space_monitor,
+ .state = state,
+ ));
+}
+
+static void bench_hdr_clip(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+ struct pl_color_map_params params = {
+ PL_COLOR_MAP_DEFAULTS
+ .tone_mapping_function = &pl_tone_map_clip,
+ .tone_mapping_mode = PL_TONE_MAP_RGB,
+ };
+
+ REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
+ pl_shader_color_map_ex(sh, &params, pl_color_map_args(
+ .src = pl_color_space_hdr10,
+ .dst = pl_color_space_monitor,
+ .state = state,
+ ));
+}
+
+static void bench_weave(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+ struct pl_deinterlace_source dsrc = {
+ .cur = pl_field_pair(src),
+ .field = PL_FIELD_TOP,
+ };
+
+ pl_shader_deinterlace(sh, &dsrc, pl_deinterlace_params(
+ .algo = PL_DEINTERLACE_WEAVE,
+ ));
+}
+
+static void bench_bob(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+ struct pl_deinterlace_source dsrc = {
+ .cur = pl_field_pair(src),
+ .field = PL_FIELD_TOP,
+ };
+
+ pl_shader_deinterlace(sh, &dsrc, pl_deinterlace_params(
+ .algo = PL_DEINTERLACE_BOB,
+ ));
+}
+
+static void bench_yadif(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+ struct pl_deinterlace_source dsrc = {
+ .prev = pl_field_pair(src),
+ .cur = pl_field_pair(src),
+ .next = pl_field_pair(src),
+ .field = PL_FIELD_TOP,
+ };
+
+ pl_shader_deinterlace(sh, &dsrc, pl_deinterlace_params(
+ .algo = PL_DEINTERLACE_YADIF,
+ ));
+}
+
+static void bench_av1_grain(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+ struct pl_film_grain_params params = {
+ .data = {
+ .type = PL_FILM_GRAIN_AV1,
+ .params.av1 = av1_grain_data,
+ .seed = rand(),
+ },
+ .tex = src,
+ .components = 3,
+ .component_mapping = {0, 1, 2},
+ .repr = &(struct pl_color_repr) {0},
+ };
+
+ REQUIRE(pl_shader_film_grain(sh, state, &params));
+}
+
+static void bench_av1_grain_lap(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+ struct pl_film_grain_params params = {
+ .data = {
+ .type = PL_FILM_GRAIN_AV1,
+ .params.av1 = av1_grain_data,
+ .seed = rand(),
+ },
+ .tex = src,
+ .components = 3,
+ .component_mapping = {0, 1, 2},
+ .repr = &(struct pl_color_repr) {0},
+ };
+
+ params.data.params.av1.overlap = true;
+ REQUIRE(pl_shader_film_grain(sh, state, &params));
+}
+
+static void bench_h274_grain(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+ struct pl_film_grain_params params = {
+ .data = {
+ .type = PL_FILM_GRAIN_H274,
+ .params.h274 = h274_grain_data,
+ .seed = rand(),
+ },
+ .tex = src,
+ .components = 3,
+ .component_mapping = {0, 1, 2},
+ .repr = &(struct pl_color_repr) {0},
+ };
+
+ REQUIRE(pl_shader_film_grain(sh, state, &params));
+}
+
+static void bench_reshape_poly(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+ REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
+ pl_shader_dovi_reshape(sh, &(struct pl_dovi_metadata) { .comp = {
+ {
+ .num_pivots = 8,
+ .pivots = {0.0, 0.00488758553, 0.0420332365, 0.177908108,
+ 0.428152502, 0.678396881, 0.92864126, 1.0},
+ .method = {0, 0, 0, 0, 0, 0, 0},
+ .poly_coeffs = {
+ {0.00290930271, 2.30019712, 50.1446037},
+ {0.00725257397, 1.88119054, -4.49443769},
+ {0.0150123835, 1.61106598, -1.64833081},
+ {0.0498571396, 1.2059114, -0.430627108},
+ {0.0878019333, 1.01845241, -0.19669354},
+ {0.120447636, 0.920134187, -0.122338772},
+ {2.12430835, -3.30913281, 2.10893941},
+ },
+ }, {
+ .num_pivots = 2,
+ .pivots = {0.0, 1.0},
+ .method = {0},
+ .poly_coeffs = {{-0.397901177, 1.85908031, 0}},
+ }, {
+ .num_pivots = 2,
+ .pivots = {0.0, 1.0},
+ .method = {0},
+ .poly_coeffs = {{-0.399355531, 1.85591626, 0}},
+ },
+ }});
+}
+
+static void bench_reshape_mmr(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+ REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
+ pl_shader_dovi_reshape(sh, &dovi_meta); // this includes MMR
+}
+
+static float data[WIDTH * HEIGHT * COMPS + 8192];
+
+static void bench_download(pl_gpu gpu, pl_tex tex)
+{
+ REQUIRE(pl_tex_download(gpu, pl_tex_transfer_params(
+ .tex = tex,
+ .ptr = (uint8_t *) PL_ALIGN((uintptr_t) data, 4096),
+ )));
+}
+
+static void bench_upload(pl_gpu gpu, pl_tex tex)
+{
+ REQUIRE(pl_tex_upload(gpu, pl_tex_transfer_params(
+ .tex = tex,
+ .ptr = (uint8_t *) PL_ALIGN((uintptr_t) data, 4096),
+ )));
+}
+
+static void dummy_cb(void *arg) {}
+
+static void bench_download_async(pl_gpu gpu, pl_tex tex)
+{
+ REQUIRE(pl_tex_download(gpu, pl_tex_transfer_params(
+ .tex = tex,
+ .ptr = (uint8_t *) PL_ALIGN((uintptr_t) data, 4096),
+ .callback = dummy_cb,
+ )));
+}
+
+static void bench_upload_async(pl_gpu gpu, pl_tex tex)
+{
+ REQUIRE(pl_tex_upload(gpu, pl_tex_transfer_params(
+ .tex = tex,
+ .ptr = (uint8_t *) PL_ALIGN((uintptr_t) data, 4096),
+ .callback = dummy_cb,
+ )));
+}
+
+int main()
+{
+ setbuf(stdout, NULL);
+ setbuf(stderr, NULL);
+
+ pl_log log = pl_log_create(PL_API_VER, pl_log_params(
+ .log_cb = isatty(fileno(stdout)) ? pl_log_color : pl_log_simple,
+ .log_level = PL_LOG_WARN,
+ ));
+
+ pl_vulkan vk = pl_vulkan_create(log, pl_vulkan_params(
+ .allow_software = true,
+ .async_transfer = ASYNC_TX,
+ .async_compute = ASYNC_COMP,
+ .queue_count = NUM_QUEUES,
+ ));
+
+ if (!vk)
+ return SKIP;
+
+#define BENCH_SH(fn) &(struct bench) { .run_sh = fn }
+#define BENCH_TEX(fn) &(struct bench) { .run_tex = fn }
+
+ printf("= Running benchmarks =\n");
+ benchmark(vk->gpu, "tex_download ptr", BENCH_TEX(bench_download));
+ benchmark(vk->gpu, "tex_download ptr async", BENCH_TEX(bench_download_async));
+ benchmark(vk->gpu, "tex_upload ptr", BENCH_TEX(bench_upload));
+ benchmark(vk->gpu, "tex_upload ptr async", BENCH_TEX(bench_upload_async));
+ benchmark(vk->gpu, "bilinear", BENCH_SH(bench_bilinear));
+ benchmark(vk->gpu, "bicubic", BENCH_SH(bench_bicubic));
+ benchmark(vk->gpu, "hermite", BENCH_SH(bench_hermite));
+ benchmark(vk->gpu, "gaussian", BENCH_SH(bench_gaussian));
+ benchmark(vk->gpu, "deband", BENCH_SH(bench_deband));
+ benchmark(vk->gpu, "deband_heavy", BENCH_SH(bench_deband_heavy));
+
+ // Deinterlacing
+ benchmark(vk->gpu, "weave", BENCH_SH(bench_weave));
+ benchmark(vk->gpu, "bob", BENCH_SH(bench_bob));
+ benchmark(vk->gpu, "yadif", BENCH_SH(bench_yadif));
+
+ // Polar sampling
+ benchmark(vk->gpu, "polar", BENCH_SH(bench_polar));
+ if (vk->gpu->glsl.compute)
+ benchmark(vk->gpu, "polar_nocompute", BENCH_SH(bench_polar_nocompute));
+
+ // Dithering algorithms
+ benchmark(vk->gpu, "dither_blue", BENCH_SH(bench_dither_blue));
+ benchmark(vk->gpu, "dither_white", BENCH_SH(bench_dither_white));
+ benchmark(vk->gpu, "dither_ordered_fixed", BENCH_SH(bench_dither_ordered_fix));
+
+ // HDR peak detection
+ if (vk->gpu->glsl.compute) {
+ benchmark(vk->gpu, "hdr_peakdetect", BENCH_SH(bench_hdr_peak));
+ benchmark(vk->gpu, "hdr_peakdetect_hq", BENCH_SH(bench_hdr_peak_hq));
+ }
+
+ // Tone mapping
+ benchmark(vk->gpu, "hdr_lut", BENCH_SH(bench_hdr_lut));
+ benchmark(vk->gpu, "hdr_clip", BENCH_SH(bench_hdr_clip));
+
+ // Misc stuff
+ benchmark(vk->gpu, "av1_grain", BENCH_SH(bench_av1_grain));
+ benchmark(vk->gpu, "av1_grain_lap", BENCH_SH(bench_av1_grain_lap));
+ benchmark(vk->gpu, "h274_grain", BENCH_SH(bench_h274_grain));
+ benchmark(vk->gpu, "reshape_poly", BENCH_SH(bench_reshape_poly));
+ benchmark(vk->gpu, "reshape_mmr", BENCH_SH(bench_reshape_mmr));
+
+ pl_vulkan_destroy(&vk);
+ pl_log_destroy(&log);
+ return 0;
+}
diff --git a/src/tests/cache.c b/src/tests/cache.c
new file mode 100644
index 0000000..667435d
--- /dev/null
+++ b/src/tests/cache.c
@@ -0,0 +1,215 @@
+#include "tests.h"
+
+#include <libplacebo/cache.h>
+
+// Returns "foo" for even keys, "bar" for odd
+static pl_cache_obj lookup_foobar(void *priv, uint64_t key)
+{
+ return (pl_cache_obj) {
+ .key = 0xFFFF, // test key sanity
+ .data = (key & 1) ? "bar" : "foo",
+ .size = 3,
+ };
+}
+
+static void update_count(void *priv, pl_cache_obj obj)
+{
+ int *count = priv;
+ *count += obj.size ? 1 : -1;
+}
+
+enum {
+ KEY1 = 0x9c65575f419288f5,
+ KEY2 = 0x92da969be9b88086,
+ KEY3 = 0x7fcb62540b00bc8b,
+ KEY4 = 0x46c60ec11af9dde3,
+ KEY5 = 0xcb6760b98ece2477,
+ KEY6 = 0xf37dc72b7f9e5c88,
+ KEY7 = 0x30c18c962d82e5f5,
+};
+
+int main()
+{
+ pl_log log = pl_test_logger();
+ pl_cache test = pl_cache_create(pl_cache_params(
+ .log = log,
+ .max_object_size = 16,
+ .max_total_size = 32,
+ ));
+
+ pl_cache_obj obj1 = { .key = KEY1, .data = "abc", .size = 3 };
+ pl_cache_obj obj2 = { .key = KEY2, .data = "de", .size = 2 };
+ pl_cache_obj obj3 = { .key = KEY3, .data = "xyzw", .size = 4 };
+
+ REQUIRE(pl_cache_try_set(test, &obj1));
+ REQUIRE(pl_cache_try_set(test, &obj2));
+ REQUIRE(pl_cache_try_set(test, &obj3));
+ REQUIRE_CMP(pl_cache_size(test), ==, 9, "zu");
+ REQUIRE_CMP(pl_cache_objects(test), ==, 3, "d");
+ REQUIRE(pl_cache_try_set(test, &obj2)); // delete KEY2
+ REQUIRE_CMP(pl_cache_size(test), ==, 7, "zu");
+ REQUIRE_CMP(pl_cache_objects(test), ==, 2, "d");
+
+ REQUIRE(pl_cache_get(test, &obj1));
+ REQUIRE(!pl_cache_get(test, &obj2));
+ REQUIRE(pl_cache_get(test, &obj3));
+ REQUIRE_CMP(pl_cache_size(test), ==, 0, "zu");
+ REQUIRE_CMP(pl_cache_objects(test), ==, 0, "d");
+ REQUIRE_MEMEQ(obj1.data, "abc", 3);
+ REQUIRE_MEMEQ(obj3.data, "xyzw", 4);
+
+ // Re-insert removed objects (in reversed order)
+ REQUIRE(pl_cache_try_set(test, &obj3));
+ REQUIRE(pl_cache_try_set(test, &obj1));
+ REQUIRE_CMP(pl_cache_size(test), ==, 7, "zu");
+ REQUIRE_CMP(pl_cache_objects(test), ==, 2, "d");
+
+ uint8_t ref[72];
+ memset(ref, 0xbe, sizeof(ref));
+ uint8_t *refp = ref;
+
+#define PAD_ALIGN(x) PL_ALIGN2(x, sizeof(uint32_t))
+#define W(type, ...) \
+ do { \
+ size_t sz = sizeof((type){__VA_ARGS__}); \
+ pl_assert(ref + sizeof(ref) - refp >= sz); \
+ memcpy(refp, &(type){__VA_ARGS__}, sz); \
+ refp += sz; \
+ size_t pad_sz = PAD_ALIGN(sz) - sz; \
+ pl_assert(ref + sizeof(ref) - refp >= pad_sz); \
+ memcpy(refp, &(char[PAD_ALIGN(1)]){0}, pad_sz); \
+ refp += pad_sz; \
+ } while (0)
+
+ W(char[], 'p', 'l', '_', 'c', 'a', 'c', 'h', 'e'); // cache magic
+ W(uint32_t, 1); // cache version
+ W(uint32_t, 2); // number of objects
+
+ // object 3
+ W(uint64_t, KEY3); // key
+ W(uint64_t, 4); // size
+#ifdef PL_HAVE_XXHASH
+ W(uint64_t, 0xd43612ef3fbee8be); // hash
+#else
+ W(uint64_t, 0xec18884e5e471117); // hash
+#endif
+ W(char[], 'x', 'y', 'z', 'w'); // data
+
+ // object 1
+ W(uint64_t, KEY1); // key
+ W(uint64_t, 3); // size
+#ifdef PL_HAVE_XXHASH
+ W(uint64_t, 0x78af5f94892f3950); // hash
+#else
+ W(uint64_t, 0x3a204d408a2e2d77); // hash
+#endif
+ W(char[], 'a', 'b', 'c'); // data
+
+#undef W
+#undef PAD_ALIGN
+
+ uint8_t data[100];
+ pl_static_assert(sizeof(data) >= sizeof(ref));
+ REQUIRE_CMP(pl_cache_save(test, data, sizeof(data)), ==, sizeof(ref), "zu");
+ REQUIRE_MEMEQ(data, ref, sizeof(ref));
+
+ pl_cache test2 = pl_cache_create(pl_cache_params( .log = log ));
+ REQUIRE_CMP(pl_cache_load(test2, data, sizeof(data)), ==, 2, "d");
+ REQUIRE_CMP(pl_cache_size(test2), ==, 7, "zu");
+ REQUIRE_CMP(pl_cache_save(test2, NULL, 0), ==, sizeof(ref), "zu");
+ REQUIRE_CMP(pl_cache_save(test2, data, sizeof(data)), ==, sizeof(ref), "zu");
+ REQUIRE_MEMEQ(data, ref, sizeof(ref));
+
+ // Test loading invalid data
+ REQUIRE_CMP(pl_cache_load(test2, ref, 0), <, 0, "d"); // empty file
+ REQUIRE_CMP(pl_cache_load(test2, ref, 5), <, 0, "d"); // truncated header
+ REQUIRE_CMP(pl_cache_load(test2, ref, 64), ==, 1, "d"); // truncated object data
+ data[sizeof(ref) - 2] = 'X'; // corrupt data
+ REQUIRE_CMP(pl_cache_load(test2, data, sizeof(ref)), ==, 1, "d"); // bad checksum
+ pl_cache_destroy(&test2);
+
+ // Inserting too large object should fail
+ uint8_t zero[32] = {0};
+ pl_cache_obj obj4 = { .key = KEY4, .data = zero, .size = 32 };
+ REQUIRE(!pl_cache_try_set(test, &obj4));
+ REQUIRE(!pl_cache_get(test, &obj4));
+ REQUIRE_CMP(pl_cache_size(test), ==, 7, "zu");
+ REQUIRE_CMP(pl_cache_objects(test), ==, 2, "d");
+
+ // Inserting 16-byte object should succeed, and not purge old entries
+ obj4 = (pl_cache_obj) { .key = KEY4, .data = zero, .size = 16 };
+ REQUIRE(pl_cache_try_set(test, &obj4));
+ REQUIRE_CMP(pl_cache_size(test), ==, 23, "zu");
+ REQUIRE_CMP(pl_cache_objects(test), ==, 3, "d");
+ REQUIRE(pl_cache_get(test, &obj1));
+ REQUIRE(pl_cache_get(test, &obj3));
+ REQUIRE(pl_cache_get(test, &obj4));
+ pl_cache_set(test, &obj1);
+ pl_cache_set(test, &obj3);
+ pl_cache_set(test, &obj4);
+ REQUIRE_CMP(pl_cache_size(test), ==, 23, "zu");
+ REQUIRE_CMP(pl_cache_objects(test), ==, 3, "d");
+
+ // Inserting another 10-byte object should purge entry KEY1
+ pl_cache_obj obj5 = { .key = KEY5, .data = zero, .size = 10 };
+ REQUIRE(pl_cache_try_set(test, &obj5));
+ REQUIRE_CMP(pl_cache_size(test), ==, 30, "zu");
+ REQUIRE_CMP(pl_cache_objects(test), ==, 3, "d");
+ REQUIRE(!pl_cache_get(test, &obj1));
+ REQUIRE(pl_cache_get(test, &obj3));
+ REQUIRE(pl_cache_get(test, &obj4));
+ REQUIRE(pl_cache_get(test, &obj5));
+ pl_cache_set(test, &obj3);
+ pl_cache_set(test, &obj4);
+ pl_cache_set(test, &obj5);
+ REQUIRE_CMP(pl_cache_size(test), ==, 30, "zu");
+ REQUIRE_CMP(pl_cache_objects(test), ==, 3, "d");
+
+ // Inserting final 6-byte object should purge entry KEY3
+ pl_cache_obj obj6 = { .key = KEY6, .data = zero, .size = 6 };
+ REQUIRE(pl_cache_try_set(test, &obj6));
+ REQUIRE_CMP(pl_cache_size(test), ==, 32, "zu");
+ REQUIRE_CMP(pl_cache_objects(test), ==, 3, "d");
+ REQUIRE(!pl_cache_get(test, &obj3));
+ REQUIRE(pl_cache_get(test, &obj4));
+ REQUIRE(pl_cache_get(test, &obj5));
+ REQUIRE(pl_cache_get(test, &obj6));
+ REQUIRE_CMP(pl_cache_size(test), ==, 0, "zu");
+ REQUIRE_CMP(pl_cache_objects(test), ==, 0, "d");
+ pl_cache_obj_free(&obj4);
+ pl_cache_obj_free(&obj5);
+ pl_cache_obj_free(&obj6);
+
+ // Test callback API
+ int num_objects = 0;
+ test2 = pl_cache_create(pl_cache_params(
+ .get = lookup_foobar,
+ .set = update_count,
+ .priv = &num_objects,
+ ));
+
+ REQUIRE(pl_cache_get(test2, &obj1));
+ REQUIRE_CMP(obj1.key, ==, KEY1, PRIu64);
+ REQUIRE_CMP(obj1.size, ==, 3, "zu");
+ REQUIRE_MEMEQ(obj1.data, "bar", 3);
+ REQUIRE(pl_cache_get(test2, &obj2));
+ REQUIRE_CMP(obj2.key, ==, KEY2, PRIu64);
+ REQUIRE_CMP(obj2.size, ==, 3, "zu");
+ REQUIRE_MEMEQ(obj2.data, "foo", 3);
+ REQUIRE_CMP(pl_cache_objects(test2), ==, 0, "d");
+ REQUIRE_CMP(num_objects, ==, 0, "d");
+ REQUIRE(pl_cache_try_set(test2, &obj1));
+ REQUIRE(pl_cache_try_set(test2, &obj2));
+ REQUIRE(pl_cache_try_set(test2, &(pl_cache_obj) { .key = KEY7, .data = "abcde", .size = 5 }));
+ REQUIRE_CMP(pl_cache_objects(test2), ==, 3, "d");
+ REQUIRE_CMP(num_objects, ==, 3, "d");
+ REQUIRE(pl_cache_try_set(test2, &obj1));
+ REQUIRE(pl_cache_try_set(test2, &obj2));
+ REQUIRE_CMP(pl_cache_objects(test2), ==, 1, "d");
+ REQUIRE_CMP(num_objects, ==, 1, "d");
+ pl_cache_destroy(&test2);
+
+ pl_cache_destroy(&test);
+ pl_log_destroy(&log);
+ return 0;
+}
diff --git a/src/tests/colorspace.c b/src/tests/colorspace.c
new file mode 100644
index 0000000..4b0662b
--- /dev/null
+++ b/src/tests/colorspace.c
@@ -0,0 +1,488 @@
+#include "tests.h"
+
+int main()
+{
+ for (enum pl_color_system sys = 0; sys < PL_COLOR_SYSTEM_COUNT; sys++) {
+ bool ycbcr = sys >= PL_COLOR_SYSTEM_BT_601 && sys <= PL_COLOR_SYSTEM_YCGCO;
+ REQUIRE_CMP(ycbcr, ==, pl_color_system_is_ycbcr_like(sys), "d");
+ }
+
+ for (enum pl_color_transfer trc = 0; trc < PL_COLOR_TRC_COUNT; trc++) {
+ bool hdr = trc >= PL_COLOR_TRC_PQ && trc <= PL_COLOR_TRC_S_LOG2;
+ REQUIRE_CMP(hdr, ==, pl_color_transfer_is_hdr(trc), "d");
+ REQUIRE_CMP(pl_color_transfer_nominal_peak(trc), >=, 1.0, "f");
+ }
+
+ float pq_peak = pl_color_transfer_nominal_peak(PL_COLOR_TRC_PQ);
+ REQUIRE_FEQ(PL_COLOR_SDR_WHITE * pq_peak, 10000, 1e-7);
+
+ struct pl_color_repr tv_repr = {
+ .sys = PL_COLOR_SYSTEM_BT_709,
+ .levels = PL_COLOR_LEVELS_LIMITED,
+ };
+
+ struct pl_color_repr pc_repr = {
+ .sys = PL_COLOR_SYSTEM_RGB,
+ .levels = PL_COLOR_LEVELS_FULL,
+ };
+
+ // Ensure this is a no-op for bits == bits
+ for (int bits = 1; bits <= 16; bits++) {
+ tv_repr.bits.color_depth = tv_repr.bits.sample_depth = bits;
+ pc_repr.bits.color_depth = pc_repr.bits.sample_depth = bits;
+ REQUIRE_FEQ(pl_color_repr_normalize(&tv_repr), 1.0, 1e-7);
+ REQUIRE_FEQ(pl_color_repr_normalize(&pc_repr), 1.0, 1e-7);
+ }
+
+ tv_repr.bits.color_depth = 8;
+ tv_repr.bits.sample_depth = 10;
+ float tv8to10 = pl_color_repr_normalize(&tv_repr);
+
+ tv_repr.bits.color_depth = 8;
+ tv_repr.bits.sample_depth = 12;
+ float tv8to12 = pl_color_repr_normalize(&tv_repr);
+
+ // Simulate the effect of GPU texture sampling on UNORM texture
+ REQUIRE_FEQ(tv8to10 * 16 /1023., 64/1023., 1e-7); // black
+ REQUIRE_FEQ(tv8to10 * 235/1023., 940/1023., 1e-7); // nominal white
+ REQUIRE_FEQ(tv8to10 * 128/1023., 512/1023., 1e-7); // achromatic
+ REQUIRE_FEQ(tv8to10 * 240/1023., 960/1023., 1e-7); // nominal chroma peak
+
+ REQUIRE_FEQ(tv8to12 * 16 /4095., 256 /4095., 1e-7); // black
+ REQUIRE_FEQ(tv8to12 * 235/4095., 3760/4095., 1e-7); // nominal white
+ REQUIRE_FEQ(tv8to12 * 128/4095., 2048/4095., 1e-7); // achromatic
+ REQUIRE_FEQ(tv8to12 * 240/4095., 3840/4095., 1e-7); // nominal chroma peak
+
+ // Ensure lavc's xyz12 is handled correctly
+ struct pl_color_repr xyz12 = {
+ .sys = PL_COLOR_SYSTEM_XYZ,
+ .levels = PL_COLOR_LEVELS_UNKNOWN,
+ .bits = {
+ .sample_depth = 16,
+ .color_depth = 12,
+ .bit_shift = 4,
+ },
+ };
+
+ float xyz = pl_color_repr_normalize(&xyz12);
+ REQUIRE_FEQ(xyz * (4095 << 4), 65535, 1e-7);
+
+ // Assume we uploaded a 10-bit source directly (unshifted) as a 16-bit
+ // texture. This texture multiplication factor should make it behave as if
+ // it was uploaded as a 10-bit texture instead.
+ pc_repr.bits.color_depth = 10;
+ pc_repr.bits.sample_depth = 16;
+ float pc10to16 = pl_color_repr_normalize(&pc_repr);
+ REQUIRE_FEQ(pc10to16 * 1000/65535., 1000/1023., 1e-7);
+
+ const struct pl_raw_primaries *bt709, *bt2020, *dcip3;
+ bt709 = pl_raw_primaries_get(PL_COLOR_PRIM_BT_709);
+ bt2020 = pl_raw_primaries_get(PL_COLOR_PRIM_BT_2020);
+ dcip3 = pl_raw_primaries_get(PL_COLOR_PRIM_DCI_P3);
+ REQUIRE(pl_primaries_superset(bt2020, bt709));
+ REQUIRE(!pl_primaries_superset(bt2020, dcip3)); // small region doesn't overlap
+ REQUIRE(pl_primaries_superset(dcip3, bt709));
+ REQUIRE(!pl_primaries_superset(bt709, bt2020));
+ REQUIRE(pl_primaries_compatible(bt2020, bt2020));
+ REQUIRE(pl_primaries_compatible(bt2020, bt709));
+ REQUIRE(pl_primaries_compatible(bt709, bt2020));
+ REQUIRE(pl_primaries_compatible(bt2020, dcip3));
+ REQUIRE(pl_primaries_compatible(bt709, dcip3));
+
+ struct pl_raw_primaries bt709_2020 = pl_primaries_clip(bt709, bt2020);
+ struct pl_raw_primaries bt2020_709 = pl_primaries_clip(bt2020, bt709);
+ REQUIRE(pl_raw_primaries_similar(&bt709_2020, bt709));
+ REQUIRE(pl_raw_primaries_similar(&bt2020_709, bt709));
+
+ struct pl_raw_primaries dcip3_bt2020 = pl_primaries_clip(dcip3, bt2020);
+ struct pl_raw_primaries dcip3_bt709 = pl_primaries_clip(dcip3, bt709);
+ REQUIRE(pl_primaries_superset(dcip3, &dcip3_bt2020));
+ REQUIRE(pl_primaries_superset(dcip3, &dcip3_bt709));
+ REQUIRE(pl_primaries_superset(bt2020, &dcip3_bt2020));
+ REQUIRE(pl_primaries_superset(bt709, &dcip3_bt709));
+
+ pl_matrix3x3 rgb2xyz, rgb2xyz_;
+ rgb2xyz = rgb2xyz_ = pl_get_rgb2xyz_matrix(bt709);
+ pl_matrix3x3_invert(&rgb2xyz_);
+ pl_matrix3x3_invert(&rgb2xyz_);
+
+ // Make sure the double-inversion round trips
+ for (int y = 0; y < 3; y++) {
+ for (int x = 0; x < 3; x++)
+ REQUIRE_FEQ(rgb2xyz.m[y][x], rgb2xyz_.m[y][x], 1e-6);
+ }
+
+ // Make sure mapping the spectral RGB colors (i.e. the matrix rows) matches
+ // our original primaries
+ float Y = rgb2xyz.m[1][0];
+ REQUIRE_FEQ(rgb2xyz.m[0][0], pl_cie_X(bt709->red) * Y, 1e-7);
+ REQUIRE_FEQ(rgb2xyz.m[2][0], pl_cie_Z(bt709->red) * Y, 1e-7);
+ Y = rgb2xyz.m[1][1];
+ REQUIRE_FEQ(rgb2xyz.m[0][1], pl_cie_X(bt709->green) * Y, 1e-7);
+ REQUIRE_FEQ(rgb2xyz.m[2][1], pl_cie_Z(bt709->green) * Y, 1e-7);
+ Y = rgb2xyz.m[1][2];
+ REQUIRE_FEQ(rgb2xyz.m[0][2], pl_cie_X(bt709->blue) * Y, 1e-7);
+ REQUIRE_FEQ(rgb2xyz.m[2][2], pl_cie_Z(bt709->blue) * Y, 1e-7);
+
+ // Make sure the gamut mapping round-trips
+ pl_matrix3x3 bt709_bt2020, bt2020_bt709;
+ bt709_bt2020 = pl_get_color_mapping_matrix(bt709, bt2020, PL_INTENT_RELATIVE_COLORIMETRIC);
+ bt2020_bt709 = pl_get_color_mapping_matrix(bt2020, bt709, PL_INTENT_RELATIVE_COLORIMETRIC);
+ for (int n = 0; n < 10; n++) {
+ float vec[3] = { RANDOM, RANDOM, RANDOM };
+ float dst[3] = { vec[0], vec[1], vec[2] };
+ pl_matrix3x3_apply(&bt709_bt2020, dst);
+ pl_matrix3x3_apply(&bt2020_bt709, dst);
+ for (int i = 0; i < 3; i++)
+ REQUIRE_FEQ(dst[i], vec[i], 1e-6);
+ }
+
+ // Ensure the decoding matrix round-trips to white/black
+ for (enum pl_color_system sys = 0; sys < PL_COLOR_SYSTEM_COUNT; sys++) {
+ if (!pl_color_system_is_linear(sys))
+ continue;
+
+ printf("testing color system %u\n", (unsigned) sys);
+ struct pl_color_repr repr = {
+ .levels = PL_COLOR_LEVELS_LIMITED,
+ .sys = sys,
+ .bits = {
+ // synthetic test
+ .color_depth = 8,
+ .sample_depth = 10,
+ },
+ };
+
+ float scale = pl_color_repr_normalize(&repr);
+ pl_transform3x3 yuv2rgb = pl_color_repr_decode(&repr, NULL);
+ pl_matrix3x3_scale(&yuv2rgb.mat, scale);
+
+ static const float white_ycbcr[3] = { 235/1023., 128/1023., 128/1023. };
+ static const float black_ycbcr[3] = { 16/1023., 128/1023., 128/1023. };
+ static const float white_other[3] = { 235/1023., 235/1023., 235/1023. };
+ static const float black_other[3] = { 16/1023., 16/1023., 16/1023. };
+
+ float white[3], black[3];
+ for (int i = 0; i < 3; i++) {
+ if (pl_color_system_is_ycbcr_like(sys)) {
+ white[i] = white_ycbcr[i];
+ black[i] = black_ycbcr[i];
+ } else {
+ white[i] = white_other[i];
+ black[i] = black_other[i];
+ }
+ }
+
+ pl_transform3x3_apply(&yuv2rgb, white);
+ REQUIRE_FEQ(white[0], 1.0, 1e-6);
+ REQUIRE_FEQ(white[1], 1.0, 1e-6);
+ REQUIRE_FEQ(white[2], 1.0, 1e-6);
+
+ pl_transform3x3_apply(&yuv2rgb, black);
+ REQUIRE_FEQ(black[0], 0.0, 1e-6);
+ REQUIRE_FEQ(black[1], 0.0, 1e-6);
+ REQUIRE_FEQ(black[2], 0.0, 1e-6);
+ }
+
+ // Make sure chromatic adaptation works
+ struct pl_raw_primaries bt709_d50;
+ bt709_d50 = *pl_raw_primaries_get(PL_COLOR_PRIM_BT_709);
+ bt709_d50.white = (struct pl_cie_xy) { 0.34567, 0.35850 };
+
+ pl_matrix3x3 d50_d65;
+ d50_d65 = pl_get_color_mapping_matrix(&bt709_d50, bt709, PL_INTENT_RELATIVE_COLORIMETRIC);
+
+ float white[3] = { 1.0, 1.0, 1.0 };
+ pl_matrix3x3_apply(&d50_d65, white);
+ REQUIRE_FEQ(white[0], 1.0, 1e-6);
+ REQUIRE_FEQ(white[1], 1.0, 1e-6);
+ REQUIRE_FEQ(white[2], 1.0, 1e-6);
+
+ // Simulate a typical 10-bit YCbCr -> 16 bit texture conversion
+ tv_repr.bits.color_depth = 10;
+ tv_repr.bits.sample_depth = 16;
+ pl_transform3x3 yuv2rgb;
+ yuv2rgb = pl_color_repr_decode(&tv_repr, NULL);
+ float test[3] = { 575/65535., 336/65535., 640/65535. };
+ pl_transform3x3_apply(&yuv2rgb, test);
+ REQUIRE_FEQ(test[0], 0.808305, 1e-6);
+ REQUIRE_FEQ(test[1], 0.553254, 1e-6);
+ REQUIRE_FEQ(test[2], 0.218841, 1e-6);
+
+ // DVD
+ REQUIRE_CMP(pl_color_system_guess_ycbcr(720, 480), ==, PL_COLOR_SYSTEM_BT_601, "u");
+ REQUIRE_CMP(pl_color_system_guess_ycbcr(720, 576), ==, PL_COLOR_SYSTEM_BT_601, "u");
+ REQUIRE_CMP(pl_color_primaries_guess(720, 576), ==, PL_COLOR_PRIM_BT_601_625, "u");
+ REQUIRE_CMP(pl_color_primaries_guess(720, 480), ==, PL_COLOR_PRIM_BT_601_525, "u");
+ // PAL 16:9
+ REQUIRE_CMP(pl_color_system_guess_ycbcr(1024, 576), ==, PL_COLOR_SYSTEM_BT_601, "u");
+ REQUIRE_CMP(pl_color_primaries_guess(1024, 576), ==, PL_COLOR_PRIM_BT_601_625, "u");
+ // HD
+ REQUIRE_CMP(pl_color_system_guess_ycbcr(1280, 720), ==, PL_COLOR_SYSTEM_BT_709, "u");
+ REQUIRE_CMP(pl_color_system_guess_ycbcr(1920, 1080), ==, PL_COLOR_SYSTEM_BT_709, "u");
+ REQUIRE_CMP(pl_color_primaries_guess(1280, 720), ==, PL_COLOR_PRIM_BT_709, "u");
+ REQUIRE_CMP(pl_color_primaries_guess(1920, 1080), ==, PL_COLOR_PRIM_BT_709, "u");
+
+ // Odd/weird videos
+ REQUIRE_CMP(pl_color_primaries_guess(2000, 576), ==, PL_COLOR_PRIM_BT_709, "u");
+ REQUIRE_CMP(pl_color_primaries_guess(200, 200), ==, PL_COLOR_PRIM_BT_709, "u");
+
+ REQUIRE(pl_color_repr_equal(&pl_color_repr_sdtv, &pl_color_repr_sdtv));
+ REQUIRE(!pl_color_repr_equal(&pl_color_repr_sdtv, &pl_color_repr_hdtv));
+
+ struct pl_color_repr repr = pl_color_repr_unknown;
+ pl_color_repr_merge(&repr, &pl_color_repr_uhdtv);
+ REQUIRE(pl_color_repr_equal(&repr, &pl_color_repr_uhdtv));
+
+ REQUIRE(!pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_UNKNOWN));
+ REQUIRE(!pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_BT_601_525));
+ REQUIRE(!pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_BT_601_625));
+ REQUIRE(!pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_BT_709));
+ REQUIRE(!pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_BT_470M));
+ REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_BT_2020));
+ REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_APPLE));
+ REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_ADOBE));
+ REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_PRO_PHOTO));
+ REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_CIE_1931));
+ REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_DCI_P3));
+ REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_DISPLAY_P3));
+ REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_V_GAMUT));
+ REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_S_GAMUT));
+
+ struct pl_color_space space = pl_color_space_unknown;
+ pl_color_space_merge(&space, &pl_color_space_bt709);
+ REQUIRE(pl_color_space_equal(&space, &pl_color_space_bt709));
+
+ // Infer some color spaces
+ struct pl_color_space hlg = {
+ .primaries = PL_COLOR_PRIM_BT_2020,
+ .transfer = PL_COLOR_TRC_HLG,
+ };
+
+ pl_color_space_infer(&hlg);
+ REQUIRE_CMP(hlg.hdr.max_luma, ==, PL_COLOR_HLG_PEAK, "f");
+
+ struct pl_color_space unknown = {0};
+ struct pl_color_space display = {
+ .primaries = PL_COLOR_PRIM_BT_709,
+ .transfer = PL_COLOR_TRC_BT_1886,
+ };
+
+ pl_color_space_infer(&unknown);
+ pl_color_space_infer(&display);
+ REQUIRE(pl_color_space_equal(&unknown, &display));
+
+ float x, y;
+ pl_chroma_location_offset(PL_CHROMA_LEFT, &x, &y);
+ REQUIRE_CMP(x, ==, -0.5f, "f");
+ REQUIRE_CMP(y, ==, 0.0f, "f");
+ pl_chroma_location_offset(PL_CHROMA_TOP_LEFT, &x, &y);
+ REQUIRE_CMP(x, ==, -0.5f, "f");
+ REQUIRE_CMP(y, ==, -0.5f, "f");
+ pl_chroma_location_offset(PL_CHROMA_CENTER, &x, &y);
+ REQUIRE_CMP(x, ==, 0.0f, "f");
+ REQUIRE_CMP(y, ==, 0.0f, "f");
+ pl_chroma_location_offset(PL_CHROMA_BOTTOM_CENTER, &x, &y);
+ REQUIRE_CMP(x, ==, 0.0f, "f");
+ REQUIRE_CMP(y, ==, 0.5f, "f");
+
+ REQUIRE_CMP(pl_raw_primaries_get(PL_COLOR_PRIM_UNKNOWN), ==,
+ pl_raw_primaries_get(PL_COLOR_PRIM_BT_709), "p");
+
+ // Color blindness tests
+ float red[3] = { 1.0, 0.0, 0.0 };
+ float green[3] = { 0.0, 1.0, 0.0 };
+ float blue[3] = { 0.0, 0.0, 1.0 };
+
+#define TEST_CONE(model, color) \
+ do { \
+ float tmp[3] = { (color)[0], (color)[1], (color)[2] }; \
+ pl_matrix3x3 mat = pl_get_cone_matrix(&(model), bt709); \
+ pl_matrix3x3_apply(&mat, tmp); \
+ printf("%s + %s = %f %f %f\n", #model, #color, tmp[0], tmp[1], tmp[2]); \
+ for (int i = 0; i < 3; i++) \
+ REQUIRE_FEQ((color)[i], tmp[i], 1e-5f); \
+ } while(0)
+
+ struct pl_cone_params red_only = { .cones = PL_CONE_MS };
+ struct pl_cone_params green_only = { .cones = PL_CONE_LS };
+ struct pl_cone_params blue_only = pl_vision_monochromacy;
+
+ // These models should all round-trip white
+ TEST_CONE(pl_vision_normal, white);
+ TEST_CONE(pl_vision_protanopia, white);
+ TEST_CONE(pl_vision_protanomaly, white);
+ TEST_CONE(pl_vision_deuteranomaly, white);
+ TEST_CONE(pl_vision_tritanomaly, white);
+ TEST_CONE(pl_vision_achromatopsia, white);
+ TEST_CONE(red_only, white);
+ TEST_CONE(green_only, white);
+ TEST_CONE(blue_only, white);
+
+ // These models should round-trip blue
+ TEST_CONE(pl_vision_normal, blue);
+ TEST_CONE(pl_vision_protanomaly, blue);
+ TEST_CONE(pl_vision_deuteranomaly, blue);
+
+ // These models should round-trip red
+ TEST_CONE(pl_vision_normal, red);
+ TEST_CONE(pl_vision_tritanomaly, red);
+ TEST_CONE(pl_vision_tritanopia, red);
+
+ // These models should round-trip green
+ TEST_CONE(pl_vision_normal, green);
+
+ // Color adaptation tests
+ struct pl_cie_xy d65 = pl_white_from_temp(6504);
+ REQUIRE_FEQ(d65.x, 0.31271, 1e-3);
+ REQUIRE_FEQ(d65.y, 0.32902, 1e-3);
+ struct pl_cie_xy d55 = pl_white_from_temp(5503);
+ REQUIRE_FEQ(d55.x, 0.33242, 1e-3);
+ REQUIRE_FEQ(d55.y, 0.34743, 1e-3);
+
+ // Make sure we infer the correct set of metadata parameters
+#define TEST_METADATA(CSP, TYPE, MIN, MAX, AVG) \
+ do { \
+ float _min, _max, _avg; \
+ pl_color_space_nominal_luma_ex(pl_nominal_luma_params( \
+ .color = &(CSP), \
+ .metadata = TYPE, \
+ .scaling = PL_HDR_PQ, \
+ .out_min = &_min, \
+ .out_max = &_max, \
+ .out_avg = &_avg, \
+ )); \
+ const float _min_ref = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_PQ, MIN); \
+ const float _max_ref = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_PQ, MAX); \
+ const float _avg_ref = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_PQ, AVG); \
+ REQUIRE_FEQ(_min, _min_ref, 1e-5); \
+ REQUIRE_FEQ(_max, _max_ref, 1e-5); \
+ REQUIRE_FEQ(_avg, _avg_ref, 1e-5); \
+ } while (0)
+
+ const struct pl_color_space hdr10plus = {
+ .primaries = PL_COLOR_PRIM_BT_2020,
+ .transfer = PL_COLOR_TRC_PQ,
+ .hdr = {
+ .min_luma = 0.005,
+ .max_luma = 4000,
+ .scene_max = {596.69, 1200, 500},
+ .scene_avg = 300,
+ },
+ };
+
+ REQUIRE(pl_hdr_metadata_contains(&hdr10plus.hdr, PL_HDR_METADATA_ANY));
+ REQUIRE(pl_hdr_metadata_contains(&hdr10plus.hdr, PL_HDR_METADATA_NONE));
+ REQUIRE(pl_hdr_metadata_contains(&hdr10plus.hdr, PL_HDR_METADATA_HDR10));
+ REQUIRE(pl_hdr_metadata_contains(&hdr10plus.hdr, PL_HDR_METADATA_HDR10PLUS));
+ REQUIRE(!pl_hdr_metadata_contains(&hdr10plus.hdr, PL_HDR_METADATA_CIE_Y));
+
+ TEST_METADATA(hdr10plus, PL_HDR_METADATA_NONE, PL_COLOR_HDR_BLACK, 10000, 0);
+ TEST_METADATA(hdr10plus, PL_HDR_METADATA_CIE_Y, PL_COLOR_HDR_BLACK, 4000, 0);
+ TEST_METADATA(hdr10plus, PL_HDR_METADATA_HDR10, PL_COLOR_HDR_BLACK, 4000, 0);
+ TEST_METADATA(hdr10plus, PL_HDR_METADATA_HDR10PLUS, PL_COLOR_HDR_BLACK, 1000, 250);
+ TEST_METADATA(hdr10plus, PL_HDR_METADATA_ANY, PL_COLOR_HDR_BLACK, 1000, 250);
+
+ const struct pl_color_space dovi = {
+ .primaries = PL_COLOR_PRIM_BT_2020,
+ .transfer = PL_COLOR_TRC_PQ,
+ .hdr = {
+ .min_luma = 0.005,
+ .max_luma = 4000,
+ .max_pq_y = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_PQ, 1000),
+ .avg_pq_y = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_PQ, 250),
+ },
+ };
+
+ REQUIRE(pl_hdr_metadata_contains(&dovi.hdr, PL_HDR_METADATA_ANY));
+ REQUIRE(pl_hdr_metadata_contains(&dovi.hdr, PL_HDR_METADATA_NONE));
+ REQUIRE(pl_hdr_metadata_contains(&dovi.hdr, PL_HDR_METADATA_HDR10));
+ REQUIRE(pl_hdr_metadata_contains(&dovi.hdr, PL_HDR_METADATA_CIE_Y));
+ REQUIRE(!pl_hdr_metadata_contains(&dovi.hdr, PL_HDR_METADATA_HDR10PLUS));
+
+ TEST_METADATA(dovi, PL_HDR_METADATA_NONE, PL_COLOR_HDR_BLACK, 10000, 0);
+ TEST_METADATA(dovi, PL_HDR_METADATA_HDR10, PL_COLOR_HDR_BLACK, 4000, 0);
+ TEST_METADATA(dovi, PL_HDR_METADATA_HDR10PLUS, PL_COLOR_HDR_BLACK, 4000, 0);
+ TEST_METADATA(dovi, PL_HDR_METADATA_CIE_Y, PL_COLOR_HDR_BLACK, 1000, 250);
+ TEST_METADATA(dovi, PL_HDR_METADATA_ANY, PL_COLOR_HDR_BLACK, 1000, 250);
+
+ const struct pl_color_space hlg4000 = {
+ .primaries = PL_COLOR_PRIM_BT_2020,
+ .transfer = PL_COLOR_TRC_HLG,
+ .hdr.max_luma = 4000,
+ .hdr.min_luma = 0.005,
+ };
+
+ TEST_METADATA(hlg4000, PL_HDR_METADATA_NONE, PL_COLOR_HDR_BLACK, PL_COLOR_HLG_PEAK, 0);
+ TEST_METADATA(hlg4000, PL_HDR_METADATA_HDR10, 0.005, 4000, 0);
+ TEST_METADATA(hlg4000, PL_HDR_METADATA_ANY, 0.005, 4000, 0);
+
+ const struct pl_color_space untagged = {
+ .primaries = PL_COLOR_PRIM_BT_709,
+ .transfer = PL_COLOR_TRC_BT_1886,
+ };
+
+ REQUIRE(pl_hdr_metadata_contains(&untagged.hdr, PL_HDR_METADATA_NONE));
+ REQUIRE(!pl_hdr_metadata_contains(&untagged.hdr, PL_HDR_METADATA_ANY));
+ REQUIRE(!pl_hdr_metadata_contains(&untagged.hdr, PL_HDR_METADATA_HDR10));
+ REQUIRE(!pl_hdr_metadata_contains(&untagged.hdr, PL_HDR_METADATA_CIE_Y));
+ REQUIRE(!pl_hdr_metadata_contains(&untagged.hdr, PL_HDR_METADATA_HDR10PLUS));
+
+ const float sdr_black = PL_COLOR_SDR_WHITE / PL_COLOR_SDR_CONTRAST;
+ TEST_METADATA(untagged, PL_HDR_METADATA_NONE, sdr_black, PL_COLOR_SDR_WHITE, 0);
+ TEST_METADATA(untagged, PL_HDR_METADATA_ANY, sdr_black, PL_COLOR_SDR_WHITE, 0);
+
+ const struct pl_color_space sdr50 = {
+ .primaries = PL_COLOR_PRIM_BT_709,
+ .transfer = PL_COLOR_TRC_BT_1886,
+ .hdr.max_luma = 50,
+ };
+
+ REQUIRE(pl_hdr_metadata_contains(&sdr50.hdr, PL_HDR_METADATA_NONE));
+ REQUIRE(pl_hdr_metadata_contains(&sdr50.hdr, PL_HDR_METADATA_ANY));
+ REQUIRE(pl_hdr_metadata_contains(&sdr50.hdr, PL_HDR_METADATA_HDR10));
+ REQUIRE(!pl_hdr_metadata_contains(&sdr50.hdr, PL_HDR_METADATA_CIE_Y));
+ REQUIRE(!pl_hdr_metadata_contains(&sdr50.hdr, PL_HDR_METADATA_HDR10PLUS));
+
+ TEST_METADATA(sdr50, PL_HDR_METADATA_NONE, sdr_black, PL_COLOR_SDR_WHITE, 0);
+ TEST_METADATA(sdr50, PL_HDR_METADATA_HDR10, 50 / PL_COLOR_SDR_CONTRAST, 50, 0);
+ TEST_METADATA(sdr50, PL_HDR_METADATA_ANY, 50 / PL_COLOR_SDR_CONTRAST, 50, 0);
+
+ const struct pl_color_space sdr10k = {
+ .primaries = PL_COLOR_PRIM_BT_709,
+ .transfer = PL_COLOR_TRC_BT_1886,
+ .hdr.min_luma = PL_COLOR_SDR_WHITE / 10000,
+ };
+
+ REQUIRE(pl_hdr_metadata_contains(&sdr10k.hdr, PL_HDR_METADATA_NONE));
+ REQUIRE(!pl_hdr_metadata_contains(&sdr10k.hdr, PL_HDR_METADATA_ANY));
+ REQUIRE(!pl_hdr_metadata_contains(&sdr10k.hdr, PL_HDR_METADATA_HDR10));
+ TEST_METADATA(sdr10k, PL_HDR_METADATA_NONE, sdr_black, PL_COLOR_SDR_WHITE, 0);
+ TEST_METADATA(sdr10k, PL_HDR_METADATA_HDR10, PL_COLOR_SDR_WHITE / 10000, PL_COLOR_SDR_WHITE, 0);
+ TEST_METADATA(sdr10k, PL_HDR_METADATA_ANY, PL_COLOR_SDR_WHITE / 10000, PL_COLOR_SDR_WHITE, 0);
+
+ const struct pl_color_space bogus_vals = {
+ .primaries = PL_COLOR_PRIM_BT_2020,
+ .transfer = PL_COLOR_TRC_HLG,
+ .hdr.min_luma = 1e-9,
+ .hdr.max_luma = 1000000,
+ };
+
+ const struct pl_color_space bogus_flip = {
+ .primaries = PL_COLOR_PRIM_BT_2020,
+ .transfer = PL_COLOR_TRC_PQ,
+ .hdr.min_luma = 4000,
+ .hdr.max_luma = 0.05,
+ };
+
+ const struct pl_color_space bogus_sign = {
+ .primaries = PL_COLOR_PRIM_BT_2020,
+ .transfer = PL_COLOR_TRC_HLG,
+ .hdr.min_luma = -0.5,
+ .hdr.max_luma = -4000,
+ };
+
+ TEST_METADATA(bogus_vals, PL_HDR_METADATA_HDR10, PL_COLOR_HDR_BLACK, 10000, 0);
+ TEST_METADATA(bogus_flip, PL_HDR_METADATA_HDR10, PL_COLOR_HDR_BLACK, 10000, 0);
+ TEST_METADATA(bogus_sign, PL_HDR_METADATA_HDR10, PL_COLOR_HDR_BLACK, PL_COLOR_HLG_PEAK, 0);
+}
diff --git a/src/tests/common.c b/src/tests/common.c
new file mode 100644
index 0000000..849971e
--- /dev/null
+++ b/src/tests/common.c
@@ -0,0 +1,136 @@
+#include "tests.h"
+
+static int irand()
+{
+ return rand() - RAND_MAX / 2;
+}
+
+int main()
+{
+ pl_log log = pl_test_logger();
+ pl_log_update(log, NULL);
+ pl_log_destroy(&log);
+
+ // Test some misc helper functions
+ pl_rect2d rc2 = {
+ irand(), irand(),
+ irand(), irand(),
+ };
+
+ pl_rect3d rc3 = {
+ irand(), irand(), irand(),
+ irand(), irand(), irand(),
+ };
+
+ pl_rect2d_normalize(&rc2);
+ REQUIRE_CMP(rc2.x1, >=, rc2.x0, "d");
+ REQUIRE_CMP(rc2.y1, >=, rc2.y0, "d");
+
+ pl_rect3d_normalize(&rc3);
+ REQUIRE_CMP(rc3.x1, >=, rc3.x0, "d");
+ REQUIRE_CMP(rc3.y1, >=, rc3.y0, "d");
+ REQUIRE_CMP(rc3.z1, >=, rc3.z0, "d");
+
+ pl_rect2df rc2f = {
+ RANDOM, RANDOM,
+ RANDOM, RANDOM,
+ };
+
+ pl_rect3df rc3f = {
+ RANDOM, RANDOM, RANDOM,
+ RANDOM, RANDOM, RANDOM,
+ };
+
+ pl_rect2df_normalize(&rc2f);
+ REQUIRE_CMP(rc2f.x1, >=, rc2f.x0, "f");
+ REQUIRE_CMP(rc2f.y1, >=, rc2f.y0, "f");
+
+ pl_rect3df_normalize(&rc3f);
+ REQUIRE_CMP(rc3f.x1, >=, rc3f.x0, "f");
+ REQUIRE_CMP(rc3f.y1, >=, rc3f.y0, "f");
+ REQUIRE_CMP(rc3f.z1, >=, rc3f.z0, "f");
+
+ pl_rect2d rc2r = pl_rect2df_round(&rc2f);
+ pl_rect3d rc3r = pl_rect3df_round(&rc3f);
+
+ REQUIRE_CMP(fabs(rc2r.x0 - rc2f.x0), <=, 0.5, "f");
+ REQUIRE_CMP(fabs(rc2r.x1 - rc2f.x1), <=, 0.5, "f");
+ REQUIRE_CMP(fabs(rc2r.y0 - rc2f.y0), <=, 0.5, "f");
+ REQUIRE_CMP(fabs(rc2r.y1 - rc2f.y1), <=, 0.5, "f");
+
+ REQUIRE_CMP(fabs(rc3r.x0 - rc3f.x0), <=, 0.5, "f");
+ REQUIRE_CMP(fabs(rc3r.x1 - rc3f.x1), <=, 0.5, "f");
+ REQUIRE_CMP(fabs(rc3r.y0 - rc3f.y0), <=, 0.5, "f");
+ REQUIRE_CMP(fabs(rc3r.y1 - rc3f.y1), <=, 0.5, "f");
+ REQUIRE_CMP(fabs(rc3r.z0 - rc3f.z0), <=, 0.5, "f");
+ REQUIRE_CMP(fabs(rc3r.z1 - rc3f.z1), <=, 0.5, "f");
+
+ pl_transform3x3 tr = {
+ .mat = {{
+ { RANDOM, RANDOM, RANDOM },
+ { RANDOM, RANDOM, RANDOM },
+ { RANDOM, RANDOM, RANDOM },
+ }},
+ .c = { RANDOM, RANDOM, RANDOM },
+ };
+
+ pl_transform3x3 tr2 = tr;
+ float scale = 1.0 + RANDOM;
+ pl_transform3x3_scale(&tr2, scale);
+ pl_transform3x3_invert(&tr2);
+ pl_transform3x3_invert(&tr2);
+ pl_transform3x3_scale(&tr2, 1.0 / scale);
+
+ for (int i = 0; i < 3; i++) {
+ for (int j = 0; j < 3; j++) {
+ printf("%f %f\n", tr.mat.m[i][j], tr2.mat.m[i][j]);
+ REQUIRE_FEQ(tr.mat.m[i][j], tr2.mat.m[i][j], 1e-4);
+ }
+ REQUIRE_FEQ(tr.c[i], tr2.c[i], 1e-4);
+ }
+
+ // Test aspect ratio code
+ const pl_rect2df rc1080p = {0, 0, 1920, 1080};
+ const pl_rect2df rc43 = {0, 0, 1024, 768};
+ pl_rect2df rc;
+
+ REQUIRE_FEQ(pl_rect2df_aspect(&rc1080p), 16.0/9.0, 1e-8);
+ REQUIRE_FEQ(pl_rect2df_aspect(&rc43), 4.0/3.0, 1e-8);
+
+#define pl_rect2df_midx(rc) (((rc).x0 + (rc).x1) / 2.0)
+#define pl_rect2df_midy(rc) (((rc).y0 + (rc).y1) / 2.0)
+
+ for (float aspect = 0.2; aspect < 3.0; aspect += 0.4) {
+ for (float scan = 0.0; scan <= 1.0; scan += 0.5) {
+ rc = rc1080p;
+ pl_rect2df_aspect_set(&rc, aspect, scan);
+ printf("aspect %.2f, panscan %.1f: {%f %f} -> {%f %f}\n",
+ aspect, scan, rc.x0, rc.y0, rc.x1, rc.y1);
+ REQUIRE_FEQ(pl_rect2df_aspect(&rc), aspect, 1e-6);
+ REQUIRE_FEQ(pl_rect2df_midx(rc), pl_rect2df_midx(rc1080p), 1e-6);
+ REQUIRE_FEQ(pl_rect2df_midy(rc), pl_rect2df_midy(rc1080p), 1e-6);
+ }
+ }
+
+ rc = rc1080p;
+ pl_rect2df_aspect_fit(&rc, &rc43, 0.0);
+ REQUIRE_FEQ(pl_rect2df_aspect(&rc), pl_rect2df_aspect(&rc43), 1e-6);
+ REQUIRE_FEQ(pl_rect2df_midx(rc), pl_rect2df_midx(rc1080p), 1e-6);
+ REQUIRE_FEQ(pl_rect2df_midy(rc), pl_rect2df_midy(rc1080p), 1e-6);
+ REQUIRE_FEQ(pl_rect_w(rc), pl_rect_w(rc43), 1e-6);
+ REQUIRE_FEQ(pl_rect_h(rc), pl_rect_h(rc43), 1e-6);
+
+ rc = rc43;
+ pl_rect2df_aspect_fit(&rc, &rc1080p, 0.0);
+ REQUIRE_FEQ(pl_rect2df_aspect(&rc), pl_rect2df_aspect(&rc1080p), 1e-6);
+ REQUIRE_FEQ(pl_rect2df_midx(rc), pl_rect2df_midx(rc43), 1e-6);
+ REQUIRE_FEQ(pl_rect2df_midy(rc), pl_rect2df_midy(rc43), 1e-6);
+ REQUIRE_FEQ(pl_rect_w(rc), pl_rect_w(rc43), 1e-6);
+
+ rc = (pl_rect2df) { 1920, 1080, 0, 0 };
+ pl_rect2df_offset(&rc, 50, 100);
+ REQUIRE_FEQ(rc.x0, 1870, 1e-6);
+ REQUIRE_FEQ(rc.x1, -50, 1e-6);
+ REQUIRE_FEQ(rc.y0, 980, 1e-6);
+ REQUIRE_FEQ(rc.y1, -100, 1e-6);
+}
diff --git a/src/tests/d3d11.c b/src/tests/d3d11.c
new file mode 100644
index 0000000..256af35
--- /dev/null
+++ b/src/tests/d3d11.c
@@ -0,0 +1,59 @@
+#include "gpu_tests.h"
+#include "d3d11/gpu.h"
+#include <dxgi1_2.h>
+
+#include <libplacebo/d3d11.h>
+
+int main()
+{
+ pl_log log = pl_test_logger();
+ IDXGIFactory1 *factory = NULL;
+ IDXGIAdapter1 *adapter1 = NULL;
+ HRESULT hr;
+
+ HMODULE dxgi = LoadLibraryW(L"dxgi.dll");
+ if (!dxgi)
+ return SKIP;
+
+ __typeof__(&CreateDXGIFactory1) pCreateDXGIFactory1 =
+ (void *) GetProcAddress(dxgi, "CreateDXGIFactory1");
+ if (!pCreateDXGIFactory1)
+ return SKIP;
+
+ hr = pCreateDXGIFactory1(&IID_IDXGIFactory1, (void **) &factory);
+ if (FAILED(hr)) {
+ printf("Failed to create DXGI factory\n");
+ return SKIP;
+ }
+
+ // Test all attached devices
+ for (int i = 0;; i++) {
+ hr = IDXGIFactory1_EnumAdapters1(factory, i, &adapter1);
+ if (hr == DXGI_ERROR_NOT_FOUND)
+ break;
+ if (FAILED(hr)) {
+ printf("Failed to enumerate adapters\n");
+ return SKIP;
+ }
+
+ DXGI_ADAPTER_DESC1 desc;
+ hr = IDXGIAdapter1_GetDesc1(adapter1, &desc);
+ if (FAILED(hr)) {
+ printf("Failed to enumerate adapters\n");
+ return SKIP;
+ }
+ SAFE_RELEASE(adapter1);
+
+ const struct pl_d3d11_t *d3d11 = pl_d3d11_create(log, pl_d3d11_params(
+ .debug = true,
+ .adapter_luid = desc.AdapterLuid,
+ ));
+ REQUIRE(d3d11);
+
+ gpu_shader_tests(d3d11->gpu);
+
+ pl_d3d11_destroy(&d3d11);
+ }
+
+ SAFE_RELEASE(factory);
+}
diff --git a/src/tests/dav1d.c b/src/tests/dav1d.c
new file mode 100644
index 0000000..7e2439f
--- /dev/null
+++ b/src/tests/dav1d.c
@@ -0,0 +1,45 @@
+#include "tests.h"
+#include "libplacebo/utils/dav1d.h"
+
+int main()
+{
+ // Test enum functions
+ for (enum pl_color_system sys = 0; sys < PL_COLOR_SYSTEM_COUNT; sys++) {
+ // Exceptions to the rule, due to different handling in dav1d
+ if (sys == PL_COLOR_SYSTEM_BT_2100_HLG || sys == PL_COLOR_SYSTEM_XYZ)
+ continue;
+
+ enum Dav1dMatrixCoefficients mc = pl_system_to_dav1d(sys);
+ enum pl_color_system sys2 = pl_system_from_dav1d(mc);
+ if (sys2)
+ REQUIRE_CMP(sys, ==, sys2, "u");
+ }
+
+ for (enum pl_color_levels lev = 0; lev < PL_COLOR_LEVELS_COUNT; lev++) {
+ int range = pl_levels_to_dav1d(lev);
+ enum pl_color_levels lev2 = pl_levels_from_dav1d(range);
+ if (lev != PL_COLOR_LEVELS_UNKNOWN)
+ REQUIRE_CMP(lev, ==, lev2, "u");
+ }
+
+ for (enum pl_color_primaries prim = 0; prim < PL_COLOR_PRIM_COUNT; prim++) {
+ enum Dav1dColorPrimaries dpri = pl_primaries_to_dav1d(prim);
+ enum pl_color_primaries prim2 = pl_primaries_from_dav1d(dpri);
+ if (prim2)
+ REQUIRE_CMP(prim, ==, prim2, "u");
+ }
+
+ for (enum pl_color_transfer trc = 0; trc < PL_COLOR_TRC_COUNT; trc++) {
+ enum Dav1dTransferCharacteristics dtrc = pl_transfer_to_dav1d(trc);
+ enum pl_color_transfer trc2 = pl_transfer_from_dav1d(dtrc);
+ if (trc2)
+ REQUIRE_CMP(trc, ==, trc2, "u");
+ }
+
+ for (enum pl_chroma_location loc = 0; loc < PL_CHROMA_COUNT; loc++) {
+ enum Dav1dChromaSamplePosition dloc = pl_chroma_to_dav1d(loc);
+ enum pl_chroma_location loc2 = pl_chroma_from_dav1d(dloc);
+ if (loc2)
+ REQUIRE_CMP(loc, ==, loc2, "u");
+ }
+}
diff --git a/src/tests/dither.c b/src/tests/dither.c
new file mode 100644
index 0000000..c9f639c
--- /dev/null
+++ b/src/tests/dither.c
@@ -0,0 +1,41 @@
+#include "tests.h"
+
+#include <libplacebo/dither.h>
+#include <libplacebo/shaders/dithering.h>
+
+#define SHIFT 4
+#define SIZE (1 << SHIFT)
+float data[SIZE][SIZE];
+
+int main()
+{
+ printf("Ordered dither matrix:\n");
+ pl_generate_bayer_matrix(&data[0][0], SIZE);
+ for (int y = 0; y < SIZE; y++) {
+ for (int x = 0; x < SIZE; x++)
+ printf(" %3d", (int)(data[y][x] * SIZE * SIZE));
+ printf("\n");
+ }
+
+ printf("Blue noise dither matrix:\n");
+ pl_generate_blue_noise(&data[0][0], SHIFT);
+ for (int y = 0; y < SIZE; y++) {
+ for (int x = 0; x < SIZE; x++)
+ printf(" %3d", (int)(data[y][x] * SIZE * SIZE));
+ printf("\n");
+ }
+
+ // Generate an example of a dither shader
+ pl_log log = pl_test_logger();
+ pl_shader sh = pl_shader_alloc(log, NULL);
+ pl_shader_obj obj = NULL;
+
+ pl_shader_dither(sh, 8, &obj, NULL);
+ const struct pl_shader_res *res = pl_shader_finalize(sh);
+ REQUIRE(res);
+ printf("Generated dither shader:\n%s\n", res->glsl);
+
+ pl_shader_obj_destroy(&obj);
+ pl_shader_free(&sh);
+ pl_log_destroy(&log);
+}
diff --git a/src/tests/dummy.c b/src/tests/dummy.c
new file mode 100644
index 0000000..0e87a2c
--- /dev/null
+++ b/src/tests/dummy.c
@@ -0,0 +1,70 @@
+#include "gpu_tests.h"
+
+#include <libplacebo/dummy.h>
+
+int main()
+{
+ pl_log log = pl_test_logger();
+ pl_gpu gpu = pl_gpu_dummy_create(log, NULL);
+ pl_buffer_tests(gpu);
+ pl_texture_tests(gpu);
+
+ // Attempt creating a shader and accessing the resulting LUT
+ pl_tex dummy = pl_tex_dummy_create(gpu, pl_tex_dummy_params(
+ .w = 100,
+ .h = 100,
+ .format = pl_find_named_fmt(gpu, "rgba8"),
+ ));
+
+ struct pl_sample_src src = {
+ .tex = dummy,
+ .new_w = 1000,
+ .new_h = 1000,
+ };
+
+ pl_shader_obj lut = NULL;
+ struct pl_sample_filter_params filter_params = {
+ .filter = pl_filter_ewa_lanczos,
+ .lut = &lut,
+ };
+
+ pl_shader sh = pl_shader_alloc(log, pl_shader_params( .gpu = gpu ));
+ REQUIRE(pl_shader_sample_polar(sh, &src, &filter_params));
+ const struct pl_shader_res *res = pl_shader_finalize(sh);
+ REQUIRE(res);
+
+ for (int n = 0; n < res->num_descriptors; n++) {
+ const struct pl_shader_desc *sd = &res->descriptors[n];
+ if (sd->desc.type != PL_DESC_SAMPLED_TEX)
+ continue;
+
+ pl_tex tex = sd->binding.object;
+ const float *data = (float *) pl_tex_dummy_data(tex);
+ if (!data)
+ continue; // means this was the `dummy` texture
+
+#ifdef PRINT_LUTS
+ for (int i = 0; i < tex->params.w; i++)
+ printf("lut[%d] = %f\n", i, data[i]);
+#endif
+ }
+
+ // Try out generation of the sampler2D interface
+ src.tex = NULL;
+ src.tex_w = 100;
+ src.tex_h = 100;
+ src.format = PL_FMT_UNORM;
+ src.sampler = PL_SAMPLER_NORMAL;
+ src.mode = PL_TEX_SAMPLE_LINEAR;
+
+ pl_shader_reset(sh, pl_shader_params( .gpu = gpu ));
+ REQUIRE(pl_shader_sample_polar(sh, &src, &filter_params));
+ REQUIRE((res = pl_shader_finalize(sh)));
+ REQUIRE_CMP(res->input, ==, PL_SHADER_SIG_SAMPLER, "u");
+
+ pl_shader_free(&sh);
+ pl_shader_obj_destroy(&lut);
+ pl_tex_destroy(gpu, &dummy);
+ pl_gpu_dummy_destroy(&gpu);
+ pl_log_destroy(&log);
+}
diff --git a/src/tests/filters.c b/src/tests/filters.c
new file mode 100644
index 0000000..b6b323c
--- /dev/null
+++ b/src/tests/filters.c
@@ -0,0 +1,81 @@
+#include "tests.h"
+
+#include <libplacebo/filters.h>
+
+int main()
+{
+ pl_log log = pl_test_logger();
+
+ for (int i = 0; i < pl_num_filter_functions; i++) {
+ const struct pl_filter_function *fun = pl_filter_functions[i];
+ if (fun->opaque)
+ continue;
+
+ printf("Testing filter function '%s'\n", fun->name);
+
+ struct pl_filter_ctx ctx = { .radius = fun->radius };
+ memcpy(ctx.params, fun->params, sizeof(ctx.params));
+
+ // Ensure the kernel is correctly scaled
+ REQUIRE_FEQ(fun->weight(&ctx, 0.0), 1.0, 1e-7);
+
+ // Only box filters are radius 1, these are unwindowed by design.
+ // Gaussian technically never reaches 0 even at its preconfigured radius.
+ if (fun->radius > 1.0 && fun != &pl_filter_function_gaussian)
+ REQUIRE_FEQ(fun->weight(&ctx, fun->radius), 0.0, 1e-7);
+ }
+
+ for (int c = 0; c < pl_num_filter_configs; c++) {
+ const struct pl_filter_config *conf = pl_filter_configs[c];
+ if (conf->kernel->opaque)
+ continue;
+
+ printf("Testing filter config '%s'\n", conf->name);
+ pl_filter flt = pl_filter_generate(log, pl_filter_params(
+ .config = *conf,
+ .lut_entries = 256,
+ .cutoff = 1e-3,
+ ));
+ REQUIRE(flt);
+ const float radius = PL_DEF(conf->radius, conf->kernel->radius);
+ REQUIRE_CMP(flt->radius, <=, radius, "f");
+ REQUIRE_CMP(flt->radius_zero, >, 0.0, "f");
+ REQUIRE_CMP(flt->radius_zero, <=, flt->radius, "f");
+
+ if (conf->polar) {
+
+ // Test LUT accuracy
+ const int range = flt->params.lut_entries - 1;
+ double scale = flt->weights[0] / pl_filter_sample(conf, 0.0);
+ double err = 0.0;
+ for (float k = 0.0; k <= 1.0; k += 1e-3f) {
+ double ref = scale * pl_filter_sample(conf, k * flt->radius);
+ double idx = k * range;
+ int base = floorf(idx);
+ double fpart = idx - base;
+ int next = PL_MIN(base + 1, range);
+ double interp = PL_MIX(flt->weights[base], flt->weights[next], fpart);
+ err = fmaxf(err, fabs(interp - ref));
+ }
+ REQUIRE_CMP(err, <=, 1e-4, "g");
+
+ } else {
+
+ // Ensure the weights for each row add up to unity
+ for (int i = 0; i < flt->params.lut_entries; i++) {
+ const float *row = flt->weights + i * flt->row_stride;
+ float sum = 0.0;
+ REQUIRE(flt->row_size);
+ REQUIRE_CMP(flt->row_stride, >=, flt->row_size, "d");
+ for (int n = 0; n < flt->row_size; n++)
+ sum += row[n];
+ REQUIRE_FEQ(sum, 1.0, 1e-6);
+ }
+
+ }
+
+ pl_filter_free(&flt);
+ }
+
+ pl_log_destroy(&log);
+}
diff --git a/src/tests/fuzz/lut.c b/src/tests/fuzz/lut.c
new file mode 100644
index 0000000..24e5f89
--- /dev/null
+++ b/src/tests/fuzz/lut.c
@@ -0,0 +1,24 @@
+#include "../tests.h"
+
+#include <libplacebo/shaders/lut.h>
+
+__AFL_FUZZ_INIT();
+
+#pragma clang optimize off
+
+int main()
+{
+ struct pl_custom_lut *lut;
+
+#ifdef __AFL_HAVE_MANUAL_CONTROL
+ __AFL_INIT();
+#endif
+
+ unsigned char *buf = __AFL_FUZZ_TESTCASE_BUF;
+
+ while (__AFL_LOOP(100000)) {
+ size_t len = __AFL_FUZZ_TESTCASE_LEN;
+ lut = pl_lut_parse_cube(NULL, (char *) buf, len);
+ pl_lut_free(&lut);
+ }
+}
diff --git a/src/tests/fuzz/options.c b/src/tests/fuzz/options.c
new file mode 100644
index 0000000..c88e462
--- /dev/null
+++ b/src/tests/fuzz/options.c
@@ -0,0 +1,26 @@
+#include "../tests.h"
+
+#include <libplacebo/options.h>
+
+__AFL_FUZZ_INIT();
+
+#pragma clang optimize off
+
+int main()
+{
+ pl_options opts = pl_options_alloc(NULL);
+
+#ifdef __AFL_HAVE_MANUAL_CONTROL
+ __AFL_INIT();
+#endif
+
+ unsigned char *buf = __AFL_FUZZ_TESTCASE_BUF;
+
+ while (__AFL_LOOP(100000)) {
+ size_t len = __AFL_FUZZ_TESTCASE_LEN;
+ buf[len - 1] = '\0'; // ensure proper null termination
+ pl_options_load(opts, (const char *) buf);
+ pl_options_save(opts);
+ pl_options_reset(opts, NULL);
+ }
+}
diff --git a/src/tests/fuzz/shaders.c b/src/tests/fuzz/shaders.c
new file mode 100644
index 0000000..2e3e92c
--- /dev/null
+++ b/src/tests/fuzz/shaders.c
@@ -0,0 +1,166 @@
+#include "../tests.h"
+#include "shaders.h"
+
+#include <libplacebo/dummy.h>
+#include <libplacebo/shaders/colorspace.h>
+#include <libplacebo/shaders/custom.h>
+#include <libplacebo/shaders/sampling.h>
+
+__AFL_FUZZ_INIT();
+
+#pragma clang optimize off
+
+int main()
+{
+ pl_gpu gpu = pl_gpu_dummy_create(NULL, NULL);
+
+#define WIDTH 64
+#define HEIGHT 64
+#define COMPS 4
+
+ static const float empty[HEIGHT][WIDTH][COMPS] = {0};
+
+ struct pl_sample_src src = {
+ .tex = pl_tex_create(gpu, pl_tex_params(
+ .format = pl_find_fmt(gpu, PL_FMT_FLOAT, COMPS, 0, 32, PL_FMT_CAP_SAMPLEABLE),
+ .initial_data = empty,
+ .sampleable = true,
+ .w = WIDTH,
+ .h = HEIGHT,
+ )),
+ .new_w = WIDTH * 2,
+ .new_h = HEIGHT * 2,
+ };
+
+ if (!src.tex)
+ return 1;
+
+#ifdef __AFL_HAVE_MANUAL_CONTROL
+ __AFL_INIT();
+#endif
+
+ unsigned char *buf = __AFL_FUZZ_TESTCASE_BUF;
+ while (__AFL_LOOP(10000)) {
+
+#define STACK_SIZE 16
+ pl_shader stack[STACK_SIZE] = {0};
+ int idx = 0;
+
+ stack[0] = pl_shader_alloc(NULL, pl_shader_params(
+ .gpu = gpu,
+ ));
+
+ pl_shader sh = stack[idx];
+ pl_shader_obj polar = NULL, ortho = NULL, peak = NULL, dither = NULL;
+
+ size_t len = __AFL_FUZZ_TESTCASE_LEN;
+ for (size_t pos = 0; pos < len; pos++) {
+ switch (buf[pos]) {
+ // Sampling steps
+ case 'S':
+ pl_shader_sample_direct(sh, &src);
+ break;
+ case 'D':
+ pl_shader_deband(sh, &src, NULL);
+ break;
+ case 'P':
+ pl_shader_sample_polar(sh, &src, pl_sample_filter_params(
+ .filter = pl_filter_ewa_lanczos,
+ .lut = &polar,
+ ));
+ break;
+ case 'O': ;
+ struct pl_sample_src srcfix = src;
+ srcfix.new_w = WIDTH;
+ pl_shader_sample_ortho2(sh, &srcfix, pl_sample_filter_params(
+ .filter = pl_filter_spline36,
+ .lut = &ortho,
+ ));
+ break;
+ case 'X':
+ pl_shader_custom(sh, &(struct pl_custom_shader) {
+ .input = PL_SHADER_SIG_NONE,
+ .output = PL_SHADER_SIG_COLOR,
+ .body = "// merge subpasses",
+ });
+ break;
+
+ // Colorspace transformation steps
+ case 'y': {
+ struct pl_color_repr repr = pl_color_repr_jpeg;
+ pl_shader_decode_color(sh, &repr, NULL);
+ break;
+ }
+ case 'p':
+ pl_shader_detect_peak(sh, pl_color_space_hdr10, &peak, NULL);
+ break;
+ case 'm':
+ pl_shader_color_map(sh, NULL, pl_color_space_bt709,
+ pl_color_space_monitor, NULL, false);
+ break;
+ case 't':
+ pl_shader_color_map(sh, NULL, pl_color_space_hdr10,
+ pl_color_space_monitor, &peak, false);
+ break;
+ case 'd':
+ pl_shader_dither(sh, 8, &dither, pl_dither_params(
+ // Picked to speed up calculation
+ .method = PL_DITHER_ORDERED_LUT,
+ .lut_size = 2,
+ ));
+ break;
+
+ // Push and pop subshader commands
+ case '(':
+ if (idx+1 == STACK_SIZE)
+ goto invalid;
+
+ idx++;
+ if (!stack[idx]) {
+ stack[idx] = pl_shader_alloc(NULL, pl_shader_params(
+ .gpu = gpu,
+ .id = idx,
+ ));
+ }
+ sh = stack[idx];
+ break;
+
+ case ')':
+ if (idx == 0)
+ goto invalid;
+
+ idx--;
+ sh_subpass(stack[idx], stack[idx + 1]);
+ pl_shader_reset(stack[idx + 1], pl_shader_params(
+ .gpu = gpu,
+ .id = idx + 1,
+ ));
+ sh = stack[idx];
+ break;
+
+ default:
+ goto invalid;
+ }
+ }
+
+ // Merge remaining shaders
+ while (idx > 0) {
+ sh_subpass(stack[idx - 1], stack[idx]);
+ idx--;
+ }
+
+ pl_shader_finalize(stack[0]);
+
+invalid:
+ for (int i = 0; i < STACK_SIZE; i++)
+ pl_shader_free(&stack[i]);
+
+ pl_shader_obj_destroy(&polar);
+ pl_shader_obj_destroy(&ortho);
+ pl_shader_obj_destroy(&peak);
+ pl_shader_obj_destroy(&dither);
+ }
+
+ pl_tex_destroy(gpu, &src.tex);
+ pl_gpu_dummy_destroy(&gpu);
+}
diff --git a/src/tests/fuzz/user_shaders.c b/src/tests/fuzz/user_shaders.c
new file mode 100644
index 0000000..bbb98c8
--- /dev/null
+++ b/src/tests/fuzz/user_shaders.c
@@ -0,0 +1,28 @@
+#include "../tests.h"
+
+#include <libplacebo/dummy.h>
+#include <libplacebo/shaders/custom.h>
+
+__AFL_FUZZ_INIT();
+
+#pragma clang optimize off
+
+int main()
+{
+ pl_gpu gpu = pl_gpu_dummy_create(NULL, NULL);
+ const struct pl_hook *hook;
+
+#ifdef __AFL_HAVE_MANUAL_CONTROL
+ __AFL_INIT();
+#endif
+
+ unsigned char *buf = __AFL_FUZZ_TESTCASE_BUF;
+
+ while (__AFL_LOOP(100000)) {
+ size_t len = __AFL_FUZZ_TESTCASE_LEN;
+ hook = pl_mpv_user_shader_parse(gpu, (char *) buf, len);
+ pl_mpv_user_shader_destroy(&hook);
+ }
+
+ pl_gpu_dummy_destroy(&gpu);
+}
diff --git a/src/tests/gpu_tests.h b/src/tests/gpu_tests.h
new file mode 100644
index 0000000..f14f260
--- /dev/null
+++ b/src/tests/gpu_tests.h
@@ -0,0 +1,1741 @@
+#include "tests.h"
+#include "shaders.h"
+
+#include <libplacebo/renderer.h>
+#include <libplacebo/utils/frame_queue.h>
+#include <libplacebo/utils/upload.h>
+
+//#define PRINT_OUTPUT
+
+static void pl_buffer_tests(pl_gpu gpu)
+{
+ const size_t buf_size = 1024;
+ if (buf_size > gpu->limits.max_buf_size)
+ return;
+
+ uint8_t *test_src = malloc(buf_size * 2);
+ uint8_t *test_dst = test_src + buf_size;
+ assert(test_src && test_dst);
+ memset(test_dst, 0, buf_size);
+ for (int i = 0; i < buf_size; i++)
+ test_src[i] = RANDOM_U8;
+
+ pl_buf buf = NULL, tbuf = NULL;
+
+ printf("test buffer static creation and readback\n");
+ buf = pl_buf_create(gpu, pl_buf_params(
+ .size = buf_size,
+ .host_readable = true,
+ .initial_data = test_src,
+ ));
+
+ REQUIRE(buf);
+ REQUIRE(pl_buf_read(gpu, buf, 0, test_dst, buf_size));
+ REQUIRE_MEMEQ(test_src, test_dst, buf_size);
+ pl_buf_destroy(gpu, &buf);
+
+ printf("test buffer empty creation, update and readback\n");
+ memset(test_dst, 0, buf_size);
+ buf = pl_buf_create(gpu, pl_buf_params(
+ .size = buf_size,
+ .host_writable = true,
+ .host_readable = true,
+ ));
+
+ REQUIRE(buf);
+ pl_buf_write(gpu, buf, 0, test_src, buf_size);
+ REQUIRE(pl_buf_read(gpu, buf, 0, test_dst, buf_size));
+ REQUIRE_MEMEQ(test_src, test_dst, buf_size);
+ pl_buf_destroy(gpu, &buf);
+
+ printf("test buffer-buffer copy and readback\n");
+ memset(test_dst, 0, buf_size);
+ buf = pl_buf_create(gpu, pl_buf_params(
+ .size = buf_size,
+ .initial_data = test_src,
+ ));
+
+ tbuf = pl_buf_create(gpu, pl_buf_params(
+ .size = buf_size,
+ .host_readable = true,
+ ));
+
+ REQUIRE(buf && tbuf);
+ pl_buf_copy(gpu, tbuf, 0, buf, 0, buf_size);
+ REQUIRE(pl_buf_read(gpu, tbuf, 0, test_dst, buf_size));
+ REQUIRE_MEMEQ(test_src, test_dst, buf_size);
+ pl_buf_destroy(gpu, &buf);
+ pl_buf_destroy(gpu, &tbuf);
+
+ if (buf_size <= gpu->limits.max_mapped_size) {
+ printf("test host mapped buffer readback\n");
+ buf = pl_buf_create(gpu, pl_buf_params(
+ .size = buf_size,
+ .host_mapped = true,
+ .initial_data = test_src,
+ ));
+
+ REQUIRE(buf);
+ REQUIRE(!pl_buf_poll(gpu, buf, 0));
+ REQUIRE_MEMEQ(test_src, buf->data, buf_size);
+ pl_buf_destroy(gpu, &buf);
+ }
+
+ // `compute_queues` check is to exclude dummy GPUs here
+ if (buf_size <= gpu->limits.max_ssbo_size && gpu->limits.compute_queues)
+ {
+ printf("test endian swapping\n");
+ buf = pl_buf_create(gpu, pl_buf_params(
+ .size = buf_size,
+ .storable = true,
+ .initial_data = test_src,
+ ));
+
+ tbuf = pl_buf_create(gpu, pl_buf_params(
+ .size = buf_size,
+ .storable = true,
+ .host_readable = true,
+ ));
+
+ REQUIRE(buf && tbuf);
+ REQUIRE(pl_buf_copy_swap(gpu, &(struct pl_buf_copy_swap_params) {
+ .src = buf,
+ .dst = tbuf,
+ .size = buf_size,
+ .wordsize = 2,
+ }));
+ REQUIRE(pl_buf_read(gpu, tbuf, 0, test_dst, buf_size));
+ for (int i = 0; i < buf_size / 2; i++) {
+ REQUIRE_CMP(test_src[2 * i + 0], ==, test_dst[2 * i + 1], PRIu8);
+ REQUIRE_CMP(test_src[2 * i + 1], ==, test_dst[2 * i + 0], PRIu8);
+ }
+ // test endian swap in-place
+ REQUIRE(pl_buf_copy_swap(gpu, &(struct pl_buf_copy_swap_params) {
+ .src = tbuf,
+ .dst = tbuf,
+ .size = buf_size,
+ .wordsize = 4,
+ }));
+ REQUIRE(pl_buf_read(gpu, tbuf, 0, test_dst, buf_size));
+ for (int i = 0; i < buf_size / 4; i++) {
+ REQUIRE_CMP(test_src[4 * i + 0], ==, test_dst[4 * i + 2], PRIu8);
+ REQUIRE_CMP(test_src[4 * i + 1], ==, test_dst[4 * i + 3], PRIu8);
+ REQUIRE_CMP(test_src[4 * i + 2], ==, test_dst[4 * i + 0], PRIu8);
+ REQUIRE_CMP(test_src[4 * i + 3], ==, test_dst[4 * i + 1], PRIu8);
+ }
+ pl_buf_destroy(gpu, &buf);
+ pl_buf_destroy(gpu, &tbuf);
+ }
+
+ free(test_src);
+}
+
+static void test_cb(void *priv)
+{
+ bool *flag = priv;
+ *flag = true;
+}
+
+static void pl_test_roundtrip(pl_gpu gpu, pl_tex tex[2],
+ uint8_t *src, uint8_t *dst)
+{
+ if (!tex[0] || !tex[1]) {
+ printf("failed creating test textures... skipping this test\n");
+ return;
+ }
+
+ int texels = tex[0]->params.w;
+ texels *= tex[0]->params.h ? tex[0]->params.h : 1;
+ texels *= tex[0]->params.d ? tex[0]->params.d : 1;
+
+ pl_fmt fmt = tex[0]->params.format;
+ size_t bytes = texels * fmt->texel_size;
+ memset(src, 0, bytes);
+ memset(dst, 0, bytes);
+
+ for (size_t i = 0; i < bytes; i++)
+ src[i] = RANDOM_U8;
+
+ pl_timer ul, dl;
+ ul = pl_timer_create(gpu);
+ dl = pl_timer_create(gpu);
+
+ bool ran_ul = false, ran_dl = false;
+
+ REQUIRE(pl_tex_upload(gpu, &(struct pl_tex_transfer_params){
+ .tex = tex[0],
+ .ptr = src,
+ .timer = ul,
+ .callback = gpu->limits.callbacks ? test_cb : NULL,
+ .priv = &ran_ul,
+ }));
+
+ // Test blitting, if possible for this format
+ pl_tex dst_tex = tex[0];
+ if (tex[0]->params.blit_src && tex[1]->params.blit_dst) {
+ pl_tex_clear_ex(gpu, tex[1], (union pl_clear_color){0}); // for testing
+ pl_tex_blit(gpu, &(struct pl_tex_blit_params) {
+ .src = tex[0],
+ .dst = tex[1],
+ });
+ dst_tex = tex[1];
+ }
+
+ REQUIRE(pl_tex_download(gpu, &(struct pl_tex_transfer_params){
+ .tex = dst_tex,
+ .ptr = dst,
+ .timer = dl,
+ .callback = gpu->limits.callbacks ? test_cb : NULL,
+ .priv = &ran_dl,
+ }));
+
+ pl_gpu_finish(gpu);
+ if (gpu->limits.callbacks)
+ REQUIRE(ran_ul && ran_dl);
+
+ if (fmt->emulated && fmt->type == PL_FMT_FLOAT) {
+ // TODO: can't memcmp here because bits might be lost due to the
+ // emulated 16/32 bit upload paths, figure out a better way to
+ // generate data and verify the roundtrip!
+ } else {
+ REQUIRE_MEMEQ(src, dst, bytes);
+ }
+
+ // Report timer results
+ printf("upload time: %"PRIu64", download time: %"PRIu64"\n",
+ pl_timer_query(gpu, ul), pl_timer_query(gpu, dl));
+
+ pl_timer_destroy(gpu, &ul);
+ pl_timer_destroy(gpu, &dl);
+}
+
+static void pl_texture_tests(pl_gpu gpu)
+{
+ const size_t max_size = 16*16*16 * 4 *sizeof(double);
+ uint8_t *test_src = malloc(max_size * 2);
+ uint8_t *test_dst = test_src + max_size;
+
+ for (int f = 0; f < gpu->num_formats; f++) {
+ pl_fmt fmt = gpu->formats[f];
+ if (fmt->opaque || !(fmt->caps & PL_FMT_CAP_HOST_READABLE))
+ continue;
+
+ printf("testing texture roundtrip for format %s\n", fmt->name);
+ assert(fmt->texel_size <= 4 * sizeof(double));
+
+ struct pl_tex_params ref_params = {
+ .format = fmt,
+ .blit_src = (fmt->caps & PL_FMT_CAP_BLITTABLE),
+ .blit_dst = (fmt->caps & PL_FMT_CAP_BLITTABLE),
+ .host_writable = true,
+ .host_readable = true,
+ .debug_tag = PL_DEBUG_TAG,
+ };
+
+ pl_tex tex[2];
+
+ if (gpu->limits.max_tex_1d_dim >= 16) {
+ printf("... 1D\n");
+ struct pl_tex_params params = ref_params;
+ params.w = 16;
+ if (!gpu->limits.blittable_1d_3d)
+ params.blit_src = params.blit_dst = false;
+ for (int i = 0; i < PL_ARRAY_SIZE(tex); i++)
+ tex[i] = pl_tex_create(gpu, &params);
+ pl_test_roundtrip(gpu, tex, test_src, test_dst);
+ for (int i = 0; i < PL_ARRAY_SIZE(tex); i++)
+ pl_tex_destroy(gpu, &tex[i]);
+ }
+
+ if (gpu->limits.max_tex_2d_dim >= 16) {
+ printf("... 2D\n");
+ struct pl_tex_params params = ref_params;
+ params.w = params.h = 16;
+ for (int i = 0; i < PL_ARRAY_SIZE(tex); i++)
+ tex[i] = pl_tex_create(gpu, &params);
+ pl_test_roundtrip(gpu, tex, test_src, test_dst);
+ for (int i = 0; i < PL_ARRAY_SIZE(tex); i++)
+ pl_tex_destroy(gpu, &tex[i]);
+ }
+
+ if (gpu->limits.max_tex_3d_dim >= 16) {
+ printf("... 3D\n");
+ struct pl_tex_params params = ref_params;
+ params.w = params.h = params.d = 16;
+ if (!gpu->limits.blittable_1d_3d)
+ params.blit_src = params.blit_dst = false;
+ for (int i = 0; i < PL_ARRAY_SIZE(tex); i++)
+ tex[i] = pl_tex_create(gpu, &params);
+ pl_test_roundtrip(gpu, tex, test_src, test_dst);
+ for (int i = 0; i < PL_ARRAY_SIZE(tex); i++)
+ pl_tex_destroy(gpu, &tex[i]);
+ }
+ }
+
+ free(test_src);
+}
+
+static void pl_planar_tests(pl_gpu gpu)
+{
+ pl_fmt fmt = pl_find_named_fmt(gpu, "g8_b8_r8_420");
+ if (!fmt)
+ return;
+ REQUIRE_CMP(fmt->num_planes, ==, 3, "d");
+
+ const int width = 64, height = 32;
+ pl_tex tex = pl_tex_create(gpu, pl_tex_params(
+ .w = width,
+ .h = height,
+ .format = fmt,
+ .blit_dst = true,
+ .host_readable = true,
+ ));
+ if (!tex)
+ return;
+ for (int i = 0; i < fmt->num_planes; i++)
+ REQUIRE(tex->planes[i]);
+
+ pl_tex plane = tex->planes[1];
+ uint8_t data[(width * height) >> 2];
+ REQUIRE_CMP(plane->params.w * plane->params.h, ==, PL_ARRAY_SIZE(data), "d");
+
+ pl_tex_clear(gpu, plane, (float[]){ (float) 0x80 / 0xFF, 0.0, 0.0, 1.0 });
+ REQUIRE(pl_tex_download(gpu, pl_tex_transfer_params(
+ .tex = plane,
+ .ptr = data,
+ )));
+
+ uint8_t ref[PL_ARRAY_SIZE(data)];
+ memset(ref, 0x80, sizeof(ref));
+ REQUIRE_MEMEQ(data, ref, PL_ARRAY_SIZE(data));
+
+ pl_tex_destroy(gpu, &tex);
+}
+
+static void pl_shader_tests(pl_gpu gpu)
+{
+ if (gpu->glsl.version < 410)
+ return;
+
+ const char *vert_shader =
+ "#version 410 \n"
+ "layout(location=0) in vec2 vertex_pos; \n"
+ "layout(location=1) in vec3 vertex_color; \n"
+ "layout(location=0) out vec3 frag_color; \n"
+ "void main() { \n"
+ " gl_Position = vec4(vertex_pos, 0, 1); \n"
+ " frag_color = vertex_color; \n"
+ "}";
+
+ const char *frag_shader =
+ "#version 410 \n"
+ "layout(location=0) in vec3 frag_color; \n"
+ "layout(location=0) out vec4 out_color; \n"
+ "void main() { \n"
+ " out_color = vec4(frag_color, 1.0); \n"
+ "}";
+
+ pl_fmt fbo_fmt;
+ enum pl_fmt_caps caps = PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_BLITTABLE |
+ PL_FMT_CAP_LINEAR;
+
+ fbo_fmt = pl_find_fmt(gpu, PL_FMT_FLOAT, 4, 16, 32, caps);
+ if (!fbo_fmt)
+ return;
+
+#define FBO_W 16
+#define FBO_H 16
+
+ pl_tex fbo;
+ fbo = pl_tex_create(gpu, &(struct pl_tex_params) {
+ .format = fbo_fmt,
+ .w = FBO_W,
+ .h = FBO_H,
+ .renderable = true,
+ .storable = !!(fbo_fmt->caps & PL_FMT_CAP_STORABLE),
+ .host_readable = true,
+ .blit_dst = true,
+ });
+ REQUIRE(fbo);
+
+ pl_tex_clear_ex(gpu, fbo, (union pl_clear_color){0});
+
+ pl_fmt vert_fmt;
+ vert_fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 3);
+ REQUIRE(vert_fmt);
+
+ static const struct vertex { float pos[2]; float color[3]; } vertices[] = {
+ {{-1.0, -1.0}, {0, 0, 0}},
+ {{ 1.0, -1.0}, {1, 0, 0}},
+ {{-1.0, 1.0}, {0, 1, 0}},
+ {{ 1.0, 1.0}, {1, 1, 0}},
+ };
+
+ pl_pass pass;
+ pass = pl_pass_create(gpu, &(struct pl_pass_params) {
+ .type = PL_PASS_RASTER,
+ .target_format = fbo_fmt,
+ .vertex_shader = vert_shader,
+ .glsl_shader = frag_shader,
+
+ .vertex_type = PL_PRIM_TRIANGLE_STRIP,
+ .vertex_stride = sizeof(struct vertex),
+ .num_vertex_attribs = 2,
+ .vertex_attribs = (struct pl_vertex_attrib[]) {{
+ .name = "vertex_pos",
+ .fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2),
+ .location = 0,
+ .offset = offsetof(struct vertex, pos),
+ }, {
+ .name = "vertex_color",
+ .fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 3),
+ .location = 1,
+ .offset = offsetof(struct vertex, color),
+ }},
+ });
+ REQUIRE(pass);
+ if (pass->params.cached_program || pass->params.cached_program_len) {
+ // Ensure both are set if either one is set
+ REQUIRE(pass->params.cached_program);
+ REQUIRE(pass->params.cached_program_len);
+ }
+
+ pl_timer timer = pl_timer_create(gpu);
+ pl_pass_run(gpu, &(struct pl_pass_run_params) {
+ .pass = pass,
+ .target = fbo,
+ .vertex_count = PL_ARRAY_SIZE(vertices),
+ .vertex_data = vertices,
+ .timer = timer,
+ });
+
+ // Wait until this pass is complete and report the timer result
+ pl_gpu_finish(gpu);
+ printf("timer query result: %"PRIu64"\n", pl_timer_query(gpu, timer));
+ pl_timer_destroy(gpu, &timer);
+
+ static float test_data[FBO_H * FBO_W * 4] = {0};
+
+ // Test against the known pattern of `src`, only useful for roundtrip tests
+#define TEST_FBO_PATTERN(eps, fmt, ...) \
+ do { \
+ printf("testing pattern of " fmt "\n", __VA_ARGS__); \
+ REQUIRE(pl_tex_download(gpu, &(struct pl_tex_transfer_params) { \
+ .tex = fbo, \
+ .ptr = test_data, \
+ })); \
+ \
+ for (int y = 0; y < FBO_H; y++) { \
+ for (int x = 0; x < FBO_W; x++) { \
+ float *color = &test_data[(y * FBO_W + x) * 4]; \
+ REQUIRE_FEQ(color[0], (x + 0.5) / FBO_W, eps); \
+ REQUIRE_FEQ(color[1], (y + 0.5) / FBO_H, eps); \
+ REQUIRE_FEQ(color[2], 0.0, eps); \
+ REQUIRE_FEQ(color[3], 1.0, eps); \
+ } \
+ } \
+ } while (0)
+
+ TEST_FBO_PATTERN(1e-6, "%s", "initial rendering");
+
+ if (sizeof(vertices) <= gpu->limits.max_vbo_size) {
+ // Test the use of an explicit vertex buffer
+ pl_buf vert = pl_buf_create(gpu, &(struct pl_buf_params) {
+ .size = sizeof(vertices),
+ .initial_data = vertices,
+ .drawable = true,
+ });
+
+ REQUIRE(vert);
+ pl_pass_run(gpu, &(struct pl_pass_run_params) {
+ .pass = pass,
+ .target = fbo,
+ .vertex_count = sizeof(vertices) / sizeof(struct vertex),
+ .vertex_buf = vert,
+ .buf_offset = 0,
+ });
+
+ pl_buf_destroy(gpu, &vert);
+ TEST_FBO_PATTERN(1e-6, "%s", "using vertex buffer");
+ }
+
+ // Test the use of index buffers
+ static const uint16_t indices[] = { 3, 2, 1, 0 };
+ pl_pass_run(gpu, &(struct pl_pass_run_params) {
+ .pass = pass,
+ .target = fbo,
+ .vertex_count = PL_ARRAY_SIZE(indices),
+ .vertex_data = vertices,
+ .index_data = indices,
+ });
+
+ pl_pass_destroy(gpu, &pass);
+ TEST_FBO_PATTERN(1e-6, "%s", "using indexed rendering");
+
+ // Test the use of pl_dispatch
+ pl_dispatch dp = pl_dispatch_create(gpu->log, gpu);
+ pl_shader sh = pl_dispatch_begin(dp);
+ REQUIRE(pl_shader_custom(sh, &(struct pl_custom_shader) {
+ .body = "color = vec4(col, 1.0);",
+ .input = PL_SHADER_SIG_NONE,
+ .output = PL_SHADER_SIG_COLOR,
+ }));
+
+ REQUIRE(pl_dispatch_vertex(dp, &(struct pl_dispatch_vertex_params) {
+ .shader = &sh,
+ .target = fbo,
+ .vertex_stride = sizeof(struct vertex),
+ .vertex_position_idx = 0,
+ .num_vertex_attribs = 2,
+ .vertex_attribs = (struct pl_vertex_attrib[]) {{
+ .name = "pos",
+ .fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2),
+ .offset = offsetof(struct vertex, pos),
+ }, {
+ .name = "col",
+ .fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 3),
+ .offset = offsetof(struct vertex, color),
+ }},
+
+ .vertex_type = PL_PRIM_TRIANGLE_STRIP,
+ .vertex_coords = PL_COORDS_NORMALIZED,
+ .vertex_count = PL_ARRAY_SIZE(vertices),
+ .vertex_data = vertices,
+ }));
+
+ TEST_FBO_PATTERN(1e-6, "%s", "using custom vertices");
+
+ static float src_data[FBO_H * FBO_W * 4] = {0};
+ memcpy(src_data, test_data, sizeof(src_data));
+
+ pl_tex src;
+ src = pl_tex_create(gpu, &(struct pl_tex_params) {
+ .format = fbo_fmt,
+ .w = FBO_W,
+ .h = FBO_H,
+ .storable = fbo->params.storable,
+ .sampleable = true,
+ .initial_data = src_data,
+ });
+
+ if (fbo->params.storable) {
+ // Test 1x1 blit, to make sure the scaling code runs
+ REQUIRE(pl_tex_blit_compute(gpu, &(struct pl_tex_blit_params) {
+ .src = src,
+ .dst = fbo,
+ .src_rc = {0, 0, 0, 1, 1, 1},
+ .dst_rc = {0, 0, 0, FBO_W, FBO_H, 1},
+ .sample_mode = PL_TEX_SAMPLE_NEAREST,
+ }));
+
+ // Test non-resizing blit, which uses the efficient imageLoad path
+ REQUIRE(pl_tex_blit_compute(gpu, &(struct pl_tex_blit_params) {
+ .src = src,
+ .dst = fbo,
+ .src_rc = {0, 0, 0, FBO_W, FBO_H, 1},
+ .dst_rc = {0, 0, 0, FBO_W, FBO_H, 1},
+ .sample_mode = PL_TEX_SAMPLE_NEAREST,
+ }));
+
+ TEST_FBO_PATTERN(1e-6, "%s", "pl_tex_blit_compute");
+ }
+
+ // Test encoding/decoding of all gamma functions, color spaces, etc.
+ for (enum pl_color_transfer trc = 0; trc < PL_COLOR_TRC_COUNT; trc++) {
+ struct pl_color_space test_csp = {
+ .transfer = trc,
+ .hdr.min_luma = PL_COLOR_HDR_BLACK,
+ };
+ sh = pl_dispatch_begin(dp);
+ pl_shader_sample_nearest(sh, pl_sample_src( .tex = src ));
+ pl_shader_delinearize(sh, &test_csp);
+ pl_shader_linearize(sh, &test_csp);
+ REQUIRE(pl_dispatch_finish(dp, pl_dispatch_params(
+ .shader = &sh,
+ .target = fbo,
+ )));
+
+ float epsilon = pl_color_transfer_is_hdr(trc) ? 1e-4 : 1e-6;
+ TEST_FBO_PATTERN(epsilon, "transfer function %d", (int) trc);
+ }
+
+ for (enum pl_color_system sys = 0; sys < PL_COLOR_SYSTEM_COUNT; sys++) {
+ if (sys == PL_COLOR_SYSTEM_DOLBYVISION)
+ continue; // requires metadata
+ sh = pl_dispatch_begin(dp);
+ pl_shader_sample_nearest(sh, pl_sample_src( .tex = src ));
+ pl_shader_encode_color(sh, &(struct pl_color_repr) { .sys = sys });
+ pl_shader_decode_color(sh, &(struct pl_color_repr) { .sys = sys }, NULL);
+ REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
+ .shader = &sh,
+ .target = fbo,
+ }));
+
+ float epsilon;
+ switch (sys) {
+ case PL_COLOR_SYSTEM_BT_2020_C:
+ case PL_COLOR_SYSTEM_XYZ:
+ epsilon = 1e-5;
+ break;
+
+ case PL_COLOR_SYSTEM_BT_2100_PQ:
+ case PL_COLOR_SYSTEM_BT_2100_HLG:
+ // These seem to be horrifically noisy and prone to breaking on
+ // edge cases for some reason
+ // TODO: figure out why!
+ continue;
+
+ default: epsilon = 1e-6; break;
+ }
+
+ TEST_FBO_PATTERN(epsilon, "color system %d", (int) sys);
+ }
+
+ // Repeat this a few times to test the caching
+ pl_cache cache = pl_cache_create(pl_cache_params( .log = gpu->log ));
+ pl_gpu_set_cache(gpu, cache);
+ for (int i = 0; i < 10; i++) {
+ if (i == 5) {
+ printf("Recreating pl_dispatch to test the caching\n");
+ size_t size = pl_dispatch_save(dp, NULL);
+ REQUIRE(size);
+ uint8_t *cache_data = malloc(size);
+ REQUIRE(cache_data);
+ REQUIRE_CMP(pl_dispatch_save(dp, cache_data), ==, size, "zu");
+
+ pl_dispatch_destroy(&dp);
+ dp = pl_dispatch_create(gpu->log, gpu);
+ pl_dispatch_load(dp, cache_data);
+
+ // Test to make sure the pass regenerates the same cache
+ uint64_t hash = pl_str_hash((pl_str) { cache_data, size });
+ REQUIRE_CMP(pl_dispatch_save(dp, NULL), ==, size, "zu");
+ REQUIRE_CMP(pl_dispatch_save(dp, cache_data), ==, size, "zu");
+ REQUIRE_CMP(pl_str_hash((pl_str) { cache_data, size }), ==, hash, PRIu64);
+ free(cache_data);
+ }
+
+ sh = pl_dispatch_begin(dp);
+
+ // For testing, force the use of CS if possible
+ if (gpu->glsl.compute) {
+ sh->type = SH_COMPUTE;
+ sh->group_size[0] = 8;
+ sh->group_size[1] = 8;
+ }
+
+ pl_shader_deband(sh, pl_sample_src( .tex = src ), pl_deband_params(
+ .iterations = 0,
+ .grain = 0.0,
+ ));
+
+ REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
+ .shader = &sh,
+ .target = fbo,
+ }));
+ TEST_FBO_PATTERN(1e-6, "deband iter %d", i);
+ }
+
+ pl_gpu_set_cache(gpu, NULL);
+ pl_cache_destroy(&cache);
+
+ // Test peak detection and readback if possible
+ sh = pl_dispatch_begin(dp);
+ pl_shader_sample_nearest(sh, pl_sample_src( .tex = src ));
+
+ pl_shader_obj peak_state = NULL;
+ struct pl_color_space csp_gamma22 = { .transfer = PL_COLOR_TRC_GAMMA22 };
+ struct pl_peak_detect_params peak_params = { .minimum_peak = 0.01 };
+ if (pl_shader_detect_peak(sh, csp_gamma22, &peak_state, &peak_params)) {
+ REQUIRE(pl_dispatch_compute(dp, &(struct pl_dispatch_compute_params) {
+ .shader = &sh,
+ .width = fbo->params.w,
+ .height = fbo->params.h,
+ }));
+
+ float peak, avg;
+ REQUIRE(pl_get_detected_peak(peak_state, &peak, &avg));
+
+ float real_peak = 0, real_avg = 0;
+ for (int y = 0; y < FBO_H; y++) {
+ for (int x = 0; x < FBO_W; x++) {
+ float *color = &src_data[(y * FBO_W + x) * 4];
+ float luma = 0.212639f * powf(color[0], 2.2f) +
+ 0.715169f * powf(color[1], 2.2f) +
+ 0.072192f * powf(color[2], 2.2f);
+ luma = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, luma);
+ real_peak = PL_MAX(real_peak, luma);
+ real_avg += luma;
+ }
+ }
+ real_avg = real_avg / (FBO_W * FBO_H);
+
+ real_avg = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, real_avg);
+ real_peak = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, real_peak);
+ REQUIRE_FEQ(peak, real_peak, 1e-3);
+ REQUIRE_FEQ(avg, real_avg, 1e-2);
+ }
+
+ pl_dispatch_abort(dp, &sh);
+ pl_shader_obj_destroy(&peak_state);
+
+ // Test film grain synthesis
+ pl_shader_obj grain = NULL;
+ struct pl_film_grain_params grain_params = {
+ .tex = src,
+ .components = 3,
+ .component_mapping = { 0, 1, 2},
+ .repr = &(struct pl_color_repr) {
+ .sys = PL_COLOR_SYSTEM_BT_709,
+ .levels = PL_COLOR_LEVELS_LIMITED,
+ .bits = { .color_depth = 10, .sample_depth = 10 },
+ },
+ };
+
+ for (int i = 0; i < 2; i++) {
+ grain_params.data.type = PL_FILM_GRAIN_AV1;
+ grain_params.data.params.av1 = av1_grain_data;
+ grain_params.data.params.av1.overlap = !!i;
+ grain_params.data.seed = rand();
+
+ sh = pl_dispatch_begin(dp);
+ pl_shader_film_grain(sh, &grain, &grain_params);
+ REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
+ .shader = &sh,
+ .target = fbo,
+ }));
+ }
+
+ if (gpu->glsl.compute) {
+ grain_params.data.type = PL_FILM_GRAIN_H274;
+ grain_params.data.params.h274 = h274_grain_data;
+ grain_params.data.seed = rand();
+
+ sh = pl_dispatch_begin(dp);
+ pl_shader_film_grain(sh, &grain, &grain_params);
+ REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
+ .shader = &sh,
+ .target = fbo,
+ }));
+ }
+ pl_shader_obj_destroy(&grain);
+
+ // Test custom shaders
+ struct pl_custom_shader custom = {
+ .header =
+ "vec3 invert(vec3 color) \n"
+ "{ \n"
+ " return vec3(1.0) - color; \n"
+ "} \n",
+
+ .body =
+ "color = vec4(gl_FragCoord.xy, 0.0, 1.0); \n"
+ "color.rgb = invert(color.rgb) + offset; \n",
+
+ .input = PL_SHADER_SIG_NONE,
+ .output = PL_SHADER_SIG_COLOR,
+
+ .num_variables = 1,
+ .variables = &(struct pl_shader_var) {
+ .var = pl_var_float("offset"),
+ .data = &(float) { 0.1 },
+ },
+ };
+
+ sh = pl_dispatch_begin(dp);
+ REQUIRE(pl_shader_custom(sh, &custom));
+ REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
+ .shader = &sh,
+ .target = fbo,
+ }));
+
+ // Test dolbyvision
+ struct pl_color_repr repr = {
+ .sys = PL_COLOR_SYSTEM_DOLBYVISION,
+ .dovi = &dovi_meta,
+ };
+
+ sh = pl_dispatch_begin(dp);
+ pl_shader_sample_direct(sh, pl_sample_src( .tex = src ));
+ pl_shader_decode_color(sh, &repr, NULL);
+ REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
+ .shader = &sh,
+ .target = fbo,
+ }));
+
+ // Test deinterlacing
+ sh = pl_dispatch_begin(dp);
+ pl_shader_deinterlace(sh, pl_deinterlace_source( .cur = pl_field_pair(src) ), NULL);
+ REQUIRE(pl_dispatch_finish(dp, pl_dispatch_params(
+ .shader = &sh,
+ .target = fbo,
+ )));
+
+ // Test error diffusion
+ if (fbo->params.storable) {
+ for (int i = 0; i < pl_num_error_diffusion_kernels; i++) {
+ const struct pl_error_diffusion_kernel *k = pl_error_diffusion_kernels[i];
+ printf("testing error diffusion kernel '%s'\n", k->name);
+ sh = pl_dispatch_begin(dp);
+ bool ok = pl_shader_error_diffusion(sh, pl_error_diffusion_params(
+ .input_tex = src,
+ .output_tex = fbo,
+ .new_depth = 8,
+ .kernel = k,
+ ));
+
+ if (!ok) {
+ fprintf(stderr, "kernel '%s' exceeds GPU limits, skipping...\n", k->name);
+ continue;
+ }
+
+ REQUIRE(pl_dispatch_compute(dp, pl_dispatch_compute_params(
+ .shader = &sh,
+ .dispatch_size = {1, 1, 1},
+ )));
+ }
+ }
+
+ pl_dispatch_destroy(&dp);
+ pl_tex_destroy(gpu, &src);
+ pl_tex_destroy(gpu, &fbo);
+}
+
+static void pl_scaler_tests(pl_gpu gpu)
+{
+ pl_fmt src_fmt = pl_find_fmt(gpu, PL_FMT_FLOAT, 1, 16, 32, PL_FMT_CAP_LINEAR);
+ pl_fmt fbo_fmt = pl_find_fmt(gpu, PL_FMT_FLOAT, 1, 16, 32, PL_FMT_CAP_RENDERABLE);
+ if (!src_fmt || !fbo_fmt)
+ return;
+
+ float *fbo_data = NULL;
+ pl_shader_obj lut = NULL;
+
+ static float data_5x5[5][5] = {
+ { 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0 },
+ { 0, 0, 1, 0, 0 },
+ { 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0 },
+ };
+
+ pl_tex dot5x5 = pl_tex_create(gpu, &(struct pl_tex_params) {
+ .w = 5,
+ .h = 5,
+ .format = src_fmt,
+ .sampleable = true,
+ .initial_data = &data_5x5[0][0],
+ });
+
+ struct pl_tex_params fbo_params = {
+ .w = 100,
+ .h = 100,
+ .format = fbo_fmt,
+ .renderable = true,
+ .storable = fbo_fmt->caps & PL_FMT_CAP_STORABLE,
+ .host_readable = fbo_fmt->caps & PL_FMT_CAP_HOST_READABLE,
+ };
+
+ pl_tex fbo = pl_tex_create(gpu, &fbo_params);
+ pl_dispatch dp = pl_dispatch_create(gpu->log, gpu);
+ if (!dot5x5 || !fbo || !dp)
+ goto error;
+
+ pl_shader sh = pl_dispatch_begin(dp);
+ REQUIRE(pl_shader_sample_polar(sh,
+ pl_sample_src(
+ .tex = dot5x5,
+ .new_w = fbo->params.w,
+ .new_h = fbo->params.h,
+ ),
+ pl_sample_filter_params(
+ .filter = pl_filter_ewa_lanczos,
+ .lut = &lut,
+ .no_compute = !fbo->params.storable,
+ )
+ ));
+ REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
+ .shader = &sh,
+ .target = fbo,
+ }));
+
+ if (fbo->params.host_readable) {
+ fbo_data = malloc(fbo->params.w * fbo->params.h * sizeof(float));
+ REQUIRE(pl_tex_download(gpu, &(struct pl_tex_transfer_params) {
+ .tex = fbo,
+ .ptr = fbo_data,
+ }));
+
+#ifdef PRINT_OUTPUT
+ int max = 255;
+ printf("P2\n%d %d\n%d\n", fbo->params.w, fbo->params.h, max);
+ for (int y = 0; y < fbo->params.h; y++) {
+ for (int x = 0; x < fbo->params.w; x++) {
+ float v = fbo_data[y * fbo->params.h + x];
+ printf("%d ", (int) round(fmin(fmax(v, 0.0), 1.0) * max));
+ }
+ printf("\n");
+ }
+#endif
+ }
+
+error:
+ free(fbo_data);
+ pl_shader_obj_destroy(&lut);
+ pl_dispatch_destroy(&dp);
+ pl_tex_destroy(gpu, &dot5x5);
+ pl_tex_destroy(gpu, &fbo);
+}
+
+static const char *user_shader_tests[] = {
+ // Test hooking, saving and loading
+ "// Example of a comment at the beginning \n"
+ " \n"
+ "//!HOOK NATIVE \n"
+ "//!DESC upscale image \n"
+ "//!BIND HOOKED \n"
+ "//!WIDTH HOOKED.w 10 * \n"
+ "//!HEIGHT HOOKED.h 10 * \n"
+ "//!SAVE NATIVEBIG \n"
+ "//!WHEN NATIVE.w 500 < \n"
+ " \n"
+ "vec4 hook() \n"
+ "{ \n"
+ " return HOOKED_texOff(0); \n"
+ "} \n"
+ " \n"
+ "//!HOOK MAIN \n"
+ "//!DESC downscale bigger image \n"
+ "//!WHEN NATIVE.w 500 < \n"
+ "//!BIND NATIVEBIG \n"
+ " \n"
+ "vec4 hook() \n"
+ "{ \n"
+ " return NATIVEBIG_texOff(0); \n"
+ "} \n",
+
+ // Test use of textures
+ "//!HOOK MAIN \n"
+ "//!DESC turn everything into colorful pixels \n"
+ "//!BIND HOOKED \n"
+ "//!BIND DISCO \n"
+ "//!COMPONENTS 3 \n"
+ " \n"
+ "vec4 hook() \n"
+ "{ \n"
+ " return vec4(DISCO_tex(HOOKED_pos * 10.0).rgb, 1); \n"
+ "} \n"
+ " \n"
+ "//!TEXTURE DISCO \n"
+ "//!SIZE 3 3 \n"
+ "//!FORMAT rgba8 \n"
+ "//!FILTER NEAREST \n"
+ "//!BORDER REPEAT \n"
+ "ff0000ff00ff00ff0000ffff00ffffffff00ffffffff00ff4c4c4cff999999ffffffffff\n"
+
+ // Test custom parameters
+ "//!PARAM test \n"
+ "//!DESC test parameter \n"
+ "//!TYPE DYNAMIC float \n"
+ "//!MINIMUM 0.0 \n"
+ "//!MAXIMUM 100.0 \n"
+ "1.0 \n"
+ " \n"
+ "//!PARAM testconst \n"
+ "//!TYPE CONSTANT uint \n"
+ "//!MAXIMUM 16 \n"
+ "3 \n"
+ " \n"
+ "//!PARAM testdefine \n"
+ "//!TYPE DEFINE \n"
+ "100 \n"
+ " \n"
+ "//!PARAM testenum \n"
+ "//!TYPE ENUM DEFINE \n"
+ "FOO \n"
+ "BAR \n"
+ " \n"
+ "//!HOOK MAIN \n"
+ "//!WHEN testconst 30 > \n"
+ "#error should not be run \n"
+ " \n"
+ "//!HOOK MAIN \n"
+ "//!WHEN testenum FOO = \n"
+ "#if testenum == BAR \n"
+ " #error bad \n"
+ "#endif \n"
+ "vec4 hook() { return vec4(0.0); } \n"
+};
+
+static const char *compute_shader_tests[] = {
+ // Test use of storage/buffer resources
+ "//!HOOK MAIN \n"
+ "//!DESC attach some storage objects \n"
+ "//!BIND tex_storage \n"
+ "//!BIND buf_uniform \n"
+ "//!BIND buf_storage \n"
+ "//!COMPONENTS 4 \n"
+ " \n"
+ "vec4 hook() \n"
+ "{ \n"
+ " return vec4(foo, bar, bat); \n"
+ "} \n"
+ " \n"
+ "//!TEXTURE tex_storage \n"
+ "//!SIZE 100 100 \n"
+ "//!FORMAT r32f \n"
+ "//!STORAGE \n"
+ " \n"
+ "//!BUFFER buf_uniform \n"
+ "//!VAR float foo \n"
+ "//!VAR float bar \n"
+ "0000000000000000 \n"
+ " \n"
+ "//!BUFFER buf_storage \n"
+ "//!VAR vec2 bat \n"
+ "//!VAR int big[32]; \n"
+ "//!STORAGE \n",
+
+};
+
+static const char *test_luts[] = {
+
+ "TITLE \"1D identity\" \n"
+ "LUT_1D_SIZE 2 \n"
+ "0.0 0.0 0.0 \n"
+ "1.0 1.0 1.0 \n",
+
+ "TITLE \"3D identity\" \n"
+ "LUT_3D_SIZE 2 \n"
+ "0.0 0.0 0.0 \n"
+ "1.0 0.0 0.0 \n"
+ "0.0 1.0 0.0 \n"
+ "1.0 1.0 0.0 \n"
+ "0.0 0.0 1.0 \n"
+ "1.0 0.0 1.0 \n"
+ "0.0 1.0 1.0 \n"
+ "1.0 1.0 1.0 \n"
+
+};
+
+static bool frame_passthrough(pl_gpu gpu, pl_tex *tex,
+ const struct pl_source_frame *src, struct pl_frame *out_frame)
+{
+ const struct pl_frame *frame = src->frame_data;
+ *out_frame = *frame;
+ return true;
+}
+
+static enum pl_queue_status get_frame_ptr(struct pl_source_frame *out_frame,
+ const struct pl_queue_params *qparams)
+{
+ const struct pl_source_frame **pframe = qparams->priv;
+ if (!(*pframe)->frame_data)
+ return PL_QUEUE_EOF;
+
+ *out_frame = *(*pframe)++;
+ return PL_QUEUE_OK;
+}
+
+static void render_info_cb(void *priv, const struct pl_render_info *info)
+{
+ printf("{%d} Executed shader: %s\n", info->index,
+ info->pass->shader->description);
+}
+
+static void pl_render_tests(pl_gpu gpu)
+{
+ pl_tex img_tex = NULL, fbo = NULL;
+ pl_renderer rr = NULL;
+
+ enum { width = 50, height = 50 };
+ static float data[width][height];
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x++)
+ data[y][x] = RANDOM;
+ }
+
+ struct pl_plane img_plane = {0};
+ struct pl_plane_data plane_data = {
+ .type = PL_FMT_FLOAT,
+ .width = width,
+ .height = height,
+ .component_size = { 8 * sizeof(float) },
+ .component_map = { 0 },
+ .pixel_stride = sizeof(float),
+ .pixels = data,
+ };
+
+ if (!pl_recreate_plane(gpu, NULL, &fbo, &plane_data))
+ return;
+
+ if (!pl_upload_plane(gpu, &img_plane, &img_tex, &plane_data))
+ goto error;
+
+ rr = pl_renderer_create(gpu->log, gpu);
+ pl_tex_clear_ex(gpu, fbo, (union pl_clear_color){0});
+
+ struct pl_frame image = {
+ .num_planes = 1,
+ .planes = { img_plane },
+ .repr = {
+ .sys = PL_COLOR_SYSTEM_BT_709,
+ .levels = PL_COLOR_LEVELS_FULL,
+ },
+ .color = pl_color_space_srgb,
+ };
+
+ struct pl_frame target = {
+ .num_planes = 1,
+ .planes = {{
+ .texture = fbo,
+ .components = 3,
+ .component_mapping = {0, 1, 2},
+ }},
+ .repr = {
+ .sys = PL_COLOR_SYSTEM_RGB,
+ .levels = PL_COLOR_LEVELS_FULL,
+ .bits.color_depth = 32,
+ },
+ .color = pl_color_space_srgb,
+ };
+
+ REQUIRE(pl_render_image(rr, &image, &target, NULL));
+ REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+
+ // TODO: embed a reference texture and ensure it matches
+
+ // Test a bunch of different params
+#define TEST(SNAME, STYPE, DEFAULT, FIELD, LIMIT) \
+ do { \
+ for (int i = 0; i <= LIMIT; i++) { \
+ printf("testing `" #STYPE "." #FIELD " = %d`\n", i); \
+ struct pl_render_params params = pl_render_default_params; \
+ params.force_dither = true; \
+ struct STYPE tmp = DEFAULT; \
+ tmp.FIELD = i; \
+ params.SNAME = &tmp; \
+ REQUIRE(pl_render_image(rr, &image, &target, &params)); \
+ pl_gpu_flush(gpu); \
+ REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE); \
+ } \
+ } while (0)
+
+#define TEST_PARAMS(NAME, FIELD, LIMIT) \
+ TEST(NAME##_params, pl_##NAME##_params, pl_##NAME##_default_params, FIELD, LIMIT)
+
+ image.crop.x1 = width / 2.0;
+ image.crop.y1 = height / 2.0;
+ for (int i = 0; i < pl_num_scale_filters; i++) {
+ struct pl_render_params params = pl_render_default_params;
+ params.upscaler = pl_scale_filters[i].filter;
+ printf("testing `params.upscaler = /* %s */`\n", pl_scale_filters[i].name);
+ REQUIRE(pl_render_image(rr, &image, &target, &params));
+ pl_gpu_flush(gpu);
+ REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+ }
+ image.crop.x1 = image.crop.y1 = 0;
+
+ target.crop.x1 = width / 2.0;
+ target.crop.y1 = height / 2.0;
+ for (int i = 0; i < pl_num_scale_filters; i++) {
+ struct pl_render_params params = pl_render_default_params;
+ params.downscaler = pl_scale_filters[i].filter;
+ printf("testing `params.downscaler = /* %s */`\n", pl_scale_filters[i].name);
+ REQUIRE(pl_render_image(rr, &image, &target, &params));
+ pl_gpu_flush(gpu);
+ REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+ }
+ target.crop.x1 = target.crop.y1 = 0;
+
+ TEST_PARAMS(deband, iterations, 3);
+ TEST_PARAMS(sigmoid, center, 1);
+ TEST_PARAMS(color_map, intent, PL_INTENT_ABSOLUTE_COLORIMETRIC);
+ TEST_PARAMS(dither, method, PL_DITHER_WHITE_NOISE);
+ TEST_PARAMS(dither, temporal, true);
+ TEST_PARAMS(distort, alpha_mode, PL_ALPHA_INDEPENDENT);
+ TEST_PARAMS(distort, constrain, true);
+ TEST_PARAMS(distort, bicubic, true);
+ TEST(cone_params, pl_cone_params, pl_vision_deuteranomaly, strength, 0);
+
+ // Test gamma-correct dithering
+ target.repr.bits.color_depth = 2;
+ TEST_PARAMS(dither, transfer, PL_COLOR_TRC_GAMMA22);
+ target.repr.bits.color_depth = 32;
+
+ // Test HDR tone mapping
+ image.color = pl_color_space_hdr10;
+ TEST_PARAMS(color_map, visualize_lut, true);
+ if (gpu->limits.max_ssbo_size)
+ TEST_PARAMS(peak_detect, allow_delayed, true);
+
+ // Test inverse tone-mapping and pure BPC
+ image.color.hdr.max_luma = 1000;
+ target.color.hdr.max_luma = 4000;
+ target.color.hdr.min_luma = 0.02;
+ TEST_PARAMS(color_map, inverse_tone_mapping, true);
+
+ image.color = pl_color_space_srgb;
+ target.color = pl_color_space_srgb;
+
+ // Test some misc stuff
+ struct pl_render_params params = pl_render_default_params;
+ params.color_adjustment = &(struct pl_color_adjustment) {
+ .brightness = 0.1,
+ .contrast = 0.9,
+ .saturation = 1.5,
+ .gamma = 0.8,
+ .temperature = 0.3,
+ };
+ REQUIRE(pl_render_image(rr, &image, &target, &params));
+ REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+ params = pl_render_default_params;
+
+ struct pl_frame inferred_image = image, inferred_target = target;
+ pl_frames_infer(rr, &inferred_image, &inferred_target);
+ REQUIRE(pl_render_image(rr, &inferred_image, &inferred_target, &params));
+ REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+
+ // Test background blending and alpha transparency
+ params.blend_against_tiles = true;
+ params.corner_rounding = 0.25f;
+ REQUIRE(pl_render_image(rr, &image, &target, &params));
+ REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+ params = pl_render_default_params;
+
+ // Test film grain synthesis
+ image.film_grain.type = PL_FILM_GRAIN_AV1;
+ image.film_grain.params.av1 = av1_grain_data;
+ REQUIRE(pl_render_image(rr, &image, &target, &params));
+ REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+
+ image.film_grain.type = PL_FILM_GRAIN_H274;
+ image.film_grain.params.h274 = h274_grain_data;
+ REQUIRE(pl_render_image(rr, &image, &target, &params));
+ // H.274 film grain synthesis requires compute shaders
+ if (gpu->glsl.compute) {
+ REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+ } else {
+ const struct pl_render_errors rr_err = pl_renderer_get_errors(rr);
+ REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_FILM_GRAIN);
+ pl_renderer_reset_errors(rr, &rr_err);
+ }
+ image.film_grain = (struct pl_film_grain_data) {0};
+
+ // Test mpv-style custom shaders
+ for (int i = 0; i < PL_ARRAY_SIZE(user_shader_tests); i++) {
+ printf("testing user shader:\n\n%s\n", user_shader_tests[i]);
+ const struct pl_hook *hook;
+ hook = pl_mpv_user_shader_parse(gpu, user_shader_tests[i],
+ strlen(user_shader_tests[i]));
+ REQUIRE(hook);
+
+ params.hooks = &hook;
+ params.num_hooks = 1;
+ REQUIRE(pl_render_image(rr, &image, &target, &params));
+ REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+
+ pl_mpv_user_shader_destroy(&hook);
+ }
+
+ if (gpu->glsl.compute && gpu->limits.max_ssbo_size) {
+ for (int i = 0; i < PL_ARRAY_SIZE(compute_shader_tests); i++) {
+ printf("testing user shader:\n\n%s\n", compute_shader_tests[i]);
+ const struct pl_hook *hook;
+ hook = pl_mpv_user_shader_parse(gpu, compute_shader_tests[i],
+ strlen(compute_shader_tests[i]));
+ REQUIRE(hook);
+
+ params.hooks = &hook;
+ params.num_hooks = 1;
+ REQUIRE(pl_render_image(rr, &image, &target, &params));
+ REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+
+ pl_mpv_user_shader_destroy(&hook);
+ }
+ }
+ params = pl_render_default_params;
+
+ // Test custom LUTs
+ for (int i = 0; i < PL_ARRAY_SIZE(test_luts); i++) {
+ printf("testing custom lut %d\n", i);
+ struct pl_custom_lut *lut;
+ lut = pl_lut_parse_cube(gpu->log, test_luts[i], strlen(test_luts[i]));
+ REQUIRE(lut);
+
+ bool has_3dlut = gpu->limits.max_tex_3d_dim && gpu->glsl.version > 100;
+ if (lut->size[2] && !has_3dlut) {
+ pl_lut_free(&lut);
+ continue;
+ }
+
+ // Test all three at the same time to reduce the number of tests
+ image.lut = target.lut = params.lut = lut;
+
+ for (enum pl_lut_type t = PL_LUT_UNKNOWN; t <= PL_LUT_CONVERSION; t++) {
+ printf("testing LUT method %d\n", t);
+ image.lut_type = target.lut_type = params.lut_type = t;
+ REQUIRE(pl_render_image(rr, &image, &target, &params));
+ REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+ }
+
+ image.lut = target.lut = params.lut = NULL;
+ pl_lut_free(&lut);
+ }
+
+#ifdef PL_HAVE_LCMS
+
+ // It doesn't fit without use of 3D textures on GLES2
+ if (gpu->glsl.version > 100) {
+ // Test ICC profiles
+ image.profile = TEST_PROFILE(sRGB_v2_nano_icc);
+ REQUIRE(pl_render_image(rr, &image, &target, &params));
+ REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+ image.profile = (struct pl_icc_profile) {0};
+
+ target.profile = TEST_PROFILE(sRGB_v2_nano_icc);
+ REQUIRE(pl_render_image(rr, &image, &target, &params));
+ REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+ target.profile = (struct pl_icc_profile) {0};
+
+ image.profile = TEST_PROFILE(sRGB_v2_nano_icc);
+ target.profile = image.profile;
+ REQUIRE(pl_render_image(rr, &image, &target, &params));
+ REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+ image.profile = (struct pl_icc_profile) {0};
+ target.profile = (struct pl_icc_profile) {0};
+ }
+
+#endif
+
+ // Test overlays
+ image.num_overlays = 1;
+ image.overlays = &(struct pl_overlay) {
+ .tex = img_plane.texture,
+ .mode = PL_OVERLAY_NORMAL,
+ .num_parts = 2,
+ .parts = (struct pl_overlay_part[]) {{
+ .src = {0, 0, 2, 2},
+ .dst = {30, 100, 40, 200},
+ }, {
+ .src = {2, 2, 5, 5},
+ .dst = {1000, -1, 3, 5},
+ }},
+ };
+ REQUIRE(pl_render_image(rr, &image, &target, &params));
+ REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+ params.disable_fbos = true;
+ REQUIRE(pl_render_image(rr, &image, &target, &params));
+ REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+ image.num_overlays = 0;
+ params = pl_render_default_params;
+
+ target.num_overlays = 1;
+ target.overlays = &(struct pl_overlay) {
+ .tex = img_plane.texture,
+ .mode = PL_OVERLAY_MONOCHROME,
+ .num_parts = 1,
+ .parts = &(struct pl_overlay_part) {
+ .src = {5, 5, 15, 15},
+ .dst = {5, 5, 15, 15},
+ .color = {1.0, 0.5, 0.0},
+ },
+ };
+ REQUIRE(pl_render_image(rr, &image, &target, &params));
+ REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+ REQUIRE(pl_render_image(rr, NULL, &target, &params));
+ REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+ target.num_overlays = 0;
+
+ // Test rotation
+ for (pl_rotation rot = 0; rot < PL_ROTATION_360; rot += PL_ROTATION_90) {
+ image.rotation = rot;
+ REQUIRE(pl_render_image(rr, &image, &target, &params));
+ REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+ }
+
+ // Attempt frame mixing, using the mixer queue helper
+ printf("testing frame mixing \n");
+ struct pl_render_params mix_params = {
+ .frame_mixer = &pl_filter_mitchell_clamp,
+ .info_callback = render_info_cb,
+ };
+
+ struct pl_queue_params qparams = {
+ .radius = pl_frame_mix_radius(&mix_params),
+ .vsync_duration = 1.0 / 60.0,
+ };
+
+ // Test large PTS jumps in frame mix
+ struct pl_frame_mix mix = (struct pl_frame_mix) {
+ .num_frames = 2,
+ .frames = (const struct pl_frame *[]) { &image, &image },
+ .signatures = (uint64_t[]) { 0xFFF1, 0xFFF2 },
+ .timestamps = (float[]) { -100, 100 },
+ .vsync_duration = 1.6,
+ };
+ REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));
+
+ // Test inferring frame mix
+ inferred_target = target;
+ pl_frames_infer_mix(rr, &mix, &inferred_target, &inferred_image);
+ REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));
+
+ // Test empty frame mix
+ mix = (struct pl_frame_mix) {0};
+ REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));
+
+ // Test inferring empty frame mix
+ inferred_target = target;
+ pl_frames_infer_mix(rr, &mix, &inferred_target, &inferred_image);
+ REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));
+
+ // Test mixer queue
+#define NUM_MIX_FRAMES 20
+ const float frame_duration = 1.0 / 24.0;
+ struct pl_source_frame srcframes[NUM_MIX_FRAMES+1];
+ srcframes[NUM_MIX_FRAMES] = (struct pl_source_frame) {0};
+ for (int i = 0; i < NUM_MIX_FRAMES; i++) {
+ srcframes[i] = (struct pl_source_frame) {
+ .pts = i * frame_duration,
+ .duration = frame_duration,
+ .map = frame_passthrough,
+ .frame_data = &image,
+ };
+ }
+
+ pl_queue queue = pl_queue_create(gpu);
+ enum pl_queue_status ret;
+
+ // Test pre-pushing all frames, with delayed EOF.
+ for (int i = 0; i < NUM_MIX_FRAMES; i++) {
+ const struct pl_source_frame *src = &srcframes[i];
+ if (i > 10) // test pushing in reverse order
+ src = &srcframes[NUM_MIX_FRAMES + 10 - i];
+ if (!pl_queue_push_block(queue, 1, src)) // mini-sleep
+ pl_queue_push(queue, src); // push it anyway, for testing
+ }
+
+ while ((ret = pl_queue_update(queue, &mix, &qparams)) != PL_QUEUE_EOF) {
+ if (ret == PL_QUEUE_MORE) {
+ REQUIRE_CMP(qparams.pts, >, 0.0f, "f");
+ pl_queue_push(queue, NULL); // push delayed EOF
+ continue;
+ }
+
+ REQUIRE_CMP(ret, ==, PL_QUEUE_OK, "u");
+ REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));
+
+ // Simulate advancing vsync
+ qparams.pts += qparams.vsync_duration;
+ }
+
+ // Test dynamically pulling all frames, with oversample mixer
+ const struct pl_source_frame *frame_ptr = &srcframes[0];
+ mix_params.frame_mixer = &pl_oversample_frame_mixer;
+
+ qparams = (struct pl_queue_params) {
+ .radius = pl_frame_mix_radius(&mix_params),
+ .vsync_duration = qparams.vsync_duration,
+ .get_frame = get_frame_ptr,
+ .priv = &frame_ptr,
+ };
+
+ pl_queue_reset(queue);
+ while ((ret = pl_queue_update(queue, &mix, &qparams)) != PL_QUEUE_EOF) {
+ REQUIRE_CMP(ret, ==, PL_QUEUE_OK, "u");
+ REQUIRE_CMP(mix.num_frames, <=, 2, "d");
+ REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));
+ qparams.pts += qparams.vsync_duration;
+ }
+
+ // Test large PTS jump
+ pl_queue_reset(queue);
+ REQUIRE(pl_queue_update(queue, &mix, &qparams) == PL_QUEUE_EOF);
+
+ // Test deinterlacing
+ pl_queue_reset(queue);
+ printf("testing deinterlacing \n");
+ for (int i = 0; i < NUM_MIX_FRAMES; i++) {
+ struct pl_source_frame *src = &srcframes[i];
+ if (i > 10)
+ src = &srcframes[NUM_MIX_FRAMES + 10 - i];
+ src->first_field = PL_FIELD_EVEN;
+ pl_queue_push(queue, src);
+ }
+ pl_queue_push(queue, NULL);
+
+ qparams.pts = 0;
+ qparams.get_frame = NULL;
+ while ((ret = pl_queue_update(queue, &mix, &qparams)) != PL_QUEUE_EOF) {
+ REQUIRE_CMP(ret, ==, PL_QUEUE_OK, "u");
+ REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));
+ qparams.pts += qparams.vsync_duration;
+ }
+
+ pl_queue_destroy(&queue);
+
+error:
+ pl_renderer_destroy(&rr);
+ pl_tex_destroy(gpu, &img_tex);
+ pl_tex_destroy(gpu, &fbo);
+}
+
+static struct pl_hook_res noop_hook(void *priv, const struct pl_hook_params *params)
+{
+ return (struct pl_hook_res) {0};
+}
+
+static void pl_ycbcr_tests(pl_gpu gpu)
+{
+ struct pl_plane_data data[3];
+ for (int i = 0; i < 3; i++) {
+ const int sub = i > 0 ? 1 : 0;
+ const int width = (323 + sub) >> sub;
+ const int height = (255 + sub) >> sub;
+
+ data[i] = (struct pl_plane_data) {
+ .type = PL_FMT_UNORM,
+ .width = width,
+ .height = height,
+ .component_size = {16},
+ .component_map = {i},
+ .pixel_stride = sizeof(uint16_t),
+ .row_stride = PL_ALIGN2(width * sizeof(uint16_t),
+ gpu->limits.align_tex_xfer_pitch),
+ };
+ }
+
+ pl_fmt fmt = pl_plane_find_fmt(gpu, NULL, &data[0]);
+ enum pl_fmt_caps caps = PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_HOST_READABLE;
+ if (!fmt || (fmt->caps & caps) != caps)
+ return;
+
+ pl_renderer rr = pl_renderer_create(gpu->log, gpu);
+ if (!rr)
+ return;
+
+ pl_tex src_tex[3] = {0};
+ pl_tex dst_tex[3] = {0};
+ struct pl_frame img = {
+ .num_planes = 3,
+ .repr = pl_color_repr_hdtv,
+ .color = pl_color_space_bt709,
+ };
+
+ struct pl_frame target = {
+ .num_planes = 3,
+ .repr = pl_color_repr_hdtv,
+ .color = pl_color_space_bt709,
+ };
+
+ uint8_t *src_buffer[3] = {0};
+ uint8_t *dst_buffer = NULL;
+ for (int i = 0; i < 3; i++) {
+ // Generate some arbitrary data for the buffer
+ src_buffer[i] = malloc(data[i].height * data[i].row_stride);
+ if (!src_buffer[i])
+ goto error;
+
+ data[i].pixels = src_buffer[i];
+ for (int y = 0; y < data[i].height; y++) {
+ for (int x = 0; x < data[i].width; x++) {
+ size_t off = y * data[i].row_stride + x * data[i].pixel_stride;
+ uint16_t *pixel = (uint16_t *) &src_buffer[i][off];
+ int gx = 200 + 100 * i, gy = 300 + 150 * i;
+ *pixel = (gx * x) ^ (gy * y); // whatever
+ }
+ }
+
+ REQUIRE(pl_upload_plane(gpu, &img.planes[i], &src_tex[i], &data[i]));
+ }
+
+ // This co-sites chroma pixels with pixels in the RGB image, meaning we
+ // get an exact round-trip when sampling both ways. This makes it useful
+ // as a test case, even though it's not common in the real world.
+ pl_frame_set_chroma_location(&img, PL_CHROMA_TOP_LEFT);
+
+ for (int i = 0; i < 3; i++) {
+ dst_tex[i] = pl_tex_create(gpu, &(struct pl_tex_params) {
+ .format = fmt,
+ .w = data[i].width,
+ .h = data[i].height,
+ .renderable = true,
+ .host_readable = true,
+ .storable = fmt->caps & PL_FMT_CAP_STORABLE,
+ .blit_dst = fmt->caps & PL_FMT_CAP_BLITTABLE,
+ });
+
+ if (!dst_tex[i])
+ goto error;
+
+ target.planes[i] = img.planes[i];
+ target.planes[i].texture = dst_tex[i];
+ }
+
+ REQUIRE(pl_render_image(rr, &img, &target, &(struct pl_render_params) {
+ .num_hooks = 1,
+ .hooks = &(const struct pl_hook *){&(struct pl_hook) {
+ // Forces chroma merging, to test the chroma merging code
+ .stages = PL_HOOK_CHROMA_INPUT,
+ .hook = noop_hook,
+ }},
+ }));
+ REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+
+ size_t buf_size = data[0].height * data[0].row_stride;
+ dst_buffer = malloc(buf_size);
+ if (!dst_buffer)
+ goto error;
+
+ for (int i = 0; i < 3; i++) {
+ memset(dst_buffer, 0xAA, buf_size);
+ REQUIRE(pl_tex_download(gpu, &(struct pl_tex_transfer_params) {
+ .tex = dst_tex[i],
+ .ptr = dst_buffer,
+ .row_pitch = data[i].row_stride,
+ }));
+
+ for (int y = 0; y < data[i].height; y++) {
+ for (int x = 0; x < data[i].width; x++) {
+ size_t off = y * data[i].row_stride + x * data[i].pixel_stride;
+ uint16_t *src_pixel = (uint16_t *) &src_buffer[i][off];
+ uint16_t *dst_pixel = (uint16_t *) &dst_buffer[off];
+ int diff = abs((int) *src_pixel - (int) *dst_pixel);
+ REQUIRE_CMP(diff, <=, 50, "d"); // a little under 0.1%
+ }
+ }
+ }
+
+error:
+ pl_renderer_destroy(&rr);
+ free(dst_buffer);
+ for (int i = 0; i < 3; i++) {
+ free(src_buffer[i]);
+ pl_tex_destroy(gpu, &src_tex[i]);
+ pl_tex_destroy(gpu, &dst_tex[i]);
+ }
+}
+
+static void pl_test_export_import(pl_gpu gpu,
+ enum pl_handle_type handle_type)
+{
+ // Test texture roundtrip
+
+ if (!(gpu->export_caps.tex & handle_type) ||
+ !(gpu->import_caps.tex & handle_type))
+ goto skip_tex;
+
+ pl_fmt fmt = pl_find_fmt(gpu, PL_FMT_UNORM, 4, 0, 0, PL_FMT_CAP_BLITTABLE);
+ if (!fmt)
+ goto skip_tex;
+
+ printf("testing texture import/export with fmt %s\n", fmt->name);
+
+ pl_tex export = pl_tex_create(gpu, &(struct pl_tex_params) {
+ .w = 32,
+ .h = 32,
+ .format = fmt,
+ .export_handle = handle_type,
+ });
+ REQUIRE(export);
+ REQUIRE_HANDLE(export->shared_mem, handle_type);
+
+ pl_tex import = pl_tex_create(gpu, &(struct pl_tex_params) {
+ .w = export->params.w,
+ .h = export->params.h,
+ .format = fmt,
+ .import_handle = handle_type,
+ .shared_mem = export->shared_mem,
+ });
+ REQUIRE(import);
+
+ pl_tex_destroy(gpu, &import);
+ pl_tex_destroy(gpu, &export);
+
+skip_tex: ;
+
+ // Test buffer roundtrip
+
+ if (!(gpu->export_caps.buf & handle_type) ||
+ !(gpu->import_caps.buf & handle_type))
+ return;
+
+ printf("testing buffer import/export\n");
+
+ pl_buf exp_buf = pl_buf_create(gpu, &(struct pl_buf_params) {
+ .size = 32,
+ .export_handle = handle_type,
+ });
+ REQUIRE(exp_buf);
+ REQUIRE_HANDLE(exp_buf->shared_mem, handle_type);
+
+ pl_buf imp_buf = pl_buf_create(gpu, &(struct pl_buf_params) {
+ .size = 32,
+ .import_handle = handle_type,
+ .shared_mem = exp_buf->shared_mem,
+ });
+ REQUIRE(imp_buf);
+
+ pl_buf_destroy(gpu, &imp_buf);
+ pl_buf_destroy(gpu, &exp_buf);
+}
+
+static void pl_test_host_ptr(pl_gpu gpu)
+{
+ if (!(gpu->import_caps.buf & PL_HANDLE_HOST_PTR))
+ return;
+
+#ifdef __unix__
+
+ printf("testing host ptr\n");
+ REQUIRE(gpu->limits.max_mapped_size);
+
+ const size_t size = 2 << 20;
+ const size_t offset = 2 << 10;
+ const size_t slice = 2 << 16;
+
+ uint8_t *data = aligned_alloc(0x1000, size);
+ for (int i = 0; i < size; i++)
+ data[i] = (uint8_t) i;
+
+ pl_buf buf = pl_buf_create(gpu, &(struct pl_buf_params) {
+ .size = slice,
+ .import_handle = PL_HANDLE_HOST_PTR,
+ .shared_mem = {
+ .handle.ptr = data,
+ .size = size,
+ .offset = offset,
+ },
+ .host_mapped = true,
+ });
+
+ REQUIRE(buf);
+ REQUIRE_MEMEQ(data + offset, buf->data, slice);
+
+ pl_buf_destroy(gpu, &buf);
+ free(data);
+
+#endif // unix
+}
+
+static void gpu_shader_tests(pl_gpu gpu)
+{
+ pl_buffer_tests(gpu);
+ pl_texture_tests(gpu);
+ pl_planar_tests(gpu);
+ pl_shader_tests(gpu);
+ pl_scaler_tests(gpu);
+ pl_render_tests(gpu);
+ pl_ycbcr_tests(gpu);
+
+ REQUIRE(!pl_gpu_is_failed(gpu));
+}
+
+static void gpu_interop_tests(pl_gpu gpu)
+{
+ pl_test_export_import(gpu, PL_HANDLE_DMA_BUF);
+ pl_test_host_ptr(gpu);
+
+ REQUIRE(!pl_gpu_is_failed(gpu));
+}
diff --git a/src/tests/icc.c b/src/tests/icc.c
new file mode 100644
index 0000000..188940b
--- /dev/null
+++ b/src/tests/icc.c
@@ -0,0 +1,106 @@
+#include "tests.h"
+
+#include <libplacebo/shaders/icc.h>
+
+static const uint8_t DisplayP3_v2_micro_icc[] = {
+ 0x00, 0x00, 0x01, 0xc8, 0x6c, 0x63, 0x6d, 0x73, 0x02, 0x10, 0x00, 0x00,
+ 0x6d, 0x6e, 0x74, 0x72, 0x52, 0x47, 0x42, 0x20, 0x58, 0x59, 0x5a, 0x20,
+ 0x07, 0xe2, 0x00, 0x03, 0x00, 0x14, 0x00, 0x09, 0x00, 0x0e, 0x00, 0x1d,
+ 0x61, 0x63, 0x73, 0x70, 0x4d, 0x53, 0x46, 0x54, 0x00, 0x00, 0x00, 0x00,
+ 0x73, 0x61, 0x77, 0x73, 0x63, 0x74, 0x72, 0x6c, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xd6,
+ 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0xd3, 0x2d, 0x68, 0x61, 0x6e, 0x64,
+ 0xb4, 0xaa, 0xdd, 0x1f, 0x13, 0xc8, 0x03, 0x3c, 0xf5, 0x51, 0x14, 0x45,
+ 0x28, 0x7a, 0x98, 0xe2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,
+ 0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00, 0x5e,
+ 0x63, 0x70, 0x72, 0x74, 0x00, 0x00, 0x01, 0x0c, 0x00, 0x00, 0x00, 0x0c,
+ 0x77, 0x74, 0x70, 0x74, 0x00, 0x00, 0x01, 0x18, 0x00, 0x00, 0x00, 0x14,
+ 0x72, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x2c, 0x00, 0x00, 0x00, 0x14,
+ 0x67, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x14,
+ 0x62, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x54, 0x00, 0x00, 0x00, 0x14,
+ 0x72, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x68, 0x00, 0x00, 0x00, 0x60,
+ 0x67, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x68, 0x00, 0x00, 0x00, 0x60,
+ 0x62, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x68, 0x00, 0x00, 0x00, 0x60,
+ 0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04,
+ 0x75, 0x50, 0x33, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x74, 0x65, 0x78, 0x74, 0x00, 0x00, 0x00, 0x00,
+ 0x43, 0x43, 0x30, 0x00, 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0xf3, 0x51, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x16, 0xcc,
+ 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x83, 0xdf,
+ 0x00, 0x00, 0x3d, 0xbf, 0xff, 0xff, 0xff, 0xbb, 0x58, 0x59, 0x5a, 0x20,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4a, 0xbf, 0x00, 0x00, 0xb1, 0x37,
+ 0x00, 0x00, 0x0a, 0xb9, 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x28, 0x38, 0x00, 0x00, 0x11, 0x0a, 0x00, 0x00, 0xc8, 0xb9,
+ 0x63, 0x75, 0x72, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2a,
+ 0x00, 0x00, 0x00, 0x7c, 0x00, 0xf8, 0x01, 0x9c, 0x02, 0x75, 0x03, 0x83,
+ 0x04, 0xc9, 0x06, 0x4e, 0x08, 0x12, 0x0a, 0x18, 0x0c, 0x62, 0x0e, 0xf4,
+ 0x11, 0xcf, 0x14, 0xf6, 0x18, 0x6a, 0x1c, 0x2e, 0x20, 0x43, 0x24, 0xac,
+ 0x29, 0x6a, 0x2e, 0x7e, 0x33, 0xeb, 0x39, 0xb3, 0x3f, 0xd6, 0x46, 0x57,
+ 0x4d, 0x36, 0x54, 0x76, 0x5c, 0x17, 0x64, 0x1d, 0x6c, 0x86, 0x75, 0x56,
+ 0x7e, 0x8d, 0x88, 0x2c, 0x92, 0x36, 0x9c, 0xab, 0xa7, 0x8c, 0xb2, 0xdb,
+ 0xbe, 0x99, 0xca, 0xc7, 0xd7, 0x65, 0xe4, 0x77, 0xf1, 0xf9, 0xff, 0xff
+};
+
+static const uint8_t Rec2020_v2_micro_icc[] = {
+ 0x00, 0x00, 0x01, 0xcc, 0x6c, 0x63, 0x6d, 0x73, 0x02, 0x10, 0x00, 0x00,
+ 0x6d, 0x6e, 0x74, 0x72, 0x52, 0x47, 0x42, 0x20, 0x58, 0x59, 0x5a, 0x20,
+ 0x07, 0xe2, 0x00, 0x03, 0x00, 0x14, 0x00, 0x09, 0x00, 0x0e, 0x00, 0x1d,
+ 0x61, 0x63, 0x73, 0x70, 0x4d, 0x53, 0x46, 0x54, 0x00, 0x00, 0x00, 0x00,
+ 0x73, 0x61, 0x77, 0x73, 0x63, 0x74, 0x72, 0x6c, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xd6,
+ 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0xd3, 0x2d, 0x68, 0x61, 0x6e, 0x64,
+ 0x17, 0xcb, 0x44, 0xd1, 0x0d, 0xca, 0xe1, 0xc9, 0x03, 0x3e, 0x20, 0x85,
+ 0x4a, 0x67, 0x4e, 0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,
+ 0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00, 0x5f,
+ 0x63, 0x70, 0x72, 0x74, 0x00, 0x00, 0x01, 0x0c, 0x00, 0x00, 0x00, 0x0c,
+ 0x77, 0x74, 0x70, 0x74, 0x00, 0x00, 0x01, 0x18, 0x00, 0x00, 0x00, 0x14,
+ 0x72, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x2c, 0x00, 0x00, 0x00, 0x14,
+ 0x67, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x14,
+ 0x62, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x54, 0x00, 0x00, 0x00, 0x14,
+ 0x72, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x68, 0x00, 0x00, 0x00, 0x64,
+ 0x67, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x68, 0x00, 0x00, 0x00, 0x64,
+ 0x62, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x68, 0x00, 0x00, 0x00, 0x64,
+ 0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05,
+ 0x75, 0x32, 0x30, 0x32, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x74, 0x65, 0x78, 0x74, 0x00, 0x00, 0x00, 0x00,
+ 0x43, 0x43, 0x30, 0x00, 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0xf3, 0x51, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x16, 0xcc,
+ 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xac, 0x69,
+ 0x00, 0x00, 0x47, 0x70, 0xff, 0xff, 0xff, 0x81, 0x58, 0x59, 0x5a, 0x20,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2a, 0x6a, 0x00, 0x00, 0xac, 0xe3,
+ 0x00, 0x00, 0x07, 0xad, 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x20, 0x03, 0x00, 0x00, 0x0b, 0xad, 0x00, 0x00, 0xcb, 0xff,
+ 0x63, 0x75, 0x72, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2c,
+ 0x00, 0x00, 0x01, 0x53, 0x02, 0xa5, 0x03, 0xf8, 0x05, 0x4e, 0x06, 0xd6,
+ 0x08, 0x98, 0x0a, 0x8f, 0x0c, 0xc3, 0x0f, 0x31, 0x11, 0xdc, 0x14, 0xc3,
+ 0x17, 0xe8, 0x1b, 0x4c, 0x1e, 0xf0, 0x22, 0xd5, 0x26, 0xfa, 0x2b, 0x62,
+ 0x30, 0x0c, 0x34, 0xfa, 0x3a, 0x2b, 0x3f, 0xa2, 0x45, 0x5d, 0x4b, 0x5f,
+ 0x51, 0xa7, 0x58, 0x37, 0x5f, 0x0d, 0x66, 0x2c, 0x6d, 0x94, 0x75, 0x45,
+ 0x7d, 0x3f, 0x85, 0x84, 0x8e, 0x13, 0x96, 0xee, 0xa0, 0x13, 0xa9, 0x86,
+ 0xb3, 0x44, 0xbd, 0x4f, 0xc7, 0xa8, 0xd2, 0x4e, 0xdd, 0x42, 0xe8, 0x86,
+ 0xf4, 0x16, 0xff, 0xff
+};
+
+int main()
+{
+ pl_log log = pl_test_logger();
+ pl_icc_object icc;
+
+ icc = pl_icc_open(log, &TEST_PROFILE(sRGB_v2_nano_icc), NULL);
+ REQUIRE_CMP(icc->csp.primaries, ==, PL_COLOR_PRIM_BT_709, "u");
+ pl_icc_close(&icc);
+
+ icc = pl_icc_open(log, &TEST_PROFILE(DisplayP3_v2_micro_icc), NULL);
+ REQUIRE_CMP(icc->csp.primaries, ==, PL_COLOR_PRIM_DISPLAY_P3, "u");
+ pl_icc_close(&icc);
+
+ icc = pl_icc_open(log, &TEST_PROFILE(Rec2020_v2_micro_icc), NULL);
+ REQUIRE_CMP(icc->csp.primaries, ==, PL_COLOR_PRIM_BT_2020, "u");
+ pl_icc_close(&icc);
+
+ pl_log_destroy(&log);
+}
diff --git a/src/tests/include/include_tmpl.c b/src/tests/include/include_tmpl.c
new file mode 100644
index 0000000..dd1000e
--- /dev/null
+++ b/src/tests/include/include_tmpl.c
@@ -0,0 +1 @@
+#include <libplacebo/@header@>
diff --git a/src/tests/include/include_tmpl.cpp b/src/tests/include/include_tmpl.cpp
new file mode 100644
index 0000000..2b6334c
--- /dev/null
+++ b/src/tests/include/include_tmpl.cpp
@@ -0,0 +1,3 @@
+#define PL_LIBAV_IMPLEMENTATION 0
+#define PL_DAV1D_IMPLEMENTATION 0
+#include <libplacebo/@header@>
diff --git a/src/tests/include/meson.build b/src/tests/include/meson.build
new file mode 100644
index 0000000..25dfaee
--- /dev/null
+++ b/src/tests/include/meson.build
@@ -0,0 +1,35 @@
+include_tmpl_langs = ['c', 'cpp']
+
+# Ensure all headers compile
+
+test_include_sources = []
+foreach h : headers
+
+ if (h.contains('internal') or
+ h.contains('dav1d') and not dav1d.found() or
+ h.contains('libav') and not libav_found or
+ h.contains('d3d11') and not d3d11_header)
+ continue
+ endif
+
+ foreach lang : include_tmpl_langs
+
+ test_include_sources += configure_file(
+ input: 'include_tmpl.' + lang,
+ output: 'include_@0@.@1@'.format(h.underscorify(), lang),
+ configuration: {
+ 'header': h
+ },
+ )
+
+ endforeach
+
+endforeach
+
+static_library('test_include', test_include_sources,
+ dependencies: [tdep_static, lavu, lavc, lavf],
+ include_directories: [inc, vulkan_headers_inc],
+ implicit_include_directories: false,
+ c_args: ['-Wall', '-Wextra', '-Wpedantic'],
+ cpp_args: ['-Wall', '-Wextra', '-Wpedantic'],
+)
diff --git a/src/tests/libav.c b/src/tests/libav.c
new file mode 100644
index 0000000..7c91e85
--- /dev/null
+++ b/src/tests/libav.c
@@ -0,0 +1,393 @@
+#include "tests.h"
+#include "libplacebo/utils/libav.h"
+
+int main()
+{
+ struct pl_plane_data data[4] = {0};
+ struct pl_bit_encoding bits;
+
+ // Make sure we don't crash on any av pixfmt
+ const AVPixFmtDescriptor *desc = NULL;
+ while ((desc = av_pix_fmt_desc_next(desc)))
+ pl_plane_data_from_pixfmt(data, &bits, av_pix_fmt_desc_get_id(desc));
+
+#define TEST(pixfmt, reference) \
+ do { \
+ int planes = pl_plane_data_from_pixfmt(data, &bits, pixfmt); \
+ REQUIRE_CMP(planes, ==, sizeof(reference) / sizeof(*reference), "d"); \
+ REQUIRE_MEMEQ(data, reference, sizeof(reference)); \
+ } while (0)
+
+ // Planar and semiplanar formats
+ static const struct pl_plane_data yuvp8[] = {
+ {
+ .type = PL_FMT_UNORM,
+ .component_size = {8},
+ .component_map = {0},
+ .pixel_stride = 1,
+ }, {
+ .type = PL_FMT_UNORM,
+ .component_size = {8},
+ .component_map = {1},
+ .pixel_stride = 1,
+ }, {
+ .type = PL_FMT_UNORM,
+ .component_size = {8},
+ .component_map = {2},
+ .pixel_stride = 1,
+ }
+ };
+
+ TEST(AV_PIX_FMT_YUV420P, yuvp8);
+ TEST(AV_PIX_FMT_YUV422P, yuvp8);
+ TEST(AV_PIX_FMT_YUV444P, yuvp8);
+ TEST(AV_PIX_FMT_YUV410P, yuvp8);
+ TEST(AV_PIX_FMT_YUV411P, yuvp8);
+ TEST(AV_PIX_FMT_YUV440P, yuvp8);
+
+ static const struct pl_plane_data yuvap8[] = {
+ {
+ .type = PL_FMT_UNORM,
+ .component_size = {8},
+ .component_map = {0},
+ .pixel_stride = 1,
+ }, {
+ .type = PL_FMT_UNORM,
+ .component_size = {8},
+ .component_map = {1},
+ .pixel_stride = 1,
+ }, {
+ .type = PL_FMT_UNORM,
+ .component_size = {8},
+ .component_map = {2},
+ .pixel_stride = 1,
+ }, {
+ .type = PL_FMT_UNORM,
+ .component_size = {8},
+ .component_map = {3},
+ .pixel_stride = 1,
+ }
+ };
+
+ TEST(AV_PIX_FMT_YUVA420P, yuvap8);
+
+ static const struct pl_plane_data yuvp16[] = {
+ {
+ .type = PL_FMT_UNORM,
+ .component_size = {16},
+ .component_map = {0},
+ .pixel_stride = 2,
+ }, {
+ .type = PL_FMT_UNORM,
+ .component_size = {16},
+ .component_map = {1},
+ .pixel_stride = 2,
+ }, {
+ .type = PL_FMT_UNORM,
+ .component_size = {16},
+ .component_map = {2},
+ .pixel_stride = 2,
+ }
+ };
+
+ TEST(AV_PIX_FMT_YUV420P10LE, yuvp16);
+ TEST(AV_PIX_FMT_YUV420P16LE, yuvp16);
+
+ static const struct pl_plane_data nv12[] = {
+ {
+ .type = PL_FMT_UNORM,
+ .component_size = {8},
+ .component_map = {0},
+ .pixel_stride = 1,
+ }, {
+ .type = PL_FMT_UNORM,
+ .component_size = {8, 8},
+ .component_map = {1, 2},
+ .pixel_stride = 2,
+ }
+ };
+
+ TEST(AV_PIX_FMT_NV12, nv12);
+
+ static const struct pl_plane_data nv21[] = {
+ {
+ .type = PL_FMT_UNORM,
+ .component_size = {8},
+ .component_map = {0},
+ .pixel_stride = 1,
+ }, {
+ .type = PL_FMT_UNORM,
+ .component_size = {8, 8},
+ .component_map = {2, 1},
+ .pixel_stride = 2,
+ }
+ };
+
+ TEST(AV_PIX_FMT_NV21, nv21);
+
+ static const struct pl_plane_data p016[] = {
+ {
+ .type = PL_FMT_UNORM,
+ .component_size = {16},
+ .component_map = {0},
+ .pixel_stride = 2,
+ }, {
+ .type = PL_FMT_UNORM,
+ .component_size = {16, 16},
+ .component_map = {1, 2},
+ .pixel_stride = 4,
+ }
+ };
+
+ TEST(AV_PIX_FMT_P010LE, p016);
+ TEST(AV_PIX_FMT_P016LE, p016);
+
+ // Packed formats
+ static const struct pl_plane_data r8[] = {
+ {
+ .type = PL_FMT_UNORM,
+ .component_size = {8},
+ .component_map = {0},
+ .pixel_stride = 1,
+ }
+ };
+
+ TEST(AV_PIX_FMT_GRAY8, r8);
+
+ static const struct pl_plane_data rg8[] = {
+ {
+ .type = PL_FMT_UNORM,
+ .component_size = {8, 8},
+ .component_map = {0, 1},
+ .pixel_stride = 2,
+ }
+ };
+
+ TEST(AV_PIX_FMT_GRAY8A, rg8);
+
+ static const struct pl_plane_data rgb8[] = {
+ {
+ .type = PL_FMT_UNORM,
+ .component_size = {8, 8, 8},
+ .component_map = {0, 1, 2},
+ .pixel_stride = 3,
+ }
+ };
+
+ TEST(AV_PIX_FMT_RGB24, rgb8);
+
+ static const struct pl_plane_data bgr8[] = {
+ {
+ .type = PL_FMT_UNORM,
+ .component_size = {8, 8, 8},
+ .component_map = {2, 1, 0},
+ .pixel_stride = 3,
+ }
+ };
+
+ TEST(AV_PIX_FMT_BGR24, bgr8);
+
+ static const struct pl_plane_data rgbx8[] = {
+ {
+ .type = PL_FMT_UNORM,
+ .component_size = {8, 8, 8},
+ .component_map = {0, 1, 2},
+ .pixel_stride = 4,
+ }
+ };
+
+ TEST(AV_PIX_FMT_RGB0, rgbx8);
+
+ static const struct pl_plane_data xrgb8[] = {
+ {
+ .type = PL_FMT_UNORM,
+ .component_size = {8, 8, 8},
+ .component_map = {0, 1, 2},
+ .component_pad = {8, 0, 0},
+ .pixel_stride = 4,
+ }
+ };
+
+ TEST(AV_PIX_FMT_0RGB, xrgb8);
+
+ static const struct pl_plane_data rgba8[] = {
+ {
+ .type = PL_FMT_UNORM,
+ .component_size = {8, 8, 8, 8},
+ .component_map = {0, 1, 2, 3},
+ .pixel_stride = 4,
+ }
+ };
+
+ TEST(AV_PIX_FMT_RGBA, rgba8);
+
+ static const struct pl_plane_data argb8[] = {
+ {
+ .type = PL_FMT_UNORM,
+ .component_size = {8, 8, 8, 8},
+ .component_map = {3, 0, 1, 2},
+ .pixel_stride = 4,
+ }
+ };
+
+ TEST(AV_PIX_FMT_ARGB, argb8);
+
+ static const struct pl_plane_data bgra8[] = {
+ {
+ .type = PL_FMT_UNORM,
+ .component_size = {8, 8, 8, 8},
+ .component_map = {2, 1, 0, 3},
+ .pixel_stride = 4,
+ }
+ };
+
+ TEST(AV_PIX_FMT_BGRA, bgra8);
+
+ static const struct pl_plane_data abgr8[] = {
+ {
+ .type = PL_FMT_UNORM,
+ .component_size = {8, 8, 8, 8},
+ .component_map = {3, 2, 1, 0},
+ .pixel_stride = 4,
+ }
+ };
+
+ TEST(AV_PIX_FMT_ABGR, abgr8);
+
+ static const struct pl_plane_data r16[] = {
+ {
+ .type = PL_FMT_UNORM,
+ .component_size = {16},
+ .component_map = {0},
+ .pixel_stride = 2,
+ }
+ };
+
+ TEST(AV_PIX_FMT_GRAY16LE, r16);
+
+ static const struct pl_plane_data rgb16[] = {
+ {
+ .type = PL_FMT_UNORM,
+ .component_size = {16, 16, 16},
+ .component_map = {0, 1, 2},
+ .pixel_stride = 6,
+ }
+ };
+
+ TEST(AV_PIX_FMT_RGB48LE, rgb16);
+
+ static const struct pl_plane_data rgb16be[] = {
+ {
+ .type = PL_FMT_UNORM,
+ .component_size = {16, 16, 16},
+ .component_map = {0, 1, 2},
+ .pixel_stride = 6,
+ .swapped = true,
+ }
+ };
+
+ TEST(AV_PIX_FMT_RGB48BE, rgb16be);
+
+ static const struct pl_plane_data rgba16[] = {
+ {
+ .type = PL_FMT_UNORM,
+ .component_size = {16, 16, 16, 16},
+ .component_map = {0, 1, 2, 3},
+ .pixel_stride = 8,
+ }
+ };
+
+ TEST(AV_PIX_FMT_RGBA64LE, rgba16);
+
+ static const struct pl_plane_data rgba16be[] = {
+ {
+ .type = PL_FMT_UNORM,
+ .component_size = {16, 16, 16, 16},
+ .component_map = {0, 1, 2, 3},
+ .pixel_stride = 8,
+ .swapped = true,
+ }
+ };
+
+ TEST(AV_PIX_FMT_RGBA64BE, rgba16be);
+
+ static const struct pl_plane_data rgb565[] = {
+ {
+ .type = PL_FMT_UNORM,
+ .component_size = {5, 6, 5},
+ .component_map = {2, 1, 0}, // LSB to MSB
+ .pixel_stride = 2,
+ }
+ };
+
+ TEST(AV_PIX_FMT_RGB565LE, rgb565);
+
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 37, 100)
+
+ static const struct pl_plane_data rgb32f[] = {
+ {
+ .type = PL_FMT_FLOAT,
+ .component_size = {32, 32, 32},
+ .component_map = {0, 1, 2},
+ .pixel_stride = 12,
+ }
+ };
+
+ TEST(AV_PIX_FMT_RGBF32LE, rgb32f);
+
+#endif
+
+ // Test pl_frame <- AVFrame bridge
+ struct pl_frame image;
+ AVFrame *frame = av_frame_alloc();
+ frame->format = AV_PIX_FMT_RGBA;
+ pl_frame_from_avframe(&image, frame);
+ REQUIRE_CMP(image.num_planes, ==, 1, "d");
+ REQUIRE_CMP(image.repr.sys, ==, PL_COLOR_SYSTEM_RGB, "u");
+
+ // Test inverse mapping
+ struct pl_color_space csp = image.color;
+ pl_color_space_infer(&csp);
+ pl_avframe_set_color(frame, csp);
+ pl_avframe_set_repr(frame, image.repr);
+ pl_avframe_set_profile(frame, image.profile);
+ pl_frame_from_avframe(&image, frame);
+ pl_color_space_infer(&image.color);
+ REQUIRE(pl_color_space_equal(&csp, &image.color));
+ av_frame_free(&frame);
+
+ // Test enum functions
+ for (enum pl_color_system sys = 0; sys < PL_COLOR_SYSTEM_COUNT; sys++) {
+ enum AVColorSpace spc = pl_system_to_av(sys);
+ enum pl_color_system sys2 = pl_system_from_av(spc);
+ // Exception to the rule, due to different handling in libav*
+ if (sys2 && sys != PL_COLOR_SYSTEM_BT_2100_HLG)
+ REQUIRE_CMP(sys, ==, sys2, "u");
+ }
+
+ for (enum pl_color_levels lev = 0; lev < PL_COLOR_LEVELS_COUNT; lev++) {
+ enum AVColorRange range = pl_levels_to_av(lev);
+ enum pl_color_levels lev2 = pl_levels_from_av(range);
+ REQUIRE_CMP(lev, ==, lev2, "u");
+ }
+
+ for (enum pl_color_primaries prim = 0; prim < PL_COLOR_PRIM_COUNT; prim++) {
+ enum AVColorPrimaries avpri = pl_primaries_to_av(prim);
+ enum pl_color_primaries prim2 = pl_primaries_from_av(avpri);
+ if (prim2)
+ REQUIRE_CMP(prim, ==, prim2, "u");
+ }
+
+ for (enum pl_color_transfer trc = 0; trc < PL_COLOR_TRC_COUNT; trc++) {
+ enum AVColorTransferCharacteristic avtrc = pl_transfer_to_av(trc);
+ enum pl_color_transfer trc2 = pl_transfer_from_av(avtrc);
+ if (trc2)
+ REQUIRE_CMP(trc, ==, trc2, "u");
+ }
+
+ for (enum pl_chroma_location loc = 0; loc < PL_CHROMA_COUNT; loc++) {
+ enum AVChromaLocation avloc = pl_chroma_to_av(loc);
+ enum pl_chroma_location loc2 = pl_chroma_from_av(avloc);
+ REQUIRE_CMP(loc, ==, loc2, "u");
+ }
+}
diff --git a/src/tests/lut.c b/src/tests/lut.c
new file mode 100644
index 0000000..4af44ee
--- /dev/null
+++ b/src/tests/lut.c
@@ -0,0 +1,86 @@
+#include "tests.h"
+
+#include <libplacebo/dummy.h>
+#include <libplacebo/shaders/lut.h>
+
+static const char *luts[] = {
+
+ "TITLE \"1D LUT example\" \n"
+ "LUT_1D_SIZE 11 \n"
+ "# Random comment \n"
+ "0.0 0.0 0.0 \n"
+ "0.1 0.1 0.1 \n"
+ "0.2 0.2 0.2 \n"
+ "0.3 0.3 0.3 \n"
+ "0.4 0.4 0.4 \n"
+ "0.5 0.5 0.5 \n"
+ "0.6 0.6 0.6 \n"
+ "0.7 0.7 0.7 \n"
+ "0.8 0.8 0.8 \n"
+ "0.9 0.9 0.9 \n"
+ "0.10 0.10 0.10 \n",
+
+ "LUT_3D_SIZE 3 \n"
+ "TITLE \"3D LUT example\" \n"
+ "0.0 0.0 0.0 \n"
+ "0.5 0.0 0.0 \n"
+ "1.0 0.0 0.0 \n"
+ "0.0 0.5 0.0 \n"
+ "0.5 0.5 0.0 \n"
+ "1.0 0.5 0.0 \n"
+ "0.0 1.0 0.0 \n"
+ "0.5 1.0 0.0 \n"
+ "1.0 1.0 0.0 \n"
+ "0.0 0.0 0.5 \n"
+ "0.5 0.0 0.5 \n"
+ "1.0 0.0 0.5 \n"
+ "0.0 0.5 0.5 \n"
+ "0.5 0.5 0.5 \n"
+ "1.0 0.5 0.5 \n"
+ "0.0 1.0 0.5 \n"
+ "0.5 1.0 0.5 \n"
+ "1.0 1.0 0.5 \n"
+ "0.0 0.0 1.0 \n"
+ "0.5 0.0 1.0 \n"
+ "1.0 0.0 1.0 \n"
+ "0.0 0.5 1.0 \n"
+ "0.5 0.5 1.0 \n"
+ "1.0 0.5 1.0 \n"
+ "0.0 1.0 1.0 \n"
+ "0.5 1.0 1.0 \n"
+ "1.0 1.0 1.0 \n",
+
+ "LUT_1D_SIZE 3 \n"
+ "TITLE \"custom domain\" \n"
+ "DOMAIN_MAX 255 255 255 \n"
+ "0 0 0 \n"
+ "128 128 128 \n"
+ "255 255 255 \n"
+
+};
+
+int main()
+{
+ pl_log log = pl_test_logger();
+ pl_gpu gpu = pl_gpu_dummy_create(log, NULL);
+ pl_shader sh = pl_shader_alloc(log, NULL);
+ pl_shader_obj obj = NULL;
+
+ for (int i = 0; i < PL_ARRAY_SIZE(luts); i++) {
+ struct pl_custom_lut *lut;
+ lut = pl_lut_parse_cube(log, luts[i], strlen(luts[i]));
+ REQUIRE(lut);
+
+ pl_shader_reset(sh, pl_shader_params( .gpu = gpu ));
+ pl_shader_custom_lut(sh, lut, &obj);
+ const struct pl_shader_res *res = pl_shader_finalize(sh);
+ REQUIRE(res);
+ printf("Generated LUT shader:\n%s\n", res->glsl);
+ pl_lut_free(&lut);
+ }
+
+ pl_shader_obj_destroy(&obj);
+ pl_shader_free(&sh);
+ pl_gpu_dummy_destroy(&gpu);
+ pl_log_destroy(&log);
+}
diff --git a/src/tests/meson.build b/src/tests/meson.build
new file mode 100644
index 0000000..335c6b1
--- /dev/null
+++ b/src/tests/meson.build
@@ -0,0 +1,39 @@
+ts = []
+
+foreach t : tests
+ deps = [tdep_static]
+ if t == 'opengl_surfaceless.c'
+ deps += glad_dep
+ endif
+ # TODO: Define objects in tdep_static once Meson 1.1.0 is ok to use
+ ts += { 'source': t,
+ 'deps': deps,
+ 'objects': lib.extract_all_objects(recursive: false) }
+endforeach
+
+dav1d = dependency('dav1d', required: false)
+if dav1d.found()
+ ts += { 'source': 'dav1d.c', 'deps': [dav1d, tdep_shared] }
+endif
+
+lavu = dependency('libavutil', version: '>=55.74.100', required: false)
+lavc = dependency('libavcodec', required: false)
+lavf = dependency('libavformat', required: false)
+libav_found = lavu.found() and lavc.found() and lavf.found()
+if libav_found
+ ts += { 'source': 'libav.c', 'deps': [lavu, lavc, lavf, tdep_shared] }
+endif
+
+foreach t : ts
+ e = executable('test.' + t['source'], t['source'],
+ objects: t.get('objects', []),
+ c_args: [ '-Wno-unused-function' ],
+ dependencies: t.get('deps', []),
+ link_args: link_args,
+ link_depends: link_depends,
+ )
+
+ test(t['source'], e, timeout: 120)
+endforeach
+
+subdir('include')
diff --git a/src/tests/opengl_surfaceless.c b/src/tests/opengl_surfaceless.c
new file mode 100644
index 0000000..2d12a08
--- /dev/null
+++ b/src/tests/opengl_surfaceless.c
@@ -0,0 +1,247 @@
+#include "gpu_tests.h"
+#include "opengl/utils.h"
+
+#include <libplacebo/opengl.h>
+
+static void opengl_interop_tests(pl_gpu gpu)
+{
+ pl_fmt fmt = pl_find_fmt(gpu, PL_FMT_UNORM, 1, 0, 0,
+ PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_LINEAR);
+ if (!fmt)
+ return;
+
+ pl_tex export = pl_tex_create(gpu, pl_tex_params(
+ .w = 32,
+ .h = 32,
+ .format = fmt,
+ .sampleable = true,
+ .renderable = true,
+ .blit_dst = fmt->caps & PL_FMT_CAP_BLITTABLE,
+ ));
+
+ REQUIRE(export);
+
+ struct pl_opengl_wrap_params wrap = {
+ .width = export->params.w,
+ .height = export->params.h,
+ .depth = export->params.d,
+ };
+
+ wrap.texture = pl_opengl_unwrap(gpu, export, &wrap.target, &wrap.iformat, NULL);
+ REQUIRE(wrap.texture);
+
+ pl_tex import = pl_opengl_wrap(gpu, &wrap);
+ REQUIRE(import);
+ REQUIRE(import->params.renderable);
+ REQUIRE_CMP(import->params.blit_dst, ==, export->params.blit_dst, "d");
+
+ pl_tex_destroy(gpu, &import);
+ pl_tex_destroy(gpu, &export);
+}
+
+#define PBUFFER_WIDTH 640
+#define PBUFFER_HEIGHT 480
+
+struct swapchain_priv {
+ EGLDisplay display;
+ EGLSurface surface;
+};
+
+static void swap_buffers(void *priv)
+{
+ struct swapchain_priv *p = priv;
+ eglSwapBuffers(p->display, p->surface);
+}
+
+static void opengl_swapchain_tests(pl_opengl gl,
+ EGLDisplay display, EGLSurface surface)
+{
+ if (surface == EGL_NO_SURFACE)
+ return;
+
+ printf("testing opengl swapchain\n");
+ pl_gpu gpu = gl->gpu;
+ pl_swapchain sw;
+ sw = pl_opengl_create_swapchain(gl, pl_opengl_swapchain_params(
+ .swap_buffers = swap_buffers,
+ .priv = &(struct swapchain_priv) { display, surface },
+ ));
+ REQUIRE(sw);
+
+ int w = PBUFFER_WIDTH, h = PBUFFER_HEIGHT;
+ REQUIRE(pl_swapchain_resize(sw, &w, &h));
+
+ for (int i = 0; i < 10; i++) {
+ struct pl_swapchain_frame frame;
+ REQUIRE(pl_swapchain_start_frame(sw, &frame));
+ if (frame.fbo->params.blit_dst)
+ pl_tex_clear(gpu, frame.fbo, (float[4]){0});
+
+ // TODO: test this with an actual pl_renderer instance
+ struct pl_frame target;
+ pl_frame_from_swapchain(&target, &frame);
+
+ REQUIRE(pl_swapchain_submit_frame(sw));
+ pl_swapchain_swap_buffers(sw);
+ }
+
+ pl_swapchain_destroy(&sw);
+}
+
+int main()
+{
+ if (!gladLoaderLoadEGL(EGL_NO_DISPLAY))
+ return SKIP;
+
+ const char *extstr = eglQueryString(EGL_NO_DISPLAY, EGL_EXTENSIONS);
+ if (!extstr || !strstr(extstr, "EGL_MESA_platform_surfaceless"))
+ return SKIP;
+
+ // Create the OpenGL context
+ EGLDisplay dpy = eglGetPlatformDisplayEXT(EGL_PLATFORM_SURFACELESS_MESA,
+ (void *) EGL_DEFAULT_DISPLAY, NULL);
+ if (dpy == EGL_NO_DISPLAY)
+ return SKIP;
+
+ EGLint major, minor;
+ if (!eglInitialize(dpy, &major, &minor))
+ return SKIP;
+
+ if (!gladLoaderLoadEGL(dpy))
+ return SKIP;
+
+ printf("Initialized EGL v%d.%d\n", major, minor);
+ int egl_ver = major * 10 + minor;
+
+ struct {
+ EGLenum api;
+ EGLenum render;
+ int major, minor;
+ int glsl_ver;
+ EGLenum profile;
+ } egl_vers[] = {
+ { EGL_OPENGL_API, EGL_OPENGL_BIT, 4, 6, 460, EGL_CONTEXT_OPENGL_CORE_PROFILE_BIT },
+ { EGL_OPENGL_API, EGL_OPENGL_BIT, 3, 3, 330, EGL_CONTEXT_OPENGL_CORE_PROFILE_BIT },
+ { EGL_OPENGL_API, EGL_OPENGL_BIT, 3, 0, 130, EGL_CONTEXT_OPENGL_COMPATIBILITY_PROFILE_BIT, },
+ { EGL_OPENGL_ES_API, EGL_OPENGL_ES3_BIT, 3, 0, 300, },
+ };
+
+ struct pl_glsl_version last_glsl = {0};
+ struct pl_gpu_limits last_limits = {0};
+
+ pl_log log = pl_test_logger();
+
+ for (int i = 0; i < PL_ARRAY_SIZE(egl_vers); i++) {
+
+ const int cfg_attribs[] = {
+ EGL_SURFACE_TYPE, EGL_PBUFFER_BIT,
+ EGL_RENDERABLE_TYPE, egl_vers[i].render,
+ EGL_NONE
+ };
+
+ EGLConfig config = 0;
+ EGLint num_configs = 0;
+ bool ok = eglChooseConfig(dpy, cfg_attribs, &config, 1, &num_configs);
+ if (!ok || !num_configs)
+ goto error;
+
+ if (!eglBindAPI(egl_vers[i].api))
+ goto error;
+
+ EGLContext egl;
+ if (egl_vers[i].api == EGL_OPENGL_ES_API) {
+ // OpenGL ES
+ const EGLint egl_attribs[] = {
+ EGL_CONTEXT_CLIENT_VERSION, egl_vers[i].major,
+ (egl_ver >= 15) ? EGL_CONTEXT_OPENGL_DEBUG : EGL_NONE, EGL_TRUE,
+ EGL_NONE
+ };
+
+ printf("Attempting creation of OpenGL ES v%d context\n", egl_vers[i].major);
+ egl = eglCreateContext(dpy, config, EGL_NO_CONTEXT, egl_attribs);
+ } else {
+ // Desktop OpenGL
+ const int egl_attribs[] = {
+ EGL_CONTEXT_MAJOR_VERSION, egl_vers[i].major,
+ EGL_CONTEXT_MINOR_VERSION, egl_vers[i].minor,
+ EGL_CONTEXT_OPENGL_PROFILE_MASK, egl_vers[i].profile,
+ (egl_ver >= 15) ? EGL_CONTEXT_OPENGL_DEBUG : EGL_NONE, EGL_TRUE,
+ EGL_NONE
+ };
+
+ printf("Attempting creation of Desktop OpenGL v%d.%d context\n",
+ egl_vers[i].major, egl_vers[i].minor);
+ egl = eglCreateContext(dpy, config, EGL_NO_CONTEXT, egl_attribs);
+ }
+
+ if (!egl)
+ goto error;
+
+ const EGLint pbuffer_attribs[] = {
+ EGL_WIDTH, PBUFFER_WIDTH,
+ EGL_HEIGHT, PBUFFER_HEIGHT,
+ EGL_NONE
+ };
+
+ EGLSurface surf = eglCreatePbufferSurface(dpy, config, pbuffer_attribs);
+
+ if (!eglMakeCurrent(dpy, surf, surf, egl))
+ goto error;
+
+ pl_opengl gl = pl_opengl_create(log, pl_opengl_params(
+ .get_proc_addr = (pl_voidfunc_t (*)(const char *)) eglGetProcAddress,
+ .max_glsl_version = egl_vers[i].glsl_ver,
+ .debug = true,
+ .egl_display = dpy,
+ .egl_context = egl,
+#ifdef CI_ALLOW_SW
+ .allow_software = true,
+#endif
+ ));
+ if (!gl)
+ goto next;
+
+ // Skip repeat tests
+ pl_gpu gpu = gl->gpu;
+ if (memcmp(&last_glsl, &gpu->glsl, sizeof(last_glsl)) == 0 &&
+ memcmp(&last_limits, &gpu->limits, sizeof(last_limits)) == 0)
+ {
+ printf("Skipping tests due to duplicate capabilities/version\n");
+ goto next;
+ }
+
+#ifdef CI_MAXGL
+ if (last_glsl.version && last_glsl.gles == gpu->glsl.gles)
+ goto next;
+#endif
+
+ last_glsl = gpu->glsl;
+ last_limits = gpu->limits;
+
+ gpu_shader_tests(gpu);
+ gpu_interop_tests(gpu);
+ opengl_interop_tests(gpu);
+ opengl_swapchain_tests(gl, dpy, surf);
+
+ // Reduce log spam after first successful test
+ pl_log_level_update(log, PL_LOG_INFO);
+
+next:
+ pl_opengl_destroy(&gl);
+ eglDestroySurface(dpy, surf);
+ eglDestroyContext(dpy, egl);
+ continue;
+
+error: ;
+ EGLint error = eglGetError();
+ if (error != EGL_SUCCESS)
+ fprintf(stderr, "EGL error: %s\n", egl_err_str(error));
+ }
+
+ eglTerminate(dpy);
+ gladLoaderUnloadEGL();
+ pl_log_destroy(&log);
+
+ if (!last_glsl.version)
+ return SKIP;
+}
diff --git a/src/tests/options.c b/src/tests/options.c
new file mode 100644
index 0000000..f178668
--- /dev/null
+++ b/src/tests/options.c
@@ -0,0 +1,123 @@
+#include "tests.h"
+
+#include <libplacebo/options.h>
+
+static void count_cb(void *priv, pl_opt_data data)
+{
+ int *num = priv;
+ printf("Iterating over option: %s = %s\n", data->opt->key, data->text);
+ (*num)++;
+}
+
+static void set_cb(void *priv, pl_opt_data data)
+{
+ pl_options dst = priv;
+ REQUIRE(pl_options_set_str(dst, data->opt->key, data->text));
+}
+
+int main()
+{
+ pl_log log = pl_test_logger();
+ pl_options test = pl_options_alloc(log);
+
+ REQUIRE_STREQ(pl_options_save(test), "");
+ REQUIRE(pl_options_load(test, ""));
+ REQUIRE_STREQ(pl_options_save(test), "");
+
+ pl_options_reset(test, &pl_render_fast_params);
+ REQUIRE_STREQ(pl_options_save(test), "");
+ REQUIRE(pl_options_load(test, "preset=fast"));
+ REQUIRE_STREQ(pl_options_save(test), "");
+
+ const char *def_opts = "upscaler=lanczos,downscaler=hermite,frame_mixer=oversample,sigmoid=yes,peak_detect=yes,dither=yes";
+ pl_options_reset(test, &pl_render_default_params);
+ REQUIRE_STREQ(pl_options_save(test), def_opts);
+ struct pl_options_t def_pre = *test;
+ pl_options_reset(test, NULL);
+ REQUIRE_STREQ(pl_options_save(test), "");
+ REQUIRE(pl_options_load(test, def_opts));
+ REQUIRE_STREQ(pl_options_save(test), def_opts);
+ REQUIRE_MEMEQ(test, &def_pre, sizeof(*test));
+ pl_options_reset(test, NULL);
+ REQUIRE(pl_options_load(test, "preset=default"));
+ REQUIRE_STREQ(pl_options_save(test), def_opts);
+ REQUIRE_MEMEQ(test, &def_pre, sizeof(*test));
+
+ int num = 0;
+ pl_options_iterate(test, count_cb, &num);
+ REQUIRE_CMP(num, ==, 6, "d");
+
+ pl_opt_data data;
+ REQUIRE((data = pl_options_get(test, "tile_size")));
+ REQUIRE_STREQ(data->opt->key, "tile_size");
+ REQUIRE_CMP(*(int *) data->value, =, pl_render_default_params.tile_size, "d");
+ REQUIRE_STREQ(data->text, "32");
+
+ const char *hq_opts = "upscaler=ewa_lanczossharp,downscaler=hermite,frame_mixer=oversample,deband=yes,sigmoid=yes,peak_detect=yes,peak_percentile=99.99500274658203,contrast_recovery=0.30000001192092896,dither=yes";
+ // fallback can produce different precision
+ const char *hq_opts2 = "upscaler=ewa_lanczossharp,downscaler=hermite,frame_mixer=oversample,deband=yes,sigmoid=yes,peak_detect=yes,peak_percentile=99.99500274658203125,contrast_recovery=0.30000001192092896,dither=yes";
+
+ pl_options_reset(test, &pl_render_high_quality_params);
+ const char *opts = pl_options_save(test);
+ if (!strcmp(opts, hq_opts2))
+ hq_opts = hq_opts2;
+ REQUIRE_STREQ(opts, hq_opts);
+ struct pl_options_t hq_pre = *test;
+ pl_options_reset(test, NULL);
+ REQUIRE_STREQ(pl_options_save(test), "");
+ REQUIRE(pl_options_load(test, hq_opts));
+ REQUIRE_STREQ(pl_options_save(test), hq_opts);
+ REQUIRE_MEMEQ(test, &hq_pre, sizeof(*test));
+ REQUIRE(pl_options_load(test, "preset=high_quality"));
+ REQUIRE_STREQ(pl_options_save(test), hq_opts);
+ REQUIRE_MEMEQ(test, &hq_pre, sizeof(*test));
+
+ pl_options test2 = pl_options_alloc(log);
+ pl_options_iterate(test, set_cb, test2);
+ REQUIRE_STREQ(pl_options_save(test), pl_options_save(test2));
+ pl_options_free(&test2);
+
+ // Test custom scalers
+ pl_options_reset(test, pl_render_params(
+ .upscaler = &(struct pl_filter_config) {
+ .kernel = &pl_filter_function_jinc,
+ .window = &pl_filter_function_jinc,
+ .radius = 4.0,
+ .polar = true,
+ },
+ ));
+ const char *jinc4_opts = "upscaler=custom,upscaler_kernel=jinc,upscaler_window=jinc,upscaler_radius=4,upscaler_polar=yes";
+ REQUIRE_STREQ(pl_options_save(test), jinc4_opts);
+ struct pl_options_t jinc4_pre = *test;
+ pl_options_reset(test, NULL);
+ REQUIRE(pl_options_load(test, "upscaler=custom,upscaler_preset=ewa_lanczos,upscaler_radius=4.0,upscaler_clamp=0.0"));
+ REQUIRE_STREQ(pl_options_save(test), jinc4_opts);
+ REQUIRE_MEMEQ(test, &jinc4_pre, sizeof(*test));
+
+ // Test params presets
+ pl_options_reset(test, NULL);
+ REQUIRE(pl_options_load(test, "cone=yes,cone_preset=deuteranomaly"));
+ REQUIRE_STREQ(pl_options_save(test), "cone=yes,cones=m,cone_strength=0.5");
+
+ // Test error paths
+ pl_options bad = pl_options_alloc(NULL);
+ REQUIRE(!pl_options_load(bad, "scale_preset=help"));
+ REQUIRE(!pl_options_load(bad, "dither_method=invalid"));
+ REQUIRE(!pl_options_load(bad, "lut_entries=-1"));
+ REQUIRE(!pl_options_load(bad, "deband_iterations=100"));
+ REQUIRE(!pl_options_load(bad, "tone_lut_size=abc"));
+ REQUIRE(!pl_options_load(bad, "show_clipping=hello"));
+ REQUIRE(!pl_options_load(bad, "brightness=2.0"));
+ REQUIRE(!pl_options_load(bad, "gamma=oops"));
+ REQUIRE(!pl_options_load(bad, "invalid"));
+ REQUIRE(!pl_options_load(bad, "="));
+ REQUIRE(!pl_options_load(bad, "preset==bar"));
+ REQUIRE(!pl_options_load(bad, "peak_percentile=E8203125"));
+ REQUIRE(!pl_options_get(bad, "invalid"));
+ REQUIRE_STREQ(pl_options_save(bad), "");
+ pl_options_free(&bad);
+
+ pl_options_free(&test);
+ pl_log_destroy(&log);
+ return 0;
+}
diff --git a/src/tests/string.c b/src/tests/string.c
new file mode 100644
index 0000000..52985c4
--- /dev/null
+++ b/src/tests/string.c
@@ -0,0 +1,147 @@
+#include "tests.h"
+
+static const pl_str null = {0};
+static const pl_str test = PL_STR0("test");
+static const pl_str empty = PL_STR0("");
+
+static inline bool is_null(pl_str str)
+{
+ return !str.len && !str.buf;
+}
+
+static inline bool is_empty(pl_str str)
+{
+ return !str.len;
+}
+
+int main()
+{
+ void *tmp = pl_tmp(NULL);
+
+ REQUIRE(is_null(pl_str0(NULL)));
+ REQUIRE(is_null(pl_strdup(tmp, null)));
+ char *empty0 = pl_strdup0(tmp, null);
+ REQUIRE(empty0 && !empty0[0]);
+ REQUIRE(pl_str_equals0(empty, empty0));
+
+ pl_str buf = {0};
+ pl_str_append(tmp, &buf, null);
+ REQUIRE(is_empty(buf));
+ pl_str_append_asprintf(tmp, &buf, "%.*s", PL_STR_FMT(test));
+ REQUIRE(pl_str_equals(buf, test));
+
+ pl_str_append_asprintf_c(tmp, &buf, "%d %f %f %f %lld %zu %.*sx %hx %hx %hx %hx",
+ 1, 1.0f, 4294967295.56, 83224965647295.65, 0xFFll, (size_t) 0, PL_STR_FMT(empty),
+ (unsigned short) 0xCAFEu, (unsigned short) 0x1, (unsigned short) 0,
+ (unsigned short) 0xFFFFu);
+ const char *expected = "test1 1 4294967295.56 83224965647295.66 255 0 x cafe 1 0 ffff";
+ // fallback can produce different precision
+ const char *expected2 = "test1 1 4294967295.55999994277954102 83224965647295.65625 255 0 x cafe 1 0 ffff";
+ REQUIRE(pl_str_equals0(buf, expected) || pl_str_equals0(buf, expected2));
+
+ REQUIRE_CMP(pl_strchr(null, ' '), <, 0, "d");
+ REQUIRE_CMP((int) pl_strspn(null, " "), ==, 0, "d");
+ REQUIRE_CMP((int) pl_strcspn(null, " "), ==, 0, "d");
+ REQUIRE(is_null(pl_str_strip(null)));
+
+ REQUIRE_CMP(pl_strchr(test, 's'), ==, 2, "d");
+ REQUIRE_CMP((int) pl_strspn(test, "et"), ==, 2, "d");
+ REQUIRE_CMP((int) pl_strcspn(test, "xs"), ==, 2, "d");
+
+ REQUIRE(is_null(pl_str_take(null, 10)));
+ REQUIRE(is_empty(pl_str_take(test, 0)));
+ REQUIRE(is_null(pl_str_drop(null, 10)));
+ REQUIRE(is_null(pl_str_drop(test, test.len)));
+ REQUIRE(pl_str_equals(pl_str_drop(test, 0), test));
+
+ REQUIRE_CMP(pl_str_find(null, test), <, 0, "d");
+ REQUIRE_CMP(pl_str_find(null, null), ==, 0, "d");
+ REQUIRE_CMP(pl_str_find(test, null), ==, 0, "d");
+ REQUIRE_CMP(pl_str_find(test, test), ==, 0, "d");
+
+ pl_str rest;
+ REQUIRE(is_null(pl_str_split_char(null, ' ', &rest)) && is_null(rest));
+ REQUIRE(is_null(pl_str_split_str(null, test, &rest)) && is_null(rest));
+ REQUIRE(is_empty(pl_str_split_str(test, test, &rest)) && is_empty(rest));
+ REQUIRE(is_null(pl_str_getline(null, &rest)) && is_null(rest));
+
+ pl_str right, left = pl_str_split_char(pl_str0("left right"), ' ', &right);
+ REQUIRE(pl_str_equals0(left, "left"));
+ REQUIRE(pl_str_equals0(right, "right"));
+
+ left = pl_str_split_str0(pl_str0("leftTESTright"), "TEST", &right);
+ REQUIRE(pl_str_equals0(left, "left"));
+ REQUIRE(pl_str_equals0(right, "right"));
+
+ pl_str out;
+ REQUIRE(pl_str_decode_hex(tmp, null, &out) && is_empty(out));
+ REQUIRE(!pl_str_decode_hex(tmp, pl_str0("invalid"), &out));
+
+ REQUIRE(pl_str_equals(null, null));
+ REQUIRE(pl_str_equals(null, empty));
+ REQUIRE(pl_str_startswith(null, null));
+ REQUIRE(pl_str_startswith(test, null));
+ REQUIRE(pl_str_startswith(test, test));
+ REQUIRE(pl_str_endswith(null, null));
+ REQUIRE(pl_str_endswith(test, null));
+ REQUIRE(pl_str_endswith(test, test));
+
+ double d;
+ float f;
+ int i;
+ unsigned u;
+ int64_t i64;
+ uint64_t u64;
+
+ REQUIRE(pl_str_parse_double(pl_str0("4294967295.56"), &d)); REQUIRE_FEQ(d, 4294967295.56, 1e-20);
+ REQUIRE(pl_str_parse_double(pl_str0("-4294967295.56"), &d)); REQUIRE_FEQ(d, -4294967295.56, 1e-20);
+ REQUIRE(pl_str_parse_double(pl_str0("83224965647295.65"), &d)); REQUIRE_FEQ(d, 83224965647295.65, 1e-20);
+ REQUIRE(pl_str_parse_double(pl_str0("-83224965647295.65"), &d)); REQUIRE_FEQ(d, -83224965647295.65, 1e-20);
+ REQUIRE(pl_str_parse_float(pl_str0("4294967295.56"), &f)); REQUIRE_FEQ(f, 4294967295.56f, 1e-8);
+ REQUIRE(pl_str_parse_float(pl_str0("-4294967295.56"), &f)); REQUIRE_FEQ(f, -4294967295.56f, 1e-8);
+ REQUIRE(pl_str_parse_float(pl_str0("83224965647295.65"), &f)); REQUIRE_FEQ(f, 83224965647295.65f, 1e-8);
+ REQUIRE(pl_str_parse_float(pl_str0("-83224965647295.65"), &f)); REQUIRE_FEQ(f, -83224965647295.65f, 1e-8);
+ REQUIRE(pl_str_parse_float(pl_str0("1.3984"), &f)); REQUIRE_FEQ(f, 1.3984f, 1e-8);
+ REQUIRE(pl_str_parse_float(pl_str0("-8.9100083"), &f)); REQUIRE_FEQ(f, -8.9100083f, 1e-8);
+ REQUIRE(pl_str_parse_float(pl_str0("-0"), &f)); REQUIRE_FEQ(f, 0.0f, 1e-8);
+ REQUIRE(pl_str_parse_float(pl_str0("-3.14e20"), &f)); REQUIRE_FEQ(f, -3.14e20f, 1e-8);
+ REQUIRE(pl_str_parse_float(pl_str0("0.5e-5"), &f)); REQUIRE_FEQ(f, 0.5e-5f, 1e-8);
+ REQUIRE(pl_str_parse_float(pl_str0("0.5e+5"), &f)); REQUIRE_FEQ(f, 0.5e+5f, 1e-8);
+ REQUIRE(pl_str_parse_int(pl_str0("64239"), &i)); REQUIRE_CMP(i, ==, 64239, "d");
+ REQUIRE(pl_str_parse_int(pl_str0("-102"), &i)); REQUIRE_CMP(i, ==, -102, "d");
+ REQUIRE(pl_str_parse_int(pl_str0("1"), &i)); REQUIRE_CMP(i, ==, 1, "d");
+ REQUIRE(pl_str_parse_int(pl_str0("-0"), &i)); REQUIRE_CMP(i, ==, 0, "d");
+ REQUIRE(pl_str_parse_uint(pl_str0("64239"), &u)); REQUIRE_CMP(u, ==, 64239, "u");
+ REQUIRE(pl_str_parse_uint(pl_str0("1"), &u)); REQUIRE_CMP(u, ==, 1, "u");
+ REQUIRE(pl_str_parse_int64(pl_str0("9223372036854775799"), &i64));
+ REQUIRE_CMP(i64, ==, 9223372036854775799LL, PRIi64);
+ REQUIRE(pl_str_parse_int64(pl_str0("-9223372036854775799"), &i64));
+ REQUIRE_CMP(i64, ==, -9223372036854775799LL, PRIi64);
+ REQUIRE(pl_str_parse_uint64(pl_str0("18446744073709551609"), &u64));
+ REQUIRE_CMP(u64, ==, 18446744073709551609LLU, PRIu64);
+ REQUIRE(!pl_str_parse_float(null, &f));
+ REQUIRE(!pl_str_parse_float(test, &f));
+ REQUIRE(!pl_str_parse_float(empty, &f));
+ REQUIRE(!pl_str_parse_int(null, &i));
+ REQUIRE(!pl_str_parse_int(test, &i));
+ REQUIRE(!pl_str_parse_int(empty, &i));
+ REQUIRE(!pl_str_parse_uint(null, &u));
+ REQUIRE(!pl_str_parse_uint(test, &u));
+ REQUIRE(!pl_str_parse_uint(empty, &u));
+
+ pl_str_builder builder = pl_str_builder_alloc(tmp);
+ pl_str_builder_const_str(builder, "hello");
+ pl_str_builder_str(builder, pl_str0("world"));
+ pl_str res = pl_str_builder_exec(builder);
+ REQUIRE(pl_str_equals0(res, "helloworld"));
+
+ pl_str_builder_reset(builder);
+ pl_str_builder_printf_c(builder, "foo %d bar %u bat %s baz %lld",
+ 123, 56u, "quack", 0xDEADBEEFll);
+ pl_str_builder_printf_c(builder, " %.*s", PL_STR_FMT(pl_str0("test123")));
+ res = pl_str_builder_exec(builder);
+ REQUIRE(pl_str_equals0(res, "foo 123 bar 56 bat quack baz 3735928559 test123"));
+
+ pl_free(tmp);
+ return 0;
+}
diff --git a/src/tests/tests.h b/src/tests/tests.h
new file mode 100644
index 0000000..a33a0de
--- /dev/null
+++ b/src/tests/tests.h
@@ -0,0 +1,319 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+#include <libplacebo/log.h>
+#include <libplacebo/colorspace.h>
+#include <libplacebo/shaders/film_grain.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+#ifdef PL_HAVE_WIN32
+#include <io.h>
+#define isatty _isatty
+#define fileno _fileno
+#else
+#include <unistd.h>
+#endif
+
+static void pl_log_timestamp(void *stream, enum pl_log_level level, const char *msg)
+{
+ static char letter[] = {
+ [PL_LOG_FATAL] = 'f',
+ [PL_LOG_ERR] = 'e',
+ [PL_LOG_WARN] = 'w',
+ [PL_LOG_INFO] = 'i',
+ [PL_LOG_DEBUG] = 'd',
+ [PL_LOG_TRACE] = 't',
+ };
+
+ // Log time relative to the first message
+ static pl_clock_t base = 0;
+ if (!base)
+ base = pl_clock_now();
+
+ double secs = pl_clock_diff(pl_clock_now(), base);
+ printf("[%2.3f][%c] %s\n", secs, letter[level], msg);
+
+ if (level <= PL_LOG_WARN) {
+ // duplicate warnings/errors to stderr
+ fprintf(stderr, "[%2.3f][%c] %s\n", secs, letter[level], msg);
+ fflush(stderr);
+ }
+}
+
+static inline pl_log pl_test_logger(void)
+{
+ setbuf(stdout, NULL);
+ setbuf(stderr, NULL);
+
+ return pl_log_create(PL_API_VER, pl_log_params(
+ .log_cb = isatty(fileno(stdout)) ? pl_log_color : pl_log_timestamp,
+ .log_level = PL_LOG_DEBUG,
+ ));
+}
+
+#define RANDOM (rand() / (float) RAND_MAX)
+#define RANDOM_U8 ((uint8_t) (256.0 * rand() / (RAND_MAX + 1.0)))
+#define SKIP 77
+
+// Helpers for performing various checks
+#define REQUIRE(cond) do \
+{ \
+ if (!(cond)) { \
+ fprintf(stderr, "=== FAILED: '"#cond"' at "__FILE__":%d\n\n", __LINE__);\
+ exit(1); \
+ } \
+} while (0)
+
+#define REQUIRE_CMP(a, op, b, fmt) do \
+{ \
+ __typeof__(a) _va = (a), _vb = (b); \
+ \
+ if (!(_va op _vb)) { \
+ fprintf(stderr, "=== FAILED: '"#a" "#op" "#b"' at "__FILE__":%d\n" \
+ " %-31s = %"fmt"\n" \
+ " %-31s = %"fmt"\n\n", \
+ __LINE__, #a, _va, #b, _vb); \
+ exit(1); \
+ } \
+} while (0)
+
+#define REQUIRE_FEQ(a, b, epsilon) do \
+{ \
+ float _va = (a); \
+ float _vb = (b); \
+ float _delta = (epsilon) * fmax(1.0, fabs(_va)); \
+ \
+ if (fabs(_va - _vb) > _delta) { \
+ fprintf(stderr, "=== FAILED: '"#a" ≈ "#b"' at "__FILE__":%d\n" \
+ " %-31s = %f\n" \
+ " %-31s = %f\n" \
+ " %-31s = %f\n\n", \
+ __LINE__, #a, _va, #b, _vb, \
+ "epsilon "#epsilon" -> max delta", _delta); \
+ exit(1); \
+ } \
+} while (0)
+
+#define REQUIRE_STREQ(a, b) do \
+{ \
+ const char *_a = (a); \
+ const char *_b = (b); \
+ if (strcmp(_a, _b) != 0) { \
+ fprintf(stderr, "=== FAILED: !strcmp("#a", "#b") at "__FILE__":%d\n" \
+ " %-31s = %s\n" \
+ " %-31s = %s\n\n", \
+ __LINE__, #a, _a, #b, _b); \
+ exit(1); \
+ } \
+} while (0)
+
+static inline void log_array(const uint8_t *a, const uint8_t *ref, size_t off, size_t size)
+{
+ for (size_t n = 0; n < size; n++) {
+ const char *prefix = "", *suffix = "";
+ char terminator = ' ';
+ if (a[n + off] != ref[n + off]) {
+ prefix = "\033[31;1m";
+ suffix = "\033[0m";
+ }
+ if (n+1 == size || n % 16 == 15)
+ terminator = '\n';
+ fprintf(stderr, "%s%02"PRIx8"%s%c", prefix, a[n + off], suffix, terminator);
+ }
+}
+
+static inline void require_memeq(const void *aptr, const void *bptr, size_t size,
+ const char *astr, const char *bstr,
+ const char *sizestr, const char *file, int line)
+{
+ const uint8_t *a = aptr, *b = bptr;
+ for (size_t i = 0; i < size; i++) {
+ if (a[i] == b[i])
+ continue;
+
+ fprintf(stderr, "=== FAILED: memcmp(%s, %s, %s) == 0 at %s:%d\n"
+ "at position %zu: 0x%02"PRIx8" != 0x%02"PRIx8"\n\n",
+ astr, bstr, sizestr, file, line, i, a[i], b[i]);
+
+ size_t start = i >= 256 ? i - 256 : 0;
+ size_t end = PL_MIN(size, i + 256);
+ fprintf(stderr, "%zu bytes of '%s' at offset %zu:\n", end - start, astr, start);
+ log_array(a, b, start, end - start);
+ fprintf(stderr, "\n%zu bytes of '%s' at offset %zu:\n", end - start, bstr, start);
+ log_array(b, a, start, end - start);
+ exit(1);
+ }
+}
+
+#define REQUIRE_MEMEQ(a, b, size) require_memeq(a, b, size, #a, #b, #size, __FILE__, __LINE__)
+
+#define REQUIRE_HANDLE(shmem, type) \
+ switch (type) { \
+ case PL_HANDLE_FD: \
+ case PL_HANDLE_DMA_BUF: \
+ REQUIRE(shmem.handle.fd > -1); \
+ break; \
+ case PL_HANDLE_WIN32: \
+ case PL_HANDLE_WIN32_KMT: \
+ /* INVALID_HANDLE_VALUE = (-1) */ \
+ REQUIRE(shmem.handle.handle != (void *)(intptr_t) (-1)); \
+ /* fallthrough */ \
+ case PL_HANDLE_MTL_TEX: \
+ case PL_HANDLE_IOSURFACE: \
+ REQUIRE(shmem.handle.handle); \
+ break; \
+ case PL_HANDLE_HOST_PTR: \
+ REQUIRE(shmem.handle.ptr); \
+ break; \
+ }
+
+static const struct pl_av1_grain_data av1_grain_data = {
+ .num_points_y = 6,
+ .points_y = {{0, 4}, {27, 33}, {54, 55}, {67, 61}, {108, 71}, {255, 72}},
+ .chroma_scaling_from_luma = false,
+ .num_points_uv = {2, 2},
+ .points_uv = {{{0, 64}, {255, 64}}, {{0, 64}, {255, 64}}},
+ .scaling_shift = 11,
+ .ar_coeff_lag = 3,
+ .ar_coeffs_y = {4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25,
+ 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66},
+ .ar_coeffs_uv = {
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127},
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127},
+ },
+ .ar_coeff_shift = 7,
+ .grain_scale_shift = 0,
+ .uv_mult = {0, 0},
+ .uv_mult_luma = {64, 64},
+ .uv_offset = {0, 0},
+};
+
+static const uint8_t h274_lower_bound = 10;
+static const uint8_t h274_upper_bound = 250;
+static const int16_t h274_values[6] = {16, 12, 14};
+
+static const struct pl_h274_grain_data h274_grain_data = {
+ .model_id = 0,
+ .blending_mode_id = 0,
+ .log2_scale_factor = 2,
+ .component_model_present = {true},
+ .num_intensity_intervals = {1},
+ .num_model_values = {3},
+ .intensity_interval_lower_bound = {&h274_lower_bound},
+ .intensity_interval_upper_bound = {&h274_upper_bound},
+ .comp_model_value = {&h274_values},
+};
+
+static const struct pl_dovi_metadata dovi_meta = {
+ .nonlinear = {{{1, 0, 0}, {0, 1, 0}, {0, 0, 1}}},
+ .linear = {{{1, 0, 0}, {0, 1, 0}, {0, 0, 1}}},
+ .comp = {
+ {
+ .num_pivots = 9,
+ .pivots = {0.0615835786, 0.129032254, 0.353861183,
+ 0.604105592, 0.854349971, 0.890518069,
+ 0.906158328, 0.913978517, 0.92082113},
+ .method = {0, 0, 0, 0, 0, 0, 0, 0},
+ .poly_coeffs = {
+ {-0.0488376617, 1.99335372, -2.41716385},
+ {-0.0141925812, 1.61829138, -1.53397191},
+ { 0.157061458, 0.63640213, -0.11302495},
+ {0.25272119, 0.246226311, 0.27281332},
+ {0.951621532, -1.35507894, 1.18898678},
+ {6.41251612, -13.6188488, 8.07336903},
+ {13.467535, -29.1869125, 16.6612244},
+ {28.2321472, -61.8516273, 34.7264938}
+ },
+ }, {
+ .num_pivots = 2,
+ .pivots = {0.0, 1.0},
+ .method = {1},
+ .mmr_order = {3},
+ .mmr_constant = {-0.500733018},
+ .mmr_coeffs = {{
+ {1.08411026, 3.80807829, 0.0881733894, -3.23097038, -0.409078479, -1.31310081, 2.71297002},
+ {-0.241833091, -3.57880807, -0.108109117, 3.13198471, 0.869203091, 1.96561158, -9.30871677},
+ {-0.177356839, 1.48970401, 0.0908923149, -0.510447979, -0.687603354, -0.934977889, 12.3544884},
+ }},
+ }, {
+ .num_pivots = 2,
+ .pivots = {0.0, 1.0},
+ .method = {1},
+ .mmr_order = {3},
+ .mmr_constant = {-1.23833287},
+ .mmr_coeffs = {{
+ {3.52909589, 0.383154511, 5.50820637, -1.02094889, -6.36386824, 0.194121242, 0.64683497},
+ {-2.57899785, -0.626081586, -6.05729723, 2.29143763, 9.14653015, -0.0507702827, -4.17724133},
+ {0.705404401, 0.341412306, 2.98387456, -1.71712542, -4.91501331, 0.1465137, 6.38665438},
+ }},
+ },
+ },
+};
+
+static const uint8_t sRGB_v2_nano_icc[] = {
+ 0x00, 0x00, 0x01, 0x9a, 0x6c, 0x63, 0x6d, 0x73, 0x02, 0x10, 0x00, 0x00,
+ 0x6d, 0x6e, 0x74, 0x72, 0x52, 0x47, 0x42, 0x20, 0x58, 0x59, 0x5a, 0x20,
+ 0x07, 0xe2, 0x00, 0x03, 0x00, 0x14, 0x00, 0x09, 0x00, 0x0e, 0x00, 0x1d,
+ 0x61, 0x63, 0x73, 0x70, 0x4d, 0x53, 0x46, 0x54, 0x00, 0x00, 0x00, 0x00,
+ 0x73, 0x61, 0x77, 0x73, 0x63, 0x74, 0x72, 0x6c, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xd6,
+ 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0xd3, 0x2d, 0x68, 0x61, 0x6e, 0x64,
+ 0xeb, 0x77, 0x1f, 0x3c, 0xaa, 0x53, 0x51, 0x02, 0xe9, 0x3e, 0x28, 0x6c,
+ 0x91, 0x46, 0xae, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,
+ 0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00, 0x5f,
+ 0x77, 0x74, 0x70, 0x74, 0x00, 0x00, 0x01, 0x0c, 0x00, 0x00, 0x00, 0x14,
+ 0x72, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x20, 0x00, 0x00, 0x00, 0x14,
+ 0x67, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x34, 0x00, 0x00, 0x00, 0x14,
+ 0x62, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x48, 0x00, 0x00, 0x00, 0x14,
+ 0x72, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x5c, 0x00, 0x00, 0x00, 0x34,
+ 0x67, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x5c, 0x00, 0x00, 0x00, 0x34,
+ 0x62, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x5c, 0x00, 0x00, 0x00, 0x34,
+ 0x63, 0x70, 0x72, 0x74, 0x00, 0x00, 0x01, 0x90, 0x00, 0x00, 0x00, 0x0a,
+ 0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05,
+ 0x6e, 0x52, 0x47, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0xf3, 0x54, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x16, 0xc9,
+ 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6f, 0xa0,
+ 0x00, 0x00, 0x38, 0xf2, 0x00, 0x00, 0x03, 0x8f, 0x58, 0x59, 0x5a, 0x20,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x62, 0x96, 0x00, 0x00, 0xb7, 0x89,
+ 0x00, 0x00, 0x18, 0xda, 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x24, 0xa0, 0x00, 0x00, 0x0f, 0x85, 0x00, 0x00, 0xb6, 0xc4,
+ 0x63, 0x75, 0x72, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x14,
+ 0x00, 0x00, 0x01, 0x07, 0x02, 0xb5, 0x05, 0x6b, 0x09, 0x36, 0x0e, 0x50,
+ 0x14, 0xb1, 0x1c, 0x80, 0x25, 0xc8, 0x30, 0xa1, 0x3d, 0x19, 0x4b, 0x40,
+ 0x5b, 0x27, 0x6c, 0xdb, 0x80, 0x6b, 0x95, 0xe3, 0xad, 0x50, 0xc6, 0xc2,
+ 0xe2, 0x31, 0xff, 0xff, 0x74, 0x65, 0x78, 0x74, 0x00, 0x00, 0x00, 0x00,
+ 0x30, 0x00
+};
+
+#define TEST_PROFILE(arr) ((struct pl_icc_profile) { \
+ .data = (arr), \
+ .len = PL_ARRAY_SIZE(arr), \
+ .signature = (uintptr_t) (arr), \
+})
diff --git a/src/tests/tone_mapping.c b/src/tests/tone_mapping.c
new file mode 100644
index 0000000..0a48945
--- /dev/null
+++ b/src/tests/tone_mapping.c
@@ -0,0 +1,181 @@
+#include "tests.h"
+#include "log.h"
+
+#include <libplacebo/gamut_mapping.h>
+#include <libplacebo/tone_mapping.h>
+
+//#define PRINT_LUTS
+
+int main()
+{
+ pl_log log = pl_test_logger();
+
+ // PQ unit tests
+ REQUIRE_FEQ(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, 0.0), 0.0, 1e-2);
+ REQUIRE_FEQ(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, 1.0), 10000.0, 1e-2);
+ REQUIRE_FEQ(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, 0.58), 203.0, 1e-2);
+
+ // Test round-trip
+ for (float x = 0.0f; x < 1.0f; x += 0.01f) {
+ REQUIRE_FEQ(x, pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ,
+ pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, x)),
+ 1e-5);
+ }
+
+ static float lut[128];
+ struct pl_tone_map_params params = {
+ .constants = { PL_TONE_MAP_CONSTANTS },
+ .input_scaling = PL_HDR_PQ,
+ .output_scaling = PL_HDR_PQ,
+ .lut_size = PL_ARRAY_SIZE(lut),
+ };
+
+ // Test regular tone-mapping
+ params.input_min = pl_hdr_rescale(PL_HDR_NITS, params.input_scaling, 0.005);
+ params.input_max = pl_hdr_rescale(PL_HDR_NITS, params.input_scaling, 1000.0);
+ params.output_min = pl_hdr_rescale(PL_HDR_NORM, params.output_scaling, 0.001);
+ params.output_max = pl_hdr_rescale(PL_HDR_NORM, params.output_scaling, 1.0);
+
+ struct pl_tone_map_params params_inv = params;
+ PL_SWAP(params_inv.input_min, params_inv.output_min);
+ PL_SWAP(params_inv.input_max, params_inv.output_max);
+
+ int tested_pure_bpc = 0;
+
+ // Generate example tone mapping curves, forward and inverse
+ for (int i = 0; i < pl_num_tone_map_functions; i++) {
+ const struct pl_tone_map_function *fun = pl_tone_map_functions[i];
+ printf("Testing tone-mapping function %s\n", fun->name);
+ params.function = params_inv.function = fun;
+ pl_clock_t start = pl_clock_now();
+ pl_tone_map_generate(lut, &params);
+ pl_log_cpu_time(log, start, pl_clock_now(), "generating LUT");
+ for (int j = 0; j < PL_ARRAY_SIZE(lut); j++) {
+ REQUIRE(isfinite(lut[j]) && !isnan(lut[j]));
+ if (j > 0)
+ REQUIRE_CMP(lut[j], >=, lut[j - 1], "f");
+#ifdef PRINT_LUTS
+ printf("%f, %f\n", j / (PL_ARRAY_SIZE(lut) - 1.0f), lut[j]);
+#endif
+ }
+
+ if (fun->map_inverse || !tested_pure_bpc++) {
+ start = pl_clock_now();
+ pl_tone_map_generate(lut, &params_inv);
+ pl_log_cpu_time(log, start, pl_clock_now(), "generating inverse LUT");
+ for (int j = 0; j < PL_ARRAY_SIZE(lut); j++) {
+ REQUIRE(isfinite(lut[j]) && !isnan(lut[j]));
+ if (j > 0)
+ REQUIRE_CMP(lut[j], >=, lut[j - 1], "f");
+#ifdef PRINT_LUTS
+ printf("%f, %f\n", j / (PL_ARRAY_SIZE(lut) - 1.0f), lut[j]);
+#endif
+ }
+ }
+ }
+
+ // Test that `spline` is a no-op for 1:1 tone mapping
+ params.output_min = params.input_min;
+ params.output_max = params.input_max;
+ params.function = &pl_tone_map_spline;
+ pl_tone_map_generate(lut, &params);
+ for (int j = 0; j < PL_ARRAY_SIZE(lut); j++) {
+ float x = j / (PL_ARRAY_SIZE(lut) - 1.0f);
+ x = PL_MIX(params.input_min, params.input_max, x);
+ REQUIRE_FEQ(x, lut[j], 1e-5);
+ }
+
+ // Test some gamut mapping methods
+ for (int i = 0; i < pl_num_gamut_map_functions; i++) {
+ static const float min_rgb = 0.1f, max_rgb = PL_COLOR_SDR_WHITE;
+ struct pl_gamut_map_params gamut = {
+ .function = pl_gamut_map_functions[i],
+ .input_gamut = *pl_raw_primaries_get(PL_COLOR_PRIM_BT_2020),
+ .output_gamut = *pl_raw_primaries_get(PL_COLOR_PRIM_BT_709),
+ .min_luma = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_PQ, min_rgb),
+ .max_luma = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_PQ, max_rgb),
+ };
+
+ printf("Testing gamut-mapping function %s\n", gamut.function->name);
+
+ // Require that black maps to black and white maps to white
+ float black[3] = { gamut.min_luma, 0.0f, 0.0f };
+ float white[3] = { gamut.max_luma, 0.0f, 0.0f };
+ pl_gamut_map_sample(black, &gamut);
+ pl_gamut_map_sample(white, &gamut);
+ REQUIRE_FEQ(black[0], gamut.min_luma, 1e-4);
+ REQUIRE_FEQ(black[1], 0.0f, 1e-4);
+ REQUIRE_FEQ(black[2], 0.0f, 1e-4);
+ if (gamut.function != &pl_gamut_map_darken)
+ REQUIRE_FEQ(white[0], gamut.max_luma, 1e-4);
+ REQUIRE_FEQ(white[1], 0.0f, 1e-4);
+ REQUIRE_FEQ(white[2], 0.0f, 1e-4);
+ }
+
+ enum { LUT3D_SIZE = 65 }; // for benchmarking
+ struct pl_gamut_map_params perceptual = {
+ .function = &pl_gamut_map_perceptual,
+ .input_gamut = *pl_raw_primaries_get(PL_COLOR_PRIM_BT_2020),
+ .output_gamut = *pl_raw_primaries_get(PL_COLOR_PRIM_BT_709),
+ .max_luma = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, 1.0f),
+ .lut_size_I = LUT3D_SIZE,
+ .lut_size_C = LUT3D_SIZE,
+ .lut_size_h = LUT3D_SIZE,
+ .lut_stride = 3,
+
+ // Set strength to maximum, because otherwise the saturation mapping
+ // code will not fully apply, invalidating the following test
+ .constants.perceptual_strength = 1.0f,
+ };
+
+ // Test that primaries round-trip for perceptual gamut mapping
+ const pl_matrix3x3 rgb2lms_src = pl_ipt_rgb2lms(&perceptual.input_gamut);
+ const pl_matrix3x3 rgb2lms_dst = pl_ipt_rgb2lms(&perceptual.output_gamut);
+ const pl_matrix3x3 lms2rgb_dst = pl_ipt_lms2rgb(&perceptual.output_gamut);
+ static const float refpoints[][3] = {
+ {1, 0, 0}, {0, 1, 0}, {0, 0, 1},
+ {0, 1, 1}, {1, 0, 1}, {1, 1, 0},
+ };
+
+ for (int i = 0; i < PL_ARRAY_SIZE(refpoints); i++) {
+ float c[3] = { refpoints[i][0], refpoints[i][1], refpoints[i][2] };
+ float ref[3] = { refpoints[i][0], refpoints[i][1], refpoints[i][2] };
+ printf("Testing primary: RGB {%.0f %.0f %.0f}\n", c[0], c[1], c[2]);
+ pl_matrix3x3_apply(&rgb2lms_src, c);
+ c[0] = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, c[0]);
+ c[1] = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, c[1]);
+ c[2] = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, c[2]);
+ pl_matrix3x3_apply(&pl_ipt_lms2ipt, c);
+ printf("Before: ICh {%f %f %f}\n",
+ c[0], sqrtf(c[1]*c[1] + c[2]*c[2]), atan2f(c[2], c[1]));
+ pl_gamut_map_sample(c, &perceptual);
+ float rgb[3] = { c[0], c[1], c[2] };
+ pl_matrix3x3_apply(&pl_ipt_ipt2lms, rgb);
+ rgb[0] = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, rgb[0]);
+ rgb[1] = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, rgb[1]);
+ rgb[2] = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, rgb[2]);
+ pl_matrix3x3_apply(&lms2rgb_dst, rgb);
+ const float hue = atan2f(c[2], c[1]);
+ printf("After: ICh {%f %f %f} = RGB {%f %f %f}\n",
+ c[0], sqrtf(c[1]*c[1] + c[2]*c[2]), hue, rgb[0], rgb[1], rgb[2]);
+ pl_matrix3x3_apply(&rgb2lms_dst, ref);
+ ref[0] = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, ref[0]);
+ ref[1] = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, ref[1]);
+ ref[2] = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, ref[2]);
+ pl_matrix3x3_apply(&pl_ipt_lms2ipt, ref);
+ const float hue_ref = atan2f(ref[2], ref[1]);
+ printf("Should be: ICh {%f %f %f}\n",
+ ref[0], sqrtf(ref[1]*ref[1] + ref[2]*ref[2]), hue_ref);
+ REQUIRE_FEQ(hue, hue_ref, 3.0e-3);
+ }
+
+ float *tmp = malloc(sizeof(float[LUT3D_SIZE][LUT3D_SIZE][LUT3D_SIZE][3]));
+ if (tmp) {
+ pl_clock_t start = pl_clock_now();
+ pl_gamut_map_generate(tmp, &perceptual);
+ pl_log_cpu_time(log, start, pl_clock_now(), "generating 3DLUT");
+ free(tmp);
+ }
+
+ pl_log_destroy(&log);
+}
diff --git a/src/tests/utils.c b/src/tests/utils.c
new file mode 100644
index 0000000..73a9265
--- /dev/null
+++ b/src/tests/utils.c
@@ -0,0 +1,165 @@
+#include "tests.h"
+#include "gpu.h"
+
+#include <libplacebo/utils/upload.h>
+
+int main()
+{
+ struct pl_bit_encoding bits = {0};
+ struct pl_plane_data data = {0};
+
+ static const struct pl_bit_encoding bits0 = {0};
+ static const struct pl_bit_encoding bits8 = {
+ .sample_depth = 8,
+ .color_depth = 8,
+ };
+
+ static const struct pl_bit_encoding bits16 = {
+ .sample_depth = 16,
+ .color_depth = 16,
+ };
+
+ static const struct pl_bit_encoding bits10_16 = {
+ .sample_depth = 16,
+ .color_depth = 10,
+ };
+
+ static const struct pl_bit_encoding bits10_16_6 = {
+ .sample_depth = 16,
+ .color_depth = 10,
+ .bit_shift = 6,
+ };
+
+#define TEST_ALIGN(ref, ref_align, ref_bits, ...) \
+ do { \
+ pl_plane_data_from_mask(&data, (uint64_t[4]){ __VA_ARGS__ }); \
+ REQUIRE_MEMEQ(&data, &ref, sizeof(ref)); \
+ pl_plane_data_align(&data, &bits); \
+ REQUIRE_MEMEQ(&data, &ref_align, sizeof(ref_align)); \
+ REQUIRE_MEMEQ(&bits, &ref_bits, sizeof(bits)); \
+ } while (0)
+
+#define TEST(ref, bits, ...) TEST_ALIGN(ref, ref, bits, __VA_ARGS__)
+
+ static const struct pl_plane_data rgb8 = {
+ .component_size = {8, 8, 8},
+ .component_map = {0, 1, 2},
+ };
+
+ TEST(rgb8, bits8, 0xFF, 0xFF00, 0xFF0000);
+
+ static const struct pl_plane_data bgra8 = {
+ .component_size = {8, 8, 8, 8},
+ .component_map = {2, 1, 0, 3},
+ };
+
+ TEST(bgra8, bits8, 0xFF0000, 0xFF00, 0xFF, 0xFF000000);
+
+ static const struct pl_plane_data gr16 = {
+ .component_size = {16, 16},
+ .component_map = {1, 0},
+ };
+
+ TEST(gr16, bits16, 0xFFFF0000, 0xFFFF);
+
+ static const struct pl_plane_data r10x6g10 = {
+ .component_size = {10, 10},
+ .component_map = {1, 0}, // LSB -> MSB ordering
+ .component_pad = {0, 6},
+ };
+
+ TEST_ALIGN(r10x6g10, gr16, bits10_16, 0x03FF0000, 0x03FF);
+
+ static const struct pl_plane_data rgb565 = {
+ .component_size = {5, 6, 5},
+ .component_map = {2, 1, 0}, // LSB -> MSB ordering
+ };
+
+ TEST(rgb565, bits0, 0xF800, 0x07E0, 0x001F);
+
+ static const struct pl_plane_data rgba16 = {
+ .component_size = {16, 16, 16, 16},
+ .component_map = {0, 1, 2, 3},
+ };
+
+ TEST(rgba16, bits16, 0xFFFFllu, 0xFFFF0000llu, 0xFFFF00000000llu, 0xFFFF000000000000llu);
+
+ static const struct pl_plane_data p010 = {
+ .component_size = {10, 10, 10},
+ .component_map = {0, 1, 2},
+ .component_pad = {6, 6, 6},
+ };
+
+ static const struct pl_plane_data rgb16 = {
+ .component_size = {16, 16, 16},
+ .component_map = {0, 1, 2},
+ };
+
+ TEST_ALIGN(p010, rgb16, bits10_16_6, 0xFFC0llu, 0xFFC00000llu, 0xFFC000000000llu);
+
+ // Test GLSL structure packing
+ struct pl_var vec1 = pl_var_float(""),
+ vec2 = pl_var_vec2(""),
+ vec3 = pl_var_vec3(""),
+ mat2 = pl_var_mat2(""),
+ mat3 = pl_var_mat3("");
+
+ struct pl_var_layout layout;
+ layout = pl_std140_layout(0, &vec2);
+ REQUIRE_CMP(layout.offset, ==, 0 * sizeof(float), "zu");
+ REQUIRE_CMP(layout.stride, ==, 2 * sizeof(float), "zu");
+ REQUIRE_CMP(layout.size, ==, 2 * sizeof(float), "zu");
+
+ layout = pl_std140_layout(3 * sizeof(float), &vec3);
+ REQUIRE_CMP(layout.offset, ==, 4 * sizeof(float), "zu");
+ REQUIRE_CMP(layout.stride, ==, 3 * sizeof(float), "zu");
+ REQUIRE_CMP(layout.size, ==, 3 * sizeof(float), "zu");
+
+ layout = pl_std140_layout(2 * sizeof(float), &mat3);
+ REQUIRE_CMP(layout.offset, ==, 4 * sizeof(float), "zu");
+ REQUIRE_CMP(layout.stride, ==, 4 * sizeof(float), "zu");
+ REQUIRE_CMP(layout.size, ==, 3 * 4 * sizeof(float), "zu");
+
+ layout = pl_std430_layout(2 * sizeof(float), &mat3);
+ REQUIRE_CMP(layout.offset, ==, 4 * sizeof(float), "zu");
+ REQUIRE_CMP(layout.stride, ==, 4 * sizeof(float), "zu");
+ REQUIRE_CMP(layout.size, ==, 4 * 3 * sizeof(float), "zu");
+
+ layout = pl_std140_layout(3 * sizeof(float), &vec1);
+ REQUIRE_CMP(layout.offset, ==, 3 * sizeof(float), "zu");
+ REQUIRE_CMP(layout.stride, ==, sizeof(float), "zu");
+ REQUIRE_CMP(layout.size, ==, sizeof(float), "zu");
+
+ struct pl_var vec2a = vec2;
+ vec2a.dim_a = 50;
+
+ layout = pl_std140_layout(sizeof(float), &vec2a);
+ REQUIRE_CMP(layout.offset, ==, 4 * sizeof(float), "zu");
+ REQUIRE_CMP(layout.stride, ==, 4 * sizeof(float), "zu");
+ REQUIRE_CMP(layout.size, ==, 50 * 4 * sizeof(float), "zu");
+
+ layout = pl_std430_layout(sizeof(float), &vec2a);
+ REQUIRE_CMP(layout.offset, ==, 2 * sizeof(float), "zu");
+ REQUIRE_CMP(layout.stride, ==, 2 * sizeof(float), "zu");
+ REQUIRE_CMP(layout.size, ==, 50 * 2 * sizeof(float), "zu");
+
+ struct pl_var mat2a = mat2;
+ mat2a.dim_a = 20;
+
+ layout = pl_std140_layout(5 * sizeof(float), &mat2a);
+ REQUIRE_CMP(layout.offset, ==, 8 * sizeof(float), "zu");
+ REQUIRE_CMP(layout.stride, ==, 4 * sizeof(float), "zu");
+ REQUIRE_CMP(layout.size, ==, 20 * 2 * 4 * sizeof(float), "zu");
+
+ layout = pl_std430_layout(5 * sizeof(float), &mat2a);
+ REQUIRE_CMP(layout.offset, ==, 6 * sizeof(float), "zu");
+ REQUIRE_CMP(layout.stride, ==, 2 * sizeof(float), "zu");
+ REQUIRE_CMP(layout.size, ==, 20 * 2 * 2 * sizeof(float), "zu");
+
+ for (const struct pl_named_var *nvar = pl_var_glsl_types; nvar->glsl_name; nvar++) {
+ struct pl_var var = nvar->var;
+ REQUIRE_CMP(nvar->glsl_name, ==, pl_var_glsl_type_name(var), "s");
+ var.dim_a = 100;
+ REQUIRE_CMP(nvar->glsl_name, ==, pl_var_glsl_type_name(var), "s");
+ }
+}
diff --git a/src/tests/vulkan.c b/src/tests/vulkan.c
new file mode 100644
index 0000000..476560a
--- /dev/null
+++ b/src/tests/vulkan.c
@@ -0,0 +1,296 @@
+#include <vulkan/vulkan.h>
+
+#include "gpu_tests.h"
+#include "vulkan/command.h"
+#include "vulkan/gpu.h"
+
+#include <libplacebo/vulkan.h>
+
+static void vulkan_interop_tests(pl_vulkan pl_vk,
+ enum pl_handle_type handle_type)
+{
+ pl_gpu gpu = pl_vk->gpu;
+ printf("testing vulkan interop for handle type 0x%x\n", handle_type);
+
+ if (gpu->export_caps.buf & handle_type) {
+ pl_buf buf = pl_buf_create(gpu, pl_buf_params(
+ .size = 1024,
+ .export_handle = handle_type,
+ ));
+
+ REQUIRE(buf);
+ REQUIRE_HANDLE(buf->shared_mem, handle_type);
+ REQUIRE_CMP(buf->shared_mem.size, >=, buf->params.size, "zu");
+ REQUIRE(pl_buf_export(gpu, buf));
+ pl_buf_destroy(gpu, &buf);
+ }
+
+ pl_fmt fmt = pl_find_fmt(gpu, PL_FMT_UNORM, 1, 0, 0, PL_FMT_CAP_BLITTABLE);
+ if (!fmt)
+ return;
+
+ if (gpu->export_caps.sync & handle_type) {
+ pl_sync sync = pl_sync_create(gpu, handle_type);
+ pl_tex tex = pl_tex_create(gpu, pl_tex_params(
+ .w = 32,
+ .h = 32,
+ .format = fmt,
+ .blit_dst = true,
+ ));
+
+ REQUIRE(sync);
+ REQUIRE(tex);
+
+ // Note: For testing purposes, we have to fool pl_tex_export into
+ // thinking this texture is actually exportable. Just hack it in
+ // horribly.
+ ((struct pl_tex_params *) &tex->params)->export_handle = PL_HANDLE_DMA_BUF;
+
+ REQUIRE(pl_tex_export(gpu, tex, sync));
+
+ // Re-use our internal helpers to signal this VkSemaphore
+ struct vk_ctx *vk = PL_PRIV(pl_vk);
+ struct vk_cmd *cmd = vk_cmd_begin(vk->pool_graphics, NULL);
+ REQUIRE(cmd);
+ struct pl_sync_vk *sync_vk = PL_PRIV(sync);
+ vk_cmd_sig(cmd, VK_PIPELINE_STAGE_2_NONE, (pl_vulkan_sem){ sync_vk->signal });
+ REQUIRE(vk_cmd_submit(&cmd));
+
+ // Do something with the image again to "import" it
+ pl_tex_clear(gpu, tex, (float[4]){0});
+ pl_gpu_finish(gpu);
+ REQUIRE(!pl_tex_poll(gpu, tex, 0));
+
+ pl_sync_destroy(gpu, &sync);
+ pl_tex_destroy(gpu, &tex);
+ }
+
+ // Test interop API
+ if (gpu->export_caps.tex & handle_type) {
+ VkSemaphore sem = pl_vulkan_sem_create(gpu, pl_vulkan_sem_params(
+ .type = VK_SEMAPHORE_TYPE_TIMELINE,
+ .initial_value = 0,
+ ));
+
+ pl_tex tex = pl_tex_create(gpu, pl_tex_params(
+ .w = 32,
+ .h = 32,
+ .format = fmt,
+ .blit_dst = true,
+ .export_handle = handle_type,
+ ));
+
+ REQUIRE(sem);
+ REQUIRE(tex);
+
+ REQUIRE(pl_vulkan_hold_ex(gpu, pl_vulkan_hold_params(
+ .tex = tex,
+ .layout = VK_IMAGE_LAYOUT_GENERAL,
+ .qf = VK_QUEUE_FAMILY_EXTERNAL,
+ .semaphore = { sem, 1 },
+ )));
+
+ pl_vulkan_release_ex(gpu, pl_vulkan_release_params(
+ .tex = tex,
+ .layout = VK_IMAGE_LAYOUT_GENERAL,
+ .qf = VK_QUEUE_FAMILY_EXTERNAL,
+ .semaphore = { sem, 1 },
+ ));
+
+ pl_tex_clear(gpu, tex, (float[4]){0});
+ pl_gpu_finish(gpu);
+ REQUIRE(!pl_tex_poll(gpu, tex, 0));
+
+ pl_vulkan_sem_destroy(gpu, &sem);
+ pl_tex_destroy(gpu, &tex);
+ }
+}
+
+static void vulkan_swapchain_tests(pl_vulkan vk, VkSurfaceKHR surf)
+{
+ if (!surf)
+ return;
+
+ printf("testing vulkan swapchain\n");
+ pl_gpu gpu = vk->gpu;
+ pl_swapchain sw;
+ sw = pl_vulkan_create_swapchain(vk, pl_vulkan_swapchain_params(
+ .surface = surf,
+ ));
+ REQUIRE(sw);
+
+ // Attempt actually initializing the swapchain
+ int w = 640, h = 480;
+ REQUIRE(pl_swapchain_resize(sw, &w, &h));
+
+ for (int i = 0; i < 10; i++) {
+ struct pl_swapchain_frame frame;
+ REQUIRE(pl_swapchain_start_frame(sw, &frame));
+ if (frame.fbo->params.blit_dst)
+ pl_tex_clear(gpu, frame.fbo, (float[4]){0});
+
+ // TODO: test this with an actual pl_renderer instance
+ struct pl_frame target;
+ pl_frame_from_swapchain(&target, &frame);
+
+ REQUIRE(pl_swapchain_submit_frame(sw));
+ pl_swapchain_swap_buffers(sw);
+
+ // Try resizing the swapchain in the middle of rendering
+ if (i == 5) {
+ w = 320;
+ h = 240;
+ REQUIRE(pl_swapchain_resize(sw, &w, &h));
+ }
+ }
+
+ pl_swapchain_destroy(&sw);
+}
+
+int main()
+{
+ pl_log log = pl_test_logger();
+ pl_vk_inst inst = pl_vk_inst_create(log, pl_vk_inst_params(
+ .debug = true,
+ .debug_extra = true,
+ .get_proc_addr = vkGetInstanceProcAddr,
+ .opt_extensions = (const char *[]){
+ VK_KHR_SURFACE_EXTENSION_NAME,
+ VK_EXT_HEADLESS_SURFACE_EXTENSION_NAME,
+ },
+ .num_opt_extensions = 2,
+ ));
+
+ if (!inst)
+ return SKIP;
+
+ PL_VK_LOAD_FUN(inst->instance, EnumeratePhysicalDevices, inst->get_proc_addr);
+ PL_VK_LOAD_FUN(inst->instance, GetPhysicalDeviceProperties, inst->get_proc_addr);
+
+ uint32_t num = 0;
+ EnumeratePhysicalDevices(inst->instance, &num, NULL);
+ if (!num)
+ return SKIP;
+
+ VkPhysicalDevice *devices = calloc(num, sizeof(*devices));
+ if (!devices)
+ return 1;
+ EnumeratePhysicalDevices(inst->instance, &num, devices);
+
+ VkSurfaceKHR surf = VK_NULL_HANDLE;
+
+ PL_VK_LOAD_FUN(inst->instance, CreateHeadlessSurfaceEXT, inst->get_proc_addr);
+ if (CreateHeadlessSurfaceEXT) {
+ VkHeadlessSurfaceCreateInfoEXT info = {
+ .sType = VK_STRUCTURE_TYPE_HEADLESS_SURFACE_CREATE_INFO_EXT,
+ };
+
+ VkResult res = CreateHeadlessSurfaceEXT(inst->instance, &info, NULL, &surf);
+ REQUIRE_CMP(res, ==, VK_SUCCESS, "u");
+ }
+
+ // Make sure choosing any device works
+ VkPhysicalDevice dev;
+ dev = pl_vulkan_choose_device(log, pl_vulkan_device_params(
+ .instance = inst->instance,
+ .get_proc_addr = inst->get_proc_addr,
+ .allow_software = true,
+ .surface = surf,
+ ));
+ if (!dev)
+ return SKIP;
+
+ // Test all attached devices
+ for (int i = 0; i < num; i++) {
+ VkPhysicalDeviceProperties props = {0};
+ GetPhysicalDeviceProperties(devices[i], &props);
+#ifndef CI_ALLOW_SW
+ if (props.deviceType == VK_PHYSICAL_DEVICE_TYPE_CPU) {
+ printf("Skipping device %d: %s\n", i, props.deviceName);
+ continue;
+ }
+#endif
+ printf("Testing device %d: %s\n", i, props.deviceName);
+
+ // Make sure we can choose this device by name
+ dev = pl_vulkan_choose_device(log, pl_vulkan_device_params(
+ .instance = inst->instance,
+ .get_proc_addr = inst->get_proc_addr,
+ .device_name = props.deviceName,
+ ));
+ REQUIRE_CMP(dev, ==, devices[i], "p");
+
+ struct pl_vulkan_params params = *pl_vulkan_params(
+ .instance = inst->instance,
+ .get_proc_addr = inst->get_proc_addr,
+ .device = devices[i],
+ .queue_count = 8, // test inter-queue stuff
+ .surface = surf,
+ );
+
+ pl_vulkan vk = pl_vulkan_create(log, &params);
+ if (!vk)
+ continue;
+
+ gpu_shader_tests(vk->gpu);
+ vulkan_swapchain_tests(vk, surf);
+
+ // Print heap statistics
+ pl_vk_print_heap(vk->gpu, PL_LOG_DEBUG);
+
+ // Test importing this context via the vulkan interop API
+ pl_vulkan vk2 = pl_vulkan_import(log, pl_vulkan_import_params(
+ .instance = vk->instance,
+ .get_proc_addr = inst->get_proc_addr,
+ .phys_device = vk->phys_device,
+ .device = vk->device,
+
+ .extensions = vk->extensions,
+ .num_extensions = vk->num_extensions,
+ .features = vk->features,
+ .queue_graphics = vk->queue_graphics,
+ .queue_compute = vk->queue_compute,
+ .queue_transfer = vk->queue_transfer,
+ ));
+ REQUIRE(vk2);
+ pl_vulkan_destroy(&vk2);
+
+ // Run these tests last because they disable some validation layers
+#ifdef PL_HAVE_UNIX
+ vulkan_interop_tests(vk, PL_HANDLE_FD);
+ vulkan_interop_tests(vk, PL_HANDLE_DMA_BUF);
+#endif
+#ifdef PL_HAVE_WIN32
+ vulkan_interop_tests(vk, PL_HANDLE_WIN32);
+ vulkan_interop_tests(vk, PL_HANDLE_WIN32_KMT);
+#endif
+ gpu_interop_tests(vk->gpu);
+ pl_vulkan_destroy(&vk);
+
+ // Re-run the same export/import tests with async queues disabled
+ params.async_compute = false;
+ params.async_transfer = false;
+ vk = pl_vulkan_create(log, &params);
+ REQUIRE(vk); // it succeeded the first time
+
+#ifdef PL_HAVE_UNIX
+ vulkan_interop_tests(vk, PL_HANDLE_FD);
+ vulkan_interop_tests(vk, PL_HANDLE_DMA_BUF);
+#endif
+#ifdef PL_HAVE_WIN32
+ vulkan_interop_tests(vk, PL_HANDLE_WIN32);
+ vulkan_interop_tests(vk, PL_HANDLE_WIN32_KMT);
+#endif
+ gpu_interop_tests(vk->gpu);
+ pl_vulkan_destroy(&vk);
+
+ // Reduce log spam after first tested device
+ pl_log_level_update(log, PL_LOG_INFO);
+ }
+
+ if (surf)
+ vkDestroySurfaceKHR(inst->instance, surf, NULL);
+ pl_vk_inst_destroy(&inst);
+ pl_log_destroy(&log);
+ free(devices);
+}
diff --git a/src/tone_mapping.c b/src/tone_mapping.c
new file mode 100644
index 0000000..f08bb58
--- /dev/null
+++ b/src/tone_mapping.c
@@ -0,0 +1,775 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+
+#include "common.h"
+
+#include <libplacebo/tone_mapping.h>
+
+#define fclampf(x, lo, hi) fminf(fmaxf(x, lo), hi)
+static void fix_constants(struct pl_tone_map_constants *c)
+{
+ const float eps = 1e-6f;
+ c->knee_adaptation = fclampf(c->knee_adaptation, 0.0f, 1.0f);
+ c->knee_minimum = fclampf(c->knee_minimum, eps, 0.5f - eps);
+ c->knee_maximum = fclampf(c->knee_maximum, 0.5f + eps, 1.0f - eps);
+ c->knee_default = fclampf(c->knee_default, c->knee_minimum, c->knee_maximum);
+ c->knee_offset = fclampf(c->knee_offset, 0.5f, 2.0f);
+ c->slope_tuning = fclampf(c->slope_tuning, 0.0f, 10.0f);
+ c->slope_offset = fclampf(c->slope_offset, 0.0f, 1.0f);
+ c->spline_contrast = fclampf(c->spline_contrast, 0.0f, 1.5f);
+ c->reinhard_contrast = fclampf(c->reinhard_contrast, eps, 1.0f - eps);
+ c->linear_knee = fclampf(c->linear_knee, eps, 1.0f - eps);
+ c->exposure = fclampf(c->exposure, eps, 10.0f);
+}
+
+static inline bool constants_equal(const struct pl_tone_map_constants *a,
+ const struct pl_tone_map_constants *b)
+{
+ pl_static_assert(sizeof(*a) % sizeof(float) == 0);
+ return !memcmp(a, b, sizeof(*a));
+}
+
+bool pl_tone_map_params_equal(const struct pl_tone_map_params *a,
+ const struct pl_tone_map_params *b)
+{
+ return a->function == b->function &&
+ a->param == b->param &&
+ a->input_scaling == b->input_scaling &&
+ a->output_scaling == b->output_scaling &&
+ a->lut_size == b->lut_size &&
+ a->input_min == b->input_min &&
+ a->input_max == b->input_max &&
+ a->input_avg == b->input_avg &&
+ a->output_min == b->output_min &&
+ a->output_max == b->output_max &&
+ constants_equal(&a->constants, &b->constants) &&
+ pl_hdr_metadata_equal(&a->hdr, &b->hdr);
+}
+
+bool pl_tone_map_params_noop(const struct pl_tone_map_params *p)
+{
+ float in_min = pl_hdr_rescale(p->input_scaling, PL_HDR_NITS, p->input_min);
+ float in_max = pl_hdr_rescale(p->input_scaling, PL_HDR_NITS, p->input_max);
+ float out_min = pl_hdr_rescale(p->output_scaling, PL_HDR_NITS, p->output_min);
+ float out_max = pl_hdr_rescale(p->output_scaling, PL_HDR_NITS, p->output_max);
+ bool can_inverse = p->function->map_inverse;
+
+ return fabs(in_min - out_min) < 1e-4 && // no BPC
+ in_max < out_max + 1e-2 && // no range reduction
+ (out_max < in_max + 1e-2 || !can_inverse); // no inverse tone-mapping
+}
+
+void pl_tone_map_params_infer(struct pl_tone_map_params *par)
+{
+ if (!par->function)
+ par->function = &pl_tone_map_clip;
+
+ if (par->param) {
+ // Backwards compatibility for older API
+ if (par->function == &pl_tone_map_st2094_40 || par->function == &pl_tone_map_st2094_10)
+ par->constants.knee_adaptation = par->param;
+ if (par->function == &pl_tone_map_bt2390)
+ par->constants.knee_offset = par->param;
+ if (par->function == &pl_tone_map_spline)
+ par->constants.spline_contrast = par->param;
+ if (par->function == &pl_tone_map_reinhard)
+ par->constants.reinhard_contrast = par->param;
+ if (par->function == &pl_tone_map_mobius || par->function == &pl_tone_map_gamma)
+ par->constants.linear_knee = par->param;
+ if (par->function == &pl_tone_map_linear || par->function == &pl_tone_map_linear_light)
+ par->constants.exposure = par->param;
+ }
+
+ fix_constants(&par->constants);
+
+ // Constrain the input peak to be no less than target SDR white
+ float sdr = pl_hdr_rescale(par->output_scaling, par->input_scaling, par->output_max);
+ sdr = fminf(sdr, pl_hdr_rescale(PL_HDR_NITS, par->input_scaling, PL_COLOR_SDR_WHITE));
+ par->input_max = fmaxf(par->input_max, sdr);
+
+ // Constrain the output peak if function does not support inverse mapping
+ if (!par->function->map_inverse)
+ par->output_max = fminf(par->output_max, par->input_max);
+}
+
+// Infer params and rescale to function scaling
+static struct pl_tone_map_params fix_params(const struct pl_tone_map_params *params)
+{
+ struct pl_tone_map_params fixed = *params;
+ pl_tone_map_params_infer(&fixed);
+
+ const struct pl_tone_map_function *fun = params->function;
+ fixed.input_scaling = fun->scaling;
+ fixed.output_scaling = fun->scaling;
+ fixed.input_min = pl_hdr_rescale(params->input_scaling, fun->scaling, fixed.input_min);
+ fixed.input_max = pl_hdr_rescale(params->input_scaling, fun->scaling, fixed.input_max);
+ fixed.input_avg = pl_hdr_rescale(params->input_scaling, fun->scaling, fixed.input_avg);
+ fixed.output_min = pl_hdr_rescale(params->output_scaling, fun->scaling, fixed.output_min);
+ fixed.output_max = pl_hdr_rescale(params->output_scaling, fun->scaling, fixed.output_max);
+
+ return fixed;
+}
+
+#define FOREACH_LUT(lut, V) \
+ for (float *_iter = lut, *_end = lut + params->lut_size, V; \
+ _iter < _end && ( V = *_iter, 1 ); *_iter++ = V)
+
+static void map_lut(float *lut, const struct pl_tone_map_params *params)
+{
+ if (params->output_max > params->input_max + 1e-4) {
+ // Inverse tone-mapping
+ pl_assert(params->function->map_inverse);
+ params->function->map_inverse(lut, params);
+ } else {
+ // Forward tone-mapping
+ params->function->map(lut, params);
+ }
+}
+
+void pl_tone_map_generate(float *out, const struct pl_tone_map_params *params)
+{
+ struct pl_tone_map_params fixed = fix_params(params);
+
+ // Generate input values evenly spaced in `params->input_scaling`
+ for (size_t i = 0; i < params->lut_size; i++) {
+ float x = (float) i / (params->lut_size - 1);
+ x = PL_MIX(params->input_min, params->input_max, x);
+ out[i] = pl_hdr_rescale(params->input_scaling, fixed.function->scaling, x);
+ }
+
+ map_lut(out, &fixed);
+
+ // Sanitize outputs and adapt back to `params->scaling`
+ for (size_t i = 0; i < params->lut_size; i++) {
+ float x = PL_CLAMP(out[i], fixed.output_min, fixed.output_max);
+ out[i] = pl_hdr_rescale(fixed.function->scaling, params->output_scaling, x);
+ }
+}
+
+float pl_tone_map_sample(float x, const struct pl_tone_map_params *params)
+{
+ struct pl_tone_map_params fixed = fix_params(params);
+ fixed.lut_size = 1;
+
+ x = PL_CLAMP(x, params->input_min, params->input_max);
+ x = pl_hdr_rescale(params->input_scaling, fixed.function->scaling, x);
+ map_lut(&x, &fixed);
+ x = PL_CLAMP(x, fixed.output_min, fixed.output_max);
+ x = pl_hdr_rescale(fixed.function->scaling, params->output_scaling, x);
+ return x;
+}
+
+// Rescale from input-absolute to input-relative
+static inline float rescale_in(float x, const struct pl_tone_map_params *params)
+{
+ return (x - params->input_min) / (params->input_max - params->input_min);
+}
+
+// Rescale from input-absolute to output-relative
+static inline float rescale(float x, const struct pl_tone_map_params *params)
+{
+ return (x - params->input_min) / (params->output_max - params->output_min);
+}
+
+// Rescale from output-relative to output-absolute
+static inline float rescale_out(float x, const struct pl_tone_map_params *params)
+{
+ return x * (params->output_max - params->output_min) + params->output_min;
+}
+
+static inline float bt1886_eotf(float x, float min, float max)
+{
+ const float lb = powf(min, 1/2.4f);
+ const float lw = powf(max, 1/2.4f);
+ return powf((lw - lb) * x + lb, 2.4f);
+}
+
+static inline float bt1886_oetf(float x, float min, float max)
+{
+ const float lb = powf(min, 1/2.4f);
+ const float lw = powf(max, 1/2.4f);
+ return (powf(x, 1/2.4f) - lb) / (lw - lb);
+}
+
+static void noop(float *lut, const struct pl_tone_map_params *params)
+{
+ return;
+}
+
+const struct pl_tone_map_function pl_tone_map_clip = {
+ .name = "clip",
+ .description = "No tone mapping (clip)",
+ .map = noop,
+ .map_inverse = noop,
+};
+
+// Helper function to pick a knee point (for suitable methods) based on the
+// HDR10+ brightness metadata and scene brightness average matching.
+//
+// Inspired by SMPTE ST2094-10, with some modifications
+static void st2094_pick_knee(float *out_src_knee, float *out_dst_knee,
+ const struct pl_tone_map_params *params)
+{
+ const float src_min = pl_hdr_rescale(params->input_scaling, PL_HDR_PQ, params->input_min);
+ const float src_max = pl_hdr_rescale(params->input_scaling, PL_HDR_PQ, params->input_max);
+ const float src_avg = pl_hdr_rescale(params->input_scaling, PL_HDR_PQ, params->input_avg);
+ const float dst_min = pl_hdr_rescale(params->output_scaling, PL_HDR_PQ, params->output_min);
+ const float dst_max = pl_hdr_rescale(params->output_scaling, PL_HDR_PQ, params->output_max);
+
+ const float min_knee = params->constants.knee_minimum;
+ const float max_knee = params->constants.knee_maximum;
+ const float def_knee = params->constants.knee_default;
+ const float src_knee_min = PL_MIX(src_min, src_max, min_knee);
+ const float src_knee_max = PL_MIX(src_min, src_max, max_knee);
+ const float dst_knee_min = PL_MIX(dst_min, dst_max, min_knee);
+ const float dst_knee_max = PL_MIX(dst_min, dst_max, max_knee);
+
+ // Choose source knee based on source scene brightness
+ float src_knee = PL_DEF(src_avg, PL_MIX(src_min, src_max, def_knee));
+ src_knee = fclampf(src_knee, src_knee_min, src_knee_max);
+
+ // Choose target adaptation point based on linearly re-scaling source knee
+ float target = (src_knee - src_min) / (src_max - src_min);
+ float adapted = PL_MIX(dst_min, dst_max, target);
+
+ // Choose the destnation knee by picking the perceptual adaptation point
+ // between the source knee and the desired target. This moves the knee
+ // point, on the vertical axis, closer to the 1:1 (neutral) line.
+ //
+ // Adjust the adaptation strength towards 1 based on how close the knee
+ // point is to its extreme values (min/max knee)
+ float tuning = 1.0f - pl_smoothstep(max_knee, def_knee, target) *
+ pl_smoothstep(min_knee, def_knee, target);
+ float adaptation = PL_MIX(params->constants.knee_adaptation, 1.0f, tuning);
+ float dst_knee = PL_MIX(src_knee, adapted, adaptation);
+ dst_knee = fclampf(dst_knee, dst_knee_min, dst_knee_max);
+
+ *out_src_knee = pl_hdr_rescale(PL_HDR_PQ, params->input_scaling, src_knee);
+ *out_dst_knee = pl_hdr_rescale(PL_HDR_PQ, params->output_scaling, dst_knee);
+}
+
+// Pascal's triangle
+static const uint16_t binom[17][17] = {
+ {1},
+ {1,1},
+ {1,2,1},
+ {1,3,3,1},
+ {1,4,6,4,1},
+ {1,5,10,10,5,1},
+ {1,6,15,20,15,6,1},
+ {1,7,21,35,35,21,7,1},
+ {1,8,28,56,70,56,28,8,1},
+ {1,9,36,84,126,126,84,36,9,1},
+ {1,10,45,120,210,252,210,120,45,10,1},
+ {1,11,55,165,330,462,462,330,165,55,11,1},
+ {1,12,66,220,495,792,924,792,495,220,66,12,1},
+ {1,13,78,286,715,1287,1716,1716,1287,715,286,78,13,1},
+ {1,14,91,364,1001,2002,3003,3432,3003,2002,1001,364,91,14,1},
+ {1,15,105,455,1365,3003,5005,6435,6435,5005,3003,1365,455,105,15,1},
+ {1,16,120,560,1820,4368,8008,11440,12870,11440,8008,4368,1820,560,120,16,1},
+};
+
+static inline float st2094_intercept(uint8_t N, float Kx, float Ky)
+{
+ if (Kx <= 0 || Ky >= 1)
+ return 1.0f / N;
+
+ const float slope = Ky / Kx * (1 - Kx) / (1 - Ky);
+ return fminf(slope / N, 1.0f);
+}
+
+static void st2094_40(float *lut, const struct pl_tone_map_params *params)
+{
+ const float D = params->output_max;
+
+ // Allocate space for the adjusted bezier control points, plus endpoints
+ float P[17], Kx, Ky, T;
+ uint8_t N;
+
+ if (params->hdr.ootf.num_anchors) {
+
+ // Use bezier curve from metadata
+ Kx = PL_CLAMP(params->hdr.ootf.knee_x, 0, 1);
+ Ky = PL_CLAMP(params->hdr.ootf.knee_y, 0, 1);
+ T = PL_CLAMP(params->hdr.ootf.target_luma, params->input_min, params->input_max);
+ N = params->hdr.ootf.num_anchors + 1;
+ pl_assert(N < PL_ARRAY_SIZE(P));
+ memcpy(P + 1, params->hdr.ootf.anchors, (N - 1) * sizeof(*P));
+ P[0] = 0.0f;
+ P[N] = 1.0f;
+
+ } else {
+
+ // Missing metadata, default to simple brightness matching
+ float src_knee, dst_knee;
+ st2094_pick_knee(&src_knee, &dst_knee, params);
+ Kx = src_knee / params->input_max;
+ Ky = dst_knee / params->output_max;
+
+ // Solve spline to match slope at knee intercept
+ const float slope = Ky / Kx * (1 - Kx) / (1 - Ky);
+ N = PL_CLAMP((int) ceilf(slope), 2, PL_ARRAY_SIZE(P) - 1);
+ P[0] = 0.0f;
+ P[1] = st2094_intercept(N, Kx, Ky);
+ for (int i = 2; i <= N; i++)
+ P[i] = 1.0f;
+ T = D;
+
+ }
+
+ if (D < T) {
+
+ // Output display darker than OOTF target, make brighter
+ const float Dmin = 0.0f, u = fmaxf(0.0f, (D - Dmin) / (T - Dmin));
+
+ // Scale down the knee point to make more room for the OOTF
+ Kx *= u;
+ Ky *= u;
+
+ // Make the slope of the knee more closely approximate a clip(),
+ // constrained to avoid exploding P[1]
+ const float beta = N * Kx / (1 - Kx);
+ const float Kxy = fminf(Kx * params->input_max / D, beta / (beta + 1));
+ Ky = PL_MIX(Kxy, Ky, u);
+
+ for (int p = 2; p <= N; p++)
+ P[p] = PL_MIX(1.0f, P[p], u);
+
+ // Make the OOTF intercept linear as D -> Dmin
+ P[1] = PL_MIX(st2094_intercept(N, Kx, Ky), P[1], u);
+
+ } else if (D > T) {
+
+ // Output display brighter than OOTF target, make more linear
+ pl_assert(params->input_max > T);
+ const float w = powf(1 - (D - T) / (params->input_max - T), 1.4f);
+
+ // Constrain the slope of the input knee to prevent it from
+ // exploding and making the picture way too bright
+ Ky *= T / D;
+
+ // Make the slope of the knee more linear by solving for f(Kx) = Kx
+ float Kxy = Kx * D / params->input_max;
+ Ky = PL_MIX(Kxy, Ky, w);
+
+ for (int p = 2; p < N; p++) {
+ float anchor_lin = (float) p / N;
+ P[p] = PL_MIX(anchor_lin, P[p], w);
+ }
+
+ // Make the OOTF intercept linear as D -> input_max
+ P[1] = PL_MIX(st2094_intercept(N, Kx, Ky), P[1], w);
+
+ }
+
+ pl_assert(Kx >= 0 && Kx <= 1);
+ pl_assert(Ky >= 0 && Ky <= 1);
+
+ FOREACH_LUT(lut, x) {
+ x = bt1886_oetf(x, params->input_min, params->input_max);
+ x = bt1886_eotf(x, 0.0f, 1.0f);
+
+ if (x <= Kx && Kx) {
+ // Linear section
+ x *= Ky / Kx;
+ } else {
+ // Bezier section
+ const float t = (x - Kx) / (1 - Kx);
+
+ x = 0; // Bn
+ for (uint8_t p = 0; p <= N; p++)
+ x += binom[N][p] * powf(t, p) * powf(1 - t, N - p) * P[p];
+
+ x = Ky + (1 - Ky) * x;
+ }
+
+ x = bt1886_oetf(x, 0.0f, 1.0f);
+ x = bt1886_eotf(x, params->output_min, params->output_max);
+ }
+}
+
+const struct pl_tone_map_function pl_tone_map_st2094_40 = {
+ .name = "st2094-40",
+ .description = "SMPTE ST 2094-40 Annex B",
+ .param_desc = "Knee point target",
+ .param_min = 0.00f,
+ .param_def = 0.70f,
+ .param_max = 1.00f,
+ .scaling = PL_HDR_NITS,
+ .map = st2094_40,
+};
+
+static void st2094_10(float *lut, const struct pl_tone_map_params *params)
+{
+ float src_knee, dst_knee;
+ st2094_pick_knee(&src_knee, &dst_knee, params);
+
+ const float x1 = params->input_min;
+ const float x3 = params->input_max;
+ const float x2 = src_knee;
+
+ const float y1 = params->output_min;
+ const float y3 = params->output_max;
+ const float y2 = dst_knee;
+
+ const pl_matrix3x3 cmat = {{
+ { x2*x3*(y2 - y3), x1*x3*(y3 - y1), x1*x2*(y1 - y2) },
+ { x3*y3 - x2*y2, x1*y1 - x3*y3, x2*y2 - x1*y1 },
+ { x3 - x2, x1 - x3, x2 - x1 },
+ }};
+
+ float coeffs[3] = { y1, y2, y3 };
+ pl_matrix3x3_apply(&cmat, coeffs);
+
+ const float k = 1.0 / (x3*y3*(x1 - x2) + x2*y2*(x3 - x1) + x1*y1*(x2 - x3));
+ const float c1 = k * coeffs[0];
+ const float c2 = k * coeffs[1];
+ const float c3 = k * coeffs[2];
+
+ FOREACH_LUT(lut, x)
+ x = (c1 + c2 * x) / (1 + c3 * x);
+}
+
+const struct pl_tone_map_function pl_tone_map_st2094_10 = {
+ .name = "st2094-10",
+ .description = "SMPTE ST 2094-10 Annex B.2",
+ .param_desc = "Knee point target",
+ .param_min = 0.00f,
+ .param_def = 0.70f,
+ .param_max = 1.00f,
+ .scaling = PL_HDR_NITS,
+ .map = st2094_10,
+};
+
+static void bt2390(float *lut, const struct pl_tone_map_params *params)
+{
+ const float minLum = rescale_in(params->output_min, params);
+ const float maxLum = rescale_in(params->output_max, params);
+ const float offset = params->constants.knee_offset;
+ const float ks = (1 + offset) * maxLum - offset;
+ const float bp = minLum > 0 ? fminf(1 / minLum, 4) : 4;
+ const float gain_inv = 1 + minLum / maxLum * powf(1 - maxLum, bp);
+ const float gain = maxLum < 1 ? 1 / gain_inv : 1;
+
+ FOREACH_LUT(lut, x) {
+ x = rescale_in(x, params);
+
+ // Piece-wise hermite spline
+ if (ks < 1) {
+ float tb = (x - ks) / (1 - ks);
+ float tb2 = tb * tb;
+ float tb3 = tb2 * tb;
+ float pb = (2 * tb3 - 3 * tb2 + 1) * ks +
+ (tb3 - 2 * tb2 + tb) * (1 - ks) +
+ (-2 * tb3 + 3 * tb2) * maxLum;
+ x = x < ks ? x : pb;
+ }
+
+ // Black point adaptation
+ if (x < 1) {
+ x += minLum * powf(1 - x, bp);
+ x = gain * (x - minLum) + minLum;
+ }
+
+ x = x * (params->input_max - params->input_min) + params->input_min;
+ }
+}
+
+const struct pl_tone_map_function pl_tone_map_bt2390 = {
+ .name = "bt2390",
+ .description = "ITU-R BT.2390 EETF",
+ .scaling = PL_HDR_PQ,
+ .param_desc = "Knee offset",
+ .param_min = 0.50,
+ .param_def = 1.00,
+ .param_max = 2.00,
+ .map = bt2390,
+};
+
+static void bt2446a(float *lut, const struct pl_tone_map_params *params)
+{
+ const float phdr = 1 + 32 * powf(params->input_max / 10000, 1/2.4f);
+ const float psdr = 1 + 32 * powf(params->output_max / 10000, 1/2.4f);
+
+ FOREACH_LUT(lut, x) {
+ x = powf(rescale_in(x, params), 1/2.4f);
+ x = logf(1 + (phdr - 1) * x) / logf(phdr);
+
+ if (x <= 0.7399f) {
+ x = 1.0770f * x;
+ } else if (x < 0.9909f) {
+ x = (-1.1510f * x + 2.7811f) * x - 0.6302f;
+ } else {
+ x = 0.5f * x + 0.5f;
+ }
+
+ x = (powf(psdr, x) - 1) / (psdr - 1);
+ x = bt1886_eotf(x, params->output_min, params->output_max);
+ }
+}
+
+static void bt2446a_inv(float *lut, const struct pl_tone_map_params *params)
+{
+ FOREACH_LUT(lut, x) {
+ x = bt1886_oetf(x, params->input_min, params->input_max);
+ x *= 255.0;
+ if (x > 70) {
+ x = powf(x, (2.8305e-6f * x - 7.4622e-4f) * x + 1.2528f);
+ } else {
+ x = powf(x, (1.8712e-5f * x - 2.7334e-3f) * x + 1.3141f);
+ }
+ x = powf(x / 1000, 2.4f);
+ x = rescale_out(x, params);
+ }
+}
+
+const struct pl_tone_map_function pl_tone_map_bt2446a = {
+ .name = "bt2446a",
+ .description = "ITU-R BT.2446 Method A",
+ .scaling = PL_HDR_NITS,
+ .map = bt2446a,
+ .map_inverse = bt2446a_inv,
+};
+
+static void spline(float *lut, const struct pl_tone_map_params *params)
+{
+ float src_pivot, dst_pivot;
+ st2094_pick_knee(&src_pivot, &dst_pivot, params);
+
+ // Solve for linear knee (Pa = 0)
+ float slope = (dst_pivot - params->output_min) /
+ (src_pivot - params->input_min);
+
+ // Tune the slope at the knee point slightly: raise it to a user-provided
+ // gamma exponent, multiplied by an extra tuning coefficient designed to
+ // make the slope closer to 1.0 when the difference in peaks is low, and
+ // closer to linear when the difference between peaks is high.
+ float ratio = params->input_max / params->output_max - 1.0f;
+ ratio = fclampf(params->constants.slope_tuning * ratio,
+ params->constants.slope_offset,
+ 1.0f + params->constants.slope_offset);
+ slope = powf(slope, (1.0f - params->constants.spline_contrast) * ratio);
+
+ // Normalize everything the pivot to make the math easier
+ const float in_min = params->input_min - src_pivot;
+ const float in_max = params->input_max - src_pivot;
+ const float out_min = params->output_min - dst_pivot;
+ const float out_max = params->output_max - dst_pivot;
+
+ // Solve P of order 2 for:
+ // P(in_min) = out_min
+ // P'(0.0) = slope
+ // P(0.0) = 0.0
+ const float Pa = (out_min - slope * in_min) / (in_min * in_min);
+ const float Pb = slope;
+
+ // Solve Q of order 3 for:
+ // Q(in_max) = out_max
+ // Q''(in_max) = 0.0
+ // Q(0.0) = 0.0
+ // Q'(0.0) = slope
+ const float t = 2 * in_max * in_max;
+ const float Qa = (slope * in_max - out_max) / (in_max * t);
+ const float Qb = -3 * (slope * in_max - out_max) / t;
+ const float Qc = slope;
+
+ FOREACH_LUT(lut, x) {
+ x -= src_pivot;
+ x = x > 0 ? ((Qa * x + Qb) * x + Qc) * x : (Pa * x + Pb) * x;
+ x += dst_pivot;
+ }
+}
+
+const struct pl_tone_map_function pl_tone_map_spline = {
+ .name = "spline",
+ .description = "Single-pivot polynomial spline",
+ .param_desc = "Contrast",
+ .param_min = 0.00f,
+ .param_def = 0.50f,
+ .param_max = 1.50f,
+ .scaling = PL_HDR_PQ,
+ .map = spline,
+ .map_inverse = spline,
+};
+
+static void reinhard(float *lut, const struct pl_tone_map_params *params)
+{
+ const float peak = rescale(params->input_max, params),
+ contrast = params->constants.reinhard_contrast,
+ offset = (1.0 - contrast) / contrast,
+ scale = (peak + offset) / peak;
+
+ FOREACH_LUT(lut, x) {
+ x = rescale(x, params);
+ x = x / (x + offset);
+ x *= scale;
+ x = rescale_out(x, params);
+ }
+}
+
+const struct pl_tone_map_function pl_tone_map_reinhard = {
+ .name = "reinhard",
+ .description = "Reinhard",
+ .param_desc = "Contrast",
+ .param_min = 0.001,
+ .param_def = 0.50,
+ .param_max = 0.99,
+ .map = reinhard,
+};
+
+static void mobius(float *lut, const struct pl_tone_map_params *params)
+{
+ const float peak = rescale(params->input_max, params),
+ j = params->constants.linear_knee;
+
+ // Solve for M(j) = j; M(peak) = 1.0; M'(j) = 1.0
+ // where M(x) = scale * (x+a)/(x+b)
+ const float a = -j*j * (peak - 1.0f) / (j*j - 2.0f * j + peak);
+ const float b = (j*j - 2.0f * j * peak + peak) /
+ fmaxf(1e-6f, peak - 1.0f);
+ const float scale = (b*b + 2.0f * b*j + j*j) / (b - a);
+
+ FOREACH_LUT(lut, x) {
+ x = rescale(x, params);
+ x = x <= j ? x : scale * (x + a) / (x + b);
+ x = rescale_out(x, params);
+ }
+}
+
+const struct pl_tone_map_function pl_tone_map_mobius = {
+ .name = "mobius",
+ .description = "Mobius",
+ .param_desc = "Knee point",
+ .param_min = 0.00,
+ .param_def = 0.30,
+ .param_max = 0.99,
+ .map = mobius,
+};
+
+static inline float hable(float x)
+{
+ const float A = 0.15, B = 0.50, C = 0.10, D = 0.20, E = 0.02, F = 0.30;
+ return ((x * (A*x + C*B) + D*E) / (x * (A*x + B) + D*F)) - E/F;
+}
+
+static void hable_map(float *lut, const struct pl_tone_map_params *params)
+{
+ const float peak = params->input_max / params->output_max,
+ scale = 1.0f / hable(peak);
+
+ FOREACH_LUT(lut, x) {
+ x = bt1886_oetf(x, params->input_min, params->input_max);
+ x = bt1886_eotf(x, 0, peak);
+ x = scale * hable(x);
+ x = bt1886_oetf(x, 0, 1);
+ x = bt1886_eotf(x, params->output_min, params->output_max);
+ }
+}
+
+const struct pl_tone_map_function pl_tone_map_hable = {
+ .name = "hable",
+ .description = "Filmic tone-mapping (Hable)",
+ .map = hable_map,
+};
+
+static void gamma_map(float *lut, const struct pl_tone_map_params *params)
+{
+ const float peak = rescale(params->input_max, params),
+ cutoff = params->constants.linear_knee,
+ gamma = logf(cutoff) / logf(cutoff / peak);
+
+ FOREACH_LUT(lut, x) {
+ x = rescale(x, params);
+ x = x > cutoff ? powf(x / peak, gamma) : x;
+ x = rescale_out(x, params);
+ }
+}
+
+const struct pl_tone_map_function pl_tone_map_gamma = {
+ .name = "gamma",
+ .description = "Gamma function with knee",
+ .param_desc = "Knee point",
+ .param_min = 0.001,
+ .param_def = 0.30,
+ .param_max = 1.00,
+ .map = gamma_map,
+};
+
+static void linear(float *lut, const struct pl_tone_map_params *params)
+{
+ const float gain = params->constants.exposure;
+
+ FOREACH_LUT(lut, x) {
+ x = rescale_in(x, params);
+ x *= gain;
+ x = rescale_out(x, params);
+ }
+}
+
+const struct pl_tone_map_function pl_tone_map_linear = {
+ .name = "linear",
+ .description = "Perceptually linear stretch",
+ .param_desc = "Exposure",
+ .param_min = 0.001,
+ .param_def = 1.00,
+ .param_max = 10.0,
+ .scaling = PL_HDR_PQ,
+ .map = linear,
+ .map_inverse = linear,
+};
+
+const struct pl_tone_map_function pl_tone_map_linear_light = {
+ .name = "linearlight",
+ .description = "Linear light stretch",
+ .param_desc = "Exposure",
+ .param_min = 0.001,
+ .param_def = 1.00,
+ .param_max = 10.0,
+ .scaling = PL_HDR_NORM,
+ .map = linear,
+ .map_inverse = linear,
+};
+
+const struct pl_tone_map_function * const pl_tone_map_functions[] = {
+ &pl_tone_map_clip,
+ &pl_tone_map_st2094_40,
+ &pl_tone_map_st2094_10,
+ &pl_tone_map_bt2390,
+ &pl_tone_map_bt2446a,
+ &pl_tone_map_spline,
+ &pl_tone_map_reinhard,
+ &pl_tone_map_mobius,
+ &pl_tone_map_hable,
+ &pl_tone_map_gamma,
+ &pl_tone_map_linear,
+ &pl_tone_map_linear_light,
+ NULL
+};
+
+const int pl_num_tone_map_functions = PL_ARRAY_SIZE(pl_tone_map_functions) - 1;
+
+const struct pl_tone_map_function *pl_find_tone_map_function(const char *name)
+{
+ for (int i = 0; i < pl_num_tone_map_functions; i++) {
+ if (strcmp(name, pl_tone_map_functions[i]->name) == 0)
+ return pl_tone_map_functions[i];
+ }
+
+ return NULL;
+}
diff --git a/src/ucrt_math.def b/src/ucrt_math.def
new file mode 100644
index 0000000..f7d000d
--- /dev/null
+++ b/src/ucrt_math.def
@@ -0,0 +1,292 @@
+LIBRARY api-ms-win-crt-math-l1-1-0
+EXPORTS
+_Cbuild
+_Cmulcc
+_Cmulcr
+_FCbuild
+_FCmulcc
+_FCmulcr
+_LCbuild
+_LCmulcc
+_LCmulcr
+__setusermatherr
+_cabs
+_chgsign
+_chgsignf
+_copysign
+_copysignf
+_d_int
+_dclass
+_dexp
+_dlog
+_dnorm
+_dpcomp
+_dpoly
+_dscale
+_dsign
+_dsin
+_dtest
+_dunscale
+_except1
+_fd_int
+_fdclass
+_fdexp
+_fdlog
+_fdnorm
+_fdopen
+_fdpcomp
+_fdpoly
+_fdscale
+_fdsign
+_fdsin
+_fdtest
+_fdunscale
+_finite
+_finitef
+_fpclass
+_fpclassf
+_get_FMA3_enable
+_hypot
+_hypotf
+_isnan
+_isnanf
+_j0
+_j1
+_jn
+_ld_int
+_ldclass
+_ldexp
+_ldlog
+_ldpcomp
+_ldpoly
+_ldscale
+_ldsign
+_ldsin
+_ldtest
+_ldunscale
+_logb
+_logbf
+_nextafter
+_nextafterf
+_scalb
+_scalbf
+_set_FMA3_enable
+_y0
+_y1
+_yn
+acos
+acosf
+acosh
+acoshf
+acoshl
+asin
+asinf
+asinh
+asinhf
+asinhl
+atan
+atan2
+atan2f
+atanf
+atanh
+atanhf
+atanhl
+cabs
+cabsf
+cabsl
+cacos
+cacosf
+cacosh
+cacoshf
+cacoshl
+cacosl
+carg
+cargf
+cargl
+casin
+casinf
+casinh
+casinhf
+casinhl
+casinl
+catan
+catanf
+catanh
+catanhf
+catanhl
+catanl
+cbrt
+cbrtf
+cbrtl
+ccos
+ccosf
+ccosh
+ccoshf
+ccoshl
+ccosl
+ceil
+ceilf
+cexp
+cexpf
+cexpl
+cimag
+cimagf
+cimagl
+clog
+clog10
+clog10f
+clog10l
+clogf
+clogl
+conj
+conjf
+conjl
+copysign
+copysignf
+copysignl
+cos
+cosf
+cosh
+coshf
+cpow
+cpowf
+cpowl
+cproj
+cprojf
+cprojl
+creal
+crealf
+creall
+csin
+csinf
+csinh
+csinhf
+csinhl
+csinl
+csqrt
+csqrtf
+csqrtl
+ctan
+ctanf
+ctanh
+ctanhf
+ctanhl
+ctanl
+erf
+erfc
+erfcf
+erfcl
+erff
+erfl
+exp
+exp2
+exp2f
+exp2l
+expf
+expm1
+expm1f
+expm1l
+fabs
+fdim
+fdimf
+fdiml
+floor
+floorf
+fma
+fmaf
+fmal
+fmax
+fmaxf
+fmaxl
+fmin
+fminf
+fminl
+fmod
+fmodf
+frexp
+hypot
+ilogb
+ilogbf
+ilogbl
+ldexp
+lgamma
+lgammaf
+lgammal
+llrint
+llrintf
+llrintl
+llround
+llroundf
+llroundl
+log
+log10
+log10f
+log1p
+log1pf
+log1pl
+log2
+log2f
+log2l
+logb
+logbf
+logbl
+logf
+lrint
+lrintf
+lrintl
+lround
+lroundf
+lroundl
+modf
+modff
+nan
+nanf
+nanl
+nearbyint
+nearbyintf
+nearbyintl
+nextafter
+nextafterf
+nextafterl
+nexttoward
+nexttowardf
+nexttowardl
+norm
+normf
+norml
+pow
+powf
+remainder
+remainderf
+remainderl
+remquo
+remquof
+remquol
+rint
+rintf
+rintl
+round
+roundf
+roundl
+scalbln
+scalblnf
+scalblnl
+scalbn
+scalbnf
+scalbnl
+sin
+sinf
+sinh
+sinhf
+sqrt
+sqrtf
+tan
+tanf
+tanh
+tanhf
+tgamma
+tgammaf
+tgammal
+trunc
+truncf
+truncl
diff --git a/src/utils/dolbyvision.c b/src/utils/dolbyvision.c
new file mode 100644
index 0000000..3798532
--- /dev/null
+++ b/src/utils/dolbyvision.c
@@ -0,0 +1,63 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+#include <libplacebo/utils/dolbyvision.h>
+
+#ifdef PL_HAVE_LIBDOVI
+#include <libplacebo/tone_mapping.h>
+#include <libdovi/rpu_parser.h>
+#endif
+
+void pl_hdr_metadata_from_dovi_rpu(struct pl_hdr_metadata *out,
+ const uint8_t *buf, size_t size)
+{
+#ifdef PL_HAVE_LIBDOVI
+ if (buf && size) {
+ DoviRpuOpaque *rpu =
+ dovi_parse_unspec62_nalu(buf, size);
+ const DoviRpuDataHeader *header = dovi_rpu_get_header(rpu);
+
+ if (header && header->vdr_dm_metadata_present_flag) {
+ // Profile 4 reshaping isn't done as it is a dual layer format.
+ // However there are still unknowns on its EOTF, so it cannot be enabled.
+ //
+ // For profile 7, the brightness metadata can still be used as most
+ // titles are going to have accurate metadata<->image brightness,
+ // with the exception of some titles that require the enhancement layer
+ // to be processed to restore the intended brightness, which would then
+ // match the metadata values.
+ if (header->guessed_profile == 4) {
+ goto done;
+ }
+
+ const DoviVdrDmData *vdr_dm_data = dovi_rpu_get_vdr_dm_data(rpu);
+ if (vdr_dm_data->dm_data.level1) {
+ const DoviExtMetadataBlockLevel1 *l1 = vdr_dm_data->dm_data.level1;
+ out->max_pq_y = l1->max_pq / 4095.0f;
+ out->avg_pq_y = l1->avg_pq / 4095.0f;
+ }
+
+ dovi_rpu_free_vdr_dm_data(vdr_dm_data);
+ }
+
+ done:
+ dovi_rpu_free_header(header);
+ dovi_rpu_free(rpu);
+ }
+#endif
+}
diff --git a/src/utils/frame_queue.c b/src/utils/frame_queue.c
new file mode 100644
index 0000000..0155983
--- /dev/null
+++ b/src/utils/frame_queue.c
@@ -0,0 +1,1030 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <errno.h>
+#include <math.h>
+
+#include "common.h"
+#include "log.h"
+#include "pl_thread.h"
+
+#include <libplacebo/utils/frame_queue.h>
+
+struct cache_entry {
+ pl_tex tex[4];
+};
+
+struct entry {
+ pl_rc_t rc;
+ double pts;
+ struct cache_entry cache;
+ struct pl_source_frame src;
+ struct pl_frame frame;
+ uint64_t signature;
+ bool mapped;
+ bool ok;
+
+ // for interlaced frames
+ enum pl_field field;
+ struct entry *primary;
+ struct entry *prev, *next;
+ bool dirty;
+};
+
+// Hard limits for vsync timing validity
+#define MIN_FPS 10
+#define MAX_FPS 400
+
+// Limits for FPS estimation state
+#define MAX_SAMPLES 32
+#define MIN_SAMPLES 4
+
+// Stickiness to prevent `interpolation_threshold` oscillation
+#define THRESHOLD_MAX_RATIO 0.3
+#define THRESHOLD_FRAMES 5
+
+// Maximum number of not-yet-mapped frames to allow queueing in advance
+#define PREFETCH_FRAMES 2
+
+struct pool {
+ float samples[MAX_SAMPLES];
+ float estimate;
+ float sum;
+ int idx;
+ int num;
+ int total;
+};
+
+struct pl_queue_t {
+ pl_gpu gpu;
+ pl_log log;
+
+ // For multi-threading, we use two locks. The `lock_weak` guards the queue
+ // state itself. The `lock_strong` has a bigger scope and should be held
+ // for the duration of any functions that expect the queue state to
+ // remain more or less valid (with the exception of adding new members).
+ //
+ // In particular, `pl_queue_reset` and `pl_queue_update` will take
+ // the strong lock, while `pl_queue_push_*` will only take the weak
+ // lock.
+ pl_mutex lock_strong;
+ pl_mutex lock_weak;
+ pl_cond wakeup;
+
+ // Frame queue and state
+ PL_ARRAY(struct entry *) queue;
+ uint64_t signature;
+ int threshold_frames;
+ bool want_frame;
+ bool eof;
+
+ // Average vsync/frame fps estimation state
+ struct pool vps, fps;
+ float reported_vps;
+ float reported_fps;
+ double prev_pts;
+
+ // Storage for temporary arrays
+ PL_ARRAY(uint64_t) tmp_sig;
+ PL_ARRAY(float) tmp_ts;
+ PL_ARRAY(const struct pl_frame *) tmp_frame;
+
+ // Queue of GPU objects to reuse
+ PL_ARRAY(struct cache_entry) cache;
+};
+
+pl_queue pl_queue_create(pl_gpu gpu)
+{
+ pl_queue p = pl_alloc_ptr(NULL, p);
+ *p = (struct pl_queue_t) {
+ .gpu = gpu,
+ .log = gpu->log,
+ };
+
+ pl_mutex_init(&p->lock_strong);
+ pl_mutex_init(&p->lock_weak);
+ int ret = pl_cond_init(&p->wakeup);
+ if (ret) {
+ PL_ERR(p, "Failed to init conditional variable: %d", ret);
+ return NULL;
+ }
+ return p;
+}
+
+static void recycle_cache(pl_queue p, struct cache_entry *cache, bool recycle)
+{
+ bool has_textures = false;
+ for (int i = 0; i < PL_ARRAY_SIZE(cache->tex); i++) {
+ if (!cache->tex[i])
+ continue;
+
+ has_textures = true;
+ if (recycle) {
+ pl_tex_invalidate(p->gpu, cache->tex[i]);
+ } else {
+ pl_tex_destroy(p->gpu, &cache->tex[i]);
+ }
+ }
+
+ if (recycle && has_textures)
+ PL_ARRAY_APPEND(p, p->cache, *cache);
+
+ memset(cache, 0, sizeof(*cache)); // sanity
+}
+
+static void entry_deref(pl_queue p, struct entry **pentry, bool recycle)
+{
+ struct entry *entry = *pentry;
+ *pentry = NULL;
+ if (!entry || !pl_rc_deref(&entry->rc))
+ return;
+
+ if (!entry->mapped && entry->src.discard) {
+ PL_TRACE(p, "Discarding unused frame id %"PRIu64" with PTS %f",
+ entry->signature, entry->src.pts);
+ entry->src.discard(&entry->src);
+ }
+
+ if (entry->mapped && entry->ok && entry->src.unmap) {
+ PL_TRACE(p, "Unmapping frame id %"PRIu64" with PTS %f",
+ entry->signature, entry->src.pts);
+ entry->src.unmap(p->gpu, &entry->frame, &entry->src);
+ }
+
+ recycle_cache(p, &entry->cache, recycle);
+ pl_free(entry);
+}
+
+static struct entry *entry_ref(struct entry *entry)
+{
+ pl_rc_ref(&entry->rc);
+ return entry;
+}
+
+static void entry_cull(pl_queue p, struct entry *entry, bool recycle)
+{
+ // Forcibly clean up references to prev/next frames, even if `entry` has
+ // remaining refs pointing at it. This is to prevent cyclic references.
+ entry_deref(p, &entry->primary, recycle);
+ entry_deref(p, &entry->prev, recycle);
+ entry_deref(p, &entry->next, recycle);
+ entry_deref(p, &entry, recycle);
+}
+
+void pl_queue_destroy(pl_queue *queue)
+{
+ pl_queue p = *queue;
+ if (!p)
+ return;
+
+ for (int n = 0; n < p->queue.num; n++)
+ entry_cull(p, p->queue.elem[n], false);
+ for (int n = 0; n < p->cache.num; n++) {
+ for (int i = 0; i < PL_ARRAY_SIZE(p->cache.elem[n].tex); i++)
+ pl_tex_destroy(p->gpu, &p->cache.elem[n].tex[i]);
+ }
+
+ pl_cond_destroy(&p->wakeup);
+ pl_mutex_destroy(&p->lock_weak);
+ pl_mutex_destroy(&p->lock_strong);
+ pl_free(p);
+ *queue = NULL;
+}
+
+void pl_queue_reset(pl_queue p)
+{
+ pl_mutex_lock(&p->lock_strong);
+ pl_mutex_lock(&p->lock_weak);
+
+ for (int i = 0; i < p->queue.num; i++)
+ entry_cull(p, p->queue.elem[i], false);
+
+ *p = (struct pl_queue_t) {
+ .gpu = p->gpu,
+ .log = p->log,
+
+ // Reuse lock objects
+ .lock_strong = p->lock_strong,
+ .lock_weak = p->lock_weak,
+ .wakeup = p->wakeup,
+
+ // Explicitly preserve allocations
+ .queue.elem = p->queue.elem,
+ .tmp_sig.elem = p->tmp_sig.elem,
+ .tmp_ts.elem = p->tmp_ts.elem,
+ .tmp_frame.elem = p->tmp_frame.elem,
+
+ // Reuse GPU object cache entirely
+ .cache = p->cache,
+ };
+
+ pl_cond_signal(&p->wakeup);
+ pl_mutex_unlock(&p->lock_weak);
+ pl_mutex_unlock(&p->lock_strong);
+}
+
+static inline float delta(float old, float new)
+{
+ return fabsf((new - old) / PL_MIN(new, old));
+}
+
+static inline void default_estimate(struct pool *pool, float val)
+{
+ if (!pool->estimate && isnormal(val) && val > 0.0)
+ pool->estimate = val;
+}
+
+static inline void update_estimate(struct pool *pool, float cur)
+{
+ if (pool->num) {
+ static const float max_delta = 0.3;
+ if (delta(pool->sum / pool->num, cur) > max_delta) {
+ pool->sum = 0.0;
+ pool->num = pool->idx = 0;
+ }
+ }
+
+ if (pool->num++ == MAX_SAMPLES) {
+ pool->sum -= pool->samples[pool->idx];
+ pool->num--;
+ }
+
+ pool->sum += pool->samples[pool->idx] = cur;
+ pool->idx = (pool->idx + 1) % MAX_SAMPLES;
+ pool->total++;
+
+ if (pool->total < MIN_SAMPLES || pool->num >= MIN_SAMPLES)
+ pool->estimate = pool->sum / pool->num;
+}
+
+static void queue_push(pl_queue p, const struct pl_source_frame *src)
+{
+ if (p->eof && !src)
+ return; // ignore duplicate EOF
+
+ if (p->eof && src) {
+ PL_INFO(p, "Received frame after EOF signaled... discarding frame!");
+ if (src->discard)
+ src->discard(src);
+ return;
+ }
+
+ pl_cond_signal(&p->wakeup);
+
+ if (!src) {
+ PL_TRACE(p, "Received EOF, draining frame queue...");
+ p->eof = true;
+ p->want_frame = false;
+ return;
+ }
+
+ // Update FPS estimates if possible/reasonable
+ default_estimate(&p->fps, src->first_field ? src->duration / 2 : src->duration);
+ if (p->queue.num) {
+ double last_pts = p->queue.elem[p->queue.num - 1]->pts;
+ float delta = src->pts - last_pts;
+ if (delta <= 0.0f) {
+ PL_DEBUG(p, "Non monotonically increasing PTS %f -> %f", last_pts, src->pts);
+ } else if (p->fps.estimate && delta > 10.0 * p->fps.estimate) {
+ PL_DEBUG(p, "Discontinuous source PTS jump %f -> %f", last_pts, src->pts);
+ } else {
+ update_estimate(&p->fps, delta);
+ }
+ } else if (src->pts != 0) {
+ PL_DEBUG(p, "First frame received with non-zero PTS %f", src->pts);
+ }
+
+ struct entry *entry = pl_alloc_ptr(NULL, entry);
+ *entry = (struct entry) {
+ .signature = p->signature++,
+ .pts = src->pts,
+ .src = *src,
+ };
+ pl_rc_init(&entry->rc);
+ PL_ARRAY_POP(p->cache, &entry->cache);
+ PL_TRACE(p, "Added new frame id %"PRIu64" with PTS %f",
+ entry->signature, entry->pts);
+
+ // Insert new entry into the correct spot in the queue, sorted by PTS
+ for (int i = p->queue.num;; i--) {
+ if (i == 0 || p->queue.elem[i - 1]->pts <= entry->pts) {
+ if (src->first_field == PL_FIELD_NONE) {
+ // Progressive
+ PL_ARRAY_INSERT_AT(p, p->queue, i, entry);
+ break;
+ } else {
+ // Interlaced
+ struct entry *prev = i > 0 ? p->queue.elem[i - 1] : NULL;
+ struct entry *next = i < p->queue.num ? p->queue.elem[i] : NULL;
+ struct entry *entry2 = pl_zalloc_ptr(NULL, entry2);
+ pl_rc_init(&entry2->rc);
+ if (next) {
+ entry2->pts = (entry->pts + next->pts) / 2;
+ } else if (src->duration) {
+ entry2->pts = entry->pts + src->duration / 2;
+ } else if (p->fps.estimate) {
+ entry2->pts = entry->pts + p->fps.estimate;
+ } else {
+ PL_ERR(p, "Frame with PTS %f specified as interlaced, but "
+ "no FPS information known yet! Please specify a "
+ "valid `pl_source_frame.duration`. Treating as "
+ "progressive...", src->pts);
+ PL_ARRAY_INSERT_AT(p, p->queue, i, entry);
+ pl_free(entry2);
+ break;
+ }
+
+ entry->field = src->first_field;
+ entry2->primary = entry_ref(entry);
+ entry2->field = pl_field_other(entry->field);
+ entry2->signature = p->signature++;
+
+ PL_TRACE(p, "Added second field id %"PRIu64" with PTS %f",
+ entry2->signature, entry2->pts);
+
+ // Link previous/next frames
+ if (prev) {
+ entry->prev = entry_ref(PL_DEF(prev->primary, prev));
+ entry2->prev = entry_ref(PL_DEF(prev->primary, prev));
+ // Retroactively re-link the previous frames that should
+ // be referencing this frame
+ for (int j = i - 1; j >= 0; --j) {
+ struct entry *e = p->queue.elem[j];
+ if (e != prev && e != prev->primary)
+ break;
+ entry_deref(p, &e->next, true);
+ e->next = entry_ref(entry);
+ if (e->dirty) { // reset signature to signal change
+ e->signature = p->signature++;
+ e->dirty = false;
+ }
+ }
+ }
+
+ if (next) {
+ entry->next = entry_ref(PL_DEF(next->primary, next));
+ entry2->next = entry_ref(PL_DEF(next->primary, next));
+ for (int j = i; j < p->queue.num; j++) {
+ struct entry *e = p->queue.elem[j];
+ if (e != next && e != next->primary)
+ break;
+ entry_deref(p, &e->prev, true);
+ e->prev = entry_ref(entry);
+ if (e->dirty) {
+ e->signature = p->signature++;
+ e->dirty = false;
+ }
+ }
+ }
+
+ PL_ARRAY_INSERT_AT(p, p->queue, i, entry);
+ PL_ARRAY_INSERT_AT(p, p->queue, i+1, entry2);
+ break;
+ }
+ }
+ }
+
+ p->want_frame = false;
+}
+
+void pl_queue_push(pl_queue p, const struct pl_source_frame *frame)
+{
+ pl_mutex_lock(&p->lock_weak);
+ queue_push(p, frame);
+ pl_mutex_unlock(&p->lock_weak);
+}
+
+static inline bool entry_mapped(struct entry *entry)
+{
+ return entry->mapped || (entry->primary && entry->primary->mapped);
+}
+
+static bool queue_has_room(pl_queue p)
+{
+ if (p->want_frame)
+ return true;
+
+ int wanted_frames = PREFETCH_FRAMES;
+ if (p->fps.estimate && p->vps.estimate && p->vps.estimate <= 1.0f / MIN_FPS)
+ wanted_frames += ceilf(p->vps.estimate / p->fps.estimate) - 1;
+
+ // Examine the queue tail
+ for (int i = p->queue.num - 1; i >= 0; i--) {
+ if (entry_mapped(p->queue.elem[i]))
+ return true;
+ if (p->queue.num - i >= wanted_frames)
+ return false;
+ }
+
+ return true;
+}
+
+bool pl_queue_push_block(pl_queue p, uint64_t timeout,
+ const struct pl_source_frame *frame)
+{
+ pl_mutex_lock(&p->lock_weak);
+ if (!timeout || !frame || p->eof)
+ goto skip_blocking;
+
+ while (!queue_has_room(p) && !p->eof) {
+ if (pl_cond_timedwait(&p->wakeup, &p->lock_weak, timeout) == ETIMEDOUT) {
+ pl_mutex_unlock(&p->lock_weak);
+ return false;
+ }
+ }
+
+skip_blocking:
+
+ queue_push(p, frame);
+ pl_mutex_unlock(&p->lock_weak);
+ return true;
+}
+
+static void report_estimates(pl_queue p)
+{
+ if (p->fps.total >= MIN_SAMPLES && p->vps.total >= MIN_SAMPLES) {
+ if (p->reported_fps && p->reported_vps) {
+ // Only re-report the estimates if they've changed considerably
+ // from the previously reported values
+ static const float report_delta = 0.3f;
+ float delta_fps = delta(p->reported_fps, p->fps.estimate);
+ float delta_vps = delta(p->reported_vps, p->vps.estimate);
+ if (delta_fps < report_delta && delta_vps < report_delta)
+ return;
+ }
+
+ PL_INFO(p, "Estimated source FPS: %.3f, display FPS: %.3f",
+ 1.0 / p->fps.estimate, 1.0 / p->vps.estimate);
+
+ p->reported_fps = p->fps.estimate;
+ p->reported_vps = p->vps.estimate;
+ }
+}
+
+// note: may add more than one frame, since it releases the lock
+static enum pl_queue_status get_frame(pl_queue p, const struct pl_queue_params *params)
+{
+ if (p->eof)
+ return PL_QUEUE_EOF;
+
+ if (!params->get_frame) {
+ if (!params->timeout)
+ return PL_QUEUE_MORE;
+
+ p->want_frame = true;
+ pl_cond_signal(&p->wakeup);
+
+ while (p->want_frame) {
+ if (pl_cond_timedwait(&p->wakeup, &p->lock_weak, params->timeout) == ETIMEDOUT)
+ return PL_QUEUE_MORE;
+ }
+
+ return p->eof ? PL_QUEUE_EOF : PL_QUEUE_OK;
+ }
+
+ // Don't hold the weak mutex while calling into `get_frame`, to allow
+ // `pl_queue_push` to run concurrently while we're waiting for frames
+ pl_mutex_unlock(&p->lock_weak);
+
+ struct pl_source_frame src;
+ enum pl_queue_status ret;
+ switch ((ret = params->get_frame(&src, params))) {
+ case PL_QUEUE_OK:
+ pl_queue_push(p, &src);
+ break;
+ case PL_QUEUE_EOF:
+ pl_queue_push(p, NULL);
+ break;
+ case PL_QUEUE_MORE:
+ case PL_QUEUE_ERR:
+ break;
+ }
+
+ pl_mutex_lock(&p->lock_weak);
+ return ret;
+}
+
+static inline bool map_frame(pl_queue p, struct entry *entry)
+{
+ if (!entry->mapped) {
+ PL_TRACE(p, "Mapping frame id %"PRIu64" with PTS %f",
+ entry->signature, entry->pts);
+ entry->mapped = true;
+ entry->ok = entry->src.map(p->gpu, entry->cache.tex,
+ &entry->src, &entry->frame);
+ if (!entry->ok)
+ PL_ERR(p, "Failed mapping frame id %"PRIu64" with PTS %f",
+ entry->signature, entry->pts);
+ }
+
+ return entry->ok;
+}
+
+static bool map_entry(pl_queue p, struct entry *entry)
+{
+ bool ok = map_frame(p, entry->primary ? entry->primary : entry);
+ if (entry->prev)
+ ok &= map_frame(p, entry->prev);
+ if (entry->next)
+ ok &= map_frame(p, entry->next);
+ if (!ok)
+ return false;
+
+ if (entry->primary)
+ entry->frame = entry->primary->frame;
+
+ if (entry->field) {
+ entry->frame.field = entry->field;
+ entry->frame.first_field = PL_DEF(entry->primary, entry)->src.first_field;
+ entry->frame.prev = entry->prev ? &entry->prev->frame : NULL;
+ entry->frame.next = entry->next ? &entry->next->frame : NULL;
+ entry->dirty = true;
+ }
+
+ return true;
+}
+
+static bool entry_complete(struct entry *entry)
+{
+ return entry->field ? !!entry->next : true;
+}
+
+// Advance the queue as needed to make sure idx 0 is the last frame before
+// `pts`, and idx 1 is the first frame after `pts` (unless this is the last).
+//
+// Returns PL_QUEUE_OK only if idx 0 is still legal under ZOH semantics.
+static enum pl_queue_status advance(pl_queue p, double pts,
+ const struct pl_queue_params *params)
+{
+ // Cull all frames except the last frame before `pts`
+ int culled = 0;
+ for (int i = 1; i < p->queue.num; i++) {
+ if (p->queue.elem[i]->pts <= pts) {
+ entry_cull(p, p->queue.elem[i - 1], true);
+ culled++;
+ }
+ }
+ PL_ARRAY_REMOVE_RANGE(p->queue, 0, culled);
+
+ // Keep adding new frames until we find one in the future, or EOF
+ enum pl_queue_status ret = PL_QUEUE_OK;
+ while (p->queue.num < 2) {
+ switch ((ret = get_frame(p, params))) {
+ case PL_QUEUE_ERR:
+ return ret;
+ case PL_QUEUE_EOF:
+ if (!p->queue.num)
+ return ret;
+ goto done;
+ case PL_QUEUE_MORE:
+ case PL_QUEUE_OK:
+ while (p->queue.num > 1 && p->queue.elem[1]->pts <= pts) {
+ entry_cull(p, p->queue.elem[0], true);
+ PL_ARRAY_REMOVE_AT(p->queue, 0);
+ }
+ if (ret == PL_QUEUE_MORE)
+ return ret;
+ continue;
+ }
+ }
+
+ if (!entry_complete(p->queue.elem[1])) {
+ switch (get_frame(p, params)) {
+ case PL_QUEUE_ERR:
+ return PL_QUEUE_ERR;
+ case PL_QUEUE_MORE:
+ ret = PL_QUEUE_MORE;
+ // fall through
+ case PL_QUEUE_EOF:
+ case PL_QUEUE_OK:
+ goto done;
+ }
+ }
+
+done:
+ if (p->eof && p->queue.num == 1) {
+ if (p->queue.elem[0]->pts == 0.0 || !p->fps.estimate) {
+ // If the last frame has PTS 0.0, or we have no FPS estimate, then
+ // this is probably a single-frame file, in which case we want to
+ // extend the ZOH to infinity, rather than returning. Not a perfect
+ // heuristic, but w/e
+ return PL_QUEUE_OK;
+ }
+
+ // Last frame is held for an extra `p->fps.estimate` duration,
+ // afterwards this function just returns EOF.
+ if (pts < p->queue.elem[0]->pts + p->fps.estimate) {
+ ret = PL_QUEUE_OK;
+ } else {
+ entry_cull(p, p->queue.elem[0], true);
+ p->queue.num = 0;
+ return PL_QUEUE_EOF;
+ }
+ }
+
+ pl_assert(p->queue.num);
+ return ret;
+}
+
+static inline enum pl_queue_status point(pl_queue p, struct pl_frame_mix *mix,
+ const struct pl_queue_params *params)
+{
+ if (!p->queue.num) {
+ *mix = (struct pl_frame_mix) {0};
+ return PL_QUEUE_MORE;
+ }
+
+ // Find closest frame (nearest neighbour semantics)
+ struct entry *entry = p->queue.elem[0];
+ if (entry->pts > params->pts) { // first frame not visible yet
+ *mix = (struct pl_frame_mix) {0};
+ return PL_QUEUE_OK;
+ }
+
+ double best = fabs(entry->pts - params->pts);
+ for (int i = 1; i < p->queue.num; i++) {
+ double dist = fabs(p->queue.elem[i]->pts - params->pts);
+ if (dist < best) {
+ entry = p->queue.elem[i];
+ best = dist;
+ continue;
+ } else {
+ break;
+ }
+ }
+
+ if (!map_entry(p, entry))
+ return PL_QUEUE_ERR;
+
+ // Return a mix containing only this single frame
+ p->tmp_sig.num = p->tmp_ts.num = p->tmp_frame.num = 0;
+ PL_ARRAY_APPEND(p, p->tmp_sig, entry->signature);
+ PL_ARRAY_APPEND(p, p->tmp_frame, &entry->frame);
+ PL_ARRAY_APPEND(p, p->tmp_ts, 0.0);
+ *mix = (struct pl_frame_mix) {
+ .num_frames = 1,
+ .frames = p->tmp_frame.elem,
+ .signatures = p->tmp_sig.elem,
+ .timestamps = p->tmp_ts.elem,
+ .vsync_duration = 1.0,
+ };
+
+ PL_TRACE(p, "Showing single frame id %"PRIu64" with PTS %f for target PTS %f",
+ entry->signature, entry->pts, params->pts);
+
+ report_estimates(p);
+ return PL_QUEUE_OK;
+}
+
+// Present a single frame as appropriate for `pts`
+static enum pl_queue_status nearest(pl_queue p, struct pl_frame_mix *mix,
+ const struct pl_queue_params *params)
+{
+ enum pl_queue_status ret;
+ switch ((ret = advance(p, params->pts, params))) {
+ case PL_QUEUE_ERR:
+ case PL_QUEUE_EOF:
+ return ret;
+ case PL_QUEUE_OK:
+ case PL_QUEUE_MORE:
+ if (mix && point(p, mix, params) == PL_QUEUE_ERR)
+ return PL_QUEUE_ERR;
+ return ret;
+ }
+
+ pl_unreachable();
+}
+
+// Special case of `interpolate` for radius = 0, in which case we need exactly
+// the previous frame and the following frame
+static enum pl_queue_status oversample(pl_queue p, struct pl_frame_mix *mix,
+ const struct pl_queue_params *params)
+{
+ enum pl_queue_status ret;
+ switch ((ret = advance(p, params->pts, params))) {
+ case PL_QUEUE_ERR:
+ case PL_QUEUE_EOF:
+ return ret;
+ case PL_QUEUE_OK:
+ break;
+ case PL_QUEUE_MORE:
+ if (!p->queue.num) {
+ if (mix)
+ *mix = (struct pl_frame_mix) {0};
+ return ret;
+ }
+ break;
+ }
+
+ if (!mix)
+ return PL_QUEUE_OK;
+
+ // Can't oversample with only a single frame, fall back to point sampling
+ if (p->queue.num < 2 || p->queue.elem[0]->pts > params->pts) {
+ if (point(p, mix, params) != PL_QUEUE_OK)
+ return PL_QUEUE_ERR;
+ return ret;
+ }
+
+ struct entry *entries[2] = { p->queue.elem[0], p->queue.elem[1] };
+ pl_assert(entries[0]->pts <= params->pts);
+ pl_assert(entries[1]->pts >= params->pts);
+
+ // Returning a mix containing both of these two frames
+ p->tmp_sig.num = p->tmp_ts.num = p->tmp_frame.num = 0;
+ for (int i = 0; i < 2; i++) {
+ if (!map_entry(p, entries[i]))
+ return PL_QUEUE_ERR;
+ float ts = (entries[i]->pts - params->pts) / p->fps.estimate;
+ PL_ARRAY_APPEND(p, p->tmp_sig, entries[i]->signature);
+ PL_ARRAY_APPEND(p, p->tmp_frame, &entries[i]->frame);
+ PL_ARRAY_APPEND(p, p->tmp_ts, ts);
+ }
+
+ *mix = (struct pl_frame_mix) {
+ .num_frames = 2,
+ .frames = p->tmp_frame.elem,
+ .signatures = p->tmp_sig.elem,
+ .timestamps = p->tmp_ts.elem,
+ .vsync_duration = p->vps.estimate / p->fps.estimate,
+ };
+
+ PL_TRACE(p, "Oversampling 2 frames for target PTS %f:", params->pts);
+ for (int i = 0; i < mix->num_frames; i++)
+ PL_TRACE(p, " id %"PRIu64" ts %f", mix->signatures[i], mix->timestamps[i]);
+
+ report_estimates(p);
+ return ret;
+}
+
+// Present a mixture of frames, relative to the vsync ratio
+static enum pl_queue_status interpolate(pl_queue p, struct pl_frame_mix *mix,
+ const struct pl_queue_params *params)
+{
+ // No FPS estimate available, possibly source contains only a single frame,
+ // or this is the first frame to be rendered. Fall back to point sampling.
+ if (!p->fps.estimate)
+ return nearest(p, mix, params);
+
+ // Silently disable interpolation if the ratio dips lower than the
+ // configured threshold
+ float ratio = fabs(p->fps.estimate / p->vps.estimate - 1.0);
+ if (ratio < params->interpolation_threshold) {
+ if (!p->threshold_frames) {
+ PL_INFO(p, "Detected fps ratio %.4f below threshold %.4f, "
+ "disabling interpolation",
+ ratio, params->interpolation_threshold);
+ }
+
+ p->threshold_frames = THRESHOLD_FRAMES + 1;
+ return nearest(p, mix, params);
+ } else if (ratio < THRESHOLD_MAX_RATIO && p->threshold_frames > 1) {
+ p->threshold_frames--;
+ return nearest(p, mix, params);
+ } else {
+ if (p->threshold_frames) {
+ PL_INFO(p, "Detected fps ratio %.4f exceeds threshold %.4f, "
+ "re-enabling interpolation",
+ ratio, params->interpolation_threshold);
+ }
+ p->threshold_frames = 0;
+ }
+
+ // No radius information, special case in which we only need the previous
+ // and next frames.
+ if (!params->radius)
+ return oversample(p, mix, params);
+
+ pl_assert(p->fps.estimate && p->vps.estimate);
+ float radius = params->radius * fmaxf(1.0f, p->vps.estimate / p->fps.estimate);
+ double min_pts = params->pts - radius * p->fps.estimate,
+ max_pts = params->pts + radius * p->fps.estimate;
+
+ enum pl_queue_status ret;
+ switch ((ret = advance(p, min_pts, params))) {
+ case PL_QUEUE_ERR:
+ case PL_QUEUE_EOF:
+ return ret;
+ case PL_QUEUE_MORE:
+ goto done;
+ case PL_QUEUE_OK:
+ break;
+ }
+
+ // Keep adding new frames until we've covered the range we care about
+ pl_assert(p->queue.num);
+ while (p->queue.elem[p->queue.num - 1]->pts < max_pts) {
+ switch ((ret = get_frame(p, params))) {
+ case PL_QUEUE_ERR:
+ return ret;
+ case PL_QUEUE_MORE:
+ goto done;
+ case PL_QUEUE_EOF:;
+ // Don't forward EOF until we've held the last frame for the
+ // desired ZOH hold duration
+ double last_pts = p->queue.elem[p->queue.num - 1]->pts;
+ if (last_pts && params->pts >= last_pts + p->fps.estimate)
+ return ret;
+ ret = PL_QUEUE_OK;
+ goto done;
+ case PL_QUEUE_OK:
+ continue;
+ }
+ }
+
+ if (!entry_complete(p->queue.elem[p->queue.num - 1])) {
+ switch ((ret = get_frame(p, params))) {
+ case PL_QUEUE_MORE:
+ case PL_QUEUE_OK:
+ break;
+ case PL_QUEUE_ERR:
+ case PL_QUEUE_EOF:
+ return ret;
+ }
+ }
+
+done: ;
+
+ if (!mix)
+ return PL_QUEUE_OK;
+
+ // Construct a mix object representing the current queue state, starting at
+ // the last frame before `min_pts` to make sure there's a fallback frame
+ // available for ZOH semantics.
+ p->tmp_sig.num = p->tmp_ts.num = p->tmp_frame.num = 0;
+ for (int i = 0; i < p->queue.num; i++) {
+ struct entry *entry = p->queue.elem[i];
+ if (entry->pts > max_pts)
+ break;
+ if (!map_entry(p, entry))
+ return PL_QUEUE_ERR;
+ float ts = (entry->pts - params->pts) / p->fps.estimate;
+ PL_ARRAY_APPEND(p, p->tmp_sig, entry->signature);
+ PL_ARRAY_APPEND(p, p->tmp_frame, &entry->frame);
+ PL_ARRAY_APPEND(p, p->tmp_ts, ts);
+ }
+
+ *mix = (struct pl_frame_mix) {
+ .num_frames = p->tmp_frame.num,
+ .frames = p->tmp_frame.elem,
+ .signatures = p->tmp_sig.elem,
+ .timestamps = p->tmp_ts.elem,
+ .vsync_duration = p->vps.estimate / p->fps.estimate,
+ };
+
+ PL_TRACE(p, "Showing mix of %d frames for target PTS %f:",
+ mix->num_frames, params->pts);
+ for (int i = 0; i < mix->num_frames; i++)
+ PL_TRACE(p, " id %"PRIu64" ts %f", mix->signatures[i], mix->timestamps[i]);
+
+ report_estimates(p);
+ return ret;
+}
+
+static bool prefill(pl_queue p, const struct pl_queue_params *params)
+{
+ int min_frames = 2 * ceilf(params->radius);
+ if (p->fps.estimate && p->vps.estimate && p->vps.estimate <= 1.0f / MIN_FPS)
+ min_frames *= ceilf(p->vps.estimate / p->fps.estimate);
+ min_frames = PL_MAX(min_frames, PREFETCH_FRAMES);
+
+ while (p->queue.num < min_frames) {
+ switch (get_frame(p, params)) {
+ case PL_QUEUE_ERR:
+ return false;
+ case PL_QUEUE_EOF:
+ case PL_QUEUE_MORE:
+ return true;
+ case PL_QUEUE_OK:
+ continue;
+ }
+ }
+
+ // In the most likely case, the first few frames will all be required. So
+ // force-map them all to initialize GPU state on initial rendering. This is
+ // better than the alternative of missing the cache later, when timing is
+ // more relevant.
+ for (int i = 0; i < min_frames; i++) {
+ if (!map_entry(p, p->queue.elem[i]))
+ return false;
+ }
+
+ return true;
+}
+
+enum pl_queue_status pl_queue_update(pl_queue p, struct pl_frame_mix *out_mix,
+ const struct pl_queue_params *params)
+{
+ pl_mutex_lock(&p->lock_strong);
+ pl_mutex_lock(&p->lock_weak);
+ default_estimate(&p->vps, params->vsync_duration);
+
+ float delta = params->pts - p->prev_pts;
+ if (delta < 0.0f) {
+
+ // This is a backwards PTS jump. This is something we can handle
+ // semi-gracefully, but only if we haven't culled past the current
+ // frame yet.
+ if (p->queue.num && p->queue.elem[0]->pts > params->pts) {
+ PL_ERR(p, "Requested PTS %f is lower than the oldest frame "
+ "PTS %f. This is not supported, PTS must be monotonically "
+ "increasing! Please use `pl_queue_reset` to reset the frame "
+ "queue on discontinuous PTS jumps.",
+ params->pts, p->queue.elem[0]->pts);
+ pl_mutex_unlock(&p->lock_weak);
+ pl_mutex_unlock(&p->lock_strong);
+ return PL_QUEUE_ERR;
+ }
+
+ } else if (delta > 1.0f) {
+
+ // A jump of more than a second is probably the result of a
+ // discontinuous jump after a suspend. To prevent this from exploding
+ // the FPS estimate, treat this as a new frame.
+ PL_TRACE(p, "Discontinuous target PTS jump %f -> %f, ignoring...",
+ p->prev_pts, params->pts);
+
+ } else if (delta > 0) {
+
+ update_estimate(&p->vps, params->pts - p->prev_pts);
+
+ }
+
+ p->prev_pts = params->pts;
+
+ // As a special case, prefill the queue if this is the first frame
+ if (!params->pts && !p->queue.num) {
+ if (!prefill(p, params)) {
+ pl_mutex_unlock(&p->lock_weak);
+ pl_mutex_unlock(&p->lock_strong);
+ return PL_QUEUE_ERR;
+ }
+ }
+
+ // Ignore unrealistically high or low FPS, common near start of playback
+ static const float max_vsync = 1.0 / MIN_FPS;
+ static const float min_vsync = 1.0 / MAX_FPS;
+ bool estimation_ok = p->vps.estimate > min_vsync && p->vps.estimate < max_vsync;
+ enum pl_queue_status ret;
+
+ if (estimation_ok || params->vsync_duration > 0) {
+ // We know the vsync duration, so construct an interpolation mix
+ ret = interpolate(p, out_mix, params);
+ } else {
+ // We don't know the vsync duration (yet), so just point-sample
+ ret = nearest(p, out_mix, params);
+ }
+
+ pl_cond_signal(&p->wakeup);
+ pl_mutex_unlock(&p->lock_weak);
+ pl_mutex_unlock(&p->lock_strong);
+ return ret;
+}
+
+float pl_queue_estimate_fps(pl_queue p)
+{
+ pl_mutex_lock(&p->lock_weak);
+ float estimate = p->fps.estimate;
+ pl_mutex_unlock(&p->lock_weak);
+ return estimate ? 1.0f / estimate : 0.0f;
+}
+
+float pl_queue_estimate_vps(pl_queue p)
+{
+ pl_mutex_lock(&p->lock_weak);
+ float estimate = p->vps.estimate;
+ pl_mutex_unlock(&p->lock_weak);
+ return estimate ? 1.0f / estimate : 0.0f;
+}
+
+int pl_queue_num_frames(pl_queue p)
+{
+ pl_mutex_lock(&p->lock_weak);
+ int count = p->queue.num;
+ pl_mutex_unlock(&p->lock_weak);
+ return count;
+}
+
+bool pl_queue_peek(pl_queue p, int idx, struct pl_source_frame *out)
+{
+ pl_mutex_lock(&p->lock_weak);
+ bool ok = idx >= 0 && idx < p->queue.num;
+ if (ok)
+ *out = p->queue.elem[idx]->src;
+ pl_mutex_unlock(&p->lock_weak);
+ return ok;
+}
diff --git a/src/utils/upload.c b/src/utils/upload.c
new file mode 100644
index 0000000..75bd4bb
--- /dev/null
+++ b/src/utils/upload.c
@@ -0,0 +1,382 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "log.h"
+#include "common.h"
+#include "gpu.h"
+
+#include <libplacebo/utils/upload.h>
+
+#define MAX_COMPS 4
+
+struct comp {
+ int order; // e.g. 0, 1, 2, 3 for RGBA
+ int size; // size in bits
+ int shift; // bit-shift / offset in bits
+};
+
+static int compare_comp(const void *pa, const void *pb)
+{
+ const struct comp *a = pa, *b = pb;
+
+ // Move all of the components with a size of 0 to the end, so they can
+ // be ignored outright
+ if (a->size && !b->size)
+ return -1;
+ if (b->size && !a->size)
+ return 1;
+
+ // Otherwise, just compare based on the shift
+ return PL_CMP(a->shift, b->shift);
+}
+
+void pl_plane_data_from_comps(struct pl_plane_data *data, int size[4],
+ int shift[4])
+{
+ struct comp comps[MAX_COMPS];
+ for (int i = 0; i < PL_ARRAY_SIZE(comps); i++) {
+ comps[i].order = i;
+ comps[i].size = size[i];
+ comps[i].shift = shift[i];
+ }
+
+ // Sort the components by shift
+ qsort(comps, MAX_COMPS, sizeof(struct comp), compare_comp);
+
+ // Generate the resulting component size/pad/map
+ int offset = 0;
+ for (int i = 0; i < MAX_COMPS; i++) {
+ if (comps[i].size) {
+ assert(comps[i].shift >= offset);
+ data->component_size[i] = comps[i].size;
+ data->component_pad[i] = comps[i].shift - offset;
+ data->component_map[i] = comps[i].order;
+ offset += data->component_size[i] + data->component_pad[i];
+ } else {
+ // Clear the superfluous entries for sanity
+ data->component_size[i] = 0;
+ data->component_pad[i] = 0;
+ data->component_map[i] = 0;
+ }
+ }
+}
+
+void pl_plane_data_from_mask(struct pl_plane_data *data, uint64_t mask[4])
+{
+ int size[4];
+ int shift[4];
+
+ for (int i = 0; i < PL_ARRAY_SIZE(size); i++) {
+ size[i] = __builtin_popcountll(mask[i]);
+ shift[i] = PL_MAX(0, __builtin_ffsll(mask[i]) - 1);
+
+ // Sanity checking
+ uint64_t mask_reconstructed = (1LLU << size[i]) - 1;
+ mask_reconstructed <<= shift[i];
+ pl_assert(mask_reconstructed == mask[i]);
+ }
+
+ pl_plane_data_from_comps(data, size, shift);
+}
+
+bool pl_plane_data_align(struct pl_plane_data *data, struct pl_bit_encoding *out_bits)
+{
+ struct pl_plane_data aligned = *data;
+ struct pl_bit_encoding bits = {0};
+
+ int offset = 0;
+
+#define SET_TEST(var, value) \
+ do { \
+ if (offset == 0) { \
+ (var) = (value); \
+ } else if ((var) != (value)) { \
+ goto misaligned; \
+ } \
+ } while (0)
+
+ for (int i = 0; i < MAX_COMPS; i++) {
+ if (!aligned.component_size[i])
+ break;
+
+ // Can't meaningfully align alpha channel, so just skip it. This is a
+ // limitation of the fact that `pl_bit_encoding` only applies to the
+ // main color channels, and changing this would be very nontrivial.
+ if (aligned.component_map[i] == PL_CHANNEL_A)
+ continue;
+
+ // Color depth is the original component size, before alignment
+ SET_TEST(bits.color_depth, aligned.component_size[i]);
+
+ // Try consuming padding of the current component to align down. This
+ // corresponds to an extra bit shift to the left.
+ int comp_start = offset + aligned.component_pad[i];
+ int left_delta = comp_start - PL_ALIGN2(comp_start - 7, 8);
+ left_delta = PL_MIN(left_delta, aligned.component_pad[i]);
+ aligned.component_pad[i] -= left_delta;
+ aligned.component_size[i] += left_delta;
+ SET_TEST(bits.bit_shift, left_delta);
+
+ // Try consuming padding of the next component to align up. This
+ // corresponds to simply ignoring some extra 0s on the end.
+ int comp_end = comp_start + aligned.component_size[i] - left_delta;
+ int right_delta = PL_ALIGN2(comp_end, 8) - comp_end;
+ if (i+1 == MAX_COMPS || !aligned.component_size[i+1]) {
+ // This is the last component, so we can be greedy
+ aligned.component_size[i] += right_delta;
+ } else {
+ right_delta = PL_MIN(right_delta, aligned.component_pad[i+1]);
+ aligned.component_pad[i+1] -= right_delta;
+ aligned.component_size[i] += right_delta;
+ }
+
+ // Sample depth is the new total component size, including padding
+ SET_TEST(bits.sample_depth, aligned.component_size[i]);
+
+ offset += aligned.component_pad[i] + aligned.component_size[i];
+ }
+
+ // Easy sanity check, to make sure that we don't exceed the known stride
+ if (aligned.pixel_stride && offset > aligned.pixel_stride * 8)
+ goto misaligned;
+
+ *data = aligned;
+ if (out_bits)
+ *out_bits = bits;
+ return true;
+
+misaligned:
+ // Can't properly align anything, so just do a no-op
+ if (out_bits)
+ *out_bits = (struct pl_bit_encoding) {0};
+ return false;
+}
+
+pl_fmt pl_plane_find_fmt(pl_gpu gpu, int out_map[4], const struct pl_plane_data *data)
+{
+ int dummy[4] = {0};
+ out_map = PL_DEF(out_map, dummy);
+
+ // Endian swapping requires compute shaders (currently)
+ if (data->swapped && !gpu->limits.max_ssbo_size)
+ return NULL;
+
+ // Count the number of components and initialize out_map
+ int num = 0;
+ for (int i = 0; i < PL_ARRAY_SIZE(data->component_size); i++) {
+ out_map[i] = -1;
+ if (data->component_size[i])
+ num = i+1;
+ }
+
+ for (int n = 0; n < gpu->num_formats; n++) {
+ pl_fmt fmt = gpu->formats[n];
+ if (fmt->opaque || fmt->num_components < num)
+ continue;
+ if (fmt->type != data->type || fmt->texel_size != data->pixel_stride)
+ continue;
+ if (!(fmt->caps & PL_FMT_CAP_SAMPLEABLE))
+ continue;
+
+ int idx = 0;
+
+ // Try mapping all pl_plane_data components to texture components
+ for (int i = 0; i < num; i++) {
+ // If there's padding we have to map it to an unused physical
+ // component first
+ int pad = data->component_pad[i];
+ if (pad && (idx >= 4 || fmt->host_bits[idx++] != pad))
+ goto next_fmt;
+
+ // Otherwise, try and match this component
+ int size = data->component_size[i];
+ if (size && (idx >= 4 || fmt->host_bits[idx] != size))
+ goto next_fmt;
+ out_map[idx++] = data->component_map[i];
+ }
+
+ // Reject misaligned formats, check this last to only log such errors
+ // if this is the only thing preventing a format from being used, as
+ // this is likely an issue in the API usage.
+ if (data->row_stride % fmt->texel_align) {
+ PL_WARN(gpu, "Rejecting texture format '%s' due to misalignment: "
+ "Row stride %zu is not a clean multiple of texel size %zu! "
+ "This is likely an API usage bug.",
+ fmt->name, data->row_stride, fmt->texel_align);
+ continue;
+ }
+
+ return fmt;
+
+next_fmt: ; // acts as `continue`
+ }
+
+ return NULL;
+}
+
+bool pl_upload_plane(pl_gpu gpu, struct pl_plane *out_plane,
+ pl_tex *tex, const struct pl_plane_data *data)
+{
+ pl_assert(!data->buf ^ !data->pixels); // exactly one
+
+ int out_map[4];
+ pl_fmt fmt = pl_plane_find_fmt(gpu, out_map, data);
+ if (!fmt) {
+ PL_ERR(gpu, "Failed picking any compatible texture format for a plane!");
+ return false;
+
+ // TODO: try soft-converting to a supported format using e.g zimg?
+ }
+
+ bool ok = pl_tex_recreate(gpu, tex, pl_tex_params(
+ .w = data->width,
+ .h = data->height,
+ .format = fmt,
+ .sampleable = true,
+ .host_writable = true,
+ .blit_src = fmt->caps & PL_FMT_CAP_BLITTABLE,
+ ));
+
+ if (!ok) {
+ PL_ERR(gpu, "Failed initializing plane texture!");
+ return false;
+ }
+
+ if (out_plane) {
+ out_plane->texture = *tex;
+ out_plane->components = 0;
+ for (int i = 0; i < PL_ARRAY_SIZE(out_map); i++) {
+ out_plane->component_mapping[i] = out_map[i];
+ if (out_map[i] >= 0)
+ out_plane->components = i+1;
+ }
+ }
+
+ struct pl_tex_transfer_params params = {
+ .tex = *tex,
+ .rc.x1 = data->width, // set these for `pl_tex_transfer_size`
+ .rc.y1 = data->height,
+ .rc.z1 = 1,
+ .row_pitch = PL_DEF(data->row_stride, data->width * fmt->texel_size),
+ .ptr = (void *) data->pixels,
+ .buf = data->buf,
+ .buf_offset = data->buf_offset,
+ .callback = data->callback,
+ .priv = data->priv,
+ };
+
+ pl_buf swapbuf = NULL;
+ if (data->swapped) {
+ const size_t aligned = PL_ALIGN2(pl_tex_transfer_size(&params), 4);
+ swapbuf = pl_buf_create(gpu, pl_buf_params(
+ .size = aligned,
+ .storable = true,
+ .initial_data = params.ptr,
+
+ // Note: This may over-read from `ptr` if `ptr` is not aligned to a
+ // word boundary, but the extra texels will be ignored by
+ // `pl_tex_upload` so this UB should be a non-issue in practice.
+ ));
+ if (!swapbuf) {
+ PL_ERR(gpu, "Failed creating endian swapping buffer!");
+ return false;
+ }
+
+ struct pl_buf_copy_swap_params swap_params = {
+ .src = swapbuf,
+ .dst = swapbuf,
+ .size = aligned,
+ .wordsize = fmt->texel_size / fmt->num_components,
+ };
+
+ bool can_reuse = params.buf && params.buf->params.storable &&
+ params.buf_offset % 4 == 0 &&
+ params.buf_offset + aligned <= params.buf->params.size;
+
+ if (params.ptr) {
+ // Data is already uploaded (no-op), can swap in-place
+ } else if (can_reuse) {
+ // We can sample directly from the source buffer
+ swap_params.src = params.buf;
+ swap_params.src_offset = params.buf_offset;
+ } else {
+ // We sadly need to do a second memcpy
+ assert(params.buf);
+ PL_TRACE(gpu, "Double-slow path! pl_buf_copy -> pl_buf_copy_swap...");
+ pl_buf_copy(gpu, swapbuf, 0, params.buf, params.buf_offset,
+ PL_MIN(aligned, params.buf->params.size - params.buf_offset));
+ }
+
+ if (!pl_buf_copy_swap(gpu, &swap_params)) {
+ PL_ERR(gpu, "Failed swapping endianness!");
+ pl_buf_destroy(gpu, &swapbuf);
+ return false;
+ }
+
+ params.ptr = NULL;
+ params.buf = swapbuf;
+ params.buf_offset = 0;
+ }
+
+ ok = pl_tex_upload(gpu, &params);
+ pl_buf_destroy(gpu, &swapbuf);
+ return ok;
+}
+
+bool pl_recreate_plane(pl_gpu gpu, struct pl_plane *out_plane,
+ pl_tex *tex, const struct pl_plane_data *data)
+{
+ if (data->swapped) {
+ PL_ERR(gpu, "Cannot call pl_recreate_plane on non-native endian plane "
+ "data, this is only supported for `pl_upload_plane`!");
+ return false;
+ }
+
+ int out_map[4];
+ pl_fmt fmt = pl_plane_find_fmt(gpu, out_map, data);
+ if (!fmt) {
+ PL_ERR(gpu, "Failed picking any compatible texture format for a plane!");
+ return false;
+ }
+
+ bool ok = pl_tex_recreate(gpu, tex, pl_tex_params(
+ .w = data->width,
+ .h = data->height,
+ .format = fmt,
+ .renderable = true,
+ .host_readable = fmt->caps & PL_FMT_CAP_HOST_READABLE,
+ .blit_dst = fmt->caps & PL_FMT_CAP_BLITTABLE,
+ .storable = fmt->caps & PL_FMT_CAP_STORABLE,
+ ));
+
+ if (!ok) {
+ PL_ERR(gpu, "Failed initializing plane texture!");
+ return false;
+ }
+
+ if (out_plane) {
+ out_plane->texture = *tex;
+ out_plane->components = 0;
+ for (int i = 0; i < PL_ARRAY_SIZE(out_map); i++) {
+ out_plane->component_mapping[i] = out_map[i];
+ if (out_map[i] >= 0)
+ out_plane->components = i+1;
+ }
+ }
+
+ return true;
+}
diff --git a/src/version.h.in b/src/version.h.in
new file mode 100644
index 0000000..22bdee8
--- /dev/null
+++ b/src/version.h.in
@@ -0,0 +1 @@
+#define BUILD_VERSION "@buildver@"
diff --git a/src/vulkan/command.c b/src/vulkan/command.c
new file mode 100644
index 0000000..5020aff
--- /dev/null
+++ b/src/vulkan/command.c
@@ -0,0 +1,571 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "command.h"
+#include "utils.h"
+
+// returns VK_SUCCESS (completed), VK_TIMEOUT (not yet completed) or an error
+static VkResult vk_cmd_poll(struct vk_cmd *cmd, uint64_t timeout)
+{
+ struct vk_ctx *vk = cmd->pool->vk;
+ return vk->WaitSemaphores(vk->dev, &(VkSemaphoreWaitInfo) {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
+ .semaphoreCount = 1,
+ .pSemaphores = &cmd->sync.sem,
+ .pValues = &cmd->sync.value,
+ }, timeout);
+}
+
+static void flush_callbacks(struct vk_ctx *vk)
+{
+ while (vk->num_pending_callbacks) {
+ const struct vk_callback *cb = vk->pending_callbacks++;
+ vk->num_pending_callbacks--;
+ cb->run(cb->priv, cb->arg);
+ }
+}
+
+static void vk_cmd_reset(struct vk_cmd *cmd)
+{
+ struct vk_ctx *vk = cmd->pool->vk;
+
+ // Flush possible callbacks left over from a previous command still in the
+ // process of being reset, whose callback triggered this command being
+ // reset.
+ flush_callbacks(vk);
+ vk->pending_callbacks = cmd->callbacks.elem;
+ vk->num_pending_callbacks = cmd->callbacks.num;
+ flush_callbacks(vk);
+
+ cmd->callbacks.num = 0;
+ cmd->deps.num = 0;
+ cmd->sigs.num = 0;
+}
+
+static void vk_cmd_destroy(struct vk_cmd *cmd)
+{
+ if (!cmd)
+ return;
+
+ struct vk_ctx *vk = cmd->pool->vk;
+ vk_cmd_poll(cmd, UINT64_MAX);
+ vk_cmd_reset(cmd);
+ vk->DestroySemaphore(vk->dev, cmd->sync.sem, PL_VK_ALLOC);
+ vk->FreeCommandBuffers(vk->dev, cmd->pool->pool, 1, &cmd->buf);
+
+ pl_free(cmd);
+}
+
+static struct vk_cmd *vk_cmd_create(struct vk_cmdpool *pool)
+{
+ struct vk_ctx *vk = pool->vk;
+ struct vk_cmd *cmd = pl_zalloc_ptr(NULL, cmd);
+ cmd->pool = pool;
+
+ VkCommandBufferAllocateInfo ainfo = {
+ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+ .commandPool = pool->pool,
+ .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+ .commandBufferCount = 1,
+ };
+
+ VK(vk->AllocateCommandBuffers(vk->dev, &ainfo, &cmd->buf));
+
+ static const VkSemaphoreTypeCreateInfo stinfo = {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO,
+ .semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE,
+ .initialValue = 0,
+ };
+
+ static const VkSemaphoreCreateInfo sinfo = {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+ .pNext = &stinfo,
+ };
+
+ VK(vk->CreateSemaphore(vk->dev, &sinfo, PL_VK_ALLOC, &cmd->sync.sem));
+ PL_VK_NAME(SEMAPHORE, cmd->sync.sem, "cmd");
+
+ return cmd;
+
+error:
+ vk_cmd_destroy(cmd);
+ vk->failed = true;
+ return NULL;
+}
+
+void vk_dev_callback(struct vk_ctx *vk, vk_cb callback,
+ const void *priv, const void *arg)
+{
+ pl_mutex_lock(&vk->lock);
+ if (vk->cmds_pending.num > 0) {
+ struct vk_cmd *last_cmd = vk->cmds_pending.elem[vk->cmds_pending.num - 1];
+ vk_cmd_callback(last_cmd, callback, priv, arg);
+ } else {
+ // The device was already idle, so we can just immediately call it
+ callback((void *) priv, (void *) arg);
+ }
+ pl_mutex_unlock(&vk->lock);
+}
+
+void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback,
+ const void *priv, const void *arg)
+{
+ PL_ARRAY_APPEND(cmd, cmd->callbacks, (struct vk_callback) {
+ .run = callback,
+ .priv = (void *) priv,
+ .arg = (void *) arg,
+ });
+}
+
+void vk_cmd_dep(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem dep)
+{
+ PL_ARRAY_APPEND(cmd, cmd->deps, (VkSemaphoreSubmitInfo) {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
+ .semaphore = dep.sem,
+ .value = dep.value,
+ .stageMask = stage,
+ });
+}
+
+void vk_cmd_sig(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem sig)
+{
+ VkSemaphoreSubmitInfo sinfo = {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
+ .semaphore = sig.sem,
+ .value = sig.value,
+ .stageMask = stage,
+ };
+
+ // Try updating existing semaphore signal operations in-place
+ for (int i = 0; i < cmd->sigs.num; i++) {
+ if (cmd->sigs.elem[i].semaphore == sig.sem) {
+ pl_assert(sig.value > cmd->sigs.elem[i].value);
+ cmd->sigs.elem[i] = sinfo;
+ return;
+ }
+ }
+
+ PL_ARRAY_APPEND(cmd, cmd->sigs, sinfo);
+}
+
+#define SET(FLAG, CHECK) \
+ if (flags2 & (CHECK)) \
+ flags |= FLAG
+
+static VkAccessFlags lower_access2(VkAccessFlags2 flags2)
+{
+ VkAccessFlags flags = flags2 & VK_ACCESS_FLAG_BITS_MAX_ENUM;
+ SET(VK_ACCESS_SHADER_READ_BIT, VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
+ VK_ACCESS_2_SHADER_STORAGE_READ_BIT);
+ SET(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT);
+ return flags;
+}
+
+static VkPipelineStageFlags lower_stage2(VkPipelineStageFlags2 flags2)
+{
+ VkPipelineStageFlags flags = flags2 & VK_PIPELINE_STAGE_FLAG_BITS_MAX_ENUM;
+ SET(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_2_COPY_BIT |
+ VK_PIPELINE_STAGE_2_RESOLVE_BIT |
+ VK_PIPELINE_STAGE_2_BLIT_BIT |
+ VK_PIPELINE_STAGE_2_CLEAR_BIT);
+ SET(VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT |
+ VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT);
+ return flags;
+}
+
+#undef SET
+
+void vk_cmd_barrier(struct vk_cmd *cmd, const VkDependencyInfo *info)
+{
+ struct vk_ctx *vk = cmd->pool->vk;
+ if (vk->CmdPipelineBarrier2KHR) {
+ vk->CmdPipelineBarrier2KHR(cmd->buf, info);
+ return;
+ }
+
+ pl_assert(!info->pNext);
+ pl_assert(info->memoryBarrierCount == 0);
+ pl_assert(info->bufferMemoryBarrierCount + info->imageMemoryBarrierCount == 1);
+
+ if (info->bufferMemoryBarrierCount) {
+
+ const VkBufferMemoryBarrier2 *barr2 = info->pBufferMemoryBarriers;
+ const VkBufferMemoryBarrier barr = {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
+ .pNext = barr2->pNext,
+ .srcAccessMask = lower_access2(barr2->srcAccessMask),
+ .dstAccessMask = lower_access2(barr2->dstAccessMask),
+ .srcQueueFamilyIndex = barr2->srcQueueFamilyIndex,
+ .dstQueueFamilyIndex = barr2->dstQueueFamilyIndex,
+ .buffer = barr2->buffer,
+ .offset = barr2->offset,
+ .size = barr2->size,
+ };
+
+ vk->CmdPipelineBarrier(cmd->buf, lower_stage2(barr2->srcStageMask),
+ lower_stage2(barr2->dstStageMask),
+ info->dependencyFlags,
+ 0, NULL, 1, &barr, 0, NULL);
+
+ } else {
+
+ const VkImageMemoryBarrier2 *barr2 = info->pImageMemoryBarriers;
+ const VkImageMemoryBarrier barr = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+ .pNext = barr2->pNext,
+ .srcAccessMask = lower_access2(barr2->srcAccessMask),
+ .dstAccessMask = lower_access2(barr2->dstAccessMask),
+ .oldLayout = barr2->oldLayout,
+ .newLayout = barr2->newLayout,
+ .srcQueueFamilyIndex = barr2->srcQueueFamilyIndex,
+ .dstQueueFamilyIndex = barr2->dstQueueFamilyIndex,
+ .image = barr2->image,
+ .subresourceRange = barr2->subresourceRange,
+ };
+
+ vk->CmdPipelineBarrier(cmd->buf, lower_stage2(barr2->srcStageMask),
+ lower_stage2(barr2->dstStageMask),
+ info->dependencyFlags,
+ 0, NULL, 0, NULL, 1, &barr);
+ }
+}
+
+struct vk_sync_scope vk_sem_barrier(struct vk_cmd *cmd, struct vk_sem *sem,
+ VkPipelineStageFlags2 stage,
+ VkAccessFlags2 access, bool is_trans)
+{
+ bool is_write = (access & vk_access_write) || is_trans;
+
+ // Writes need to be synchronized against the last *read* (which is
+ // transitively synchronized against the last write), reads only
+ // need to be synchronized against the last write.
+ struct vk_sync_scope last = sem->write;
+ if (is_write && sem->read.access)
+ last = sem->read;
+
+ if (last.queue != cmd->queue) {
+ if (!is_write && sem->read.queue == cmd->queue) {
+ // No semaphore needed in this case because the implicit submission
+ // order execution dependencies already transitively imply a wait
+ // for the previous write
+ } else if (last.sync.sem) {
+ // Image barrier still needs to depend on this stage for implicit
+ // ordering guarantees to apply properly
+ vk_cmd_dep(cmd, stage, last.sync);
+ last.stage = stage;
+ }
+
+ // Last access is on different queue, so no pipeline barrier needed
+ last.access = 0;
+ }
+
+ if (!is_write && sem->read.queue == cmd->queue &&
+ (sem->read.stage & stage) == stage &&
+ (sem->read.access & access) == access)
+ {
+ // A past pipeline barrier already covers this access transitively, so
+ // we don't need to emit another pipeline barrier at all
+ last.access = 0;
+ }
+
+ if (is_write) {
+ sem->write = (struct vk_sync_scope) {
+ .sync = cmd->sync,
+ .queue = cmd->queue,
+ .stage = stage,
+ .access = access,
+ };
+
+ sem->read = (struct vk_sync_scope) {
+ .sync = cmd->sync,
+ .queue = cmd->queue,
+ // no stage or access scope, because no reads happened yet
+ };
+ } else if (sem->read.queue == cmd->queue) {
+ // Coalesce multiple same-queue reads into a single access scope
+ sem->read.sync = cmd->sync;
+ sem->read.stage |= stage;
+ sem->read.access |= access;
+ } else {
+ sem->read = (struct vk_sync_scope) {
+ .sync = cmd->sync,
+ .queue = cmd->queue,
+ .stage = stage,
+ .access = access,
+ };
+ }
+
+ // We never need to include pipeline barriers for reads, only writes
+ last.access &= vk_access_write;
+ return last;
+}
+
+struct vk_cmdpool *vk_cmdpool_create(struct vk_ctx *vk, int qf, int qnum,
+ VkQueueFamilyProperties props)
+{
+ struct vk_cmdpool *pool = pl_alloc_ptr(NULL, pool);
+ *pool = (struct vk_cmdpool) {
+ .vk = vk,
+ .props = props,
+ .qf = qf,
+ .queues = pl_calloc(pool, qnum, sizeof(VkQueue)),
+ .num_queues = qnum,
+ };
+
+ for (int n = 0; n < qnum; n++)
+ vk->GetDeviceQueue(vk->dev, qf, n, &pool->queues[n]);
+
+ VkCommandPoolCreateInfo cinfo = {
+ .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+ .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT |
+ VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
+ .queueFamilyIndex = qf,
+ };
+
+ VK(vk->CreateCommandPool(vk->dev, &cinfo, PL_VK_ALLOC, &pool->pool));
+ return pool;
+
+error:
+ vk_cmdpool_destroy(pool);
+ vk->failed = true;
+ return NULL;
+}
+
+void vk_cmdpool_destroy(struct vk_cmdpool *pool)
+{
+ if (!pool)
+ return;
+
+ for (int i = 0; i < pool->cmds.num; i++)
+ vk_cmd_destroy(pool->cmds.elem[i]);
+
+ struct vk_ctx *vk = pool->vk;
+ vk->DestroyCommandPool(vk->dev, pool->pool, PL_VK_ALLOC);
+ pl_free(pool);
+}
+
+struct vk_cmd *vk_cmd_begin(struct vk_cmdpool *pool, pl_debug_tag debug_tag)
+{
+ struct vk_ctx *vk = pool->vk;
+
+ // Garbage collect the cmdpool first, to increase the chances of getting
+ // an already-available command buffer.
+ vk_poll_commands(vk, 0);
+
+ struct vk_cmd *cmd = NULL;
+ pl_mutex_lock(&vk->lock);
+ if (!PL_ARRAY_POP(pool->cmds, &cmd)) {
+ cmd = vk_cmd_create(pool);
+ if (!cmd) {
+ pl_mutex_unlock(&vk->lock);
+ goto error;
+ }
+ }
+
+ cmd->qindex = pool->idx_queues;
+ cmd->queue = pool->queues[cmd->qindex];
+ pl_mutex_unlock(&vk->lock);
+
+ VkCommandBufferBeginInfo binfo = {
+ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
+ .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
+ };
+
+ VK(vk->BeginCommandBuffer(cmd->buf, &binfo));
+
+ debug_tag = PL_DEF(debug_tag, "vk_cmd");
+ PL_VK_NAME_HANDLE(COMMAND_BUFFER, cmd->buf, debug_tag);
+ PL_VK_NAME(SEMAPHORE, cmd->sync.sem, debug_tag);
+
+ cmd->sync.value++;
+ vk_cmd_sig(cmd, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, cmd->sync);
+ return cmd;
+
+error:
+ // Something has to be seriously messed up if we get to this point
+ vk_cmd_destroy(cmd);
+ vk->failed = true;
+ return NULL;
+}
+
+static VkResult vk_queue_submit2(struct vk_ctx *vk, VkQueue queue,
+ const VkSubmitInfo2 *info2, VkFence fence)
+{
+ if (vk->QueueSubmit2KHR)
+ return vk->QueueSubmit2KHR(queue, 1, info2, fence);
+
+ const uint32_t num_deps = info2->waitSemaphoreInfoCount;
+ const uint32_t num_sigs = info2->signalSemaphoreInfoCount;
+ const uint32_t num_cmds = info2->commandBufferInfoCount;
+
+ void *tmp = pl_tmp(NULL);
+ VkSemaphore *deps = pl_calloc_ptr(tmp, num_deps, deps);
+ VkPipelineStageFlags *masks = pl_calloc_ptr(tmp, num_deps, masks);
+ uint64_t *depvals = pl_calloc_ptr(tmp, num_deps, depvals);
+ VkSemaphore *sigs = pl_calloc_ptr(tmp, num_sigs, sigs);
+ uint64_t *sigvals = pl_calloc_ptr(tmp, num_sigs, sigvals);
+ VkCommandBuffer *cmds = pl_calloc_ptr(tmp, num_cmds, cmds);
+
+ for (int i = 0; i < num_deps; i++) {
+ deps[i] = info2->pWaitSemaphoreInfos[i].semaphore;
+ masks[i] = info2->pWaitSemaphoreInfos[i].stageMask;
+ depvals[i] = info2->pWaitSemaphoreInfos[i].value;
+ }
+ for (int i = 0; i < num_sigs; i++) {
+ sigs[i] = info2->pSignalSemaphoreInfos[i].semaphore;
+ sigvals[i] = info2->pSignalSemaphoreInfos[i].value;
+ }
+ for (int i = 0; i < num_cmds; i++)
+ cmds[i] = info2->pCommandBufferInfos[i].commandBuffer;
+
+ const VkTimelineSemaphoreSubmitInfo tinfo = {
+ .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO,
+ .pNext = info2->pNext,
+ .waitSemaphoreValueCount = num_deps,
+ .pWaitSemaphoreValues = depvals,
+ .signalSemaphoreValueCount = num_sigs,
+ .pSignalSemaphoreValues = sigvals,
+ };
+
+ const VkSubmitInfo info = {
+ .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+ .pNext = &tinfo,
+ .waitSemaphoreCount = num_deps,
+ .pWaitSemaphores = deps,
+ .pWaitDstStageMask = masks,
+ .commandBufferCount = num_cmds,
+ .pCommandBuffers = cmds,
+ .signalSemaphoreCount = num_sigs,
+ .pSignalSemaphores = sigs,
+ };
+
+ VkResult res = vk->QueueSubmit(queue, 1, &info, fence);
+ pl_free(tmp);
+ return res;
+}
+
+bool vk_cmd_submit(struct vk_cmd **pcmd)
+{
+ struct vk_cmd *cmd = *pcmd;
+ if (!cmd)
+ return true;
+
+ *pcmd = NULL;
+ struct vk_cmdpool *pool = cmd->pool;
+ struct vk_ctx *vk = pool->vk;
+
+ VK(vk->EndCommandBuffer(cmd->buf));
+
+ VkSubmitInfo2 sinfo = {
+ .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2,
+ .waitSemaphoreInfoCount = cmd->deps.num,
+ .pWaitSemaphoreInfos = cmd->deps.elem,
+ .signalSemaphoreInfoCount = cmd->sigs.num,
+ .pSignalSemaphoreInfos = cmd->sigs.elem,
+ .commandBufferInfoCount = 1,
+ .pCommandBufferInfos = &(VkCommandBufferSubmitInfo) {
+ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
+ .commandBuffer = cmd->buf,
+ },
+ };
+
+ if (pl_msg_test(vk->log, PL_LOG_TRACE)) {
+ PL_TRACE(vk, "Submitting command %p on queue %p (QF %d):",
+ (void *) cmd->buf, (void *) cmd->queue, pool->qf);
+ for (int n = 0; n < cmd->deps.num; n++) {
+ PL_TRACE(vk, " waits on semaphore 0x%"PRIx64" = %"PRIu64,
+ (uint64_t) cmd->deps.elem[n].semaphore, cmd->deps.elem[n].value);
+ }
+ for (int n = 0; n < cmd->sigs.num; n++) {
+ PL_TRACE(vk, " signals semaphore 0x%"PRIx64" = %"PRIu64,
+ (uint64_t) cmd->sigs.elem[n].semaphore, cmd->sigs.elem[n].value);
+ }
+ if (cmd->callbacks.num)
+ PL_TRACE(vk, " signals %d callbacks", cmd->callbacks.num);
+ }
+
+ vk->lock_queue(vk->queue_ctx, pool->qf, cmd->qindex);
+ VkResult res = vk_queue_submit2(vk, cmd->queue, &sinfo, VK_NULL_HANDLE);
+ vk->unlock_queue(vk->queue_ctx, pool->qf, cmd->qindex);
+ PL_VK_ASSERT(res, "vkQueueSubmit2");
+
+ pl_mutex_lock(&vk->lock);
+ PL_ARRAY_APPEND(vk->alloc, vk->cmds_pending, cmd);
+ pl_mutex_unlock(&vk->lock);
+ return true;
+
+error:
+ vk_cmd_reset(cmd);
+ pl_mutex_lock(&vk->lock);
+ PL_ARRAY_APPEND(pool, pool->cmds, cmd);
+ pl_mutex_unlock(&vk->lock);
+ vk->failed = true;
+ return false;
+}
+
+bool vk_poll_commands(struct vk_ctx *vk, uint64_t timeout)
+{
+ bool ret = false;
+ pl_mutex_lock(&vk->lock);
+
+ while (vk->cmds_pending.num) {
+ struct vk_cmd *cmd = vk->cmds_pending.elem[0];
+ struct vk_cmdpool *pool = cmd->pool;
+ pl_mutex_unlock(&vk->lock); // don't hold mutex while blocking
+ if (vk_cmd_poll(cmd, timeout) == VK_TIMEOUT)
+ return ret;
+ pl_mutex_lock(&vk->lock);
+ if (!vk->cmds_pending.num || vk->cmds_pending.elem[0] != cmd)
+ continue; // another thread modified this state while blocking
+
+ PL_TRACE(vk, "VkSemaphore signalled: 0x%"PRIx64" = %"PRIu64,
+ (uint64_t) cmd->sync.sem, cmd->sync.value);
+ PL_ARRAY_REMOVE_AT(vk->cmds_pending, 0); // remove before callbacks
+ vk_cmd_reset(cmd);
+ PL_ARRAY_APPEND(pool, pool->cmds, cmd);
+ ret = true;
+
+ // If we've successfully spent some time waiting for at least one
+ // command, disable the timeout. This has the dual purpose of both
+ // making sure we don't over-wait due to repeat timeout application,
+ // but also makes sure we don't block on future commands if we've
+ // already spend time waiting for one.
+ timeout = 0;
+ }
+
+ pl_mutex_unlock(&vk->lock);
+ return ret;
+}
+
+void vk_rotate_queues(struct vk_ctx *vk)
+{
+ pl_mutex_lock(&vk->lock);
+
+ // Rotate the queues to ensure good parallelism across frames
+ for (int i = 0; i < vk->pools.num; i++) {
+ struct vk_cmdpool *pool = vk->pools.elem[i];
+ pool->idx_queues = (pool->idx_queues + 1) % pool->num_queues;
+ PL_TRACE(vk, "QF %d: %d/%d", pool->qf, pool->idx_queues, pool->num_queues);
+ }
+
+ pl_mutex_unlock(&vk->lock);
+}
+
+void vk_wait_idle(struct vk_ctx *vk)
+{
+ while (vk_poll_commands(vk, UINT64_MAX)) ;
+}
diff --git a/src/vulkan/command.h b/src/vulkan/command.h
new file mode 100644
index 0000000..4c70482
--- /dev/null
+++ b/src/vulkan/command.h
@@ -0,0 +1,142 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include "common.h"
+
+// Since lots of vulkan operations need to be done lazily once the affected
+// resources are no longer in use, provide an abstraction for tracking these.
+// In practice, these are only checked and run when submitting new commands, so
+// the actual execution may be delayed by a frame.
+typedef void (*vk_cb)(void *p, void *arg);
+
+struct vk_callback {
+ vk_cb run;
+ void *priv;
+ void *arg;
+};
+
+// Associate a callback with the completion of all currently pending commands.
+// This will essentially run once the device is completely idle.
+void vk_dev_callback(struct vk_ctx *vk, vk_cb callback,
+ const void *priv, const void *arg);
+
+// Helper wrapper around command buffers that also track dependencies,
+// callbacks and synchronization primitives
+//
+// Thread-safety: Unsafe
+struct vk_cmd {
+ struct vk_cmdpool *pool; // pool it was allocated from
+ pl_vulkan_sem sync; // pending execution, tied to lifetime of device
+ VkQueue queue; // the submission queue (for recording/pending)
+ int qindex; // the index of `queue` in `pool`
+ VkCommandBuffer buf; // the command buffer itself
+ // Command dependencies and signals. Not owned by the vk_cmd.
+ PL_ARRAY(VkSemaphoreSubmitInfo) deps;
+ PL_ARRAY(VkSemaphoreSubmitInfo) sigs;
+ // "Callbacks" to fire once a command completes. These are used for
+ // multiple purposes, ranging from resource deallocation to fencing.
+ PL_ARRAY(struct vk_callback) callbacks;
+};
+
+// Associate a callback with the completion of the current command. This
+// function will be run once the command completes, or shortly thereafter.
+void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback,
+ const void *priv, const void *arg);
+
+// Associate a raw dependency for the current command. This semaphore must
+// signal by the corresponding stage before the command may execute.
+void vk_cmd_dep(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem dep);
+
+// Associate a raw signal with the current command. This semaphore will signal
+// after the given stage completes.
+void vk_cmd_sig(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem sig);
+
+// Compatibility wrappers for vkCmdPipelineBarrier2 (works with pre-1.3)
+void vk_cmd_barrier(struct vk_cmd *cmd, const VkDependencyInfo *info);
+
+// Synchronization scope
+struct vk_sync_scope {
+ pl_vulkan_sem sync; // semaphore of last access
+ VkQueue queue; // source queue of last access
+ VkPipelineStageFlags2 stage;// stage bitmask of last access
+ VkAccessFlags2 access; // access type bitmask
+};
+
+// Synchronization primitive
+struct vk_sem {
+ struct vk_sync_scope read, write;
+};
+
+// Updates the `vk_sem` state for a given access. If `is_trans` is set, this
+// access is treated as a write (since it alters the resource's state).
+//
+// Returns a struct describing the previous access to a resource. A pipeline
+// barrier is only required if the previous access scope is nonzero.
+struct vk_sync_scope vk_sem_barrier(struct vk_cmd *cmd, struct vk_sem *sem,
+ VkPipelineStageFlags2 stage,
+ VkAccessFlags2 access, bool is_trans);
+
+// Command pool / queue family hybrid abstraction
+struct vk_cmdpool {
+ struct vk_ctx *vk;
+ VkQueueFamilyProperties props;
+ int qf; // queue family index
+ VkCommandPool pool;
+ VkQueue *queues;
+ int num_queues;
+ int idx_queues;
+ // Command buffers associated with this queue. These are available for
+ // re-recording
+ PL_ARRAY(struct vk_cmd *) cmds;
+};
+
+// Set up a vk_cmdpool corresponding to a queue family. `qnum` may be less than
+// `props.queueCount`, to restrict the number of queues in this queue family.
+struct vk_cmdpool *vk_cmdpool_create(struct vk_ctx *vk, int qf, int qnum,
+ VkQueueFamilyProperties props);
+
+void vk_cmdpool_destroy(struct vk_cmdpool *pool);
+
+// Fetch a command buffer from a command pool and begin recording to it.
+// Returns NULL on failure.
+struct vk_cmd *vk_cmd_begin(struct vk_cmdpool *pool, pl_debug_tag debug_tag);
+
+// Finish recording a command buffer and submit it for execution. This function
+// takes over ownership of **cmd, and sets *cmd to NULL in doing so.
+bool vk_cmd_submit(struct vk_cmd **cmd);
+
+// Block until some commands complete executing. This is the only function that
+// actually processes the callbacks. Will wait at most `timeout` nanoseconds
+// for the completion of any command. The timeout may also be passed as 0, in
+// which case this function will not block, but only poll for completed
+// commands. Returns whether any forward progress was made.
+//
+// This does *not* flush any queued commands, forgetting to do so may result
+// in infinite loops if waiting for the completion of callbacks that were
+// never flushed!
+bool vk_poll_commands(struct vk_ctx *vk, uint64_t timeout);
+
+// Rotate through queues in each command pool. Call this once per frame, after
+// submitting all of the command buffers for that frame. Calling this more
+// often than that is possible but bad for performance.
+void vk_rotate_queues(struct vk_ctx *vk);
+
+// Wait until all commands are complete, i.e. the device is idle. This is
+// basically equivalent to calling `vk_poll_commands` with a timeout of
+// UINT64_MAX until it returns `false`.
+void vk_wait_idle(struct vk_ctx *vk);
diff --git a/src/vulkan/common.h b/src/vulkan/common.h
new file mode 100644
index 0000000..31b309e
--- /dev/null
+++ b/src/vulkan/common.h
@@ -0,0 +1,234 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#define VK_NO_PROTOTYPES
+#define VK_ENABLE_BETA_EXTENSIONS // for VK_KHR_portability_subset
+#define VK_USE_PLATFORM_METAL_EXT
+
+#include "../common.h"
+#include "../log.h"
+#include "../pl_thread.h"
+
+#include <libplacebo/vulkan.h>
+
+#ifdef PL_HAVE_WIN32
+#include <windows.h>
+#include <vulkan/vulkan_win32.h>
+#endif
+
+// Vulkan allows the optional use of a custom allocator. We don't need one but
+// mark this parameter with a better name in case we ever decide to change this
+// in the future. (And to make the code more readable)
+#define PL_VK_ALLOC NULL
+
+// Type of a vulkan function that needs to be loaded
+#define PL_VK_FUN(name) PFN_vk##name name
+
+// Load a vulkan instance-level extension function directly (on the stack)
+#define PL_VK_LOAD_FUN(inst, name, get_addr) \
+ PL_VK_FUN(name) = (PFN_vk##name) get_addr(inst, "vk" #name);
+
+#ifndef VK_VENDOR_ID_NVIDIA
+#define VK_VENDOR_ID_NVIDIA 0x10DE
+#endif
+
+// Shared struct used to hold vulkan context information
+struct vk_ctx {
+ pl_mutex lock;
+ pl_vulkan vulkan;
+ void *alloc; // host allocations bound to the lifetime of this vk_ctx
+ struct vk_malloc *ma; // VRAM malloc layer
+ pl_vk_inst internal_instance;
+ pl_log log;
+ VkInstance inst;
+ VkPhysicalDevice physd;
+ VkPhysicalDeviceProperties props;
+ VkPhysicalDeviceFeatures2 features;
+ uint32_t api_ver; // device API version
+ VkDevice dev;
+ bool imported; // device was not created by us
+
+ // Generic error flag for catching "failed" devices
+ bool failed;
+
+ // Enabled extensions
+ PL_ARRAY(const char *) exts;
+
+ // Command pools (one per queue family)
+ PL_ARRAY(struct vk_cmdpool *) pools;
+
+ // Pointers into `pools` (always set)
+ struct vk_cmdpool *pool_graphics;
+ struct vk_cmdpool *pool_compute;
+ struct vk_cmdpool *pool_transfer;
+
+ // Queue locking functions
+ PL_ARRAY(PL_ARRAY(pl_mutex)) queue_locks;
+ void (*lock_queue)(void *queue_ctx, uint32_t qf, uint32_t idx);
+ void (*unlock_queue)(void *queue_ctx, uint32_t qf, uint32_t idx);
+ void *queue_ctx;
+
+ // Pending commands. These are shared for the entire mpvk_ctx to ensure
+ // submission and callbacks are FIFO
+ PL_ARRAY(struct vk_cmd *) cmds_pending; // submitted but not completed
+
+ // Pending callbacks that still need to be drained before processing
+ // callbacks for the next command (in case commands are recursively being
+ // polled from another callback)
+ const struct vk_callback *pending_callbacks;
+ int num_pending_callbacks;
+
+ // Instance-level function pointers
+ PL_VK_FUN(CreateDevice);
+ PL_VK_FUN(EnumerateDeviceExtensionProperties);
+ PL_VK_FUN(GetDeviceProcAddr);
+ PL_VK_FUN(GetInstanceProcAddr);
+ PL_VK_FUN(GetPhysicalDeviceExternalBufferProperties);
+ PL_VK_FUN(GetPhysicalDeviceExternalSemaphoreProperties);
+ PL_VK_FUN(GetPhysicalDeviceFeatures2KHR);
+ PL_VK_FUN(GetPhysicalDeviceFormatProperties);
+ PL_VK_FUN(GetPhysicalDeviceFormatProperties2KHR);
+ PL_VK_FUN(GetPhysicalDeviceImageFormatProperties2KHR);
+ PL_VK_FUN(GetPhysicalDeviceMemoryProperties);
+ PL_VK_FUN(GetPhysicalDeviceProperties);
+ PL_VK_FUN(GetPhysicalDeviceProperties2);
+ PL_VK_FUN(GetPhysicalDeviceQueueFamilyProperties);
+ PL_VK_FUN(GetPhysicalDeviceSurfaceCapabilitiesKHR);
+ PL_VK_FUN(GetPhysicalDeviceSurfaceFormatsKHR);
+ PL_VK_FUN(GetPhysicalDeviceSurfacePresentModesKHR);
+ PL_VK_FUN(GetPhysicalDeviceSurfaceSupportKHR);
+
+ // Device-level function pointers
+ PL_VK_FUN(AcquireNextImageKHR);
+ PL_VK_FUN(AllocateCommandBuffers);
+ PL_VK_FUN(AllocateDescriptorSets);
+ PL_VK_FUN(AllocateMemory);
+ PL_VK_FUN(BeginCommandBuffer);
+ PL_VK_FUN(BindBufferMemory);
+ PL_VK_FUN(BindImageMemory);
+ PL_VK_FUN(CmdBeginDebugUtilsLabelEXT);
+ PL_VK_FUN(CmdBeginRenderPass);
+ PL_VK_FUN(CmdBindDescriptorSets);
+ PL_VK_FUN(CmdBindIndexBuffer);
+ PL_VK_FUN(CmdBindPipeline);
+ PL_VK_FUN(CmdBindVertexBuffers);
+ PL_VK_FUN(CmdBlitImage);
+ PL_VK_FUN(CmdClearColorImage);
+ PL_VK_FUN(CmdCopyBuffer);
+ PL_VK_FUN(CmdCopyBufferToImage);
+ PL_VK_FUN(CmdCopyImage);
+ PL_VK_FUN(CmdCopyImageToBuffer);
+ PL_VK_FUN(CmdDispatch);
+ PL_VK_FUN(CmdDraw);
+ PL_VK_FUN(CmdDrawIndexed);
+ PL_VK_FUN(CmdEndDebugUtilsLabelEXT);
+ PL_VK_FUN(CmdEndRenderPass);
+ PL_VK_FUN(CmdPipelineBarrier);
+ PL_VK_FUN(CmdPipelineBarrier2KHR);
+ PL_VK_FUN(CmdPushConstants);
+ PL_VK_FUN(CmdPushDescriptorSetKHR);
+ PL_VK_FUN(CmdResetQueryPool);
+ PL_VK_FUN(CmdSetScissor);
+ PL_VK_FUN(CmdSetViewport);
+ PL_VK_FUN(CmdUpdateBuffer);
+ PL_VK_FUN(CmdWriteTimestamp);
+ PL_VK_FUN(CreateBuffer);
+ PL_VK_FUN(CreateBufferView);
+ PL_VK_FUN(CreateCommandPool);
+ PL_VK_FUN(CreateComputePipelines);
+ PL_VK_FUN(CreateDebugReportCallbackEXT);
+ PL_VK_FUN(CreateDescriptorPool);
+ PL_VK_FUN(CreateDescriptorSetLayout);
+ PL_VK_FUN(CreateFence);
+ PL_VK_FUN(CreateFramebuffer);
+ PL_VK_FUN(CreateGraphicsPipelines);
+ PL_VK_FUN(CreateImage);
+ PL_VK_FUN(CreateImageView);
+ PL_VK_FUN(CreatePipelineCache);
+ PL_VK_FUN(CreatePipelineLayout);
+ PL_VK_FUN(CreateQueryPool);
+ PL_VK_FUN(CreateRenderPass);
+ PL_VK_FUN(CreateSampler);
+ PL_VK_FUN(CreateSemaphore);
+ PL_VK_FUN(CreateShaderModule);
+ PL_VK_FUN(CreateSwapchainKHR);
+ PL_VK_FUN(DestroyBuffer);
+ PL_VK_FUN(DestroyBufferView);
+ PL_VK_FUN(DestroyCommandPool);
+ PL_VK_FUN(DestroyDebugReportCallbackEXT);
+ PL_VK_FUN(DestroyDescriptorPool);
+ PL_VK_FUN(DestroyDescriptorSetLayout);
+ PL_VK_FUN(DestroyDevice);
+ PL_VK_FUN(DestroyFence);
+ PL_VK_FUN(DestroyFramebuffer);
+ PL_VK_FUN(DestroyImage);
+ PL_VK_FUN(DestroyImageView);
+ PL_VK_FUN(DestroyInstance);
+ PL_VK_FUN(DestroyPipeline);
+ PL_VK_FUN(DestroyPipelineCache);
+ PL_VK_FUN(DestroyPipelineLayout);
+ PL_VK_FUN(DestroyQueryPool);
+ PL_VK_FUN(DestroyRenderPass);
+ PL_VK_FUN(DestroySampler);
+ PL_VK_FUN(DestroySemaphore);
+ PL_VK_FUN(DestroyShaderModule);
+ PL_VK_FUN(DestroySwapchainKHR);
+ PL_VK_FUN(DeviceWaitIdle);
+ PL_VK_FUN(EndCommandBuffer);
+ PL_VK_FUN(FlushMappedMemoryRanges);
+ PL_VK_FUN(FreeCommandBuffers);
+ PL_VK_FUN(FreeMemory);
+ PL_VK_FUN(GetBufferMemoryRequirements);
+ PL_VK_FUN(GetDeviceQueue);
+ PL_VK_FUN(GetImageDrmFormatModifierPropertiesEXT);
+ PL_VK_FUN(GetImageMemoryRequirements2);
+ PL_VK_FUN(GetImageSubresourceLayout);
+ PL_VK_FUN(GetMemoryFdKHR);
+ PL_VK_FUN(GetMemoryFdPropertiesKHR);
+ PL_VK_FUN(GetMemoryHostPointerPropertiesEXT);
+ PL_VK_FUN(GetPipelineCacheData);
+ PL_VK_FUN(GetQueryPoolResults);
+ PL_VK_FUN(GetSemaphoreFdKHR);
+ PL_VK_FUN(GetSwapchainImagesKHR);
+ PL_VK_FUN(InvalidateMappedMemoryRanges);
+ PL_VK_FUN(MapMemory);
+ PL_VK_FUN(QueuePresentKHR);
+ PL_VK_FUN(QueueSubmit);
+ PL_VK_FUN(QueueSubmit2KHR);
+ PL_VK_FUN(QueueWaitIdle);
+ PL_VK_FUN(ResetFences);
+ PL_VK_FUN(ResetQueryPool);
+ PL_VK_FUN(SetDebugUtilsObjectNameEXT);
+ PL_VK_FUN(SetHdrMetadataEXT);
+ PL_VK_FUN(UpdateDescriptorSets);
+ PL_VK_FUN(WaitForFences);
+ PL_VK_FUN(WaitSemaphores);
+
+#ifdef PL_HAVE_WIN32
+ PL_VK_FUN(GetMemoryWin32HandleKHR);
+ PL_VK_FUN(GetSemaphoreWin32HandleKHR);
+#endif
+
+#ifdef VK_EXT_metal_objects
+ PL_VK_FUN(ExportMetalObjectsEXT);
+#endif
+#ifdef VK_EXT_full_screen_exclusive
+ PL_VK_FUN(AcquireFullScreenExclusiveModeEXT);
+#endif
+};
diff --git a/src/vulkan/context.c b/src/vulkan/context.c
new file mode 100644
index 0000000..ad8a859
--- /dev/null
+++ b/src/vulkan/context.c
@@ -0,0 +1,1704 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+#include "command.h"
+#include "utils.h"
+#include "gpu.h"
+
+#ifdef PL_HAVE_VK_PROC_ADDR
+VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vkGetInstanceProcAddr(
+ VkInstance instance,
+ const char* pName);
+#endif
+
+const struct pl_vk_inst_params pl_vk_inst_default_params = {0};
+
+struct vk_fun {
+ const char *name;
+ size_t offset;
+ bool device_level;
+};
+
+struct vk_ext {
+ const char *name;
+ const struct vk_fun *funs;
+};
+
+#define PL_VK_INST_FUN(N) \
+ { .name = "vk" #N, \
+ .offset = offsetof(struct vk_ctx, N), \
+ }
+
+#define PL_VK_DEV_FUN(N) \
+ { .name = "vk" #N, \
+ .offset = offsetof(struct vk_ctx, N), \
+ .device_level = true, \
+ }
+
+// Table of optional vulkan instance extensions
+static const char *vk_instance_extensions[] = {
+ VK_KHR_SURFACE_EXTENSION_NAME,
+ VK_EXT_SWAPCHAIN_COLOR_SPACE_EXTENSION_NAME,
+ VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME,
+ VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME,
+ VK_KHR_GET_SURFACE_CAPABILITIES_2_EXTENSION_NAME,
+};
+
+// List of mandatory instance-level function pointers, including functions
+// associated with mandatory instance extensions
+static const struct vk_fun vk_inst_funs[] = {
+ PL_VK_INST_FUN(CreateDevice),
+ PL_VK_INST_FUN(EnumerateDeviceExtensionProperties),
+ PL_VK_INST_FUN(GetDeviceProcAddr),
+ PL_VK_INST_FUN(GetPhysicalDeviceExternalBufferProperties),
+ PL_VK_INST_FUN(GetPhysicalDeviceExternalSemaphoreProperties),
+ PL_VK_INST_FUN(GetPhysicalDeviceFeatures2KHR),
+ PL_VK_INST_FUN(GetPhysicalDeviceFormatProperties),
+ PL_VK_INST_FUN(GetPhysicalDeviceFormatProperties2KHR),
+ PL_VK_INST_FUN(GetPhysicalDeviceImageFormatProperties2KHR),
+ PL_VK_INST_FUN(GetPhysicalDeviceMemoryProperties),
+ PL_VK_INST_FUN(GetPhysicalDeviceProperties),
+ PL_VK_INST_FUN(GetPhysicalDeviceProperties2),
+ PL_VK_INST_FUN(GetPhysicalDeviceQueueFamilyProperties),
+
+ // These are not actually mandatory, but they're universal enough that we
+ // just load them unconditionally (in lieu of not having proper support for
+ // loading arbitrary instance extensions). Their use is generally guarded
+ // behind various VkSurfaceKHR values already being provided by the API
+ // user (implying this extension is loaded).
+ PL_VK_INST_FUN(GetPhysicalDeviceSurfaceCapabilitiesKHR),
+ PL_VK_INST_FUN(GetPhysicalDeviceSurfaceFormatsKHR),
+ PL_VK_INST_FUN(GetPhysicalDeviceSurfacePresentModesKHR),
+ PL_VK_INST_FUN(GetPhysicalDeviceSurfaceSupportKHR),
+};
+
+// Table of vulkan device extensions and functions they load, including
+// functions exported by dependent instance-level extensions
+static const struct vk_ext vk_device_extensions[] = {
+ {
+ .name = VK_KHR_SWAPCHAIN_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(AcquireNextImageKHR),
+ PL_VK_DEV_FUN(CreateSwapchainKHR),
+ PL_VK_DEV_FUN(DestroySwapchainKHR),
+ PL_VK_DEV_FUN(GetSwapchainImagesKHR),
+ PL_VK_DEV_FUN(QueuePresentKHR),
+ {0}
+ },
+ }, {
+ .name = VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(CmdPushDescriptorSetKHR),
+ {0}
+ },
+ }, {
+ .name = VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(GetMemoryFdKHR),
+ {0}
+ },
+ }, {
+ .name = VK_EXT_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(GetMemoryFdPropertiesKHR),
+ {0}
+ },
+#ifdef PL_HAVE_WIN32
+ }, {
+ .name = VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(GetMemoryWin32HandleKHR),
+ {0}
+ },
+#endif
+ }, {
+ .name = VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(GetMemoryHostPointerPropertiesEXT),
+ {0}
+ },
+ }, {
+ .name = VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(GetSemaphoreFdKHR),
+ {0}
+ },
+#ifdef PL_HAVE_WIN32
+ }, {
+ .name = VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(GetSemaphoreWin32HandleKHR),
+ {0}
+ },
+#endif
+ }, {
+ .name = VK_EXT_PCI_BUS_INFO_EXTENSION_NAME,
+ }, {
+ .name = VK_EXT_HDR_METADATA_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(SetHdrMetadataEXT),
+ {0}
+ },
+ }, {
+ .name = VK_EXT_IMAGE_DRM_FORMAT_MODIFIER_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(GetImageDrmFormatModifierPropertiesEXT),
+ {0}
+ },
+#ifdef VK_KHR_portability_subset
+ }, {
+ .name = VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME,
+#endif
+#ifdef VK_EXT_metal_objects
+ }, {
+ .name = VK_EXT_METAL_OBJECTS_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(ExportMetalObjectsEXT),
+ {0}
+ },
+#endif
+#ifdef VK_EXT_full_screen_exclusive
+ }, {
+ .name = VK_EXT_FULL_SCREEN_EXCLUSIVE_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(AcquireFullScreenExclusiveModeEXT),
+ {0}
+ },
+#endif
+ }, {
+ .name = VK_KHR_SYNCHRONIZATION_2_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(CmdPipelineBarrier2KHR),
+ PL_VK_DEV_FUN(QueueSubmit2KHR),
+ {0}
+ },
+ },
+};
+
+// Make sure to keep this in sync with the above!
+const char * const pl_vulkan_recommended_extensions[] = {
+ VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME,
+ VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME,
+ VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME,
+ VK_EXT_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME,
+ VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME,
+#ifdef PL_HAVE_WIN32
+ VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME,
+ VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME,
+#endif
+ VK_EXT_PCI_BUS_INFO_EXTENSION_NAME,
+ VK_EXT_HDR_METADATA_EXTENSION_NAME,
+ VK_EXT_IMAGE_DRM_FORMAT_MODIFIER_EXTENSION_NAME,
+#ifdef VK_KHR_portability_subset
+ VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME,
+#endif
+#ifdef VK_EXT_metal_objects
+ VK_EXT_METAL_OBJECTS_EXTENSION_NAME,
+#endif
+#ifdef VK_EXT_full_screen_exclusive
+ VK_EXT_FULL_SCREEN_EXCLUSIVE_EXTENSION_NAME,
+#endif
+ VK_KHR_SYNCHRONIZATION_2_EXTENSION_NAME,
+};
+
+const int pl_vulkan_num_recommended_extensions =
+ PL_ARRAY_SIZE(pl_vulkan_recommended_extensions);
+
+// +1 because VK_KHR_swapchain is not automatically pulled in
+static_assert(PL_ARRAY_SIZE(pl_vulkan_recommended_extensions) + 1 ==
+ PL_ARRAY_SIZE(vk_device_extensions),
+ "pl_vulkan_recommended_extensions out of sync with "
+ "vk_device_extensions?");
+
+// Recommended features; keep in sync with libavutil vulkan hwcontext
+static const VkPhysicalDeviceVulkan13Features recommended_vk13 = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES,
+ .computeFullSubgroups = true,
+ .maintenance4 = true,
+ .shaderZeroInitializeWorkgroupMemory = true,
+ .synchronization2 = true,
+};
+
+static const VkPhysicalDeviceVulkan12Features recommended_vk12 = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES,
+ .pNext = (void *) &recommended_vk13,
+ .bufferDeviceAddress = true,
+ .storagePushConstant8 = true,
+ .shaderInt8 = true,
+ .shaderFloat16 = true,
+ .shaderSharedInt64Atomics = true,
+ .storageBuffer8BitAccess = true,
+ .uniformAndStorageBuffer8BitAccess = true,
+ .vulkanMemoryModel = true,
+ .vulkanMemoryModelDeviceScope = true,
+};
+
+static const VkPhysicalDeviceVulkan11Features recommended_vk11 = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES,
+ .pNext = (void *) &recommended_vk12,
+ .samplerYcbcrConversion = true,
+ .storagePushConstant16 = true,
+};
+
+const VkPhysicalDeviceFeatures2 pl_vulkan_recommended_features = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2,
+ .pNext = (void *) &recommended_vk11,
+ .features = {
+ .shaderImageGatherExtended = true,
+ .shaderStorageImageReadWithoutFormat = true,
+ .shaderStorageImageWriteWithoutFormat = true,
+
+ // Needed for GPU-assisted validation, but not harmful to enable
+ .fragmentStoresAndAtomics = true,
+ .vertexPipelineStoresAndAtomics = true,
+ .shaderInt64 = true,
+ }
+};
+
+// Required features
+static const VkPhysicalDeviceVulkan12Features required_vk12 = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES,
+ .hostQueryReset = true,
+ .timelineSemaphore = true,
+};
+
+static const VkPhysicalDeviceVulkan11Features required_vk11 = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES,
+ .pNext = (void *) &required_vk12,
+};
+
+const VkPhysicalDeviceFeatures2 pl_vulkan_required_features = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2,
+ .pNext = (void *) &required_vk11,
+};
+
+static bool check_required_features(struct vk_ctx *vk)
+{
+ #define CHECK_FEATURE(maj, min, feat) do { \
+ const VkPhysicalDeviceVulkan##maj##min##Features *f; \
+ f = vk_find_struct(&vk->features, \
+ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_##maj##_##min##_FEATURES); \
+ if (!f || !f->feat) { \
+ PL_ERR(vk, "Missing device feature: " #feat); \
+ return false; \
+ } \
+ } while (0)
+
+ CHECK_FEATURE(1, 2, hostQueryReset);
+ CHECK_FEATURE(1, 2, timelineSemaphore);
+
+ #undef CHECK_FEATURE
+ return true;
+}
+
+
+// List of mandatory device-level functions
+//
+// Note: Also includes VK_EXT_debug_utils functions, even though they aren't
+// mandatory, simply because we load that extension in a special way.
+static const struct vk_fun vk_dev_funs[] = {
+ PL_VK_DEV_FUN(AllocateCommandBuffers),
+ PL_VK_DEV_FUN(AllocateDescriptorSets),
+ PL_VK_DEV_FUN(AllocateMemory),
+ PL_VK_DEV_FUN(BeginCommandBuffer),
+ PL_VK_DEV_FUN(BindBufferMemory),
+ PL_VK_DEV_FUN(BindImageMemory),
+ PL_VK_DEV_FUN(CmdBeginDebugUtilsLabelEXT),
+ PL_VK_DEV_FUN(CmdBeginRenderPass),
+ PL_VK_DEV_FUN(CmdBindDescriptorSets),
+ PL_VK_DEV_FUN(CmdBindIndexBuffer),
+ PL_VK_DEV_FUN(CmdBindPipeline),
+ PL_VK_DEV_FUN(CmdBindVertexBuffers),
+ PL_VK_DEV_FUN(CmdBlitImage),
+ PL_VK_DEV_FUN(CmdClearColorImage),
+ PL_VK_DEV_FUN(CmdCopyBuffer),
+ PL_VK_DEV_FUN(CmdCopyBufferToImage),
+ PL_VK_DEV_FUN(CmdCopyImage),
+ PL_VK_DEV_FUN(CmdCopyImageToBuffer),
+ PL_VK_DEV_FUN(CmdDispatch),
+ PL_VK_DEV_FUN(CmdDraw),
+ PL_VK_DEV_FUN(CmdDrawIndexed),
+ PL_VK_DEV_FUN(CmdEndDebugUtilsLabelEXT),
+ PL_VK_DEV_FUN(CmdEndRenderPass),
+ PL_VK_DEV_FUN(CmdPipelineBarrier),
+ PL_VK_DEV_FUN(CmdPushConstants),
+ PL_VK_DEV_FUN(CmdResetQueryPool),
+ PL_VK_DEV_FUN(CmdSetScissor),
+ PL_VK_DEV_FUN(CmdSetViewport),
+ PL_VK_DEV_FUN(CmdUpdateBuffer),
+ PL_VK_DEV_FUN(CmdWriteTimestamp),
+ PL_VK_DEV_FUN(CreateBuffer),
+ PL_VK_DEV_FUN(CreateBufferView),
+ PL_VK_DEV_FUN(CreateCommandPool),
+ PL_VK_DEV_FUN(CreateComputePipelines),
+ PL_VK_DEV_FUN(CreateDescriptorPool),
+ PL_VK_DEV_FUN(CreateDescriptorSetLayout),
+ PL_VK_DEV_FUN(CreateFence),
+ PL_VK_DEV_FUN(CreateFramebuffer),
+ PL_VK_DEV_FUN(CreateGraphicsPipelines),
+ PL_VK_DEV_FUN(CreateImage),
+ PL_VK_DEV_FUN(CreateImageView),
+ PL_VK_DEV_FUN(CreatePipelineCache),
+ PL_VK_DEV_FUN(CreatePipelineLayout),
+ PL_VK_DEV_FUN(CreateQueryPool),
+ PL_VK_DEV_FUN(CreateRenderPass),
+ PL_VK_DEV_FUN(CreateSampler),
+ PL_VK_DEV_FUN(CreateSemaphore),
+ PL_VK_DEV_FUN(CreateShaderModule),
+ PL_VK_DEV_FUN(DestroyBuffer),
+ PL_VK_DEV_FUN(DestroyBufferView),
+ PL_VK_DEV_FUN(DestroyCommandPool),
+ PL_VK_DEV_FUN(DestroyDescriptorPool),
+ PL_VK_DEV_FUN(DestroyDescriptorSetLayout),
+ PL_VK_DEV_FUN(DestroyDevice),
+ PL_VK_DEV_FUN(DestroyFence),
+ PL_VK_DEV_FUN(DestroyFramebuffer),
+ PL_VK_DEV_FUN(DestroyImage),
+ PL_VK_DEV_FUN(DestroyImageView),
+ PL_VK_DEV_FUN(DestroyInstance),
+ PL_VK_DEV_FUN(DestroyPipeline),
+ PL_VK_DEV_FUN(DestroyPipelineCache),
+ PL_VK_DEV_FUN(DestroyPipelineLayout),
+ PL_VK_DEV_FUN(DestroyQueryPool),
+ PL_VK_DEV_FUN(DestroyRenderPass),
+ PL_VK_DEV_FUN(DestroySampler),
+ PL_VK_DEV_FUN(DestroySemaphore),
+ PL_VK_DEV_FUN(DestroyShaderModule),
+ PL_VK_DEV_FUN(DeviceWaitIdle),
+ PL_VK_DEV_FUN(EndCommandBuffer),
+ PL_VK_DEV_FUN(FlushMappedMemoryRanges),
+ PL_VK_DEV_FUN(FreeCommandBuffers),
+ PL_VK_DEV_FUN(FreeMemory),
+ PL_VK_DEV_FUN(GetBufferMemoryRequirements),
+ PL_VK_DEV_FUN(GetDeviceQueue),
+ PL_VK_DEV_FUN(GetImageMemoryRequirements2),
+ PL_VK_DEV_FUN(GetImageSubresourceLayout),
+ PL_VK_DEV_FUN(GetPipelineCacheData),
+ PL_VK_DEV_FUN(GetQueryPoolResults),
+ PL_VK_DEV_FUN(InvalidateMappedMemoryRanges),
+ PL_VK_DEV_FUN(MapMemory),
+ PL_VK_DEV_FUN(QueueSubmit),
+ PL_VK_DEV_FUN(QueueWaitIdle),
+ PL_VK_DEV_FUN(ResetFences),
+ PL_VK_DEV_FUN(ResetQueryPool),
+ PL_VK_DEV_FUN(SetDebugUtilsObjectNameEXT),
+ PL_VK_DEV_FUN(UpdateDescriptorSets),
+ PL_VK_DEV_FUN(WaitForFences),
+ PL_VK_DEV_FUN(WaitSemaphores),
+};
+
+static void load_vk_fun(struct vk_ctx *vk, const struct vk_fun *fun)
+{
+ PFN_vkVoidFunction *pfn = (void *) ((uintptr_t) vk + (ptrdiff_t) fun->offset);
+
+ if (fun->device_level) {
+ *pfn = vk->GetDeviceProcAddr(vk->dev, fun->name);
+ } else {
+ *pfn = vk->GetInstanceProcAddr(vk->inst, fun->name);
+ };
+
+ if (!*pfn) {
+ // Some functions get their extension suffix stripped when promoted
+ // to core. As a very simple work-around to this, try loading the
+ // function a second time with the reserved suffixes stripped.
+ static const char *ext_suffixes[] = { "KHR", "EXT" };
+ pl_str fun_name = pl_str0(fun->name);
+ char buf[64];
+
+ for (int i = 0; i < PL_ARRAY_SIZE(ext_suffixes); i++) {
+ if (!pl_str_eatend0(&fun_name, ext_suffixes[i]))
+ continue;
+
+ pl_assert(sizeof(buf) > fun_name.len);
+ snprintf(buf, sizeof(buf), "%.*s", PL_STR_FMT(fun_name));
+ if (fun->device_level) {
+ *pfn = vk->GetDeviceProcAddr(vk->dev, buf);
+ } else {
+ *pfn = vk->GetInstanceProcAddr(vk->inst, buf);
+ }
+ return;
+ }
+ }
+}
+
+// Private struct for pl_vk_inst
+struct priv {
+ VkDebugUtilsMessengerEXT debug_utils_cb;
+};
+
+void pl_vk_inst_destroy(pl_vk_inst *inst_ptr)
+{
+ pl_vk_inst inst = *inst_ptr;
+ if (!inst)
+ return;
+
+ struct priv *p = PL_PRIV(inst);
+ if (p->debug_utils_cb) {
+ PL_VK_LOAD_FUN(inst->instance, DestroyDebugUtilsMessengerEXT, inst->get_proc_addr);
+ DestroyDebugUtilsMessengerEXT(inst->instance, p->debug_utils_cb, PL_VK_ALLOC);
+ }
+
+ PL_VK_LOAD_FUN(inst->instance, DestroyInstance, inst->get_proc_addr);
+ DestroyInstance(inst->instance, PL_VK_ALLOC);
+ pl_free_ptr((void **) inst_ptr);
+}
+
+static VkBool32 VKAPI_PTR vk_dbg_utils_cb(VkDebugUtilsMessageSeverityFlagBitsEXT sev,
+ VkDebugUtilsMessageTypeFlagsEXT msgType,
+ const VkDebugUtilsMessengerCallbackDataEXT *data,
+ void *priv)
+{
+ pl_log log = priv;
+
+ // Ignore errors for messages that we consider false positives
+ switch (data->messageIdNumber) {
+ case 0x7cd0911d: // VUID-VkSwapchainCreateInfoKHR-imageExtent-01274
+ case 0x8928392f: // UNASSIGNED-BestPractices-NonSuccess-Result
+ case 0xdc18ad6b: // UNASSIGNED-BestPractices-vkAllocateMemory-small-allocation
+ case 0xb3d4346b: // UNASSIGNED-BestPractices-vkBindMemory-small-dedicated-allocation
+ case 0x6cfe18a5: // UNASSIGNED-BestPractices-SemaphoreCount
+ case 0x48a09f6c: // UNASSIGNED-BestPractices-pipeline-stage-flags
+ // profile chain expectations
+ case 0x30f4ac70: // VUID-VkImageCreateInfo-pNext-06811
+ return false;
+
+ case 0x5f379b89: // UNASSIGNED-BestPractices-Error-Result
+ if (strstr(data->pMessage, "VK_ERROR_FORMAT_NOT_SUPPORTED"))
+ return false;
+ break;
+
+ case 0xf6a37cfa: // VUID-vkGetImageSubresourceLayout-format-04461
+ // Work around https://github.com/KhronosGroup/Vulkan-Docs/issues/2109
+ return false;
+ }
+
+ enum pl_log_level lev;
+ switch (sev) {
+ case VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT: lev = PL_LOG_ERR; break;
+ case VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT: lev = PL_LOG_WARN; break;
+ case VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT: lev = PL_LOG_DEBUG; break;
+ case VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT: lev = PL_LOG_TRACE; break;
+ default: lev = PL_LOG_INFO; break;
+ }
+
+ pl_msg(log, lev, "vk %s", data->pMessage);
+
+ for (int i = 0; i < data->queueLabelCount; i++)
+ pl_msg(log, lev, " during %s", data->pQueueLabels[i].pLabelName);
+ for (int i = 0; i < data->cmdBufLabelCount; i++)
+ pl_msg(log, lev, " inside %s", data->pCmdBufLabels[i].pLabelName);
+ for (int i = 0; i < data->objectCount; i++) {
+ const VkDebugUtilsObjectNameInfoEXT *obj = &data->pObjects[i];
+ pl_msg(log, lev, " using %s: %s (0x%llx)",
+ vk_obj_type(obj->objectType),
+ obj->pObjectName ? obj->pObjectName : "anon",
+ (unsigned long long) obj->objectHandle);
+ }
+
+ // The return value of this function determines whether the call will
+ // be explicitly aborted (to prevent GPU errors) or not. In this case,
+ // we generally want this to be on for the validation errors, but nothing
+ // else (e.g. performance warnings)
+ bool is_error = (sev & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) &&
+ (msgType & VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT);
+
+ if (is_error) {
+ pl_log_stack_trace(log, lev);
+ pl_debug_abort();
+ return true;
+ }
+
+ return false;
+}
+
+static PFN_vkGetInstanceProcAddr get_proc_addr_fallback(pl_log log,
+ PFN_vkGetInstanceProcAddr get_proc_addr)
+{
+ if (get_proc_addr)
+ return get_proc_addr;
+
+#ifdef PL_HAVE_VK_PROC_ADDR
+ return vkGetInstanceProcAddr;
+#else
+ pl_fatal(log, "No `vkGetInstanceProcAddr` function provided, and "
+ "libplacebo built without linking against this function!");
+ return NULL;
+#endif
+}
+
+#define PRINTF_VER(ver) \
+ (int) VK_API_VERSION_MAJOR(ver), \
+ (int) VK_API_VERSION_MINOR(ver), \
+ (int) VK_API_VERSION_PATCH(ver)
+
+pl_vk_inst pl_vk_inst_create(pl_log log, const struct pl_vk_inst_params *params)
+{
+ void *tmp = pl_tmp(NULL);
+ params = PL_DEF(params, &pl_vk_inst_default_params);
+ VkInstance inst = NULL;
+ pl_clock_t start;
+
+ PL_ARRAY(const char *) exts = {0};
+
+ PFN_vkGetInstanceProcAddr get_addr;
+ if (!(get_addr = get_proc_addr_fallback(log, params->get_proc_addr)))
+ goto error;
+
+ // Query instance version support
+ uint32_t api_ver = VK_API_VERSION_1_0;
+ PL_VK_LOAD_FUN(NULL, EnumerateInstanceVersion, get_addr);
+ if (EnumerateInstanceVersion && EnumerateInstanceVersion(&api_ver) != VK_SUCCESS)
+ goto error;
+
+ pl_debug(log, "Available instance version: %d.%d.%d", PRINTF_VER(api_ver));
+
+ if (params->max_api_version) {
+ api_ver = PL_MIN(api_ver, params->max_api_version);
+ pl_info(log, "Restricting API version to %d.%d.%d... new version %d.%d.%d",
+ PRINTF_VER(params->max_api_version), PRINTF_VER(api_ver));
+ }
+
+ if (api_ver < PL_VK_MIN_VERSION) {
+ pl_fatal(log, "Instance API version %d.%d.%d is lower than the minimum "
+ "required version of %d.%d.%d, cannot proceed!",
+ PRINTF_VER(api_ver), PRINTF_VER(PL_VK_MIN_VERSION));
+ goto error;
+ }
+
+ VkInstanceCreateInfo info = {
+ .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
+ .pApplicationInfo = &(VkApplicationInfo) {
+ .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
+ .apiVersion = api_ver,
+ },
+ };
+
+ // Enumerate all supported layers
+ start = pl_clock_now();
+ PL_VK_LOAD_FUN(NULL, EnumerateInstanceLayerProperties, get_addr);
+ uint32_t num_layers_avail = 0;
+ EnumerateInstanceLayerProperties(&num_layers_avail, NULL);
+ VkLayerProperties *layers_avail = pl_calloc_ptr(tmp, num_layers_avail, layers_avail);
+ EnumerateInstanceLayerProperties(&num_layers_avail, layers_avail);
+ pl_log_cpu_time(log, start, pl_clock_now(), "enumerating instance layers");
+
+ pl_debug(log, "Available layers:");
+ for (int i = 0; i < num_layers_avail; i++) {
+ pl_debug(log, " %s (v%d.%d.%d)", layers_avail[i].layerName,
+ PRINTF_VER(layers_avail[i].specVersion));
+ }
+
+ PL_ARRAY(const char *) layers = {0};
+
+ // Sorted by priority
+ static const char *debug_layers[] = {
+ "VK_LAYER_KHRONOS_validation",
+ "VK_LAYER_LUNARG_standard_validation",
+ };
+
+ // This layer has to be initialized first, otherwise all sorts of weirdness
+ // happens (random segfaults, yum)
+ bool debug = params->debug;
+ uint32_t debug_layer = 0; // layer idx of debug layer
+ uint32_t debug_layer_version = 0;
+ if (debug) {
+ for (int i = 0; i < PL_ARRAY_SIZE(debug_layers); i++) {
+ for (int n = 0; n < num_layers_avail; n++) {
+ if (strcmp(debug_layers[i], layers_avail[n].layerName) != 0)
+ continue;
+
+ debug_layer = n;
+ debug_layer_version = layers_avail[n].specVersion;
+ pl_info(log, "Enabling debug meta layer: %s (v%d.%d.%d)",
+ debug_layers[i], PRINTF_VER(debug_layer_version));
+ PL_ARRAY_APPEND(tmp, layers, debug_layers[i]);
+ goto debug_layers_done;
+ }
+ }
+
+ // No layer found..
+ pl_warn(log, "API debugging requested but no debug meta layers present... ignoring");
+ debug = false;
+ }
+
+debug_layers_done: ;
+
+ for (int i = 0; i < params->num_layers; i++)
+ PL_ARRAY_APPEND(tmp, layers, params->layers[i]);
+
+ for (int i = 0; i < params->num_opt_layers; i++) {
+ const char *layer = params->opt_layers[i];
+ for (int n = 0; n < num_layers_avail; n++) {
+ if (strcmp(layer, layers_avail[n].layerName) == 0) {
+ PL_ARRAY_APPEND(tmp, layers, layer);
+ break;
+ }
+ }
+ }
+
+ // Enumerate all supported extensions
+ start = pl_clock_now();
+ PL_VK_LOAD_FUN(NULL, EnumerateInstanceExtensionProperties, get_addr);
+ uint32_t num_exts_avail = 0;
+ EnumerateInstanceExtensionProperties(NULL, &num_exts_avail, NULL);
+ VkExtensionProperties *exts_avail = pl_calloc_ptr(tmp, num_exts_avail, exts_avail);
+ EnumerateInstanceExtensionProperties(NULL, &num_exts_avail, exts_avail);
+
+ struct {
+ VkExtensionProperties *exts;
+ uint32_t num_exts;
+ } *layer_exts = pl_calloc_ptr(tmp, num_layers_avail, layer_exts);
+
+ // Enumerate extensions from layers
+ for (int i = 0; i < num_layers_avail; i++) {
+ VkExtensionProperties **lexts = &layer_exts[i].exts;
+ uint32_t *num = &layer_exts[i].num_exts;
+
+ EnumerateInstanceExtensionProperties(layers_avail[i].layerName, num, NULL);
+ *lexts = pl_calloc_ptr(tmp, *num, *lexts);
+ EnumerateInstanceExtensionProperties(layers_avail[i].layerName, num, *lexts);
+
+ // Replace all extensions that are already available globally by {0}
+ for (int j = 0; j < *num; j++) {
+ for (int k = 0; k < num_exts_avail; k++) {
+ if (strcmp((*lexts)[j].extensionName, exts_avail[k].extensionName) == 0)
+ (*lexts)[j] = (VkExtensionProperties) {0};
+ }
+ }
+ }
+
+ pl_log_cpu_time(log, start, pl_clock_now(), "enumerating instance extensions");
+ pl_debug(log, "Available instance extensions:");
+ for (int i = 0; i < num_exts_avail; i++)
+ pl_debug(log, " %s", exts_avail[i].extensionName);
+ for (int i = 0; i < num_layers_avail; i++) {
+ for (int j = 0; j < layer_exts[i].num_exts; j++) {
+ if (!layer_exts[i].exts[j].extensionName[0])
+ continue;
+
+ pl_debug(log, " %s (via %s)",
+ layer_exts[i].exts[j].extensionName,
+ layers_avail[i].layerName);
+ }
+ }
+
+ // Add mandatory extensions
+ PL_ARRAY_APPEND(tmp, exts, VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME);
+
+ // Add optional extensions
+ for (int i = 0; i < PL_ARRAY_SIZE(vk_instance_extensions); i++) {
+ const char *ext = vk_instance_extensions[i];
+ for (int n = 0; n < num_exts_avail; n++) {
+ if (strcmp(ext, exts_avail[n].extensionName) == 0) {
+ PL_ARRAY_APPEND(tmp, exts, ext);
+ break;
+ }
+ }
+ }
+
+#ifdef VK_KHR_portability_enumeration
+ // Required for macOS ( MoltenVK ) compatibility
+ for (int n = 0; n < num_exts_avail; n++) {
+ if (strcmp(VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME, exts_avail[n].extensionName) == 0) {
+ PL_ARRAY_APPEND(tmp, exts, VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME);
+ info.flags |= VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR;
+ break;
+ }
+ }
+#endif
+
+ // Add extra user extensions
+ for (int i = 0; i < params->num_extensions; i++) {
+ const char *ext = params->extensions[i];
+ PL_ARRAY_APPEND(tmp, exts, ext);
+
+ // Enable any additional layers that are required for this extension
+ for (int n = 0; n < num_layers_avail; n++) {
+ for (int j = 0; j < layer_exts[n].num_exts; j++) {
+ if (!layer_exts[n].exts[j].extensionName[0])
+ continue;
+ if (strcmp(ext, layer_exts[n].exts[j].extensionName) == 0) {
+ PL_ARRAY_APPEND(tmp, layers, layers_avail[n].layerName);
+ goto next_user_ext;
+ }
+ }
+ }
+
+next_user_ext: ;
+ }
+
+ // Add extra optional user extensions
+ for (int i = 0; i < params->num_opt_extensions; i++) {
+ const char *ext = params->opt_extensions[i];
+ for (int n = 0; n < num_exts_avail; n++) {
+ if (strcmp(ext, exts_avail[n].extensionName) == 0) {
+ PL_ARRAY_APPEND(tmp, exts, ext);
+ goto next_opt_user_ext;
+ }
+ }
+
+ for (int n = 0; n < num_layers_avail; n++) {
+ for (int j = 0; j < layer_exts[n].num_exts; j++) {
+ if (!layer_exts[n].exts[j].extensionName[0])
+ continue;
+ if (strcmp(ext, layer_exts[n].exts[j].extensionName) == 0) {
+ PL_ARRAY_APPEND(tmp, exts, ext);
+ PL_ARRAY_APPEND(tmp, layers, layers_avail[n].layerName);
+ goto next_opt_user_ext;
+ }
+ }
+ }
+
+next_opt_user_ext: ;
+ }
+
+ // If debugging is enabled, load the necessary debug utils extension
+ if (debug) {
+ const char * const ext = VK_EXT_DEBUG_UTILS_EXTENSION_NAME;
+ for (int n = 0; n < num_exts_avail; n++) {
+ if (strcmp(ext, exts_avail[n].extensionName) == 0) {
+ PL_ARRAY_APPEND(tmp, exts, ext);
+ goto debug_ext_done;
+ }
+ }
+
+ for (int n = 0; n < layer_exts[debug_layer].num_exts; n++) {
+ if (strcmp(ext, layer_exts[debug_layer].exts[n].extensionName) == 0) {
+ PL_ARRAY_APPEND(tmp, exts, ext);
+ goto debug_ext_done;
+ }
+ }
+
+ // No extension found
+ pl_warn(log, "API debug layers enabled but no debug report extension "
+ "found... ignoring. Debug messages may be spilling to "
+ "stdout/stderr!");
+ debug = false;
+ }
+
+debug_ext_done: ;
+
+ // Limit this to 1.3.250+ because of bugs in older versions.
+ if (debug && params->debug_extra &&
+ debug_layer_version >= VK_MAKE_API_VERSION(0, 1, 3, 259))
+ {
+ // Try enabling as many validation features as possible
+ static const VkValidationFeatureEnableEXT validation_features[] = {
+ VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT,
+ VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT,
+ VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT,
+ VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION_EXT,
+ };
+
+ static const VkValidationFeaturesEXT vinfo = {
+ .sType = VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT,
+ .pEnabledValidationFeatures = validation_features,
+ .enabledValidationFeatureCount = PL_ARRAY_SIZE(validation_features),
+ };
+
+ const char * const ext = VK_EXT_VALIDATION_FEATURES_EXTENSION_NAME;
+ for (int n = 0; n < num_exts_avail; n++) {
+ if (strcmp(ext, exts_avail[n].extensionName) == 0) {
+ PL_ARRAY_APPEND(tmp, exts, ext);
+ vk_link_struct(&info, &vinfo);
+ goto debug_extra_ext_done;
+ }
+ }
+
+ for (int n = 0; n < layer_exts[debug_layer].num_exts; n++) {
+ if (strcmp(ext, layer_exts[debug_layer].exts[n].extensionName) == 0) {
+ PL_ARRAY_APPEND(tmp, exts, ext);
+ vk_link_struct(&info, &vinfo);
+ goto debug_extra_ext_done;
+ }
+ }
+
+ pl_warn(log, "GPU-assisted validation enabled but not supported by "
+ "instance, disabling...");
+ }
+
+debug_extra_ext_done: ;
+
+ info.ppEnabledExtensionNames = exts.elem;
+ info.enabledExtensionCount = exts.num;
+ info.ppEnabledLayerNames = layers.elem;
+ info.enabledLayerCount = layers.num;
+
+ pl_info(log, "Creating vulkan instance%s", exts.num ? " with extensions:" : "");
+ for (int i = 0; i < exts.num; i++)
+ pl_info(log, " %s", exts.elem[i]);
+
+ if (layers.num) {
+ pl_info(log, " and layers:");
+ for (int i = 0; i < layers.num; i++)
+ pl_info(log, " %s", layers.elem[i]);
+ }
+
+ start = pl_clock_now();
+ PL_VK_LOAD_FUN(NULL, CreateInstance, get_addr);
+ VkResult res = CreateInstance(&info, PL_VK_ALLOC, &inst);
+ pl_log_cpu_time(log, start, pl_clock_now(), "creating vulkan instance");
+ if (res != VK_SUCCESS) {
+ pl_fatal(log, "Failed creating instance: %s", vk_res_str(res));
+ goto error;
+ }
+
+ struct pl_vk_inst_t *pl_vk = pl_zalloc_obj(NULL, pl_vk, struct priv);
+ struct priv *p = PL_PRIV(pl_vk);
+ *pl_vk = (struct pl_vk_inst_t) {
+ .instance = inst,
+ .api_version = api_ver,
+ .get_proc_addr = get_addr,
+ .extensions = pl_steal(pl_vk, exts.elem),
+ .num_extensions = exts.num,
+ .layers = pl_steal(pl_vk, layers.elem),
+ .num_layers = layers.num,
+ };
+
+ // Set up a debug callback to catch validation messages
+ if (debug) {
+ VkDebugUtilsMessengerCreateInfoEXT dinfo = {
+ .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT,
+ .messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT |
+ VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT |
+ VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT |
+ VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT,
+ .messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT |
+ VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT |
+ VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT,
+ .pfnUserCallback = vk_dbg_utils_cb,
+ .pUserData = (void *) log,
+ };
+
+ PL_VK_LOAD_FUN(inst, CreateDebugUtilsMessengerEXT, get_addr);
+ CreateDebugUtilsMessengerEXT(inst, &dinfo, PL_VK_ALLOC, &p->debug_utils_cb);
+ }
+
+ pl_free(tmp);
+ return pl_vk;
+
+error:
+ pl_fatal(log, "Failed initializing vulkan instance");
+ if (inst) {
+ PL_VK_LOAD_FUN(inst, DestroyInstance, get_addr);
+ DestroyInstance(inst, PL_VK_ALLOC);
+ }
+ pl_free(tmp);
+ return NULL;
+}
+
+const struct pl_vulkan_params pl_vulkan_default_params = { PL_VULKAN_DEFAULTS };
+
+void pl_vulkan_destroy(pl_vulkan *pl_vk)
+{
+ if (!*pl_vk)
+ return;
+
+ struct vk_ctx *vk = PL_PRIV(*pl_vk);
+ if (vk->dev) {
+ if ((*pl_vk)->gpu) {
+ PL_DEBUG(vk, "Waiting for remaining commands...");
+ pl_gpu_finish((*pl_vk)->gpu);
+ pl_assert(vk->cmds_pending.num == 0);
+
+ pl_gpu_destroy((*pl_vk)->gpu);
+ }
+ vk_malloc_destroy(&vk->ma);
+ for (int i = 0; i < vk->pools.num; i++)
+ vk_cmdpool_destroy(vk->pools.elem[i]);
+
+ if (!vk->imported)
+ vk->DestroyDevice(vk->dev, PL_VK_ALLOC);
+ }
+
+ for (int i = 0; i < vk->queue_locks.num; i++) {
+ for (int n = 0; n < vk->queue_locks.elem[i].num; n++)
+ pl_mutex_destroy(&vk->queue_locks.elem[i].elem[n]);
+ }
+
+ pl_vk_inst_destroy(&vk->internal_instance);
+ pl_mutex_destroy(&vk->lock);
+ pl_free_ptr((void **) pl_vk);
+}
+
+static bool supports_surf(pl_log log, VkInstance inst,
+ PFN_vkGetInstanceProcAddr get_addr,
+ VkPhysicalDevice physd, VkSurfaceKHR surf)
+{
+ // Hack for the VK macro's logging to work
+ struct { pl_log log; } *vk = (void *) &log;
+
+ PL_VK_LOAD_FUN(inst, GetPhysicalDeviceQueueFamilyProperties, get_addr);
+ PL_VK_LOAD_FUN(inst, GetPhysicalDeviceSurfaceSupportKHR, get_addr);
+ uint32_t qfnum = 0;
+ GetPhysicalDeviceQueueFamilyProperties(physd, &qfnum, NULL);
+
+ for (int i = 0; i < qfnum; i++) {
+ VkBool32 sup = false;
+ VK(GetPhysicalDeviceSurfaceSupportKHR(physd, i, surf, &sup));
+ if (sup)
+ return true;
+ }
+
+error:
+ return false;
+}
+
+VkPhysicalDevice pl_vulkan_choose_device(pl_log log,
+ const struct pl_vulkan_device_params *params)
+{
+ // Hack for the VK macro's logging to work
+ struct { pl_log log; } *vk = (void *) &log;
+ PL_INFO(vk, "Probing for vulkan devices:");
+
+ pl_assert(params->instance);
+ VkInstance inst = params->instance;
+ VkPhysicalDevice dev = VK_NULL_HANDLE;
+
+ PFN_vkGetInstanceProcAddr get_addr;
+ if (!(get_addr = get_proc_addr_fallback(log, params->get_proc_addr)))
+ return NULL;
+
+ PL_VK_LOAD_FUN(inst, EnumeratePhysicalDevices, get_addr);
+ PL_VK_LOAD_FUN(inst, GetPhysicalDeviceProperties2, get_addr);
+ pl_assert(GetPhysicalDeviceProperties2);
+
+ pl_clock_t start = pl_clock_now();
+ VkPhysicalDevice *devices = NULL;
+ uint32_t num = 0;
+ VK(EnumeratePhysicalDevices(inst, &num, NULL));
+ devices = pl_calloc_ptr(NULL, num, devices);
+ VK(EnumeratePhysicalDevices(inst, &num, devices));
+ pl_log_cpu_time(log, start, pl_clock_now(), "enumerating physical devices");
+
+ static const struct { const char *name; int priority; } types[] = {
+ [VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU] = {"discrete", 5},
+ [VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU] = {"integrated", 4},
+ [VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU] = {"virtual", 3},
+ [VK_PHYSICAL_DEVICE_TYPE_CPU] = {"software", 2},
+ [VK_PHYSICAL_DEVICE_TYPE_OTHER] = {"other", 1},
+ };
+
+ static const uint8_t nil[VK_UUID_SIZE] = {0};
+ bool uuid_set = memcmp(params->device_uuid, nil, VK_UUID_SIZE) != 0;
+
+ int best = -1;
+ for (int i = 0; i < num; i++) {
+ VkPhysicalDeviceIDPropertiesKHR id_props = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR,
+ };
+
+ VkPhysicalDeviceProperties2 prop = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR,
+ .pNext = &id_props,
+ };
+
+ GetPhysicalDeviceProperties2(devices[i], &prop);
+ VkPhysicalDeviceType t = prop.properties.deviceType;
+ const char *dtype = t < PL_ARRAY_SIZE(types) ? types[t].name : "unknown?";
+ PL_INFO(vk, " GPU %d: %s v%d.%d.%d (%s)", i, prop.properties.deviceName,
+ PRINTF_VER(prop.properties.apiVersion), dtype);
+ PL_INFO(vk, " uuid: %s", PRINT_UUID(id_props.deviceUUID));
+
+ if (params->surface) {
+ if (!supports_surf(log, inst, get_addr, devices[i], params->surface)) {
+ PL_DEBUG(vk, " -> excluding due to lack of surface support");
+ continue;
+ }
+ }
+
+ if (uuid_set) {
+ if (memcmp(id_props.deviceUUID, params->device_uuid, VK_UUID_SIZE) == 0) {
+ dev = devices[i];
+ continue;
+ } else {
+ PL_DEBUG(vk, " -> excluding due to UUID mismatch");
+ continue;
+ }
+ } else if (params->device_name && params->device_name[0] != '\0') {
+ if (strcmp(params->device_name, prop.properties.deviceName) == 0) {
+ dev = devices[i];
+ continue;
+ } else {
+ PL_DEBUG(vk, " -> excluding due to name mismatch");
+ continue;
+ }
+ }
+
+ if (!params->allow_software && t == VK_PHYSICAL_DEVICE_TYPE_CPU) {
+ PL_DEBUG(vk, " -> excluding due to !params->allow_software");
+ continue;
+ }
+
+ if (prop.properties.apiVersion < PL_VK_MIN_VERSION) {
+ PL_DEBUG(vk, " -> excluding due to too low API version");
+ continue;
+ }
+
+ int priority = t < PL_ARRAY_SIZE(types) ? types[t].priority : 0;
+ if (priority > best) {
+ dev = devices[i];
+ best = priority;
+ }
+ }
+
+error:
+ pl_free(devices);
+ return dev;
+}
+
+static void lock_queue_internal(void *priv, uint32_t qf, uint32_t qidx)
+{
+ struct vk_ctx *vk = priv;
+ pl_mutex_lock(&vk->queue_locks.elem[qf].elem[qidx]);
+}
+
+static void unlock_queue_internal(void *priv, uint32_t qf, uint32_t qidx)
+{
+ struct vk_ctx *vk = priv;
+ pl_mutex_unlock(&vk->queue_locks.elem[qf].elem[qidx]);
+}
+
+static void init_queue_locks(struct vk_ctx *vk, uint32_t qfnum,
+ const VkQueueFamilyProperties *qfs)
+{
+ vk->queue_locks.elem = pl_calloc_ptr(vk->alloc, qfnum, vk->queue_locks.elem);
+ vk->queue_locks.num = qfnum;
+ for (int i = 0; i < qfnum; i++) {
+ const uint32_t qnum = qfs[i].queueCount;
+ vk->queue_locks.elem[i].elem = pl_calloc(vk->alloc, qnum, sizeof(pl_mutex));
+ vk->queue_locks.elem[i].num = qnum;
+ for (int n = 0; n < qnum; n++)
+ pl_mutex_init(&vk->queue_locks.elem[i].elem[n]);
+ }
+
+ vk->lock_queue = lock_queue_internal;
+ vk->unlock_queue = unlock_queue_internal;
+ vk->queue_ctx = vk;
+}
+
+// Find the most specialized queue supported a combination of flags. In cases
+// where there are multiple queue families at the same specialization level,
+// this finds the one with the most queues. Returns -1 if no queue was found.
+static int find_qf(VkQueueFamilyProperties *qfs, int qfnum, VkQueueFlags flags)
+{
+ int idx = -1;
+ for (int i = 0; i < qfnum; i++) {
+ if ((qfs[i].queueFlags & flags) != flags)
+ continue;
+
+ // QF is more specialized. Since we don't care about other bits like
+ // SPARSE_BIT, mask the ones we're interestew in
+ const VkQueueFlags mask = VK_QUEUE_GRAPHICS_BIT |
+ VK_QUEUE_TRANSFER_BIT |
+ VK_QUEUE_COMPUTE_BIT;
+
+ if (idx < 0 || (qfs[i].queueFlags & mask) < (qfs[idx].queueFlags & mask))
+ idx = i;
+
+ // QF has more queues (at the same specialization level)
+ if (qfs[i].queueFlags == qfs[idx].queueFlags &&
+ qfs[i].queueCount > qfs[idx].queueCount)
+ idx = i;
+ }
+
+ return idx;
+}
+
+static bool device_init(struct vk_ctx *vk, const struct pl_vulkan_params *params)
+{
+ pl_assert(vk->physd);
+ void *tmp = pl_tmp(NULL);
+
+ // Enumerate the queue families and find suitable families for each task
+ uint32_t qfnum = 0;
+ vk->GetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, NULL);
+ VkQueueFamilyProperties *qfs = pl_calloc_ptr(tmp, qfnum, qfs);
+ vk->GetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, qfs);
+ init_queue_locks(vk, qfnum, qfs);
+
+ PL_DEBUG(vk, "Queue families supported by device:");
+ for (int i = 0; i < qfnum; i++) {
+ PL_DEBUG(vk, " %d: flags 0x%"PRIx32" num %"PRIu32, i,
+ qfs[i].queueFlags, qfs[i].queueCount);
+ }
+
+ VkQueueFlagBits gfx_flags = VK_QUEUE_GRAPHICS_BIT;
+ if (!params->async_compute)
+ gfx_flags |= VK_QUEUE_COMPUTE_BIT;
+
+ int idx_gfx = find_qf(qfs, qfnum, gfx_flags);
+ int idx_comp = find_qf(qfs, qfnum, VK_QUEUE_COMPUTE_BIT);
+ int idx_tf = find_qf(qfs, qfnum, VK_QUEUE_TRANSFER_BIT);
+ if (idx_tf < 0)
+ idx_tf = idx_comp;
+
+ if (!params->async_compute)
+ idx_comp = idx_gfx;
+ if (!params->async_transfer)
+ idx_tf = idx_gfx;
+
+ PL_DEBUG(vk, "Using graphics queue %d", idx_gfx);
+ if (idx_tf != idx_gfx)
+ PL_INFO(vk, "Using async transfer (queue %d)", idx_tf);
+ if (idx_comp != idx_gfx)
+ PL_INFO(vk, "Using async compute (queue %d)", idx_comp);
+
+ // Vulkan requires at least one GRAPHICS+COMPUTE queue, so if this fails
+ // something is horribly wrong.
+ pl_assert(idx_gfx >= 0 && idx_comp >= 0 && idx_tf >= 0);
+
+ // If needed, ensure we can actually present to the surface using this queue
+ if (params->surface) {
+ VkBool32 sup = false;
+ VK(vk->GetPhysicalDeviceSurfaceSupportKHR(vk->physd, idx_gfx,
+ params->surface, &sup));
+ if (!sup) {
+ PL_FATAL(vk, "Queue family does not support surface presentation!");
+ goto error;
+ }
+ }
+
+ // Enumerate all supported extensions
+ pl_clock_t start = pl_clock_now();
+ uint32_t num_exts_avail = 0;
+ VK(vk->EnumerateDeviceExtensionProperties(vk->physd, NULL, &num_exts_avail, NULL));
+ VkExtensionProperties *exts_avail = pl_calloc_ptr(tmp, num_exts_avail, exts_avail);
+ VK(vk->EnumerateDeviceExtensionProperties(vk->physd, NULL, &num_exts_avail, exts_avail));
+ pl_log_cpu_time(vk->log, start, pl_clock_now(), "enumerating device extensions");
+
+ PL_DEBUG(vk, "Available device extensions:");
+ for (int i = 0; i < num_exts_avail; i++)
+ PL_DEBUG(vk, " %s", exts_avail[i].extensionName);
+
+ // Add all extensions we need
+ if (params->surface)
+ PL_ARRAY_APPEND(vk->alloc, vk->exts, VK_KHR_SWAPCHAIN_EXTENSION_NAME);
+
+ // Keep track of all optional function pointers associated with extensions
+ PL_ARRAY(const struct vk_fun *) ext_funs = {0};
+
+ // Add all optional device-level extensions extensions
+ for (int i = 0; i < PL_ARRAY_SIZE(vk_device_extensions); i++) {
+ const struct vk_ext *ext = &vk_device_extensions[i];
+ uint32_t core_ver = vk_ext_promoted_ver(ext->name);
+ if (core_ver && vk->api_ver >= core_ver) {
+ // Layer is already implicitly enabled by the API version
+ for (const struct vk_fun *f = ext->funs; f && f->name; f++)
+ PL_ARRAY_APPEND(tmp, ext_funs, f);
+ continue;
+ }
+
+ for (int n = 0; n < num_exts_avail; n++) {
+ if (strcmp(ext->name, exts_avail[n].extensionName) == 0) {
+ PL_ARRAY_APPEND(vk->alloc, vk->exts, ext->name);
+ for (const struct vk_fun *f = ext->funs; f && f->name; f++)
+ PL_ARRAY_APPEND(tmp, ext_funs, f);
+ break;
+ }
+ }
+ }
+
+ // Add extra user extensions
+ for (int i = 0; i < params->num_extensions; i++)
+ PL_ARRAY_APPEND(vk->alloc, vk->exts, params->extensions[i]);
+
+ // Add optional extra user extensions
+ for (int i = 0; i < params->num_opt_extensions; i++) {
+ const char *ext = params->opt_extensions[i];
+ for (int n = 0; n < num_exts_avail; n++) {
+ if (strcmp(ext, exts_avail[n].extensionName) == 0) {
+ PL_ARRAY_APPEND(vk->alloc, vk->exts, ext);
+ break;
+ }
+ }
+ }
+
+ VkPhysicalDeviceFeatures2 features = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR
+ };
+
+ vk_features_normalize(tmp, &pl_vulkan_required_features, vk->api_ver, &features);
+ vk_features_normalize(tmp, &pl_vulkan_recommended_features, vk->api_ver, &features);
+ vk_features_normalize(tmp, params->features, vk->api_ver, &features);
+
+ // Explicitly clear the features struct before querying feature support
+ // from the driver. This way, we don't mistakenly mark as supported
+ // features coming from structs the driver doesn't have support for.
+ VkPhysicalDeviceFeatures2 *features_sup = vk_chain_memdup(tmp, &features);;
+ for (VkBaseOutStructure *out = (void *) features_sup; out; out = out->pNext) {
+ const size_t size = vk_struct_size(out->sType);
+ memset(&out[1], 0, size - sizeof(out[0]));
+ }
+
+ vk->GetPhysicalDeviceFeatures2KHR(vk->physd, features_sup);
+
+ // Filter out unsupported features
+ for (VkBaseOutStructure *f = (VkBaseOutStructure *) &features; f; f = f->pNext) {
+ const VkBaseInStructure *sup = vk_find_struct(features_sup, f->sType);
+ VkBool32 *flags = (VkBool32 *) &f[1];
+ const VkBool32 *flags_sup = (const VkBool32 *) &sup[1];
+ const size_t size = vk_struct_size(f->sType) - sizeof(VkBaseOutStructure);
+ for (int i = 0; i < size / sizeof(VkBool32); i++)
+ flags[i] &= flags_sup[i];
+ }
+
+ // Construct normalized output chain
+ vk->features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
+ vk_features_normalize(vk->alloc, &features, 0, &vk->features);
+ if (!check_required_features(vk)) {
+ PL_FATAL(vk, "Vulkan device does not support all required features!");
+ goto error;
+ }
+
+ // Enable all queues at device creation time, to maximize compatibility
+ // with other API users (e.g. FFmpeg)
+ PL_ARRAY(VkDeviceQueueCreateInfo) qinfos = {0};
+ for (int i = 0; i < qfnum; i++) {
+ bool use_qf = i == idx_gfx || i == idx_comp || i == idx_tf;
+ use_qf |= qfs[i].queueFlags & params->extra_queues;
+ if (!use_qf)
+ continue;
+ PL_ARRAY_APPEND(tmp, qinfos, (VkDeviceQueueCreateInfo) {
+ .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
+ .queueFamilyIndex = i,
+ .queueCount = qfs[i].queueCount,
+ .pQueuePriorities = pl_calloc(tmp, qfs[i].queueCount, sizeof(float)),
+ });
+ }
+
+ VkDeviceCreateInfo dinfo = {
+ .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
+ .pNext = &features,
+ .pQueueCreateInfos = qinfos.elem,
+ .queueCreateInfoCount = qinfos.num,
+ .ppEnabledExtensionNames = vk->exts.elem,
+ .enabledExtensionCount = vk->exts.num,
+ };
+
+ PL_INFO(vk, "Creating vulkan device%s", vk->exts.num ? " with extensions:" : "");
+ for (int i = 0; i < vk->exts.num; i++)
+ PL_INFO(vk, " %s", vk->exts.elem[i]);
+
+ start = pl_clock_now();
+ VK(vk->CreateDevice(vk->physd, &dinfo, PL_VK_ALLOC, &vk->dev));
+ pl_log_cpu_time(vk->log, start, pl_clock_now(), "creating vulkan device");
+
+ // Load all mandatory device-level functions
+ for (int i = 0; i < PL_ARRAY_SIZE(vk_dev_funs); i++)
+ load_vk_fun(vk, &vk_dev_funs[i]);
+
+ // Load all of the optional functions from the extensions we enabled
+ for (int i = 0; i < ext_funs.num; i++)
+ load_vk_fun(vk, ext_funs.elem[i]);
+
+ // Create the command pools for the queues we care about
+ const uint32_t qmax = PL_DEF(params->queue_count, UINT32_MAX);
+ for (int i = 0; i < qfnum; i++) {
+ if (i != idx_gfx && i != idx_tf && i != idx_comp)
+ continue; // ignore QFs not used internally
+
+ int qnum = qfs[i].queueCount;
+ if (qmax < qnum) {
+ PL_DEBUG(vk, "Restricting QF %d from %d queues to %d", i, qnum, qmax);
+ qnum = qmax;
+ }
+
+ struct vk_cmdpool *pool = vk_cmdpool_create(vk, i, qnum, qfs[i]);
+ if (!pool)
+ goto error;
+ PL_ARRAY_APPEND(vk->alloc, vk->pools, pool);
+
+ // Update the pool_* pointers based on the corresponding index
+ const char *qf_name = NULL;
+ if (i == idx_tf) {
+ vk->pool_transfer = pool;
+ qf_name = "transfer";
+ }
+ if (i == idx_comp) {
+ vk->pool_compute = pool;
+ qf_name = "compute";
+ }
+ if (i == idx_gfx) {
+ vk->pool_graphics = pool;
+ qf_name = "graphics";
+ }
+
+ for (int n = 0; n < pool->num_queues; n++)
+ PL_VK_NAME_HANDLE(QUEUE, pool->queues[n], qf_name);
+ }
+
+ pl_free(tmp);
+ return true;
+
+error:
+ PL_FATAL(vk, "Failed creating logical device!");
+ pl_free(tmp);
+ vk->failed = true;
+ return false;
+}
+
+static void lock_queue(pl_vulkan pl_vk, uint32_t qf, uint32_t qidx)
+{
+ struct vk_ctx *vk = PL_PRIV(pl_vk);
+ vk->lock_queue(vk->queue_ctx, qf, qidx);
+}
+
+static void unlock_queue(pl_vulkan pl_vk, uint32_t qf, uint32_t qidx)
+{
+ struct vk_ctx *vk = PL_PRIV(pl_vk);
+ vk->unlock_queue(vk->queue_ctx, qf, qidx);
+}
+
+static bool finalize_context(struct pl_vulkan_t *pl_vk, int max_glsl_version)
+{
+ struct vk_ctx *vk = PL_PRIV(pl_vk);
+
+ pl_assert(vk->pool_graphics);
+ pl_assert(vk->pool_compute);
+ pl_assert(vk->pool_transfer);
+
+ vk->ma = vk_malloc_create(vk);
+ if (!vk->ma)
+ return false;
+
+ pl_vk->gpu = pl_gpu_create_vk(vk);
+ if (!pl_vk->gpu)
+ return false;
+
+ // Blacklist / restrict features
+ if (max_glsl_version) {
+ struct pl_glsl_version *glsl = (struct pl_glsl_version *) &pl_vk->gpu->glsl;
+ glsl->version = PL_MIN(glsl->version, max_glsl_version);
+ glsl->version = PL_MAX(glsl->version, 140); // required for GL_KHR_vulkan_glsl
+ PL_INFO(vk, "Restricting GLSL version to %d... new version is %d",
+ max_glsl_version, glsl->version);
+ }
+
+ // Expose the resulting vulkan objects
+ pl_vk->instance = vk->inst;
+ pl_vk->phys_device = vk->physd;
+ pl_vk->device = vk->dev;
+ pl_vk->get_proc_addr = vk->GetInstanceProcAddr;
+ pl_vk->api_version = vk->api_ver;
+ pl_vk->extensions = vk->exts.elem;
+ pl_vk->num_extensions = vk->exts.num;
+ pl_vk->features = &vk->features;
+ pl_vk->num_queues = vk->pools.num;
+ pl_vk->queues = pl_calloc_ptr(vk->alloc, vk->pools.num, pl_vk->queues);
+ pl_vk->lock_queue = lock_queue;
+ pl_vk->unlock_queue = unlock_queue;
+
+ for (int i = 0; i < vk->pools.num; i++) {
+ struct pl_vulkan_queue *queues = (struct pl_vulkan_queue *) pl_vk->queues;
+ queues[i] = (struct pl_vulkan_queue) {
+ .index = vk->pools.elem[i]->qf,
+ .count = vk->pools.elem[i]->num_queues,
+ };
+
+ if (vk->pools.elem[i] == vk->pool_graphics)
+ pl_vk->queue_graphics = queues[i];
+ if (vk->pools.elem[i] == vk->pool_compute)
+ pl_vk->queue_compute = queues[i];
+ if (vk->pools.elem[i] == vk->pool_transfer)
+ pl_vk->queue_transfer = queues[i];
+ }
+
+ pl_assert(vk->lock_queue);
+ pl_assert(vk->unlock_queue);
+ return true;
+}
+
+pl_vulkan pl_vulkan_create(pl_log log, const struct pl_vulkan_params *params)
+{
+ params = PL_DEF(params, &pl_vulkan_default_params);
+ struct pl_vulkan_t *pl_vk = pl_zalloc_obj(NULL, pl_vk, struct vk_ctx);
+ struct vk_ctx *vk = PL_PRIV(pl_vk);
+ *vk = (struct vk_ctx) {
+ .vulkan = pl_vk,
+ .alloc = pl_vk,
+ .log = log,
+ .inst = params->instance,
+ .GetInstanceProcAddr = get_proc_addr_fallback(log, params->get_proc_addr),
+ };
+
+ pl_mutex_init_type(&vk->lock, PL_MUTEX_RECURSIVE);
+ if (!vk->GetInstanceProcAddr)
+ goto error;
+
+ if (!vk->inst) {
+ pl_assert(!params->surface);
+ pl_assert(!params->device);
+ PL_DEBUG(vk, "No VkInstance provided, creating one...");
+
+ // Mirror the instance params here to set `get_proc_addr` correctly
+ struct pl_vk_inst_params iparams;
+ iparams = *PL_DEF(params->instance_params, &pl_vk_inst_default_params);
+ iparams.get_proc_addr = params->get_proc_addr;
+ vk->internal_instance = pl_vk_inst_create(log, &iparams);
+ if (!vk->internal_instance)
+ goto error;
+ vk->inst = vk->internal_instance->instance;
+ }
+
+ // Directly load all mandatory instance-level function pointers, since
+ // these will be required for all further device creation logic
+ for (int i = 0; i < PL_ARRAY_SIZE(vk_inst_funs); i++)
+ load_vk_fun(vk, &vk_inst_funs[i]);
+
+ // Choose the physical device
+ if (params->device) {
+ PL_DEBUG(vk, "Using specified VkPhysicalDevice");
+ vk->physd = params->device;
+ } else {
+ struct pl_vulkan_device_params dparams = {
+ .instance = vk->inst,
+ .get_proc_addr = params->get_proc_addr,
+ .surface = params->surface,
+ .device_name = params->device_name,
+ .allow_software = params->allow_software,
+ };
+ memcpy(dparams.device_uuid, params->device_uuid, VK_UUID_SIZE);
+
+ vk->physd = pl_vulkan_choose_device(log, &dparams);
+ if (!vk->physd) {
+ PL_FATAL(vk, "Found no suitable device, giving up.");
+ goto error;
+ }
+ }
+
+ VkPhysicalDeviceIDPropertiesKHR id_props = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR,
+ };
+
+ VkPhysicalDeviceProperties2KHR prop = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR,
+ .pNext = &id_props,
+ };
+
+ vk->GetPhysicalDeviceProperties2(vk->physd, &prop);
+ vk->props = prop.properties;
+
+ PL_INFO(vk, "Vulkan device properties:");
+ PL_INFO(vk, " Device Name: %s", prop.properties.deviceName);
+ PL_INFO(vk, " Device ID: %"PRIx32":%"PRIx32, prop.properties.vendorID,
+ prop.properties.deviceID);
+ PL_INFO(vk, " Device UUID: %s", PRINT_UUID(id_props.deviceUUID));
+ PL_INFO(vk, " Driver version: %"PRIx32, prop.properties.driverVersion);
+ PL_INFO(vk, " API version: %d.%d.%d", PRINTF_VER(prop.properties.apiVersion));
+
+ // Needed by device_init
+ vk->api_ver = prop.properties.apiVersion;
+ if (params->max_api_version) {
+ vk->api_ver = PL_MIN(vk->api_ver, params->max_api_version);
+ PL_INFO(vk, "Restricting API version to %d.%d.%d... new version %d.%d.%d",
+ PRINTF_VER(params->max_api_version), PRINTF_VER(vk->api_ver));
+ }
+
+ if (vk->api_ver < PL_VK_MIN_VERSION) {
+ PL_FATAL(vk, "Device API version %d.%d.%d is lower than the minimum "
+ "required version of %d.%d.%d, cannot proceed!",
+ PRINTF_VER(vk->api_ver), PRINTF_VER(PL_VK_MIN_VERSION));
+ goto error;
+ }
+
+ // Finally, initialize the logical device and the rest of the vk_ctx
+ if (!device_init(vk, params))
+ goto error;
+
+ if (!finalize_context(pl_vk, params->max_glsl_version))
+ goto error;
+
+ return pl_vk;
+
+error:
+ PL_FATAL(vk, "Failed initializing vulkan device");
+ pl_vulkan_destroy((pl_vulkan *) &pl_vk);
+ return NULL;
+}
+
+pl_vulkan pl_vulkan_import(pl_log log, const struct pl_vulkan_import_params *params)
+{
+ void *tmp = pl_tmp(NULL);
+
+ struct pl_vulkan_t *pl_vk = pl_zalloc_obj(NULL, pl_vk, struct vk_ctx);
+ struct vk_ctx *vk = PL_PRIV(pl_vk);
+ *vk = (struct vk_ctx) {
+ .vulkan = pl_vk,
+ .alloc = pl_vk,
+ .log = log,
+ .imported = true,
+ .inst = params->instance,
+ .physd = params->phys_device,
+ .dev = params->device,
+ .GetInstanceProcAddr = get_proc_addr_fallback(log, params->get_proc_addr),
+ .lock_queue = params->lock_queue,
+ .unlock_queue = params->unlock_queue,
+ .queue_ctx = params->queue_ctx,
+ };
+
+ pl_mutex_init_type(&vk->lock, PL_MUTEX_RECURSIVE);
+ if (!vk->GetInstanceProcAddr)
+ goto error;
+
+ for (int i = 0; i < PL_ARRAY_SIZE(vk_inst_funs); i++)
+ load_vk_fun(vk, &vk_inst_funs[i]);
+
+ VkPhysicalDeviceIDPropertiesKHR id_props = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR,
+ };
+
+ VkPhysicalDeviceProperties2KHR prop = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR,
+ .pNext = &id_props,
+ };
+
+ pl_assert(vk->GetPhysicalDeviceProperties2);
+ vk->GetPhysicalDeviceProperties2(vk->physd, &prop);
+ vk->props = prop.properties;
+
+ PL_INFO(vk, "Imported vulkan device properties:");
+ PL_INFO(vk, " Device Name: %s", prop.properties.deviceName);
+ PL_INFO(vk, " Device ID: %"PRIx32":%"PRIx32, prop.properties.vendorID,
+ prop.properties.deviceID);
+ PL_INFO(vk, " Device UUID: %s", PRINT_UUID(id_props.deviceUUID));
+ PL_INFO(vk, " Driver version: %"PRIx32, prop.properties.driverVersion);
+ PL_INFO(vk, " API version: %d.%d.%d", PRINTF_VER(prop.properties.apiVersion));
+
+ vk->api_ver = prop.properties.apiVersion;
+ if (params->max_api_version) {
+ vk->api_ver = PL_MIN(vk->api_ver, params->max_api_version);
+ PL_INFO(vk, "Restricting API version to %d.%d.%d... new version %d.%d.%d",
+ PRINTF_VER(params->max_api_version), PRINTF_VER(vk->api_ver));
+ }
+
+ if (vk->api_ver < PL_VK_MIN_VERSION) {
+ PL_FATAL(vk, "Device API version %d.%d.%d is lower than the minimum "
+ "required version of %d.%d.%d, cannot proceed!",
+ PRINTF_VER(vk->api_ver), PRINTF_VER(PL_VK_MIN_VERSION));
+ goto error;
+ }
+
+ vk->features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
+ vk_features_normalize(vk->alloc, params->features, 0, &vk->features);
+ if (!check_required_features(vk)) {
+ PL_FATAL(vk, "Imported Vulkan device was not created with all required "
+ "features!");
+ goto error;
+ }
+
+ // Load all mandatory device-level functions
+ for (int i = 0; i < PL_ARRAY_SIZE(vk_dev_funs); i++)
+ load_vk_fun(vk, &vk_dev_funs[i]);
+
+ // Load all of the optional functions from the extensions enabled
+ for (int i = 0; i < PL_ARRAY_SIZE(vk_device_extensions); i++) {
+ const struct vk_ext *ext = &vk_device_extensions[i];
+ uint32_t core_ver = vk_ext_promoted_ver(ext->name);
+ if (core_ver && vk->api_ver >= core_ver) {
+ for (const struct vk_fun *f = ext->funs; f && f->name; f++)
+ load_vk_fun(vk, f);
+ continue;
+ }
+ for (int n = 0; n < params->num_extensions; n++) {
+ if (strcmp(ext->name, params->extensions[n]) == 0) {
+ for (const struct vk_fun *f = ext->funs; f && f->name; f++)
+ load_vk_fun(vk, f);
+ break;
+ }
+ }
+ }
+
+ uint32_t qfnum = 0;
+ vk->GetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, NULL);
+ VkQueueFamilyProperties *qfs = pl_calloc_ptr(tmp, qfnum, qfs);
+ vk->GetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, qfs);
+ if (!params->lock_queue)
+ init_queue_locks(vk, qfnum, qfs);
+
+ // Create the command pools for each unique qf that exists
+ struct {
+ const struct pl_vulkan_queue *info;
+ struct vk_cmdpool **pool;
+ VkQueueFlagBits flags; // *any* of these flags provide the cap
+ } qinfos[] = {
+ {
+ .info = &params->queue_graphics,
+ .pool = &vk->pool_graphics,
+ .flags = VK_QUEUE_GRAPHICS_BIT,
+ }, {
+ .info = &params->queue_compute,
+ .pool = &vk->pool_compute,
+ .flags = VK_QUEUE_COMPUTE_BIT,
+ }, {
+ .info = &params->queue_transfer,
+ .pool = &vk->pool_transfer,
+ .flags = VK_QUEUE_TRANSFER_BIT |
+ VK_QUEUE_GRAPHICS_BIT |
+ VK_QUEUE_COMPUTE_BIT,
+ }
+ };
+
+ for (int i = 0; i < PL_ARRAY_SIZE(qinfos); i++) {
+ int qf = qinfos[i].info->index;
+ struct vk_cmdpool **pool = qinfos[i].pool;
+ if (!qinfos[i].info->count)
+ continue;
+
+ // API sanity check
+ pl_assert(qfs[qf].queueFlags & qinfos[i].flags);
+
+ // See if we already created a pool for this queue family
+ for (int j = 0; j < i; j++) {
+ if (qinfos[j].info->count && qinfos[j].info->index == qf) {
+ *pool = *qinfos[j].pool;
+ goto next_qf;
+ }
+ }
+
+ *pool = vk_cmdpool_create(vk, qf, qinfos[i].info->count, qfs[qf]);
+ if (!*pool)
+ goto error;
+ PL_ARRAY_APPEND(vk->alloc, vk->pools, *pool);
+
+ // Pre-emptively set "lower priority" pools as well
+ for (int j = i+1; j < PL_ARRAY_SIZE(qinfos); j++) {
+ if (qfs[qf].queueFlags & qinfos[j].flags)
+ *qinfos[j].pool = *pool;
+ }
+
+next_qf: ;
+ }
+
+ if (!vk->pool_graphics) {
+ PL_ERR(vk, "No valid queues provided?");
+ goto error;
+ }
+
+ if (!finalize_context(pl_vk, params->max_glsl_version))
+ goto error;
+
+ pl_free(tmp);
+ return pl_vk;
+
+error:
+ PL_FATAL(vk, "Failed importing vulkan device");
+ pl_vulkan_destroy((pl_vulkan *) &pl_vk);
+ pl_free(tmp);
+ return NULL;
+}
diff --git a/src/vulkan/formats.c b/src/vulkan/formats.c
new file mode 100644
index 0000000..f0eb0fb
--- /dev/null
+++ b/src/vulkan/formats.c
@@ -0,0 +1,616 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "formats.h"
+
+#define FMT(_name, num, size, ftype, bits, idx) \
+ (struct pl_fmt_t) { \
+ .name = _name, \
+ .type = PL_FMT_##ftype, \
+ .num_components = num, \
+ .component_depth = bits, \
+ .internal_size = size, \
+ .opaque = false, \
+ .texel_size = size, \
+ .texel_align = size, \
+ .host_bits = bits, \
+ .sample_order = idx, \
+ }
+
+#define IDX(...) {__VA_ARGS__}
+#define BITS(...) {__VA_ARGS__}
+
+#define REGFMT(name, num, bits, type) \
+ FMT(name, num, (num) * (bits) / 8, type, \
+ BITS(bits, bits, bits, bits), \
+ IDX(0, 1, 2, 3))
+
+#define EMUFMT(_name, in, en, ib, eb, ftype) \
+ (struct pl_fmt_t) { \
+ .name = _name, \
+ .type = PL_FMT_##ftype, \
+ .num_components = en, \
+ .component_depth = BITS(ib, ib, ib, ib),\
+ .internal_size = (in) * (ib) / 8, \
+ .opaque = false, \
+ .emulated = true, \
+ .texel_size = (en) * (eb) / 8, \
+ .texel_align = (eb) / 8, \
+ .host_bits = BITS(eb, eb, eb, eb),\
+ .sample_order = IDX(0, 1, 2, 3), \
+ }
+
+#define PACKED16FMT(_name, num, b) \
+ (struct pl_fmt_t) { \
+ .name = _name, \
+ .type = PL_FMT_UNORM, \
+ .num_components = num, \
+ .component_depth = BITS(b, b, b, b), \
+ .internal_size = (num) * 2, \
+ .texel_size = (num) * 2, \
+ .texel_align = (num) * 2, \
+ .host_bits = BITS(16, 16, 16, 16),\
+ .sample_order = IDX(0, 1, 2, 3), \
+ }
+
+#define PLANARFMT(_name, planes, size, bits) \
+ (struct pl_fmt_t) { \
+ .name = _name, \
+ .type = PL_FMT_UNORM, \
+ .num_planes = planes, \
+ .num_components = 3, \
+ .component_depth = {bits, bits, bits}, \
+ .internal_size = size, \
+ .opaque = true, \
+ }
+
+static const struct vk_format rgb8e = {
+ .tfmt = VK_FORMAT_R8G8B8A8_UNORM,
+ .bfmt = VK_FORMAT_R8G8B8_UNORM,
+ .icomps = 4,
+ .fmt = EMUFMT("rgb8", 4, 3, 8, 8, UNORM),
+};
+
+static const struct vk_format rgb16e = {
+ .tfmt = VK_FORMAT_R16G16B16A16_UNORM,
+ .bfmt = VK_FORMAT_R16G16B16_UNORM,
+ .icomps = 4,
+ .fmt = EMUFMT("rgb16", 4, 3, 16, 16, UNORM),
+};
+
+static const struct vk_format vk_formats[] = {
+ // Regular, byte-aligned integer formats
+ {VK_FORMAT_R8_UNORM, REGFMT("r8", 1, 8, UNORM)},
+ {VK_FORMAT_R8G8_UNORM, REGFMT("rg8", 2, 8, UNORM)},
+ {VK_FORMAT_R8G8B8_UNORM, REGFMT("rgb8", 3, 8, UNORM), .emufmt = &rgb8e},
+ {VK_FORMAT_R8G8B8A8_UNORM, REGFMT("rgba8", 4, 8, UNORM)},
+ {VK_FORMAT_R16_UNORM, REGFMT("r16", 1, 16, UNORM)},
+ {VK_FORMAT_R16G16_UNORM, REGFMT("rg16", 2, 16, UNORM)},
+ {VK_FORMAT_R16G16B16_UNORM, REGFMT("rgb16", 3, 16, UNORM), .emufmt = &rgb16e},
+ {VK_FORMAT_R16G16B16A16_UNORM, REGFMT("rgba16", 4, 16, UNORM)},
+
+ {VK_FORMAT_R8_SNORM, REGFMT("r8s", 1, 8, SNORM)},
+ {VK_FORMAT_R8G8_SNORM, REGFMT("rg8s", 2, 8, SNORM)},
+ {VK_FORMAT_R8G8B8_SNORM, REGFMT("rgb8s", 3, 8, SNORM)},
+ {VK_FORMAT_R8G8B8A8_SNORM, REGFMT("rgba8s", 4, 8, SNORM)},
+ {VK_FORMAT_R16_SNORM, REGFMT("r16s", 1, 16, SNORM)},
+ {VK_FORMAT_R16G16_SNORM, REGFMT("rg16s", 2, 16, SNORM)},
+ {VK_FORMAT_R16G16B16_SNORM, REGFMT("rgb16s", 3, 16, SNORM)},
+ {VK_FORMAT_R16G16B16A16_SNORM, REGFMT("rgba16s", 4, 16, SNORM)},
+
+ // Float formats (native formats: hf = half float, df = double float)
+ {VK_FORMAT_R16_SFLOAT, REGFMT("r16hf", 1, 16, FLOAT)},
+ {VK_FORMAT_R16G16_SFLOAT, REGFMT("rg16hf", 2, 16, FLOAT)},
+ {VK_FORMAT_R16G16B16_SFLOAT, REGFMT("rgb16hf", 3, 16, FLOAT)},
+ {VK_FORMAT_R16G16B16A16_SFLOAT, REGFMT("rgba16hf", 4, 16, FLOAT)},
+ {VK_FORMAT_R32_SFLOAT, REGFMT("r32f", 1, 32, FLOAT)},
+ {VK_FORMAT_R32G32_SFLOAT, REGFMT("rg32f", 2, 32, FLOAT)},
+ {VK_FORMAT_R32G32B32_SFLOAT, REGFMT("rgb32f", 3, 32, FLOAT)},
+ {VK_FORMAT_R32G32B32A32_SFLOAT, REGFMT("rgba32f", 4, 32, FLOAT)},
+
+ // Float formats (emulated upload/download)
+ {VK_FORMAT_R16_SFLOAT, EMUFMT("r16f", 1, 1, 16, 32, FLOAT)},
+ {VK_FORMAT_R16G16_SFLOAT, EMUFMT("rg16f", 2, 2, 16, 32, FLOAT)},
+ {VK_FORMAT_R16G16B16_SFLOAT, EMUFMT("rgb16f", 3, 3, 16, 32, FLOAT)},
+ {VK_FORMAT_R16G16B16A16_SFLOAT, EMUFMT("rgba16f", 4, 4, 16, 32, FLOAT)},
+
+ // Integer-sampled formats
+ {VK_FORMAT_R8_UINT, REGFMT("r8u", 1, 8, UINT)},
+ {VK_FORMAT_R8G8_UINT, REGFMT("rg8u", 2, 8, UINT)},
+ {VK_FORMAT_R8G8B8_UINT, REGFMT("rgb8u", 3, 8, UINT)},
+ {VK_FORMAT_R8G8B8A8_UINT, REGFMT("rgba8u", 4, 8, UINT)},
+ {VK_FORMAT_R16_UINT, REGFMT("r16u", 1, 16, UINT)},
+ {VK_FORMAT_R16G16_UINT, REGFMT("rg16u", 2, 16, UINT)},
+ {VK_FORMAT_R16G16B16_UINT, REGFMT("rgb16u", 3, 16, UINT)},
+ {VK_FORMAT_R16G16B16A16_UINT, REGFMT("rgba16u", 4, 16, UINT)},
+ {VK_FORMAT_R32_UINT, REGFMT("r32u", 1, 32, UINT)},
+ {VK_FORMAT_R32G32_UINT, REGFMT("rg32u", 2, 32, UINT)},
+ {VK_FORMAT_R32G32B32_UINT, REGFMT("rgb32u", 3, 32, UINT)},
+ {VK_FORMAT_R32G32B32A32_UINT, REGFMT("rgba32u", 4, 32, UINT)},
+
+ {VK_FORMAT_R8_SINT, REGFMT("r8i", 1, 8, SINT)},
+ {VK_FORMAT_R8G8_SINT, REGFMT("rg8i", 2, 8, SINT)},
+ {VK_FORMAT_R8G8B8_SINT, REGFMT("rgb8i", 3, 8, SINT)},
+ {VK_FORMAT_R8G8B8A8_SINT, REGFMT("rgba8i", 4, 8, SINT)},
+ {VK_FORMAT_R16_SINT, REGFMT("r16i", 1, 16, SINT)},
+ {VK_FORMAT_R16G16_SINT, REGFMT("rg16i", 2, 16, SINT)},
+ {VK_FORMAT_R16G16B16_SINT, REGFMT("rgb16i", 3, 16, SINT)},
+ {VK_FORMAT_R16G16B16A16_SINT, REGFMT("rgba16i", 4, 16, SINT)},
+ {VK_FORMAT_R32_SINT, REGFMT("r32i", 1, 32, SINT)},
+ {VK_FORMAT_R32G32_SINT, REGFMT("rg32i", 2, 32, SINT)},
+ {VK_FORMAT_R32G32B32_SINT, REGFMT("rgb32i", 3, 32, SINT)},
+ {VK_FORMAT_R32G32B32A32_SINT, REGFMT("rgba32i", 4, 32, SINT)},
+
+ // "Swapped" component order formats
+ {VK_FORMAT_B8G8R8_UNORM, FMT("bgr8", 3, 3, UNORM, BITS(8, 8, 8), IDX(2, 1, 0))},
+ {VK_FORMAT_B8G8R8A8_UNORM, FMT("bgra8", 4, 4, UNORM, BITS(8, 8, 8, 8), IDX(2, 1, 0, 3))},
+
+ {VK_FORMAT_B8G8R8_UINT, FMT("bgr8u", 3, 3, UINT, BITS(8, 8, 8), IDX(2, 1, 0))},
+ {VK_FORMAT_B8G8R8A8_UINT, FMT("bgra8u", 4, 4, UINT, BITS(8, 8, 8, 8), IDX(2, 1, 0, 3))},
+
+ {VK_FORMAT_B8G8R8_SINT, FMT("bgr8i", 3, 3, SINT, BITS(8, 8, 8), IDX(2, 1, 0))},
+ {VK_FORMAT_B8G8R8A8_SINT, FMT("bgra8i", 4, 4, SINT, BITS(8, 8, 8, 8), IDX(2, 1, 0, 3))},
+
+ // "Packed" integer formats
+ //
+ // Note: These have the component order reversed from what the vulkan name
+ // implies, because we order our IDX from LSB to MSB (consistent with the
+ // usual ordering from lowest byte to highest byte, on little endian
+ // platforms), but Vulkan names them from MSB to LSB.
+ {VK_FORMAT_R4G4_UNORM_PACK8, FMT("gr4", 2, 1, UNORM, BITS(4, 4), IDX(1, 0))},
+ {VK_FORMAT_B4G4R4A4_UNORM_PACK16, FMT("argb4", 4, 2, UNORM, BITS(4, 4, 4, 4), IDX(3, 0, 1, 2))},
+ {VK_FORMAT_R4G4B4A4_UNORM_PACK16, FMT("abgr4", 4, 2, UNORM, BITS(4, 4, 4, 4), IDX(3, 2, 1, 0))},
+
+ {VK_FORMAT_R5G6B5_UNORM_PACK16, FMT("bgr565", 3, 2, UNORM, BITS(5, 6, 5), IDX(2, 1, 0))},
+ {VK_FORMAT_B5G6R5_UNORM_PACK16, FMT("rgb565", 3, 2, UNORM, BITS(5, 6, 5), IDX(0, 1, 2))},
+
+ {VK_FORMAT_R5G5B5A1_UNORM_PACK16, FMT("a1bgr5", 4, 2, UNORM, BITS(1, 5, 5, 5), IDX(3, 2, 1, 0))},
+ {VK_FORMAT_B5G5R5A1_UNORM_PACK16, FMT("a1rgb5", 4, 2, UNORM, BITS(1, 5, 5, 5), IDX(3, 0, 1, 2))},
+ {VK_FORMAT_A1R5G5B5_UNORM_PACK16, FMT("bgr5a1", 4, 2, UNORM, BITS(5, 5, 5, 1), IDX(2, 1, 0, 3))},
+
+ {VK_FORMAT_A2B10G10R10_UNORM_PACK32, FMT("rgb10a2", 4, 4, UNORM, BITS(10, 10, 10, 2), IDX(0, 1, 2, 3))},
+ {VK_FORMAT_A2R10G10B10_UNORM_PACK32, FMT("bgr10a2", 4, 4, UNORM, BITS(10, 10, 10, 2), IDX(2, 1, 0, 3))},
+ {VK_FORMAT_A2B10G10R10_SNORM_PACK32, FMT("rgb10a2s", 4, 4, SNORM, BITS(10, 10, 10, 2), IDX(0, 1, 2, 3))},
+ {VK_FORMAT_A2R10G10B10_SNORM_PACK32, FMT("bgr10a2s", 4, 4, SNORM, BITS(10, 10, 10, 2), IDX(2, 1, 0, 3))},
+ {VK_FORMAT_A2B10G10R10_UINT_PACK32, FMT("rgb10a2u", 4, 4, UINT, BITS(10, 10, 10, 2), IDX(0, 1, 2, 3))},
+ {VK_FORMAT_A2R10G10B10_UINT_PACK32, FMT("bgr10a2u", 4, 4, UINT, BITS(10, 10, 10, 2), IDX(2, 1, 0, 3))},
+ {VK_FORMAT_A2B10G10R10_SINT_PACK32, FMT("rgb10a2i", 4, 4, SINT, BITS(10, 10, 10, 2), IDX(0, 1, 2, 3))},
+ {VK_FORMAT_A2R10G10B10_SINT_PACK32, FMT("bgr10a2i", 4, 4, SINT, BITS(10, 10, 10, 2), IDX(2, 1, 0, 3))},
+
+
+ // Packed 16 bit formats
+ {VK_FORMAT_R10X6_UNORM_PACK16, PACKED16FMT("rx10", 1, 10)},
+ {VK_FORMAT_R10X6G10X6_UNORM_2PACK16, PACKED16FMT("rxgx10", 2, 10)},
+ {VK_FORMAT_R12X4_UNORM_PACK16, PACKED16FMT("rx12", 1, 12)},
+ {VK_FORMAT_R12X4G12X4_UNORM_2PACK16, PACKED16FMT("rxgx12", 2, 12)},
+
+ // FIXME: enabling these requires VK_EXT_rgba10x6_formats or equivalent
+ // {VK_FORMAT_R10X6G10X6B10X6A10X6_UNORM_4PACK16, PACKED16FMT("rxgxbxax10", 4, 10)},
+ // {VK_FORMAT_R12X4G12X4B12X4A12X4_UNORM_4PACK16, PACKED16FMT("rxgxbxax12", 4, 12)},
+
+ // Planar formats
+ {VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM, PLANARFMT("g8_b8_r8_420", 3, 12, 8),
+ .pfmt = {
+ {VK_FORMAT_R8_UNORM},
+ {VK_FORMAT_R8_UNORM, .sx = 1, .sy = 1},
+ {VK_FORMAT_R8_UNORM, .sx = 1, .sy = 1},
+ },
+ },
+ {VK_FORMAT_G8_B8_R8_3PLANE_422_UNORM, PLANARFMT("g8_b8_r8_422", 3, 16, 8),
+ .pfmt = {
+ {VK_FORMAT_R8_UNORM},
+ {VK_FORMAT_R8_UNORM, .sx = 1},
+ {VK_FORMAT_R8_UNORM, .sx = 1},
+ },
+ },
+ {VK_FORMAT_G8_B8_R8_3PLANE_444_UNORM, PLANARFMT("g8_b8_r8_444", 3, 24, 8),
+ .pfmt = {
+ {VK_FORMAT_R8_UNORM},
+ {VK_FORMAT_R8_UNORM},
+ {VK_FORMAT_R8_UNORM},
+ },
+ },
+
+ {VK_FORMAT_G16_B16_R16_3PLANE_420_UNORM, PLANARFMT("g16_b16_r16_420", 3, 24, 16),
+ .pfmt = {
+ {VK_FORMAT_R16_UNORM},
+ {VK_FORMAT_R16_UNORM, .sx = 1, .sy = 1},
+ {VK_FORMAT_R16_UNORM, .sx = 1, .sy = 1},
+ },
+ },
+ {VK_FORMAT_G16_B16_R16_3PLANE_422_UNORM, PLANARFMT("g16_b16_r16_422", 3, 32, 16),
+ .pfmt = {
+ {VK_FORMAT_R16_UNORM},
+ {VK_FORMAT_R16_UNORM, .sx = 1},
+ {VK_FORMAT_R16_UNORM, .sx = 1},
+ },
+ },
+ {VK_FORMAT_G16_B16_R16_3PLANE_444_UNORM, PLANARFMT("g16_b16_r16_444", 3, 48, 16),
+ .pfmt = {
+ {VK_FORMAT_R16_UNORM},
+ {VK_FORMAT_R16_UNORM},
+ {VK_FORMAT_R16_UNORM},
+ },
+ },
+
+ {VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_420_UNORM_3PACK16, PLANARFMT("gx10_bx10_rx10_420", 3, 24, 10),
+ .pfmt = {
+ {VK_FORMAT_R10X6_UNORM_PACK16},
+ {VK_FORMAT_R10X6_UNORM_PACK16, .sx = 1, .sy = 1},
+ {VK_FORMAT_R10X6_UNORM_PACK16, .sx = 1, .sy = 1},
+ },
+ },
+ {VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_422_UNORM_3PACK16, PLANARFMT("gx10_bx10_rx10_422", 3, 32, 10),
+ .pfmt = {
+ {VK_FORMAT_R10X6_UNORM_PACK16},
+ {VK_FORMAT_R10X6_UNORM_PACK16, .sx = 1},
+ {VK_FORMAT_R10X6_UNORM_PACK16, .sx = 1},
+ },
+ },
+ {VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_444_UNORM_3PACK16, PLANARFMT("gx10_bx10_rx10_444", 3, 48, 10),
+ .pfmt = {
+ {VK_FORMAT_R10X6_UNORM_PACK16},
+ {VK_FORMAT_R10X6_UNORM_PACK16},
+ {VK_FORMAT_R10X6_UNORM_PACK16},
+ },
+ },
+
+ {VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_420_UNORM_3PACK16, PLANARFMT("gx12_bx12_rx12_420", 3, 24, 12),
+ .pfmt = {
+ {VK_FORMAT_R12X4_UNORM_PACK16},
+ {VK_FORMAT_R12X4_UNORM_PACK16, .sx = 1, .sy = 1},
+ {VK_FORMAT_R12X4_UNORM_PACK16, .sx = 1, .sy = 1},
+ },
+ },
+ {VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_422_UNORM_3PACK16, PLANARFMT("gx12_bx12_rx12_422", 3, 32, 12),
+ .pfmt = {
+ {VK_FORMAT_R12X4_UNORM_PACK16},
+ {VK_FORMAT_R12X4_UNORM_PACK16, .sx = 1},
+ {VK_FORMAT_R12X4_UNORM_PACK16, .sx = 1},
+ },
+ },
+ {VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_444_UNORM_3PACK16, PLANARFMT("gx12_bx12_rx12_444", 3, 48, 12),
+ .pfmt = {
+ {VK_FORMAT_R12X4_UNORM_PACK16},
+ {VK_FORMAT_R12X4_UNORM_PACK16},
+ {VK_FORMAT_R12X4_UNORM_PACK16},
+ },
+ },
+
+ {VK_FORMAT_G8_B8R8_2PLANE_420_UNORM, PLANARFMT("g8_br8_420", 2, 12, 8),
+ .pfmt = {
+ {VK_FORMAT_R8_UNORM},
+ {VK_FORMAT_R8G8_UNORM, .sx = 1, .sy = 1},
+ },
+ },
+ {VK_FORMAT_G8_B8R8_2PLANE_422_UNORM, PLANARFMT("g8_br8_422", 2, 16, 8),
+ .pfmt = {
+ {VK_FORMAT_R8_UNORM},
+ {VK_FORMAT_R8G8_UNORM, .sx = 1},
+ },
+ },
+ {VK_FORMAT_G8_B8R8_2PLANE_444_UNORM, PLANARFMT("g8_br8_444", 2, 24, 8),
+ .min_ver = VK_API_VERSION_1_3,
+ .pfmt = {
+ {VK_FORMAT_R8_UNORM},
+ {VK_FORMAT_R8G8_UNORM},
+ },
+ },
+
+ {VK_FORMAT_G16_B16R16_2PLANE_420_UNORM, PLANARFMT("g16_br16_420", 2, 24, 16),
+ .pfmt = {
+ {VK_FORMAT_R16_UNORM},
+ {VK_FORMAT_R16G16_UNORM, .sx = 1, .sy = 1},
+ },
+ },
+ {VK_FORMAT_G16_B16R16_2PLANE_422_UNORM, PLANARFMT("g16_br16_422", 2, 32, 16),
+ .pfmt = {
+ {VK_FORMAT_R16_UNORM},
+ {VK_FORMAT_R16G16_UNORM, .sx = 1},
+ },
+ },
+ {VK_FORMAT_G16_B16R16_2PLANE_444_UNORM, PLANARFMT("g16_br16_444", 2, 48, 16),
+ .min_ver = VK_API_VERSION_1_3,
+ .pfmt = {
+ {VK_FORMAT_R16_UNORM},
+ {VK_FORMAT_R16G16_UNORM},
+ },
+ },
+
+ {VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16, PLANARFMT("gx10_bxrx10_420", 2, 24, 10),
+ .pfmt = {
+ {VK_FORMAT_R10X6_UNORM_PACK16},
+ {VK_FORMAT_R10X6G10X6_UNORM_2PACK16, .sx = 1, .sy = 1},
+ },
+ },
+ {VK_FORMAT_G10X6_B10X6R10X6_2PLANE_422_UNORM_3PACK16, PLANARFMT("gx10_bxrx10_422", 2, 32, 10),
+ .pfmt = {
+ {VK_FORMAT_R10X6_UNORM_PACK16},
+ {VK_FORMAT_R10X6G10X6_UNORM_2PACK16, .sx = 1},
+ },
+ },
+ {VK_FORMAT_G10X6_B10X6R10X6_2PLANE_444_UNORM_3PACK16, PLANARFMT("gx10_bxrx10_444", 2, 48, 10),
+ .min_ver = VK_API_VERSION_1_3,
+ .pfmt = {
+ {VK_FORMAT_R10X6_UNORM_PACK16},
+ {VK_FORMAT_R10X6G10X6_UNORM_2PACK16},
+ },
+ },
+
+ {VK_FORMAT_G12X4_B12X4R12X4_2PLANE_420_UNORM_3PACK16, PLANARFMT("gx12_bxrx12_420", 2, 24, 12),
+ .pfmt = {
+ {VK_FORMAT_R12X4_UNORM_PACK16},
+ {VK_FORMAT_R12X4G12X4_UNORM_2PACK16, .sx = 1, .sy = 1},
+ },
+ },
+ {VK_FORMAT_G12X4_B12X4R12X4_2PLANE_422_UNORM_3PACK16, PLANARFMT("gx12_bxrx12_422", 2, 32, 12),
+ .pfmt = {
+ {VK_FORMAT_R12X4_UNORM_PACK16},
+ {VK_FORMAT_R12X4G12X4_UNORM_2PACK16, .sx = 1},
+ },
+ },
+ {VK_FORMAT_G12X4_B12X4R12X4_2PLANE_444_UNORM_3PACK16, PLANARFMT("gx12_bxrx12_444", 2, 48, 12),
+ .min_ver = VK_API_VERSION_1_3,
+ .pfmt = {
+ {VK_FORMAT_R12X4_UNORM_PACK16},
+ {VK_FORMAT_R12X4G12X4_UNORM_2PACK16},
+ },
+ },
+
+ {0}
+};
+
+#undef BITS
+#undef IDX
+#undef REGFMT
+#undef FMT
+
+void vk_setup_formats(struct pl_gpu_t *gpu)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ PL_ARRAY(pl_fmt) formats = {0};
+
+ // Texture format emulation requires at least support for texel buffers
+ bool has_emu = gpu->glsl.compute && gpu->limits.max_buffer_texels;
+
+ for (const struct vk_format *pvk_fmt = vk_formats; pvk_fmt->tfmt; pvk_fmt++) {
+ const struct vk_format *vk_fmt = pvk_fmt;
+
+ // Skip formats that require a too new version of Vulkan
+ if (vk_fmt->min_ver > vk->api_ver)
+ continue;
+
+ // Skip formats with innately emulated representation if unsupported
+ if (vk_fmt->fmt.emulated && !has_emu)
+ continue;
+
+ // Suppress some errors/warnings spit out by the format probing code
+ pl_log_level_cap(vk->log, PL_LOG_INFO);
+
+ bool has_drm_mods = vk->GetImageDrmFormatModifierPropertiesEXT;
+ VkDrmFormatModifierPropertiesEXT modifiers[16] = {0};
+ VkDrmFormatModifierPropertiesListEXT drm_props = {
+ .sType = VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_EXT,
+ .drmFormatModifierCount = PL_ARRAY_SIZE(modifiers),
+ .pDrmFormatModifierProperties = modifiers,
+ };
+
+ VkFormatProperties2KHR prop2 = {
+ .sType = VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2,
+ .pNext = has_drm_mods ? &drm_props : NULL,
+ };
+
+ vk->GetPhysicalDeviceFormatProperties2KHR(vk->physd, vk_fmt->tfmt, &prop2);
+
+ // If wholly unsupported, try falling back to the emulation formats
+ // for texture operations
+ VkFormatProperties *prop = &prop2.formatProperties;
+ while (has_emu && !prop->optimalTilingFeatures && vk_fmt->emufmt) {
+ vk_fmt = vk_fmt->emufmt;
+ vk->GetPhysicalDeviceFormatProperties2KHR(vk->physd, vk_fmt->tfmt, &prop2);
+ }
+
+ VkFormatFeatureFlags texflags = prop->optimalTilingFeatures;
+ VkFormatFeatureFlags bufflags = prop->bufferFeatures;
+ if (vk_fmt->fmt.emulated) {
+ // Emulated formats might have a different buffer representation
+ // than their texture representation. If they don't, assume their
+ // buffer representation is nonsensical (e.g. r16f)
+ if (vk_fmt->bfmt) {
+ vk->GetPhysicalDeviceFormatProperties(vk->physd, vk_fmt->bfmt, prop);
+ bufflags = prop->bufferFeatures;
+ } else {
+ bufflags = 0;
+ }
+ } else if (vk_fmt->fmt.num_planes) {
+ // Planar textures cannot be used directly
+ texflags = bufflags = 0;
+ }
+
+ pl_log_level_cap(vk->log, PL_LOG_NONE);
+
+ struct pl_fmt_t *fmt = pl_alloc_obj(gpu, fmt, struct pl_fmt_vk);
+ struct pl_fmt_vk *fmtp = PL_PRIV(fmt);
+ *fmt = vk_fmt->fmt;
+ *fmtp = (struct pl_fmt_vk) {
+ .vk_fmt = vk_fmt
+ };
+
+ // Always set the signature to the actual texture format, so we can use
+ // it to guarantee renderpass compatibility.
+ fmt->signature = (uint64_t) vk_fmt->tfmt;
+
+ // For sanity, clear the superfluous fields
+ for (int i = fmt->num_components; i < 4; i++) {
+ fmt->component_depth[i] = 0;
+ fmt->sample_order[i] = 0;
+ fmt->host_bits[i] = 0;
+ }
+
+ // We can set this universally
+ fmt->fourcc = pl_fmt_fourcc(fmt);
+
+ if (has_drm_mods) {
+
+ if (drm_props.drmFormatModifierCount == PL_ARRAY_SIZE(modifiers)) {
+ PL_WARN(gpu, "DRM modifier list for format %s possibly truncated",
+ fmt->name);
+ }
+
+ // Query the list of supported DRM modifiers from the driver
+ PL_ARRAY(uint64_t) modlist = {0};
+ for (int i = 0; i < drm_props.drmFormatModifierCount; i++) {
+ if (modifiers[i].drmFormatModifierPlaneCount > 1) {
+ PL_TRACE(gpu, "Ignoring format modifier %s of "
+ "format %s because its plane count %d > 1",
+ PRINT_DRM_MOD(modifiers[i].drmFormatModifier),
+ fmt->name, modifiers[i].drmFormatModifierPlaneCount);
+ continue;
+ }
+
+ // Only warn about texture format features relevant to us
+ const VkFormatFeatureFlags flag_mask =
+ VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT |
+ VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT |
+ VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT |
+ VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT |
+ VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
+ VK_FORMAT_FEATURE_BLIT_SRC_BIT |
+ VK_FORMAT_FEATURE_BLIT_DST_BIT;
+
+
+ VkFormatFeatureFlags flags = modifiers[i].drmFormatModifierTilingFeatures;
+ if ((flags & flag_mask) != (texflags & flag_mask)) {
+ PL_DEBUG(gpu, "DRM format modifier %s of format %s "
+ "supports fewer caps (0x%"PRIx32") than optimal tiling "
+ "(0x%"PRIx32"), may result in limited capability!",
+ PRINT_DRM_MOD(modifiers[i].drmFormatModifier),
+ fmt->name, flags, texflags);
+ }
+
+ PL_ARRAY_APPEND(fmt, modlist, modifiers[i].drmFormatModifier);
+ }
+
+ fmt->num_modifiers = modlist.num;
+ fmt->modifiers = modlist.elem;
+
+ } else if (gpu->export_caps.tex & PL_HANDLE_DMA_BUF) {
+
+ // Hard-code a list of static mods that we're likely to support
+ static const uint64_t static_mods[2] = {
+ DRM_FORMAT_MOD_INVALID,
+ DRM_FORMAT_MOD_LINEAR,
+ };
+
+ fmt->num_modifiers = PL_ARRAY_SIZE(static_mods);
+ fmt->modifiers = static_mods;
+
+ }
+
+ struct { VkFormatFeatureFlags flags; enum pl_fmt_caps caps; } bufbits[] = {
+ {VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT, PL_FMT_CAP_VERTEX},
+ {VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT, PL_FMT_CAP_TEXEL_UNIFORM},
+ {VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT, PL_FMT_CAP_TEXEL_STORAGE},
+ };
+
+ for (int i = 0; i < PL_ARRAY_SIZE(bufbits); i++) {
+ if ((bufflags & bufbits[i].flags) == bufbits[i].flags)
+ fmt->caps |= bufbits[i].caps;
+ }
+
+ if (fmt->caps) {
+ fmt->glsl_type = pl_var_glsl_type_name(pl_var_from_fmt(fmt, ""));
+ pl_assert(fmt->glsl_type);
+ }
+
+ struct { VkFormatFeatureFlags flags; enum pl_fmt_caps caps; } bits[] = {
+ {VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT, PL_FMT_CAP_BLENDABLE},
+ {VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT, PL_FMT_CAP_LINEAR},
+ {VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT, PL_FMT_CAP_SAMPLEABLE},
+ {VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT, PL_FMT_CAP_STORABLE},
+ {VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT, PL_FMT_CAP_RENDERABLE},
+
+ // We don't distinguish between the two blit modes for pl_fmt_caps
+ {VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT,
+ PL_FMT_CAP_BLITTABLE},
+ };
+
+ for (int i = 0; i < PL_ARRAY_SIZE(bits); i++) {
+ if ((texflags & bits[i].flags) == bits[i].flags)
+ fmt->caps |= bits[i].caps;
+ }
+
+ // For blit emulation via compute shaders
+ if (!(fmt->caps & PL_FMT_CAP_BLITTABLE) && (fmt->caps & PL_FMT_CAP_STORABLE)) {
+ fmt->caps |= PL_FMT_CAP_BLITTABLE;
+ fmtp->blit_emulated = true;
+ }
+
+ // This is technically supported for all textures, but the semantics
+ // of pl_gpu require it only be listed for non-opaque ones
+ if (!fmt->opaque)
+ fmt->caps |= PL_FMT_CAP_HOST_READABLE;
+
+ // Vulkan requires a minimum GLSL version that supports textureGather()
+ if (fmt->caps & PL_FMT_CAP_SAMPLEABLE)
+ fmt->gatherable = true;
+
+ // Disable implied capabilities where the dependencies are unavailable
+ enum pl_fmt_caps storable = PL_FMT_CAP_STORABLE | PL_FMT_CAP_TEXEL_STORAGE;
+ if (!(fmt->caps & PL_FMT_CAP_SAMPLEABLE))
+ fmt->caps &= ~PL_FMT_CAP_LINEAR;
+ if (!gpu->glsl.compute)
+ fmt->caps &= ~storable;
+
+ bool has_nofmt = vk->features.features.shaderStorageImageReadWithoutFormat &&
+ vk->features.features.shaderStorageImageWriteWithoutFormat;
+
+ if (fmt->caps & storable) {
+ int real_comps = PL_DEF(vk_fmt->icomps, fmt->num_components);
+ fmt->glsl_format = pl_fmt_glsl_format(fmt, real_comps);
+ if (!fmt->glsl_format && !has_nofmt) {
+ PL_DEBUG(gpu, "Storable format '%s' has no matching GLSL "
+ "format qualifier but read/write without format "
+ "is not supported.. disabling", fmt->name);
+ fmt->caps &= ~storable;
+ }
+ }
+
+ if (fmt->caps & storable)
+ fmt->caps |= PL_FMT_CAP_READWRITE;
+
+ // Pick sub-plane formats for planar formats
+ for (int n = 0; n < fmt->num_planes; n++) {
+ for (int i = 0; i < formats.num; i++) {
+ if (formats.elem[i]->signature == vk_fmt->pfmt[n].fmt) {
+ fmt->planes[n].format = formats.elem[i];
+ fmt->planes[n].shift_x = vk_fmt->pfmt[n].sx;
+ fmt->planes[n].shift_y = vk_fmt->pfmt[n].sy;
+ break;
+ }
+ }
+
+ pl_assert(fmt->planes[n].format);
+ }
+
+ PL_ARRAY_APPEND(gpu, formats, fmt);
+ }
+
+ gpu->formats = formats.elem;
+ gpu->num_formats = formats.num;
+}
diff --git a/src/vulkan/formats.h b/src/vulkan/formats.h
new file mode 100644
index 0000000..b1408fd
--- /dev/null
+++ b/src/vulkan/formats.h
@@ -0,0 +1,34 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+#include "gpu.h"
+
+struct vk_format {
+ VkFormat tfmt; // internal vulkan format enum (textures)
+ struct pl_fmt_t fmt;// pl_fmt template (features will be auto-detected)
+ int icomps; // internal component count (or 0 to infer from `fmt`)
+ VkFormat bfmt; // vulkan format for use as buffers (or 0 to use `tfmt`)
+ const struct vk_format *emufmt; // alternate format for emulation
+ uint32_t min_ver; // minimum vulkan API version for this format to exist
+ struct { VkFormat fmt; int sx, sy; } pfmt[4]; // plane formats (for planar textures)
+};
+
+// Add all supported formats to the `pl_gpu` format list
+void vk_setup_formats(struct pl_gpu_t *gpu);
diff --git a/src/vulkan/gpu.c b/src/vulkan/gpu.c
new file mode 100644
index 0000000..69aca67
--- /dev/null
+++ b/src/vulkan/gpu.c
@@ -0,0 +1,924 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+#include "formats.h"
+#include "glsl/spirv.h"
+
+#ifdef PL_HAVE_UNIX
+#include <unistd.h>
+#endif
+
+// Gives us enough queries for 8 results
+#define QUERY_POOL_SIZE 16
+
+struct pl_timer_t {
+ VkQueryPool qpool; // even=start, odd=stop
+ int index_write; // next index to write to
+ int index_read; // next index to read from
+ uint64_t pending; // bitmask of queries that are still running
+};
+
+static inline uint64_t timer_bit(int index)
+{
+ return 1llu << (index / 2);
+}
+
+static void timer_destroy_cb(pl_gpu gpu, pl_timer timer)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ pl_assert(!timer->pending);
+ vk->DestroyQueryPool(vk->dev, timer->qpool, PL_VK_ALLOC);
+ pl_free(timer);
+}
+
+static pl_timer vk_timer_create(pl_gpu gpu)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ pl_timer timer = pl_alloc_ptr(NULL, timer);
+ *timer = (struct pl_timer_t) {0};
+
+ struct VkQueryPoolCreateInfo qinfo = {
+ .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
+ .queryType = VK_QUERY_TYPE_TIMESTAMP,
+ .queryCount = QUERY_POOL_SIZE,
+ };
+
+ VK(vk->CreateQueryPool(vk->dev, &qinfo, PL_VK_ALLOC, &timer->qpool));
+ return timer;
+
+error:
+ timer_destroy_cb(gpu, timer);
+ return NULL;
+}
+
+static void vk_timer_destroy(pl_gpu gpu, pl_timer timer)
+{
+ vk_gpu_idle_callback(gpu, (vk_cb) timer_destroy_cb, gpu, timer);
+}
+
+static uint64_t vk_timer_query(pl_gpu gpu, pl_timer timer)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ if (timer->index_read == timer->index_write)
+ return 0; // no more unprocessed results
+
+ vk_poll_commands(vk, 0);
+ if (timer->pending & timer_bit(timer->index_read))
+ return 0; // still waiting for results
+
+ VkResult res;
+ uint64_t ts[2] = {0};
+ res = vk->GetQueryPoolResults(vk->dev, timer->qpool, timer->index_read, 2,
+ sizeof(ts), &ts[0], sizeof(uint64_t),
+ VK_QUERY_RESULT_64_BIT);
+
+ switch (res) {
+ case VK_SUCCESS:
+ timer->index_read = (timer->index_read + 2) % QUERY_POOL_SIZE;
+ return (ts[1] - ts[0]) * vk->props.limits.timestampPeriod;
+ case VK_NOT_READY:
+ return 0;
+ default:
+ PL_VK_ASSERT(res, "Retrieving query pool results");
+ }
+
+error:
+ return 0;
+}
+
+static void timer_begin(pl_gpu gpu, struct vk_cmd *cmd, pl_timer timer)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ if (!timer)
+ return;
+
+ if (!cmd->pool->props.timestampValidBits) {
+ PL_TRACE(gpu, "QF %d does not support timestamp queries", cmd->pool->qf);
+ return;
+ }
+
+ vk_poll_commands(vk, 0);
+ if (timer->pending & timer_bit(timer->index_write))
+ return; // next query is still running, skip this timer
+
+ VkQueueFlags reset_flags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT;
+ if (cmd->pool->props.queueFlags & reset_flags) {
+ // Use direct command buffer resets
+ vk->CmdResetQueryPool(cmd->buf, timer->qpool, timer->index_write, 2);
+ } else {
+ // Use host query reset
+ vk->ResetQueryPool(vk->dev, timer->qpool, timer->index_write, 2);
+ }
+
+ vk->CmdWriteTimestamp(cmd->buf, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+ timer->qpool, timer->index_write);
+
+ p->cmd_timer = timer;
+}
+
+static inline bool supports_marks(struct vk_cmd *cmd) {
+ // Spec says debug markers are only available on graphics/compute queues
+ VkQueueFlags flags = cmd->pool->props.queueFlags;
+ return flags & (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT);
+}
+
+struct vk_cmd *_begin_cmd(pl_gpu gpu, enum queue_type type, const char *label,
+ pl_timer timer)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ pl_mutex_lock(&p->recording);
+
+ struct vk_cmdpool *pool;
+ switch (type) {
+ case ANY: pool = p->cmd ? p->cmd->pool : vk->pool_graphics; break;
+ case GRAPHICS: pool = vk->pool_graphics; break;
+ case COMPUTE: pool = vk->pool_compute; break;
+ case TRANSFER: pool = vk->pool_transfer; break;
+ default: pl_unreachable();
+ }
+
+ if (!p->cmd || p->cmd->pool != pool) {
+ vk_cmd_submit(&p->cmd);
+ p->cmd = vk_cmd_begin(pool, label);
+ if (!p->cmd) {
+ pl_mutex_unlock(&p->recording);
+ return NULL;
+ }
+ }
+
+ if (vk->CmdBeginDebugUtilsLabelEXT && supports_marks(p->cmd)) {
+ vk->CmdBeginDebugUtilsLabelEXT(p->cmd->buf, &(VkDebugUtilsLabelEXT) {
+ .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT,
+ .pLabelName = label,
+ });
+ }
+
+ timer_begin(gpu, p->cmd, timer);
+ return p->cmd;
+}
+
+static void timer_end_cb(void *ptimer, void *pindex)
+{
+ pl_timer timer = ptimer;
+ int index = (uintptr_t) pindex;
+ timer->pending &= ~timer_bit(index);
+}
+
+bool _end_cmd(pl_gpu gpu, struct vk_cmd **pcmd, bool submit)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ bool ret = true;
+ if (!pcmd) {
+ if (submit) {
+ pl_mutex_lock(&p->recording);
+ ret = vk_cmd_submit(&p->cmd);
+ pl_mutex_unlock(&p->recording);
+ }
+ return ret;
+ }
+
+ struct vk_cmd *cmd = *pcmd;
+ pl_assert(p->cmd == cmd);
+
+ if (p->cmd_timer) {
+ pl_timer timer = p->cmd_timer;
+ vk->CmdWriteTimestamp(cmd->buf, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+ timer->qpool, timer->index_write + 1);
+
+ timer->pending |= timer_bit(timer->index_write);
+ vk_cmd_callback(cmd, (vk_cb) timer_end_cb, timer,
+ (void *) (uintptr_t) timer->index_write);
+
+ timer->index_write = (timer->index_write + 2) % QUERY_POOL_SIZE;
+ if (timer->index_write == timer->index_read) {
+ // forcibly drop the least recent result to make space
+ timer->index_read = (timer->index_read + 2) % QUERY_POOL_SIZE;
+ }
+
+ p->cmd_timer = NULL;
+ }
+
+ if (vk->CmdEndDebugUtilsLabelEXT && supports_marks(cmd))
+ vk->CmdEndDebugUtilsLabelEXT(cmd->buf);
+
+ if (submit)
+ ret = vk_cmd_submit(&p->cmd);
+
+ pl_mutex_unlock(&p->recording);
+ return ret;
+}
+
+void vk_gpu_idle_callback(pl_gpu gpu, vk_cb cb, const void *priv, const void *arg)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ pl_mutex_lock(&p->recording);
+ if (p->cmd) {
+ vk_cmd_callback(p->cmd, cb, priv, arg);
+ } else {
+ vk_dev_callback(vk, cb, priv, arg);
+ }
+ pl_mutex_unlock(&p->recording);
+}
+
+static void vk_gpu_destroy(pl_gpu gpu)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ vk_cmd_submit(&p->cmd);
+ vk_wait_idle(vk);
+
+ for (enum pl_tex_sample_mode s = 0; s < PL_TEX_SAMPLE_MODE_COUNT; s++) {
+ for (enum pl_tex_address_mode a = 0; a < PL_TEX_ADDRESS_MODE_COUNT; a++)
+ vk->DestroySampler(vk->dev, p->samplers[s][a], PL_VK_ALLOC);
+ }
+
+ pl_spirv_destroy(&p->spirv);
+ pl_mutex_destroy(&p->recording);
+ pl_free((void *) gpu);
+}
+
+pl_vulkan pl_vulkan_get(pl_gpu gpu)
+{
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ if (impl->destroy == vk_gpu_destroy) {
+ struct pl_vk *p = (struct pl_vk *) impl;
+ return p->vk->vulkan;
+ }
+
+ return NULL;
+}
+
+static pl_handle_caps vk_sync_handle_caps(struct vk_ctx *vk)
+{
+ pl_handle_caps caps = 0;
+
+ for (int i = 0; vk_sync_handle_list[i]; i++) {
+ enum pl_handle_type type = vk_sync_handle_list[i];
+
+ VkPhysicalDeviceExternalSemaphoreInfo info = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_SEMAPHORE_INFO_KHR,
+ .handleType = vk_sync_handle_type(type),
+ };
+
+ VkExternalSemaphoreProperties props = {
+ .sType = VK_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_PROPERTIES_KHR,
+ };
+
+ vk->GetPhysicalDeviceExternalSemaphoreProperties(vk->physd, &info, &props);
+ VkExternalSemaphoreFeatureFlags flags = props.externalSemaphoreFeatures;
+ if ((props.compatibleHandleTypes & info.handleType) &&
+ (flags & VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT_KHR))
+ {
+ caps |= type;
+ }
+ }
+
+ return caps;
+}
+
+static pl_handle_caps vk_tex_handle_caps(struct vk_ctx *vk, bool import)
+{
+ pl_handle_caps caps = 0;
+
+ for (int i = 0; vk_mem_handle_list[i]; i++) {
+ enum pl_handle_type handle_type = vk_mem_handle_list[i];
+ if (handle_type == PL_HANDLE_DMA_BUF && !vk->GetImageDrmFormatModifierPropertiesEXT) {
+ PL_DEBUG(vk, "Tex caps for %s (0x%x) unsupported: no DRM modifiers",
+ vk_handle_name(vk_mem_handle_type(PL_HANDLE_DMA_BUF)),
+ (unsigned int) PL_HANDLE_DMA_BUF);
+ continue;
+ }
+
+ // Query whether creation of a "basic" dummy texture would work
+ VkPhysicalDeviceImageDrmFormatModifierInfoEXT drm_pinfo = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT,
+ .drmFormatModifier = DRM_FORMAT_MOD_LINEAR,
+ .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+ };
+
+ VkPhysicalDeviceExternalImageFormatInfoKHR ext_pinfo = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO_KHR,
+ .handleType = vk_mem_handle_type(handle_type),
+ };
+
+ VkPhysicalDeviceImageFormatInfo2KHR pinfo = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2_KHR,
+ .pNext = &ext_pinfo,
+ .format = VK_FORMAT_R8_UNORM,
+ .type = VK_IMAGE_TYPE_2D,
+ .tiling = VK_IMAGE_TILING_OPTIMAL,
+ .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+ };
+
+ if (handle_type == PL_HANDLE_DMA_BUF) {
+ vk_link_struct(&pinfo, &drm_pinfo);
+ pinfo.tiling = VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;
+ }
+
+ VkExternalImageFormatPropertiesKHR ext_props = {
+ .sType = VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES_KHR,
+ };
+
+ VkImageFormatProperties2KHR props = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2_KHR,
+ .pNext = &ext_props,
+ };
+
+ VkResult res;
+ res = vk->GetPhysicalDeviceImageFormatProperties2KHR(vk->physd, &pinfo, &props);
+ if (res != VK_SUCCESS) {
+ PL_DEBUG(vk, "Tex caps for %s (0x%x) unsupported: %s",
+ vk_handle_name(ext_pinfo.handleType),
+ (unsigned int) handle_type,
+ vk_res_str(res));
+ continue;
+ }
+
+ if (vk_external_mem_check(vk, &ext_props.externalMemoryProperties,
+ handle_type, import))
+ {
+ caps |= handle_type;
+ }
+ }
+
+#ifdef VK_EXT_metal_objects
+ if (vk->ExportMetalObjectsEXT && import)
+ caps |= PL_HANDLE_MTL_TEX | PL_HANDLE_IOSURFACE;
+#endif
+
+ return caps;
+}
+
+static const VkFilter filters[PL_TEX_SAMPLE_MODE_COUNT] = {
+ [PL_TEX_SAMPLE_NEAREST] = VK_FILTER_NEAREST,
+ [PL_TEX_SAMPLE_LINEAR] = VK_FILTER_LINEAR,
+};
+
+static inline struct pl_spirv_version get_spirv_version(const struct vk_ctx *vk)
+{
+ if (vk->api_ver >= VK_API_VERSION_1_3) {
+ const VkPhysicalDeviceMaintenance4Features *device_maintenance4;
+ device_maintenance4 = vk_find_struct(&vk->features,
+ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_4_FEATURES);
+
+ if (device_maintenance4 && device_maintenance4->maintenance4) {
+ return (struct pl_spirv_version) {
+ .env_version = VK_API_VERSION_1_3,
+ .spv_version = PL_SPV_VERSION(1, 6),
+ };
+ }
+ }
+
+ pl_assert(vk->api_ver >= VK_API_VERSION_1_2);
+ return (struct pl_spirv_version) {
+ .env_version = VK_API_VERSION_1_2,
+ .spv_version = PL_SPV_VERSION(1, 5),
+ };
+}
+
+static const struct pl_gpu_fns pl_fns_vk;
+
+pl_gpu pl_gpu_create_vk(struct vk_ctx *vk)
+{
+ pl_assert(vk->dev);
+
+ struct pl_gpu_t *gpu = pl_zalloc_obj(NULL, gpu, struct pl_vk);
+ gpu->log = vk->log;
+
+ struct pl_vk *p = PL_PRIV(gpu);
+ pl_mutex_init(&p->recording);
+ p->vk = vk;
+ p->impl = pl_fns_vk;
+ p->spirv = pl_spirv_create(vk->log, get_spirv_version(vk));
+ if (!p->spirv)
+ goto error;
+
+ // Query all device properties
+ VkPhysicalDevicePCIBusInfoPropertiesEXT pci_props = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PCI_BUS_INFO_PROPERTIES_EXT,
+ };
+
+ VkPhysicalDeviceIDPropertiesKHR id_props = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR,
+ .pNext = &pci_props,
+ };
+
+ VkPhysicalDevicePushDescriptorPropertiesKHR pushd_props = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PUSH_DESCRIPTOR_PROPERTIES_KHR,
+ .pNext = &id_props,
+ };
+
+ VkPhysicalDeviceSubgroupProperties group_props = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES,
+ .pNext = &pushd_props,
+ };
+
+ VkPhysicalDeviceExternalMemoryHostPropertiesEXT host_props = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_MEMORY_HOST_PROPERTIES_EXT,
+ .pNext = &group_props,
+ };
+
+ VkPhysicalDeviceProperties2KHR props = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR,
+ .pNext = &host_props,
+ };
+
+ bool is_portability = false;
+
+#ifdef VK_KHR_portability_subset
+ VkPhysicalDevicePortabilitySubsetPropertiesKHR port_props = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PORTABILITY_SUBSET_PROPERTIES_KHR,
+ .minVertexInputBindingStrideAlignment = 1,
+ };
+
+ for (int i = 0; i < vk->exts.num; i++) {
+ if (!strcmp(vk->exts.elem[i], VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME)) {
+ vk_link_struct(&props, &port_props);
+ is_portability = true;
+ break;
+ }
+ }
+#endif
+
+ vk->GetPhysicalDeviceProperties2(vk->physd, &props);
+ VkPhysicalDeviceLimits limits = props.properties.limits;
+
+ // Determine GLSL features and limits
+ gpu->glsl = (struct pl_glsl_version) {
+ .version = 450,
+ .vulkan = true,
+ .compute = true,
+ .max_shmem_size = limits.maxComputeSharedMemorySize,
+ .max_group_threads = limits.maxComputeWorkGroupInvocations,
+ .max_group_size = {
+ limits.maxComputeWorkGroupSize[0],
+ limits.maxComputeWorkGroupSize[1],
+ limits.maxComputeWorkGroupSize[2],
+ },
+ };
+
+ VkShaderStageFlags req_stages = VK_SHADER_STAGE_FRAGMENT_BIT |
+ VK_SHADER_STAGE_COMPUTE_BIT;
+ VkSubgroupFeatureFlags req_flags = VK_SUBGROUP_FEATURE_BASIC_BIT |
+ VK_SUBGROUP_FEATURE_VOTE_BIT |
+ VK_SUBGROUP_FEATURE_ARITHMETIC_BIT |
+ VK_SUBGROUP_FEATURE_BALLOT_BIT |
+ VK_SUBGROUP_FEATURE_SHUFFLE_BIT;
+
+ if ((group_props.supportedStages & req_stages) == req_stages &&
+ (group_props.supportedOperations & req_flags) == req_flags)
+ {
+ gpu->glsl.subgroup_size = group_props.subgroupSize;
+ }
+
+ if (vk->features.features.shaderImageGatherExtended) {
+ gpu->glsl.min_gather_offset = limits.minTexelGatherOffset;
+ gpu->glsl.max_gather_offset = limits.maxTexelGatherOffset;
+ }
+
+ const size_t max_size = vk_malloc_avail(vk->ma, 0);
+ gpu->limits = (struct pl_gpu_limits) {
+ // pl_gpu
+ .thread_safe = true,
+ .callbacks = true,
+ // pl_buf
+ .max_buf_size = max_size,
+ .max_ubo_size = PL_MIN(limits.maxUniformBufferRange, max_size),
+ .max_ssbo_size = PL_MIN(limits.maxStorageBufferRange, max_size),
+ .max_vbo_size = vk_malloc_avail(vk->ma, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT),
+ .max_mapped_size = vk_malloc_avail(vk->ma, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT),
+ .max_buffer_texels = PL_MIN(limits.maxTexelBufferElements, max_size),
+ .align_host_ptr = host_props.minImportedHostPointerAlignment,
+ .host_cached = vk_malloc_avail(vk->ma, VK_MEMORY_PROPERTY_HOST_CACHED_BIT),
+ // pl_tex
+ .max_tex_1d_dim = limits.maxImageDimension1D,
+ .max_tex_2d_dim = limits.maxImageDimension2D,
+ .max_tex_3d_dim = limits.maxImageDimension3D,
+ .blittable_1d_3d = true,
+ .buf_transfer = true,
+ .align_tex_xfer_pitch = limits.optimalBufferCopyRowPitchAlignment,
+ .align_tex_xfer_offset = pl_lcm(limits.optimalBufferCopyOffsetAlignment, 4),
+ // pl_pass
+ .max_variable_comps = 0, // vulkan doesn't support these at all
+ .max_constants = SIZE_MAX,
+ .array_size_constants = !is_portability,
+ .max_pushc_size = limits.maxPushConstantsSize,
+#ifdef VK_KHR_portability_subset
+ .align_vertex_stride = port_props.minVertexInputBindingStrideAlignment,
+#else
+ .align_vertex_stride = 1,
+#endif
+ .max_dispatch = {
+ limits.maxComputeWorkGroupCount[0],
+ limits.maxComputeWorkGroupCount[1],
+ limits.maxComputeWorkGroupCount[2],
+ },
+ .fragment_queues = vk->pool_graphics->num_queues,
+ .compute_queues = vk->pool_compute->num_queues,
+ };
+
+ gpu->export_caps.buf = vk_malloc_handle_caps(vk->ma, false);
+ gpu->import_caps.buf = vk_malloc_handle_caps(vk->ma, true);
+ gpu->export_caps.tex = vk_tex_handle_caps(vk, false);
+ gpu->import_caps.tex = vk_tex_handle_caps(vk, true);
+ gpu->export_caps.sync = vk_sync_handle_caps(vk);
+ gpu->import_caps.sync = 0; // Not supported yet
+
+ if (pl_gpu_supports_interop(gpu)) {
+ pl_static_assert(sizeof(gpu->uuid) == VK_UUID_SIZE);
+ memcpy(gpu->uuid, id_props.deviceUUID, sizeof(gpu->uuid));
+
+ gpu->pci.domain = pci_props.pciDomain;
+ gpu->pci.bus = pci_props.pciBus;
+ gpu->pci.device = pci_props.pciDevice;
+ gpu->pci.function = pci_props.pciFunction;
+ }
+
+ if (vk->CmdPushDescriptorSetKHR)
+ p->max_push_descriptors = pushd_props.maxPushDescriptors;
+
+ vk_setup_formats(gpu);
+
+ // Compute the correct minimum texture alignment
+ p->min_texel_alignment = 1;
+ for (int i = 0; i < gpu->num_formats; i++) {
+ if (gpu->formats[i]->emulated || gpu->formats[i]->opaque)
+ continue;
+ size_t texel_size = gpu->formats[i]->texel_size;
+ p->min_texel_alignment = pl_lcm(p->min_texel_alignment, texel_size);
+ }
+ PL_DEBUG(gpu, "Minimum texel alignment: %zu", p->min_texel_alignment);
+
+ // Initialize the samplers
+ for (enum pl_tex_sample_mode s = 0; s < PL_TEX_SAMPLE_MODE_COUNT; s++) {
+ for (enum pl_tex_address_mode a = 0; a < PL_TEX_ADDRESS_MODE_COUNT; a++) {
+ static const VkSamplerAddressMode modes[PL_TEX_ADDRESS_MODE_COUNT] = {
+ [PL_TEX_ADDRESS_CLAMP] = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
+ [PL_TEX_ADDRESS_REPEAT] = VK_SAMPLER_ADDRESS_MODE_REPEAT,
+ [PL_TEX_ADDRESS_MIRROR] = VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT,
+ };
+
+ VkSamplerCreateInfo sinfo = {
+ .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,
+ .magFilter = filters[s],
+ .minFilter = filters[s],
+ .addressModeU = modes[a],
+ .addressModeV = modes[a],
+ .addressModeW = modes[a],
+ .maxAnisotropy = 1.0,
+ };
+
+ VK(vk->CreateSampler(vk->dev, &sinfo, PL_VK_ALLOC, &p->samplers[s][a]));
+ }
+ }
+
+ return pl_gpu_finalize(gpu);
+
+error:
+ vk_gpu_destroy(gpu);
+ return NULL;
+}
+
+static void vk_sync_destroy(pl_gpu gpu, pl_sync sync)
+{
+ if (!sync)
+ return;
+
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_sync_vk *sync_vk = PL_PRIV(sync);
+
+#ifdef PL_HAVE_UNIX
+ if (sync->handle_type == PL_HANDLE_FD) {
+ if (sync->wait_handle.fd > -1)
+ close(sync->wait_handle.fd);
+ if (sync->signal_handle.fd > -1)
+ close(sync->signal_handle.fd);
+ }
+#endif
+#ifdef PL_HAVE_WIN32
+ if (sync->handle_type == PL_HANDLE_WIN32) {
+ if (sync->wait_handle.handle != NULL)
+ CloseHandle(sync->wait_handle.handle);
+ if (sync->signal_handle.handle != NULL)
+ CloseHandle(sync->signal_handle.handle);
+ }
+ // PL_HANDLE_WIN32_KMT is just an identifier. It doesn't get closed.
+#endif
+
+ vk->DestroySemaphore(vk->dev, sync_vk->wait, PL_VK_ALLOC);
+ vk->DestroySemaphore(vk->dev, sync_vk->signal, PL_VK_ALLOC);
+
+ pl_free((void *) sync);
+}
+
+void vk_sync_deref(pl_gpu gpu, pl_sync sync)
+{
+ if (!sync)
+ return;
+
+ struct pl_sync_vk *sync_vk = PL_PRIV(sync);
+ if (pl_rc_deref(&sync_vk->rc))
+ vk_sync_destroy(gpu, sync);
+}
+
+static pl_sync vk_sync_create(pl_gpu gpu, enum pl_handle_type handle_type)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ struct pl_sync_t *sync = pl_zalloc_obj(NULL, sync, struct pl_sync_vk);
+ sync->handle_type = handle_type;
+
+ struct pl_sync_vk *sync_vk = PL_PRIV(sync);
+ pl_rc_init(&sync_vk->rc);
+
+ VkExportSemaphoreCreateInfoKHR einfo = {
+ .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR,
+ .handleTypes = vk_sync_handle_type(handle_type),
+ };
+
+ switch (handle_type) {
+ case PL_HANDLE_FD:
+ sync->wait_handle.fd = -1;
+ sync->signal_handle.fd = -1;
+ break;
+ case PL_HANDLE_WIN32:
+ case PL_HANDLE_WIN32_KMT:
+ sync->wait_handle.handle = NULL;
+ sync->signal_handle.handle = NULL;
+ break;
+ case PL_HANDLE_DMA_BUF:
+ case PL_HANDLE_HOST_PTR:
+ case PL_HANDLE_MTL_TEX:
+ case PL_HANDLE_IOSURFACE:
+ pl_unreachable();
+ }
+
+ const VkSemaphoreCreateInfo sinfo = {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+ .pNext = &einfo,
+ };
+
+ VK(vk->CreateSemaphore(vk->dev, &sinfo, PL_VK_ALLOC, &sync_vk->wait));
+ VK(vk->CreateSemaphore(vk->dev, &sinfo, PL_VK_ALLOC, &sync_vk->signal));
+ PL_VK_NAME(SEMAPHORE, sync_vk->wait, "sync wait");
+ PL_VK_NAME(SEMAPHORE, sync_vk->signal, "sync signal");
+
+#ifdef PL_HAVE_UNIX
+ if (handle_type == PL_HANDLE_FD) {
+ VkSemaphoreGetFdInfoKHR finfo = {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR,
+ .semaphore = sync_vk->wait,
+ .handleType = einfo.handleTypes,
+ };
+
+ VK(vk->GetSemaphoreFdKHR(vk->dev, &finfo, &sync->wait_handle.fd));
+
+ finfo.semaphore = sync_vk->signal;
+ VK(vk->GetSemaphoreFdKHR(vk->dev, &finfo, &sync->signal_handle.fd));
+ }
+#endif
+
+#ifdef PL_HAVE_WIN32
+ if (handle_type == PL_HANDLE_WIN32 ||
+ handle_type == PL_HANDLE_WIN32_KMT)
+ {
+ VkSemaphoreGetWin32HandleInfoKHR handle_info = {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR,
+ .semaphore = sync_vk->wait,
+ .handleType = einfo.handleTypes,
+ };
+
+ VK(vk->GetSemaphoreWin32HandleKHR(vk->dev, &handle_info,
+ &sync->wait_handle.handle));
+
+ handle_info.semaphore = sync_vk->signal;
+ VK(vk->GetSemaphoreWin32HandleKHR(vk->dev, &handle_info,
+ &sync->signal_handle.handle));
+ }
+#endif
+
+ return sync;
+
+error:
+ vk_sync_destroy(gpu, sync);
+ return NULL;
+}
+
+void pl_vulkan_sem_destroy(pl_gpu gpu, VkSemaphore *semaphore)
+{
+ VkSemaphore sem = *semaphore;
+ if (!sem)
+ return;
+
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ vk->DestroySemaphore(vk->dev, sem, PL_VK_ALLOC);
+ *semaphore = VK_NULL_HANDLE;
+}
+
+VkSemaphore pl_vulkan_sem_create(pl_gpu gpu, const struct pl_vulkan_sem_params *params)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ pl_assert(PL_ISPOT(params->export_handle));
+ if ((params->export_handle & gpu->export_caps.sync) != params->export_handle) {
+ PL_ERR(gpu, "Invalid handle type 0x%"PRIx64" specified for "
+ "`pl_vulkan_sem_create`!", (uint64_t) params->export_handle);
+ return VK_NULL_HANDLE;
+ }
+
+ switch (params->export_handle) {
+ case PL_HANDLE_FD:
+ params->out_handle->fd = -1;
+ break;
+ case PL_HANDLE_WIN32:
+ case PL_HANDLE_WIN32_KMT:
+ params->out_handle->handle = NULL;
+ break;
+ case PL_HANDLE_DMA_BUF:
+ case PL_HANDLE_HOST_PTR:
+ case PL_HANDLE_MTL_TEX:
+ case PL_HANDLE_IOSURFACE:
+ pl_unreachable();
+ }
+
+ const VkExportSemaphoreCreateInfoKHR einfo = {
+ .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR,
+ .handleTypes = vk_sync_handle_type(params->export_handle),
+ };
+
+ const VkSemaphoreTypeCreateInfo stinfo = {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO,
+ .pNext = params->export_handle ? &einfo : NULL,
+ .semaphoreType = params->type,
+ .initialValue = params->initial_value,
+ };
+
+ const VkSemaphoreCreateInfo sinfo = {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+ .pNext = &stinfo,
+ };
+
+ VkSemaphore sem = VK_NULL_HANDLE;
+ VK(vk->CreateSemaphore(vk->dev, &sinfo, PL_VK_ALLOC, &sem));
+ PL_VK_NAME(SEMAPHORE, sem, PL_DEF(params->debug_tag, "pl_vulkan_sem"));
+
+#ifdef PL_HAVE_UNIX
+ if (params->export_handle == PL_HANDLE_FD) {
+ VkSemaphoreGetFdInfoKHR finfo = {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR,
+ .handleType = einfo.handleTypes,
+ .semaphore = sem,
+ };
+
+ VK(vk->GetSemaphoreFdKHR(vk->dev, &finfo, &params->out_handle->fd));
+ }
+#endif
+
+#ifdef PL_HAVE_WIN32
+ if (params->export_handle == PL_HANDLE_WIN32 ||
+ params->export_handle == PL_HANDLE_WIN32_KMT)
+ {
+ VkSemaphoreGetWin32HandleInfoKHR handle_info = {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR,
+ .handleType = einfo.handleTypes,
+ .semaphore = sem,
+ };
+
+ VK(vk->GetSemaphoreWin32HandleKHR(vk->dev, &handle_info,
+ &params->out_handle->handle));
+ }
+#endif
+
+ return sem;
+
+error:
+#ifdef PL_HAVE_UNIX
+ if (params->export_handle == PL_HANDLE_FD) {
+ if (params->out_handle->fd > -1)
+ close(params->out_handle->fd);
+ }
+#endif
+#ifdef PL_HAVE_WIN32
+ if (params->export_handle == PL_HANDLE_WIN32) {
+ if (params->out_handle->handle != NULL)
+ CloseHandle(params->out_handle->handle);
+ }
+ // PL_HANDLE_WIN32_KMT is just an identifier. It doesn't get closed.
+#endif
+ vk->DestroySemaphore(vk->dev, sem, PL_VK_ALLOC);
+ return VK_NULL_HANDLE;
+}
+
+static void vk_gpu_flush(pl_gpu gpu)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ CMD_SUBMIT(NULL);
+ vk_rotate_queues(vk);
+ vk_malloc_garbage_collect(vk->ma);
+}
+
+static void vk_gpu_finish(pl_gpu gpu)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ CMD_SUBMIT(NULL);
+ vk_wait_idle(vk);
+}
+
+static bool vk_gpu_is_failed(pl_gpu gpu)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ return vk->failed;
+}
+
+struct vk_cmd *pl_vk_steal_cmd(pl_gpu gpu)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ pl_mutex_lock(&p->recording);
+ struct vk_cmd *cmd = p->cmd;
+ p->cmd = NULL;
+ pl_mutex_unlock(&p->recording);
+
+ struct vk_cmdpool *pool = vk->pool_graphics;
+ if (!cmd || cmd->pool != pool) {
+ vk_cmd_submit(&cmd);
+ cmd = vk_cmd_begin(pool, NULL);
+ }
+
+ return cmd;
+}
+
+void pl_vk_print_heap(pl_gpu gpu, enum pl_log_level lev)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ vk_malloc_print_stats(vk->ma, lev);
+}
+
+static const struct pl_gpu_fns pl_fns_vk = {
+ .destroy = vk_gpu_destroy,
+ .tex_create = vk_tex_create,
+ .tex_destroy = vk_tex_deref,
+ .tex_invalidate = vk_tex_invalidate,
+ .tex_clear_ex = vk_tex_clear_ex,
+ .tex_blit = vk_tex_blit,
+ .tex_upload = vk_tex_upload,
+ .tex_download = vk_tex_download,
+ .tex_poll = vk_tex_poll,
+ .tex_export = vk_tex_export,
+ .buf_create = vk_buf_create,
+ .buf_destroy = vk_buf_deref,
+ .buf_write = vk_buf_write,
+ .buf_read = vk_buf_read,
+ .buf_copy = vk_buf_copy,
+ .buf_export = vk_buf_export,
+ .buf_poll = vk_buf_poll,
+ .desc_namespace = vk_desc_namespace,
+ .pass_create = vk_pass_create,
+ .pass_destroy = vk_pass_destroy,
+ .pass_run = vk_pass_run,
+ .sync_create = vk_sync_create,
+ .sync_destroy = vk_sync_deref,
+ .timer_create = vk_timer_create,
+ .timer_destroy = vk_timer_destroy,
+ .timer_query = vk_timer_query,
+ .gpu_flush = vk_gpu_flush,
+ .gpu_finish = vk_gpu_finish,
+ .gpu_is_failed = vk_gpu_is_failed,
+};
diff --git a/src/vulkan/gpu.h b/src/vulkan/gpu.h
new file mode 100644
index 0000000..041de13
--- /dev/null
+++ b/src/vulkan/gpu.h
@@ -0,0 +1,175 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+#include "command.h"
+#include "formats.h"
+#include "malloc.h"
+#include "utils.h"
+
+#include "../gpu.h"
+#include "../glsl/spirv.h"
+#include "../pl_thread.h"
+
+pl_gpu pl_gpu_create_vk(struct vk_ctx *vk);
+
+// This function takes the current graphics command and steals it from the
+// GPU, so the caller can do custom vk_cmd_ calls on it. The caller should
+// submit it as well.
+struct vk_cmd *pl_vk_steal_cmd(pl_gpu gpu);
+
+// Print memory usage statistics
+void pl_vk_print_heap(pl_gpu, enum pl_log_level);
+
+// --- pl_gpu internal structs and helpers
+
+struct pl_fmt_vk {
+ const struct vk_format *vk_fmt;
+ bool blit_emulated;
+};
+
+enum queue_type {
+ GRAPHICS,
+ COMPUTE,
+ TRANSFER,
+ ANY,
+};
+
+struct pl_vk {
+ struct pl_gpu_fns impl;
+ struct vk_ctx *vk;
+ pl_spirv spirv;
+
+ // Some additional cached device limits and features checks
+ uint32_t max_push_descriptors;
+ size_t min_texel_alignment;
+
+ // The "currently recording" command. This will be queued and replaced by
+ // a new command every time we need to "switch" between queue families.
+ pl_mutex recording;
+ struct vk_cmd *cmd;
+ pl_timer cmd_timer;
+
+ // Array of VkSamplers for every combination of sample/address modes
+ VkSampler samplers[PL_TEX_SAMPLE_MODE_COUNT][PL_TEX_ADDRESS_MODE_COUNT];
+
+ // To avoid spamming warnings
+ bool warned_modless;
+};
+
+struct vk_cmd *_begin_cmd(pl_gpu, enum queue_type, const char *label, pl_timer);
+bool _end_cmd(pl_gpu, struct vk_cmd **, bool submit);
+
+#define CMD_BEGIN(type) _begin_cmd(gpu, type, __func__, NULL)
+#define CMD_BEGIN_TIMED(type, timer) _begin_cmd(gpu, type, __func__, timer)
+#define CMD_FINISH(cmd) _end_cmd(gpu, cmd, false)
+#define CMD_SUBMIT(cmd) _end_cmd(gpu, cmd, true)
+
+// Helper to fire a callback the next time the `pl_gpu` is in an idle state
+//
+// Use this instead of `vk_dev_callback` when you need to clean up after
+// resources that might possibly still be in use by the `pl_gpu` at the time of
+// creating the callback.
+void vk_gpu_idle_callback(pl_gpu, vk_cb, const void *priv, const void *arg);
+
+struct pl_tex_vk {
+ pl_rc_t rc;
+ bool external_img;
+ enum queue_type transfer_queue;
+ VkImageType type;
+ VkImage img;
+ VkImageAspectFlags aspect;
+ struct vk_memslice mem;
+ // cached properties
+ VkFormat img_fmt;
+ VkImageUsageFlags usage_flags;
+ // for sampling
+ VkImageView view;
+ // for rendering
+ VkFramebuffer framebuffer;
+ // for vk_tex_upload/download fallback code
+ pl_fmt texel_fmt;
+ // for planar textures (as a convenience)
+ int num_planes;
+ struct pl_tex_vk *planes[4];
+
+ // synchronization and current state (planes only)
+ struct vk_sem sem;
+ VkImageLayout layout;
+ PL_ARRAY(pl_vulkan_sem) ext_deps; // external semaphore, not owned by the pl_tex
+ pl_sync ext_sync; // indicates an exported image
+ uint32_t qf; // last queue family to access this texture (for barriers)
+ bool may_invalidate;
+ bool held;
+};
+
+pl_tex vk_tex_create(pl_gpu, const struct pl_tex_params *);
+void vk_tex_deref(pl_gpu, pl_tex);
+void vk_tex_invalidate(pl_gpu, pl_tex);
+void vk_tex_clear_ex(pl_gpu, pl_tex, const union pl_clear_color);
+void vk_tex_blit(pl_gpu, const struct pl_tex_blit_params *);
+bool vk_tex_upload(pl_gpu, const struct pl_tex_transfer_params *);
+bool vk_tex_download(pl_gpu, const struct pl_tex_transfer_params *);
+bool vk_tex_poll(pl_gpu, pl_tex, uint64_t timeout);
+bool vk_tex_export(pl_gpu, pl_tex, pl_sync);
+void vk_tex_barrier(pl_gpu, struct vk_cmd *, pl_tex, VkPipelineStageFlags2,
+ VkAccessFlags2, VkImageLayout, uint32_t qf);
+
+struct pl_buf_vk {
+ pl_rc_t rc;
+ struct vk_memslice mem;
+ enum queue_type update_queue;
+ VkBufferView view; // for texel buffers
+
+ // synchronization and current state
+ struct vk_sem sem;
+ bool exported;
+ bool needs_flush;
+};
+
+pl_buf vk_buf_create(pl_gpu, const struct pl_buf_params *);
+void vk_buf_deref(pl_gpu, pl_buf);
+void vk_buf_write(pl_gpu, pl_buf, size_t offset, const void *src, size_t size);
+bool vk_buf_read(pl_gpu, pl_buf, size_t offset, void *dst, size_t size);
+void vk_buf_copy(pl_gpu, pl_buf dst, size_t dst_offset,
+ pl_buf src, size_t src_offset, size_t size);
+bool vk_buf_export(pl_gpu, pl_buf);
+bool vk_buf_poll(pl_gpu, pl_buf, uint64_t timeout);
+
+// Helper to ease buffer barrier creation. (`offset` is relative to pl_buf)
+void vk_buf_barrier(pl_gpu, struct vk_cmd *, pl_buf, VkPipelineStageFlags2,
+ VkAccessFlags2, size_t offset, size_t size, bool export);
+
+// Flush visible writes to a buffer made by the API
+void vk_buf_flush(pl_gpu, struct vk_cmd *, pl_buf, size_t offset, size_t size);
+
+struct pl_pass_vk;
+
+int vk_desc_namespace(pl_gpu, enum pl_desc_type);
+pl_pass vk_pass_create(pl_gpu, const struct pl_pass_params *);
+void vk_pass_destroy(pl_gpu, pl_pass);
+void vk_pass_run(pl_gpu, const struct pl_pass_run_params *);
+
+struct pl_sync_vk {
+ pl_rc_t rc;
+ VkSemaphore wait;
+ VkSemaphore signal;
+};
+
+void vk_sync_deref(pl_gpu, pl_sync);
diff --git a/src/vulkan/gpu_buf.c b/src/vulkan/gpu_buf.c
new file mode 100644
index 0000000..2f317bc
--- /dev/null
+++ b/src/vulkan/gpu_buf.c
@@ -0,0 +1,470 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+
+void vk_buf_barrier(pl_gpu gpu, struct vk_cmd *cmd, pl_buf buf,
+ VkPipelineStageFlags2 stage, VkAccessFlags2 access,
+ size_t offset, size_t size, bool export)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+ pl_assert(!export || !buf_vk->exported); // can't re-export exported buffers
+ pl_rc_ref(&buf_vk->rc);
+
+ bool needs_flush = buf_vk->needs_flush || buf->params.host_mapped ||
+ buf->params.import_handle == PL_HANDLE_HOST_PTR;
+ bool noncoherent = buf_vk->mem.data && !buf_vk->mem.coherent;
+ if (needs_flush && noncoherent) {
+ VK(vk->FlushMappedMemoryRanges(vk->dev, 1, &(struct VkMappedMemoryRange) {
+ .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+ .memory = buf_vk->mem.vkmem,
+ .offset = buf_vk->mem.map_offset,
+ .size = buf_vk->mem.map_size,
+ }));
+
+ // Just ignore errors, not much we can do about them other than
+ // logging them and moving on...
+ error: ;
+ }
+
+ struct vk_sync_scope last;
+ last = vk_sem_barrier(cmd, &buf_vk->sem, stage, access, export);
+
+ // CONCURRENT buffers require transitioning to/from IGNORED, EXCLUSIVE
+ // buffers require transitioning to/from the concrete QF index
+ uint32_t qf = vk->pools.num > 1 ? VK_QUEUE_FAMILY_IGNORED : cmd->pool->qf;
+ uint32_t src_qf = buf_vk->exported ? VK_QUEUE_FAMILY_EXTERNAL_KHR : qf;
+ uint32_t dst_qf = export ? VK_QUEUE_FAMILY_EXTERNAL_KHR : qf;
+
+ if (last.access || src_qf != dst_qf) {
+ vk_cmd_barrier(cmd, &(VkDependencyInfo) {
+ .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+ .bufferMemoryBarrierCount = 1,
+ .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+ .srcStageMask = last.stage,
+ .srcAccessMask = last.access,
+ .dstStageMask = stage,
+ .dstAccessMask = access,
+ .srcQueueFamilyIndex = src_qf,
+ .dstQueueFamilyIndex = dst_qf,
+ .buffer = buf_vk->mem.buf,
+ .offset = buf_vk->mem.offset + offset,
+ .size = size,
+ },
+ });
+ }
+
+ buf_vk->needs_flush = false;
+ buf_vk->exported = export;
+ vk_cmd_callback(cmd, (vk_cb) vk_buf_deref, gpu, buf);
+}
+
+void vk_buf_deref(pl_gpu gpu, pl_buf buf)
+{
+ if (!buf)
+ return;
+
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+ if (pl_rc_deref(&buf_vk->rc)) {
+ vk->DestroyBufferView(vk->dev, buf_vk->view, PL_VK_ALLOC);
+ vk_malloc_free(vk->ma, &buf_vk->mem);
+ pl_free((void *) buf);
+ }
+}
+
+pl_buf vk_buf_create(pl_gpu gpu, const struct pl_buf_params *params)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ struct pl_buf_t *buf = pl_zalloc_obj(NULL, buf, struct pl_buf_vk);
+ buf->params = *params;
+ buf->params.initial_data = NULL;
+
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+ pl_rc_init(&buf_vk->rc);
+
+ struct vk_malloc_params mparams = {
+ .reqs = {
+ .size = PL_ALIGN2(params->size, 4), // for vk_buf_write
+ .memoryTypeBits = UINT32_MAX,
+ .alignment = 1,
+ },
+ // these are always set, because `vk_buf_copy` can always be used
+ .buf_usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
+ VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+ .export_handle = params->export_handle,
+ .import_handle = params->import_handle,
+ .shared_mem = params->shared_mem,
+ .debug_tag = params->debug_tag,
+ };
+
+ // Mandatory/optimal buffer offset alignment
+ VkDeviceSize *align = &mparams.reqs.alignment;
+ VkDeviceSize extra_align = vk->props.limits.optimalBufferCopyOffsetAlignment;
+
+ // Try and align all buffers to the minimum texel alignment, to make sure
+ // tex_upload/tex_download always gets aligned buffer copies if possible
+ extra_align = pl_lcm(extra_align, p->min_texel_alignment);
+
+ enum pl_buf_mem_type mem_type = params->memory_type;
+ bool is_texel = false;
+
+ if (params->uniform) {
+ mparams.buf_usage |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
+ *align = pl_lcm(*align, vk->props.limits.minUniformBufferOffsetAlignment);
+ mem_type = PL_BUF_MEM_DEVICE;
+ if (params->format) {
+ mparams.buf_usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT;
+ is_texel = true;
+ }
+ }
+
+ if (params->storable) {
+ mparams.buf_usage |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
+ *align = pl_lcm(*align, vk->props.limits.minStorageBufferOffsetAlignment);
+ buf_vk->update_queue = COMPUTE;
+ mem_type = PL_BUF_MEM_DEVICE;
+ if (params->format) {
+ mparams.buf_usage |= VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT;
+ is_texel = true;
+ }
+ }
+
+ if (is_texel) {
+ *align = pl_lcm(*align, vk->props.limits.minTexelBufferOffsetAlignment);
+ *align = pl_lcm(*align, params->format->texel_size);
+ }
+
+ if (params->drawable) {
+ mparams.buf_usage |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT |
+ VK_BUFFER_USAGE_INDEX_BUFFER_BIT;
+ mem_type = PL_BUF_MEM_DEVICE;
+ }
+
+ if (params->host_writable || params->initial_data) {
+ // Buffers should be written using mapped memory if possible
+ mparams.optimal = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+ // Use the transfer queue for updates on very large buffers (1 MB)
+ if (params->size > 1024*1024)
+ buf_vk->update_queue = TRANSFER;
+ }
+
+ if (params->host_mapped || params->host_readable) {
+ mparams.required |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+
+ if (params->size > 1024) {
+ // Prefer cached memory for large buffers (1 kB) which may be read
+ // from, because uncached reads are extremely slow
+ mparams.optimal |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+ }
+ }
+
+ switch (mem_type) {
+ case PL_BUF_MEM_AUTO:
+ // We generally prefer VRAM since it's faster than RAM, but any number
+ // of other requirements could potentially exclude it, so just mark it
+ // as optimal by default.
+ if (!(mparams.optimal & VK_MEMORY_PROPERTY_HOST_CACHED_BIT))
+ mparams.optimal |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+ break;
+ case PL_BUF_MEM_DEVICE:
+ // Force device local memory.
+ mparams.required |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+ break;
+ case PL_BUF_MEM_HOST:
+ // This isn't a true guarantee, but actually trying to restrict the
+ // device-local bit locks out all memory heaps on iGPUs. Requiring
+ // the memory be host-mapped is the easiest compromise.
+ mparams.required |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+ mparams.optimal |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+ break;
+ case PL_BUF_MEM_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ if (params->import_handle) {
+ size_t offset = params->shared_mem.offset;
+ if (PL_ALIGN(offset, *align) != offset) {
+ PL_ERR(gpu, "Imported memory offset %zu violates minimum alignment "
+ "requirement of enabled usage flags (%zu)!",
+ offset, (size_t) *align);
+ goto error;
+ }
+ } else {
+ *align = pl_lcm(*align, extra_align);
+ }
+
+ if (!vk_malloc_slice(vk->ma, &buf_vk->mem, &mparams))
+ goto error;
+
+ if (params->host_mapped)
+ buf->data = buf_vk->mem.data;
+
+ if (params->export_handle) {
+ buf->shared_mem = buf_vk->mem.shared_mem;
+ buf->shared_mem.drm_format_mod = DRM_FORMAT_MOD_LINEAR;
+ buf_vk->exported = true;
+ }
+
+ if (is_texel) {
+ struct pl_fmt_vk *fmtp = PL_PRIV(params->format);
+ VkBufferViewCreateInfo vinfo = {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO,
+ .buffer = buf_vk->mem.buf,
+ .format = PL_DEF(fmtp->vk_fmt->bfmt, fmtp->vk_fmt->tfmt),
+ .offset = buf_vk->mem.offset,
+ .range = buf_vk->mem.size,
+ };
+
+ VK(vk->CreateBufferView(vk->dev, &vinfo, PL_VK_ALLOC, &buf_vk->view));
+ PL_VK_NAME(BUFFER_VIEW, buf_vk->view, PL_DEF(params->debug_tag, "texel"));
+ }
+
+ if (params->initial_data)
+ vk_buf_write(gpu, buf, 0, params->initial_data, params->size);
+
+ return buf;
+
+error:
+ vk_buf_deref(gpu, buf);
+ return NULL;
+}
+
+static void invalidate_buf(pl_gpu gpu, pl_buf buf)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+ if (buf_vk->mem.data && !buf_vk->mem.coherent) {
+ VK(vk->InvalidateMappedMemoryRanges(vk->dev, 1, &(VkMappedMemoryRange) {
+ .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+ .memory = buf_vk->mem.vkmem,
+ .offset = buf_vk->mem.map_offset,
+ .size = buf_vk->mem.map_size,
+ }));
+ }
+
+ // Ignore errors (after logging), nothing useful we can do anyway
+error: ;
+ vk_buf_deref(gpu, buf);
+}
+
+void vk_buf_flush(pl_gpu gpu, struct vk_cmd *cmd, pl_buf buf,
+ size_t offset, size_t size)
+{
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+ // We need to perform a flush if the host is capable of reading back from
+ // the buffer, or if we intend to overwrite it using mapped memory
+ bool can_read = buf->params.host_readable;
+ bool can_write = buf_vk->mem.data && buf->params.host_writable;
+ if (buf->params.host_mapped || buf->params.import_handle == PL_HANDLE_HOST_PTR)
+ can_read = can_write = true;
+
+ if (!can_read && !can_write)
+ return;
+
+ vk_cmd_barrier(cmd, &(VkDependencyInfo) {
+ .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+ .bufferMemoryBarrierCount = 1,
+ .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+ .srcStageMask = buf_vk->sem.write.stage,
+ .srcAccessMask = buf_vk->sem.write.access,
+ .dstStageMask = VK_PIPELINE_STAGE_2_HOST_BIT,
+ .dstAccessMask = (can_read ? VK_ACCESS_2_HOST_READ_BIT : 0)
+ | (can_write ? VK_ACCESS_2_HOST_WRITE_BIT : 0),
+ .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+ .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+ .buffer = buf_vk->mem.buf,
+ .offset = buf_vk->mem.offset + offset,
+ .size = size,
+ },
+ });
+
+ // We need to hold on to the buffer until this barrier completes
+ vk_cmd_callback(cmd, (vk_cb) invalidate_buf, gpu, buf);
+ pl_rc_ref(&buf_vk->rc);
+}
+
+bool vk_buf_poll(pl_gpu gpu, pl_buf buf, uint64_t timeout)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+ // Opportunistically check if we can re-use this buffer without flush
+ vk_poll_commands(vk, 0);
+ if (pl_rc_count(&buf_vk->rc) == 1)
+ return false;
+
+ // Otherwise, we're force to submit any queued command so that the
+ // user is guaranteed to see progress eventually, even if they call
+ // this in a tight loop
+ CMD_SUBMIT(NULL);
+ vk_poll_commands(vk, timeout);
+
+ return pl_rc_count(&buf_vk->rc) > 1;
+}
+
+void vk_buf_write(pl_gpu gpu, pl_buf buf, size_t offset,
+ const void *data, size_t size)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+ // For host-mapped buffers, we can just directly memcpy the buffer contents.
+ // Otherwise, we can update the buffer from the GPU using a command buffer.
+ if (buf_vk->mem.data) {
+ // ensure no queued operations
+ while (vk_buf_poll(gpu, buf, UINT64_MAX))
+ ; // do nothing
+
+ uintptr_t addr = (uintptr_t) buf_vk->mem.data + offset;
+ memcpy((void *) addr, data, size);
+ buf_vk->needs_flush = true;
+ } else {
+ struct vk_cmd *cmd = CMD_BEGIN(buf_vk->update_queue);
+ if (!cmd) {
+ PL_ERR(gpu, "Failed updating buffer!");
+ return;
+ }
+
+ vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_WRITE_BIT, offset, size, false);
+
+ // Vulkan requires `size` to be a multiple of 4, so we need to make
+ // sure to handle the end separately if the original data is not
+ const size_t max_transfer = 64 * 1024;
+ size_t size_rem = size % 4;
+ size_t size_base = size - size_rem;
+ VkDeviceSize buf_offset = buf_vk->mem.offset + offset;
+
+ if (size_base > max_transfer) {
+ PL_TRACE(gpu, "Using multiple vkCmdUpdateBuffer calls to upload "
+ "large buffer. Consider using buffer-buffer transfers "
+ "instead!");
+ }
+
+ for (size_t xfer = 0; xfer < size_base; xfer += max_transfer) {
+ vk->CmdUpdateBuffer(cmd->buf, buf_vk->mem.buf,
+ buf_offset + xfer,
+ PL_MIN(size_base, max_transfer),
+ (void *) ((uint8_t *) data + xfer));
+ }
+
+ if (size_rem) {
+ uint8_t tail[4] = {0};
+ memcpy(tail, data, size_rem);
+ vk->CmdUpdateBuffer(cmd->buf, buf_vk->mem.buf, buf_offset + size_base,
+ sizeof(tail), tail);
+ }
+
+ pl_assert(!buf->params.host_readable); // no flush needed due to this
+ CMD_FINISH(&cmd);
+ }
+}
+
+bool vk_buf_read(pl_gpu gpu, pl_buf buf, size_t offset, void *dest, size_t size)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+ pl_assert(buf_vk->mem.data);
+
+ if (vk_buf_poll(gpu, buf, 0) && buf_vk->sem.write.sync.sem) {
+ // ensure no more queued writes
+ VK(vk->WaitSemaphores(vk->dev, &(VkSemaphoreWaitInfo) {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
+ .semaphoreCount = 1,
+ .pSemaphores = &buf_vk->sem.write.sync.sem,
+ .pValues = &buf_vk->sem.write.sync.value,
+ }, UINT64_MAX));
+
+ // process callbacks
+ vk_poll_commands(vk, 0);
+ }
+
+ uintptr_t addr = (uintptr_t) buf_vk->mem.data + (size_t) offset;
+ memcpy(dest, (void *) addr, size);
+ return true;
+
+error:
+ return false;
+}
+
+void vk_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset,
+ pl_buf src, size_t src_offset, size_t size)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_buf_vk *dst_vk = PL_PRIV(dst);
+ struct pl_buf_vk *src_vk = PL_PRIV(src);
+
+ struct vk_cmd *cmd = CMD_BEGIN(dst_vk->update_queue);
+ if (!cmd) {
+ PL_ERR(gpu, "Failed copying buffer!");
+ return;
+ }
+
+ vk_buf_barrier(gpu, cmd, dst, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_WRITE_BIT, dst_offset, size, false);
+ vk_buf_barrier(gpu, cmd, src, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_READ_BIT, src_offset, size, false);
+
+ VkBufferCopy region = {
+ .srcOffset = src_vk->mem.offset + src_offset,
+ .dstOffset = dst_vk->mem.offset + dst_offset,
+ .size = size,
+ };
+
+ vk->CmdCopyBuffer(cmd->buf, src_vk->mem.buf, dst_vk->mem.buf,
+ 1, &region);
+
+ vk_buf_flush(gpu, cmd, dst, dst_offset, size);
+ CMD_FINISH(&cmd);
+}
+
+bool vk_buf_export(pl_gpu gpu, pl_buf buf)
+{
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+ if (buf_vk->exported)
+ return true;
+
+ struct vk_cmd *cmd = CMD_BEGIN(ANY);
+ if (!cmd) {
+ PL_ERR(gpu, "Failed exporting buffer!");
+ return false;
+ }
+
+ // For the queue family ownership transfer, we can ignore all pipeline
+ // stages since the synchronization via fences/semaphores is required
+ vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_NONE, 0, 0,
+ buf->params.size, true);
+
+
+ return CMD_SUBMIT(&cmd);
+}
diff --git a/src/vulkan/gpu_pass.c b/src/vulkan/gpu_pass.c
new file mode 100644
index 0000000..5ffe77d
--- /dev/null
+++ b/src/vulkan/gpu_pass.c
@@ -0,0 +1,964 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+#include "cache.h"
+#include "glsl/spirv.h"
+
+// For pl_pass.priv
+struct pl_pass_vk {
+ // Pipeline / render pass
+ VkPipeline base;
+ VkPipeline pipe;
+ VkPipelineLayout pipeLayout;
+ VkRenderPass renderPass;
+ // Descriptor set (bindings)
+ bool use_pushd;
+ VkDescriptorSetLayout dsLayout;
+ VkDescriptorPool dsPool;
+ // To keep track of which descriptor sets are and aren't available, we
+ // allocate a fixed number and use a bitmask of all available sets.
+ VkDescriptorSet dss[16];
+ uint16_t dmask;
+
+ // For recompilation
+ VkVertexInputAttributeDescription *attrs;
+ VkPipelineCache cache;
+ VkShaderModule vert;
+ VkShaderModule shader;
+
+ // For updating
+ VkWriteDescriptorSet *dswrite;
+ VkDescriptorImageInfo *dsiinfo;
+ VkDescriptorBufferInfo *dsbinfo;
+ VkSpecializationInfo specInfo;
+ size_t spec_size;
+};
+
+int vk_desc_namespace(pl_gpu gpu, enum pl_desc_type type)
+{
+ return 0;
+}
+
+static void pass_destroy_cb(pl_gpu gpu, pl_pass pass)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_pass_vk *pass_vk = PL_PRIV(pass);
+
+ vk->DestroyPipeline(vk->dev, pass_vk->pipe, PL_VK_ALLOC);
+ vk->DestroyPipeline(vk->dev, pass_vk->base, PL_VK_ALLOC);
+ vk->DestroyRenderPass(vk->dev, pass_vk->renderPass, PL_VK_ALLOC);
+ vk->DestroyPipelineLayout(vk->dev, pass_vk->pipeLayout, PL_VK_ALLOC);
+ vk->DestroyPipelineCache(vk->dev, pass_vk->cache, PL_VK_ALLOC);
+ vk->DestroyDescriptorPool(vk->dev, pass_vk->dsPool, PL_VK_ALLOC);
+ vk->DestroyDescriptorSetLayout(vk->dev, pass_vk->dsLayout, PL_VK_ALLOC);
+ vk->DestroyShaderModule(vk->dev, pass_vk->vert, PL_VK_ALLOC);
+ vk->DestroyShaderModule(vk->dev, pass_vk->shader, PL_VK_ALLOC);
+
+ pl_free((void *) pass);
+}
+
+void vk_pass_destroy(pl_gpu gpu, pl_pass pass)
+{
+ vk_gpu_idle_callback(gpu, (vk_cb) pass_destroy_cb, gpu, pass);
+}
+
+static const VkDescriptorType dsType[] = {
+ [PL_DESC_SAMPLED_TEX] = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+ [PL_DESC_STORAGE_IMG] = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+ [PL_DESC_BUF_UNIFORM] = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+ [PL_DESC_BUF_STORAGE] = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ [PL_DESC_BUF_TEXEL_UNIFORM] = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
+ [PL_DESC_BUF_TEXEL_STORAGE] = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER,
+};
+
+static VkResult vk_compile_glsl(pl_gpu gpu, void *alloc,
+ enum glsl_shader_stage stage,
+ const char *shader,
+ pl_cache_obj *out_spirv)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ pl_cache cache = pl_gpu_cache(gpu);
+ uint64_t key = CACHE_KEY_SPIRV;
+ if (cache) { // skip computing key if `cache
+ pl_hash_merge(&key, p->spirv->signature);
+ pl_hash_merge(&key, pl_str0_hash(shader));
+ out_spirv->key = key;
+ if (pl_cache_get(cache, out_spirv)) {
+ PL_DEBUG(gpu, "Re-using cached SPIR-V object 0x%"PRIx64, key);
+ return VK_SUCCESS;
+ }
+ }
+
+ pl_clock_t start = pl_clock_now();
+ pl_str spirv = pl_spirv_compile_glsl(p->spirv, alloc, gpu->glsl, stage, shader);
+ pl_log_cpu_time(gpu->log, start, pl_clock_now(), "translating SPIR-V");
+ out_spirv->data = spirv.buf;
+ out_spirv->size = spirv.len;
+ out_spirv->free = pl_free;
+ return spirv.len ? VK_SUCCESS : VK_ERROR_INITIALIZATION_FAILED;
+}
+
+static const VkShaderStageFlags stageFlags[] = {
+ [PL_PASS_RASTER] = VK_SHADER_STAGE_FRAGMENT_BIT |
+ VK_SHADER_STAGE_VERTEX_BIT,
+ [PL_PASS_COMPUTE] = VK_SHADER_STAGE_COMPUTE_BIT,
+};
+
+static void destroy_pipeline(struct vk_ctx *vk, void *pipeline)
+{
+ vk->DestroyPipeline(vk->dev, vk_unwrap_handle(pipeline), PL_VK_ALLOC);
+}
+
+static VkResult vk_recreate_pipelines(struct vk_ctx *vk, pl_pass pass,
+ bool derivable, VkPipeline base,
+ VkPipeline *out_pipe)
+{
+ struct pl_pass_vk *pass_vk = PL_PRIV(pass);
+ const struct pl_pass_params *params = &pass->params;
+
+ // The old pipeline might still be in use, so we have to destroy it
+ // asynchronously with a device idle callback
+ if (*out_pipe) {
+ // We don't need to use `vk_gpu_idle_callback` because the only command
+ // that can access a VkPipeline, `vk_pass_run`, always flushes `p->cmd`.
+ vk_dev_callback(vk, (vk_cb) destroy_pipeline, vk, vk_wrap_handle(*out_pipe));
+ *out_pipe = VK_NULL_HANDLE;
+ }
+
+ VkPipelineCreateFlags flags = 0;
+ if (derivable)
+ flags |= VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT;
+ if (base)
+ flags |= VK_PIPELINE_CREATE_DERIVATIVE_BIT;
+
+ const VkSpecializationInfo *specInfo = &pass_vk->specInfo;
+ if (!specInfo->dataSize)
+ specInfo = NULL;
+
+ switch (params->type) {
+ case PL_PASS_RASTER: {
+ static const VkBlendFactor blendFactors[] = {
+ [PL_BLEND_ZERO] = VK_BLEND_FACTOR_ZERO,
+ [PL_BLEND_ONE] = VK_BLEND_FACTOR_ONE,
+ [PL_BLEND_SRC_ALPHA] = VK_BLEND_FACTOR_SRC_ALPHA,
+ [PL_BLEND_ONE_MINUS_SRC_ALPHA] = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA,
+ };
+
+ VkPipelineColorBlendAttachmentState blendState = {
+ .colorBlendOp = VK_BLEND_OP_ADD,
+ .alphaBlendOp = VK_BLEND_OP_ADD,
+ .colorWriteMask = VK_COLOR_COMPONENT_R_BIT |
+ VK_COLOR_COMPONENT_G_BIT |
+ VK_COLOR_COMPONENT_B_BIT |
+ VK_COLOR_COMPONENT_A_BIT,
+ };
+
+ const struct pl_blend_params *blend = params->blend_params;
+ if (blend) {
+ blendState.blendEnable = true;
+ blendState.srcColorBlendFactor = blendFactors[blend->src_rgb];
+ blendState.dstColorBlendFactor = blendFactors[blend->dst_rgb];
+ blendState.srcAlphaBlendFactor = blendFactors[blend->src_alpha];
+ blendState.dstAlphaBlendFactor = blendFactors[blend->dst_alpha];
+ }
+
+ static const VkPrimitiveTopology topologies[PL_PRIM_TYPE_COUNT] = {
+ [PL_PRIM_TRIANGLE_LIST] = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
+ [PL_PRIM_TRIANGLE_STRIP] = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP,
+ };
+
+ VkGraphicsPipelineCreateInfo cinfo = {
+ .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
+ .flags = flags,
+ .stageCount = 2,
+ .pStages = (VkPipelineShaderStageCreateInfo[]) {
+ {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+ .stage = VK_SHADER_STAGE_VERTEX_BIT,
+ .module = pass_vk->vert,
+ .pName = "main",
+ }, {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+ .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
+ .module = pass_vk->shader,
+ .pName = "main",
+ .pSpecializationInfo = specInfo,
+ }
+ },
+ .pVertexInputState = &(VkPipelineVertexInputStateCreateInfo) {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
+ .vertexBindingDescriptionCount = 1,
+ .pVertexBindingDescriptions = &(VkVertexInputBindingDescription) {
+ .binding = 0,
+ .stride = params->vertex_stride,
+ .inputRate = VK_VERTEX_INPUT_RATE_VERTEX,
+ },
+ .vertexAttributeDescriptionCount = params->num_vertex_attribs,
+ .pVertexAttributeDescriptions = pass_vk->attrs,
+ },
+ .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+ .topology = topologies[params->vertex_type],
+ },
+ .pViewportState = &(VkPipelineViewportStateCreateInfo) {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
+ .viewportCount = 1,
+ .scissorCount = 1,
+ },
+ .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
+ .polygonMode = VK_POLYGON_MODE_FILL,
+ .cullMode = VK_CULL_MODE_NONE,
+ .lineWidth = 1.0f,
+ },
+ .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
+ .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
+ },
+ .pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
+ .attachmentCount = 1,
+ .pAttachments = &blendState,
+ },
+ .pDynamicState = &(VkPipelineDynamicStateCreateInfo) {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
+ .dynamicStateCount = 2,
+ .pDynamicStates = (VkDynamicState[]){
+ VK_DYNAMIC_STATE_VIEWPORT,
+ VK_DYNAMIC_STATE_SCISSOR,
+ },
+ },
+ .layout = pass_vk->pipeLayout,
+ .renderPass = pass_vk->renderPass,
+ .basePipelineHandle = base,
+ .basePipelineIndex = -1,
+ };
+
+ return vk->CreateGraphicsPipelines(vk->dev, pass_vk->cache, 1, &cinfo,
+ PL_VK_ALLOC, out_pipe);
+ }
+
+ case PL_PASS_COMPUTE: {
+ VkComputePipelineCreateInfo cinfo = {
+ .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+ .flags = flags,
+ .stage = {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+ .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+ .module = pass_vk->shader,
+ .pName = "main",
+ .pSpecializationInfo = specInfo,
+ },
+ .layout = pass_vk->pipeLayout,
+ .basePipelineHandle = base,
+ .basePipelineIndex = -1,
+ };
+
+ return vk->CreateComputePipelines(vk->dev, pass_vk->cache, 1, &cinfo,
+ PL_VK_ALLOC, out_pipe);
+ }
+
+ case PL_PASS_INVALID:
+ case PL_PASS_TYPE_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+}
+
+pl_pass vk_pass_create(pl_gpu gpu, const struct pl_pass_params *params)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ bool success = false;
+
+ struct pl_pass_t *pass = pl_zalloc_obj(NULL, pass, struct pl_pass_vk);
+ pass->params = pl_pass_params_copy(pass, params);
+
+ struct pl_pass_vk *pass_vk = PL_PRIV(pass);
+ pass_vk->dmask = -1; // all descriptors available
+
+ // temporary allocations
+ void *tmp = pl_tmp(NULL);
+
+ int num_desc = params->num_descriptors;
+ if (!num_desc)
+ goto no_descriptors;
+ if (num_desc > vk->props.limits.maxPerStageResources) {
+ PL_ERR(gpu, "Pass with %d descriptors exceeds the maximum number of "
+ "per-stage resources %" PRIu32"!",
+ num_desc, vk->props.limits.maxPerStageResources);
+ goto error;
+ }
+
+ pass_vk->dswrite = pl_calloc(pass, num_desc, sizeof(VkWriteDescriptorSet));
+ pass_vk->dsiinfo = pl_calloc(pass, num_desc, sizeof(VkDescriptorImageInfo));
+ pass_vk->dsbinfo = pl_calloc(pass, num_desc, sizeof(VkDescriptorBufferInfo));
+
+#define NUM_DS (PL_ARRAY_SIZE(pass_vk->dss))
+
+ int dsSize[PL_DESC_TYPE_COUNT] = {0};
+ VkDescriptorSetLayoutBinding *bindings = pl_calloc_ptr(tmp, num_desc, bindings);
+
+ uint32_t max_tex = vk->props.limits.maxPerStageDescriptorSampledImages,
+ max_img = vk->props.limits.maxPerStageDescriptorStorageImages,
+ max_ubo = vk->props.limits.maxPerStageDescriptorUniformBuffers,
+ max_ssbo = vk->props.limits.maxPerStageDescriptorStorageBuffers;
+
+ uint32_t *dsLimits[PL_DESC_TYPE_COUNT] = {
+ [PL_DESC_SAMPLED_TEX] = &max_tex,
+ [PL_DESC_STORAGE_IMG] = &max_img,
+ [PL_DESC_BUF_UNIFORM] = &max_ubo,
+ [PL_DESC_BUF_STORAGE] = &max_ssbo,
+ [PL_DESC_BUF_TEXEL_UNIFORM] = &max_tex,
+ [PL_DESC_BUF_TEXEL_STORAGE] = &max_img,
+ };
+
+ for (int i = 0; i < num_desc; i++) {
+ struct pl_desc *desc = &params->descriptors[i];
+ if (!(*dsLimits[desc->type])--) {
+ PL_ERR(gpu, "Pass exceeds the maximum number of per-stage "
+ "descriptors of type %u!", (unsigned) desc->type);
+ goto error;
+ }
+
+ dsSize[desc->type]++;
+ bindings[i] = (VkDescriptorSetLayoutBinding) {
+ .binding = desc->binding,
+ .descriptorType = dsType[desc->type],
+ .descriptorCount = 1,
+ .stageFlags = stageFlags[params->type],
+ };
+ }
+
+ VkDescriptorSetLayoutCreateInfo dinfo = {
+ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+ .pBindings = bindings,
+ .bindingCount = num_desc,
+ };
+
+ if (p->max_push_descriptors && num_desc <= p->max_push_descriptors) {
+ dinfo.flags |= VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR;
+ pass_vk->use_pushd = true;
+ } else if (p->max_push_descriptors) {
+ PL_INFO(gpu, "Pass with %d descriptors exceeds the maximum push "
+ "descriptor count (%d). Falling back to descriptor sets!",
+ num_desc, p->max_push_descriptors);
+ }
+
+ VK(vk->CreateDescriptorSetLayout(vk->dev, &dinfo, PL_VK_ALLOC,
+ &pass_vk->dsLayout));
+
+ if (!pass_vk->use_pushd) {
+ PL_ARRAY(VkDescriptorPoolSize) dsPoolSizes = {0};
+
+ for (enum pl_desc_type t = 0; t < PL_DESC_TYPE_COUNT; t++) {
+ if (dsSize[t] > 0) {
+ PL_ARRAY_APPEND(tmp, dsPoolSizes, (VkDescriptorPoolSize) {
+ .type = dsType[t],
+ .descriptorCount = dsSize[t] * NUM_DS,
+ });
+ }
+ }
+
+ if (dsPoolSizes.num) {
+ VkDescriptorPoolCreateInfo pinfo = {
+ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+ .maxSets = NUM_DS,
+ .pPoolSizes = dsPoolSizes.elem,
+ .poolSizeCount = dsPoolSizes.num,
+ };
+
+ VK(vk->CreateDescriptorPool(vk->dev, &pinfo, PL_VK_ALLOC, &pass_vk->dsPool));
+
+ VkDescriptorSetLayout layouts[NUM_DS];
+ for (int i = 0; i < NUM_DS; i++)
+ layouts[i] = pass_vk->dsLayout;
+
+ VkDescriptorSetAllocateInfo ainfo = {
+ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+ .descriptorPool = pass_vk->dsPool,
+ .descriptorSetCount = NUM_DS,
+ .pSetLayouts = layouts,
+ };
+
+ VK(vk->AllocateDescriptorSets(vk->dev, &ainfo, pass_vk->dss));
+ }
+ }
+
+no_descriptors: ;
+
+ bool has_spec = params->num_constants;
+ if (has_spec) {
+ PL_ARRAY(VkSpecializationMapEntry) entries = {0};
+ PL_ARRAY_RESIZE(pass, entries, params->num_constants);
+ size_t spec_size = 0;
+
+ for (int i = 0; i < params->num_constants; i++) {
+ const struct pl_constant *con = &params->constants[i];
+ size_t con_size = pl_var_type_size(con->type);
+ entries.elem[i] = (VkSpecializationMapEntry) {
+ .constantID = con->id,
+ .offset = con->offset,
+ .size = con_size,
+ };
+
+ size_t req_size = con->offset + con_size;
+ spec_size = PL_MAX(spec_size, req_size);
+ }
+
+ pass_vk->spec_size = spec_size;
+ pass_vk->specInfo = (VkSpecializationInfo) {
+ .mapEntryCount = params->num_constants,
+ .pMapEntries = entries.elem,
+ };
+
+ if (params->constant_data) {
+ pass_vk->specInfo.pData = pl_memdup(pass, params->constant_data, spec_size);
+ pass_vk->specInfo.dataSize = spec_size;
+ }
+ }
+
+ VkPipelineLayoutCreateInfo linfo = {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+ .setLayoutCount = num_desc ? 1 : 0,
+ .pSetLayouts = &pass_vk->dsLayout,
+ .pushConstantRangeCount = params->push_constants_size ? 1 : 0,
+ .pPushConstantRanges = &(VkPushConstantRange){
+ .stageFlags = stageFlags[params->type],
+ .offset = 0,
+ .size = params->push_constants_size,
+ },
+ };
+
+ VK(vk->CreatePipelineLayout(vk->dev, &linfo, PL_VK_ALLOC,
+ &pass_vk->pipeLayout));
+
+ pl_cache_obj vert = {0}, frag = {0}, comp = {0};
+ switch (params->type) {
+ case PL_PASS_RASTER: ;
+ VK(vk_compile_glsl(gpu, tmp, GLSL_SHADER_VERTEX, params->vertex_shader, &vert));
+ VK(vk_compile_glsl(gpu, tmp, GLSL_SHADER_FRAGMENT, params->glsl_shader, &frag));
+ break;
+ case PL_PASS_COMPUTE:
+ VK(vk_compile_glsl(gpu, tmp, GLSL_SHADER_COMPUTE, params->glsl_shader, &comp));
+ break;
+ case PL_PASS_INVALID:
+ case PL_PASS_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ // Use hash of generated SPIR-V as key for pipeline cache
+ const pl_cache cache = pl_gpu_cache(gpu);
+ pl_cache_obj pipecache = {0};
+ if (cache) {
+ pipecache.key = CACHE_KEY_VK_PIPE;
+ pl_hash_merge(&pipecache.key, pl_var_hash(vk->props.pipelineCacheUUID));
+ pl_hash_merge(&pipecache.key, pl_mem_hash(vert.data, vert.size));
+ pl_hash_merge(&pipecache.key, pl_mem_hash(frag.data, frag.size));
+ pl_hash_merge(&pipecache.key, pl_mem_hash(comp.data, comp.size));
+ pl_cache_get(cache, &pipecache);
+ }
+
+ if (cache || has_spec) {
+ // Don't create pipeline cache unless we either plan on caching the
+ // result of this shader to a pl_cache, or if we will possibly re-use
+ // it due to the presence of specialization constants
+ VkPipelineCacheCreateInfo pcinfo = {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO,
+ .pInitialData = pipecache.data,
+ .initialDataSize = pipecache.size,
+ };
+
+ VK(vk->CreatePipelineCache(vk->dev, &pcinfo, PL_VK_ALLOC, &pass_vk->cache));
+ }
+
+ VkShaderModuleCreateInfo sinfo = {
+ .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+ };
+
+ pl_clock_t start = pl_clock_now();
+ switch (params->type) {
+ case PL_PASS_RASTER: {
+ sinfo.pCode = (uint32_t *) vert.data;
+ sinfo.codeSize = vert.size;
+ VK(vk->CreateShaderModule(vk->dev, &sinfo, PL_VK_ALLOC, &pass_vk->vert));
+ PL_VK_NAME(SHADER_MODULE, pass_vk->vert, "vertex");
+
+ sinfo.pCode = (uint32_t *) frag.data;
+ sinfo.codeSize = frag.size;
+ VK(vk->CreateShaderModule(vk->dev, &sinfo, PL_VK_ALLOC, &pass_vk->shader));
+ PL_VK_NAME(SHADER_MODULE, pass_vk->shader, "fragment");
+
+ pass_vk->attrs = pl_calloc_ptr(pass, params->num_vertex_attribs, pass_vk->attrs);
+ for (int i = 0; i < params->num_vertex_attribs; i++) {
+ struct pl_vertex_attrib *va = &params->vertex_attribs[i];
+ const struct vk_format **pfmt_vk = PL_PRIV(va->fmt);
+
+ pass_vk->attrs[i] = (VkVertexInputAttributeDescription) {
+ .binding = 0,
+ .location = va->location,
+ .offset = va->offset,
+ .format = PL_DEF((*pfmt_vk)->bfmt, (*pfmt_vk)->tfmt),
+ };
+ }
+
+ VkRenderPassCreateInfo rinfo = {
+ .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
+ .attachmentCount = 1,
+ .pAttachments = &(VkAttachmentDescription) {
+ .format = (VkFormat) params->target_format->signature,
+ .samples = VK_SAMPLE_COUNT_1_BIT,
+ .loadOp = pass->params.load_target
+ ? VK_ATTACHMENT_LOAD_OP_LOAD
+ : VK_ATTACHMENT_LOAD_OP_DONT_CARE,
+ .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
+ .initialLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+ .finalLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+ },
+ .subpassCount = 1,
+ .pSubpasses = &(VkSubpassDescription) {
+ .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
+ .colorAttachmentCount = 1,
+ .pColorAttachments = &(VkAttachmentReference) {
+ .attachment = 0,
+ .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+ },
+ },
+ };
+
+ VK(vk->CreateRenderPass(vk->dev, &rinfo, PL_VK_ALLOC, &pass_vk->renderPass));
+ break;
+ }
+ case PL_PASS_COMPUTE: {
+ sinfo.pCode = (uint32_t *) comp.data;
+ sinfo.codeSize = comp.size;
+ VK(vk->CreateShaderModule(vk->dev, &sinfo, PL_VK_ALLOC, &pass_vk->shader));
+ PL_VK_NAME(SHADER_MODULE, pass_vk->shader, "compute");
+ break;
+ }
+ case PL_PASS_INVALID:
+ case PL_PASS_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ pl_clock_t after_compilation = pl_clock_now();
+ pl_log_cpu_time(gpu->log, start, after_compilation, "compiling shader");
+
+ // Update cache entries on successful compilation
+ pl_cache_steal(cache, &vert);
+ pl_cache_steal(cache, &frag);
+ pl_cache_steal(cache, &comp);
+
+ // Create the graphics/compute pipeline
+ VkPipeline *pipe = has_spec ? &pass_vk->base : &pass_vk->pipe;
+ VK(vk_recreate_pipelines(vk, pass, has_spec, VK_NULL_HANDLE, pipe));
+ pl_log_cpu_time(gpu->log, after_compilation, pl_clock_now(), "creating pipeline");
+
+ // Update pipeline cache
+ if (cache) {
+ size_t size = 0;
+ VK(vk->GetPipelineCacheData(vk->dev, pass_vk->cache, &size, NULL));
+ pl_cache_obj_resize(tmp, &pipecache, size);
+ VK(vk->GetPipelineCacheData(vk->dev, pass_vk->cache, &size, pipecache.data));
+ pl_cache_steal(cache, &pipecache);
+ }
+
+ if (!has_spec) {
+ // We can free these if we no longer need them for specialization
+ pl_free_ptr(&pass_vk->attrs);
+ vk->DestroyShaderModule(vk->dev, pass_vk->vert, PL_VK_ALLOC);
+ vk->DestroyShaderModule(vk->dev, pass_vk->shader, PL_VK_ALLOC);
+ vk->DestroyPipelineCache(vk->dev, pass_vk->cache, PL_VK_ALLOC);
+ pass_vk->vert = VK_NULL_HANDLE;
+ pass_vk->shader = VK_NULL_HANDLE;
+ pass_vk->cache = VK_NULL_HANDLE;
+ }
+
+ PL_DEBUG(vk, "Pass statistics: size %zu, SPIR-V: vert %zu frag %zu comp %zu",
+ pipecache.size, vert.size, frag.size, comp.size);
+
+ success = true;
+
+error:
+ if (!success) {
+ pass_destroy_cb(gpu, pass);
+ pass = NULL;
+ }
+
+#undef NUM_DS
+
+ pl_free(tmp);
+ return pass;
+}
+
+static const VkPipelineStageFlags2 shaderStages[] = {
+ [PL_PASS_RASTER] = VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT,
+ [PL_PASS_COMPUTE] = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+};
+
+static void vk_update_descriptor(pl_gpu gpu, struct vk_cmd *cmd, pl_pass pass,
+ struct pl_desc_binding db,
+ VkDescriptorSet ds, int idx)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct pl_pass_vk *pass_vk = PL_PRIV(pass);
+ struct pl_desc *desc = &pass->params.descriptors[idx];
+
+ VkWriteDescriptorSet *wds = &pass_vk->dswrite[idx];
+ *wds = (VkWriteDescriptorSet) {
+ .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+ .dstSet = ds,
+ .dstBinding = desc->binding,
+ .descriptorCount = 1,
+ .descriptorType = dsType[desc->type],
+ };
+
+ static const VkAccessFlags2 storageAccess[PL_DESC_ACCESS_COUNT] = {
+ [PL_DESC_ACCESS_READONLY] = VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
+ [PL_DESC_ACCESS_WRITEONLY] = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+ [PL_DESC_ACCESS_READWRITE] = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
+ VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+ };
+
+ switch (desc->type) {
+ case PL_DESC_SAMPLED_TEX: {
+ pl_tex tex = db.object;
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+
+ vk_tex_barrier(gpu, cmd, tex, shaderStages[pass->params.type],
+ VK_ACCESS_2_SHADER_SAMPLED_READ_BIT,
+ VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+ VK_QUEUE_FAMILY_IGNORED);
+
+ VkDescriptorImageInfo *iinfo = &pass_vk->dsiinfo[idx];
+ *iinfo = (VkDescriptorImageInfo) {
+ .sampler = p->samplers[db.sample_mode][db.address_mode],
+ .imageView = tex_vk->view,
+ .imageLayout = tex_vk->layout,
+ };
+
+ wds->pImageInfo = iinfo;
+ return;
+ }
+ case PL_DESC_STORAGE_IMG: {
+ pl_tex tex = db.object;
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+
+ vk_tex_barrier(gpu, cmd, tex, shaderStages[pass->params.type],
+ storageAccess[desc->access], VK_IMAGE_LAYOUT_GENERAL,
+ VK_QUEUE_FAMILY_IGNORED);
+
+ VkDescriptorImageInfo *iinfo = &pass_vk->dsiinfo[idx];
+ *iinfo = (VkDescriptorImageInfo) {
+ .imageView = tex_vk->view,
+ .imageLayout = tex_vk->layout,
+ };
+
+ wds->pImageInfo = iinfo;
+ return;
+ }
+ case PL_DESC_BUF_UNIFORM:
+ case PL_DESC_BUF_STORAGE: {
+ pl_buf buf = db.object;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+ VkAccessFlags2 access = VK_ACCESS_2_UNIFORM_READ_BIT;
+ if (desc->type == PL_DESC_BUF_STORAGE)
+ access = storageAccess[desc->access];
+
+ vk_buf_barrier(gpu, cmd, buf, shaderStages[pass->params.type],
+ access, 0, buf->params.size, false);
+
+ VkDescriptorBufferInfo *binfo = &pass_vk->dsbinfo[idx];
+ *binfo = (VkDescriptorBufferInfo) {
+ .buffer = buf_vk->mem.buf,
+ .offset = buf_vk->mem.offset,
+ .range = buf->params.size,
+ };
+
+ wds->pBufferInfo = binfo;
+ return;
+ }
+ case PL_DESC_BUF_TEXEL_UNIFORM:
+ case PL_DESC_BUF_TEXEL_STORAGE: {
+ pl_buf buf = db.object;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+ VkAccessFlags2 access = VK_ACCESS_2_SHADER_SAMPLED_READ_BIT;
+ if (desc->type == PL_DESC_BUF_TEXEL_STORAGE)
+ access = storageAccess[desc->access];
+
+ vk_buf_barrier(gpu, cmd, buf, shaderStages[pass->params.type],
+ access, 0, buf->params.size, false);
+
+ wds->pTexelBufferView = &buf_vk->view;
+ return;
+ }
+ case PL_DESC_INVALID:
+ case PL_DESC_TYPE_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+}
+
+static void vk_release_descriptor(pl_gpu gpu, struct vk_cmd *cmd, pl_pass pass,
+ struct pl_desc_binding db, int idx)
+{
+ const struct pl_desc *desc = &pass->params.descriptors[idx];
+
+ switch (desc->type) {
+ case PL_DESC_BUF_UNIFORM:
+ case PL_DESC_BUF_STORAGE:
+ case PL_DESC_BUF_TEXEL_UNIFORM:
+ case PL_DESC_BUF_TEXEL_STORAGE:
+ if (desc->access != PL_DESC_ACCESS_READONLY) {
+ pl_buf buf = db.object;
+ vk_buf_flush(gpu, cmd, buf, 0, buf->params.size);
+ }
+ return;
+ case PL_DESC_SAMPLED_TEX:
+ case PL_DESC_STORAGE_IMG:
+ return;
+ case PL_DESC_INVALID:
+ case PL_DESC_TYPE_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+}
+
+static void set_ds(struct pl_pass_vk *pass_vk, void *dsbit)
+{
+ pass_vk->dmask |= (uintptr_t) dsbit;
+}
+
+static bool need_respec(pl_pass pass, const struct pl_pass_run_params *params)
+{
+ struct pl_pass_vk *pass_vk = PL_PRIV(pass);
+ if (!pass_vk->spec_size || !params->constant_data)
+ return false;
+
+ VkSpecializationInfo *specInfo = &pass_vk->specInfo;
+ size_t size = pass_vk->spec_size;
+ if (!specInfo->pData) {
+ // Shader was never specialized before
+ specInfo->pData = pl_memdup((void *) pass, params->constant_data, size);
+ specInfo->dataSize = size;
+ return true;
+ }
+
+ // Shader is being re-specialized with new values
+ if (memcmp(specInfo->pData, params->constant_data, size) != 0) {
+ memcpy((void *) specInfo->pData, params->constant_data, size);
+ return true;
+ }
+
+ return false;
+}
+
+void vk_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ pl_pass pass = params->pass;
+ struct pl_pass_vk *pass_vk = PL_PRIV(pass);
+
+ if (params->vertex_data || params->index_data)
+ return pl_pass_run_vbo(gpu, params);
+
+ // Check if we need to re-specialize this pipeline
+ if (need_respec(pass, params)) {
+ pl_clock_t start = pl_clock_now();
+ VK(vk_recreate_pipelines(vk, pass, false, pass_vk->base, &pass_vk->pipe));
+ pl_log_cpu_time(gpu->log, start, pl_clock_now(), "re-specializing shader");
+ }
+
+ if (!pass_vk->use_pushd) {
+ // Wait for a free descriptor set
+ while (!pass_vk->dmask) {
+ PL_TRACE(gpu, "No free descriptor sets! ...blocking (slow path)");
+ vk_poll_commands(vk, 10000000); // 10 ms
+ }
+ }
+
+ static const enum queue_type types[] = {
+ [PL_PASS_RASTER] = GRAPHICS,
+ [PL_PASS_COMPUTE] = COMPUTE,
+ };
+
+ struct vk_cmd *cmd = CMD_BEGIN_TIMED(types[pass->params.type], params->timer);
+ if (!cmd)
+ goto error;
+
+ // Find a descriptor set to use
+ VkDescriptorSet ds = VK_NULL_HANDLE;
+ if (!pass_vk->use_pushd) {
+ for (int i = 0; i < PL_ARRAY_SIZE(pass_vk->dss); i++) {
+ uint16_t dsbit = 1u << i;
+ if (pass_vk->dmask & dsbit) {
+ ds = pass_vk->dss[i];
+ pass_vk->dmask &= ~dsbit; // unset
+ vk_cmd_callback(cmd, (vk_cb) set_ds, pass_vk,
+ (void *)(uintptr_t) dsbit);
+ break;
+ }
+ }
+ }
+
+ // Update the dswrite structure with all of the new values
+ for (int i = 0; i < pass->params.num_descriptors; i++)
+ vk_update_descriptor(gpu, cmd, pass, params->desc_bindings[i], ds, i);
+
+ if (!pass_vk->use_pushd) {
+ vk->UpdateDescriptorSets(vk->dev, pass->params.num_descriptors,
+ pass_vk->dswrite, 0, NULL);
+ }
+
+ // Bind the pipeline, descriptor set, etc.
+ static const VkPipelineBindPoint bindPoint[] = {
+ [PL_PASS_RASTER] = VK_PIPELINE_BIND_POINT_GRAPHICS,
+ [PL_PASS_COMPUTE] = VK_PIPELINE_BIND_POINT_COMPUTE,
+ };
+
+ vk->CmdBindPipeline(cmd->buf, bindPoint[pass->params.type],
+ PL_DEF(pass_vk->pipe, pass_vk->base));
+
+ if (ds) {
+ vk->CmdBindDescriptorSets(cmd->buf, bindPoint[pass->params.type],
+ pass_vk->pipeLayout, 0, 1, &ds, 0, NULL);
+ }
+
+ if (pass_vk->use_pushd) {
+ vk->CmdPushDescriptorSetKHR(cmd->buf, bindPoint[pass->params.type],
+ pass_vk->pipeLayout, 0,
+ pass->params.num_descriptors,
+ pass_vk->dswrite);
+ }
+
+ if (pass->params.push_constants_size) {
+ vk->CmdPushConstants(cmd->buf, pass_vk->pipeLayout,
+ stageFlags[pass->params.type], 0,
+ pass->params.push_constants_size,
+ params->push_constants);
+ }
+
+ switch (pass->params.type) {
+ case PL_PASS_RASTER: {
+ pl_tex tex = params->target;
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+ pl_buf vert = params->vertex_buf;
+ struct pl_buf_vk *vert_vk = PL_PRIV(vert);
+ pl_buf index = params->index_buf;
+ struct pl_buf_vk *index_vk = index ? PL_PRIV(index) : NULL;
+ pl_assert(vert);
+
+ // In the edge case that vert = index buffer, we need to synchronize
+ // for both flags simultaneously
+ VkPipelineStageFlags2 vbo_stage = VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT;
+ VkAccessFlags2 vbo_flags = VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT;
+ if (index == vert) {
+ vbo_stage |= VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT;
+ vbo_flags |= VK_ACCESS_2_INDEX_READ_BIT;
+ }
+
+ vk_buf_barrier(gpu, cmd, vert, vbo_stage, vbo_flags, 0, vert->params.size, false);
+
+ VkDeviceSize offset = vert_vk->mem.offset + params->buf_offset;
+ vk->CmdBindVertexBuffers(cmd->buf, 0, 1, &vert_vk->mem.buf, &offset);
+
+ if (index) {
+ if (index != vert) {
+ vk_buf_barrier(gpu, cmd, index, VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT,
+ VK_ACCESS_2_INDEX_READ_BIT, 0, index->params.size,
+ false);
+ }
+
+ static const VkIndexType index_fmts[PL_INDEX_FORMAT_COUNT] = {
+ [PL_INDEX_UINT16] = VK_INDEX_TYPE_UINT16,
+ [PL_INDEX_UINT32] = VK_INDEX_TYPE_UINT32,
+ };
+
+ vk->CmdBindIndexBuffer(cmd->buf, index_vk->mem.buf,
+ index_vk->mem.offset + params->index_offset,
+ index_fmts[params->index_fmt]);
+ }
+
+
+ VkAccessFlags2 fbo_access = VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT;
+ if (pass->params.load_target)
+ fbo_access |= VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT;
+
+ vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
+ fbo_access, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+ VK_QUEUE_FAMILY_IGNORED);
+
+ VkViewport viewport = {
+ .x = params->viewport.x0,
+ .y = params->viewport.y0,
+ .width = pl_rect_w(params->viewport),
+ .height = pl_rect_h(params->viewport),
+ };
+
+ VkRect2D scissor = {
+ .offset = {params->scissors.x0, params->scissors.y0},
+ .extent = {pl_rect_w(params->scissors), pl_rect_h(params->scissors)},
+ };
+
+ vk->CmdSetViewport(cmd->buf, 0, 1, &viewport);
+ vk->CmdSetScissor(cmd->buf, 0, 1, &scissor);
+
+ VkRenderPassBeginInfo binfo = {
+ .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
+ .renderPass = pass_vk->renderPass,
+ .framebuffer = tex_vk->framebuffer,
+ .renderArea.extent = {tex->params.w, tex->params.h},
+ };
+
+ vk->CmdBeginRenderPass(cmd->buf, &binfo, VK_SUBPASS_CONTENTS_INLINE);
+
+ if (index) {
+ vk->CmdDrawIndexed(cmd->buf, params->vertex_count, 1, 0, 0, 0);
+ } else {
+ vk->CmdDraw(cmd->buf, params->vertex_count, 1, 0, 0);
+ }
+
+ vk->CmdEndRenderPass(cmd->buf);
+ break;
+ }
+ case PL_PASS_COMPUTE:
+ vk->CmdDispatch(cmd->buf, params->compute_groups[0],
+ params->compute_groups[1],
+ params->compute_groups[2]);
+ break;
+ case PL_PASS_INVALID:
+ case PL_PASS_TYPE_COUNT:
+ pl_unreachable();
+ };
+
+ for (int i = 0; i < pass->params.num_descriptors; i++)
+ vk_release_descriptor(gpu, cmd, pass, params->desc_bindings[i], i);
+
+ // submit this command buffer for better intra-frame granularity
+ CMD_SUBMIT(&cmd);
+
+error:
+ return;
+}
diff --git a/src/vulkan/gpu_tex.c b/src/vulkan/gpu_tex.c
new file mode 100644
index 0000000..7ab83b7
--- /dev/null
+++ b/src/vulkan/gpu_tex.c
@@ -0,0 +1,1453 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+
+void vk_tex_barrier(pl_gpu gpu, struct vk_cmd *cmd, pl_tex tex,
+ VkPipelineStageFlags2 stage, VkAccessFlags2 access,
+ VkImageLayout layout, uint32_t qf)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+ pl_rc_ref(&tex_vk->rc);
+ pl_assert(!tex_vk->held);
+ pl_assert(!tex_vk->num_planes);
+
+ // CONCURRENT images require transitioning to/from IGNORED, EXCLUSIVE
+ // images require transitioning to/from the concrete QF index
+ if (vk->pools.num == 1) {
+ if (tex_vk->qf == VK_QUEUE_FAMILY_IGNORED)
+ tex_vk->qf = cmd->pool->qf;
+ if (qf == VK_QUEUE_FAMILY_IGNORED)
+ qf = cmd->pool->qf;
+ }
+
+ struct vk_sync_scope last;
+ bool is_trans = layout != tex_vk->layout, is_xfer = qf != tex_vk->qf;
+ last = vk_sem_barrier(cmd, &tex_vk->sem, stage, access, is_trans || is_xfer);
+
+ VkImageMemoryBarrier2 barr = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2,
+ .srcStageMask = last.stage,
+ .srcAccessMask = last.access,
+ .dstStageMask = stage,
+ .dstAccessMask = access,
+ .oldLayout = tex_vk->layout,
+ .newLayout = layout,
+ .srcQueueFamilyIndex = tex_vk->qf,
+ .dstQueueFamilyIndex = qf,
+ .image = tex_vk->img,
+ .subresourceRange = {
+ .aspectMask = tex_vk->aspect,
+ .levelCount = 1,
+ .layerCount = 1,
+ },
+ };
+
+ if (tex_vk->may_invalidate) {
+ tex_vk->may_invalidate = false;
+ barr.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+ }
+
+ if (last.access || is_trans || is_xfer) {
+ vk_cmd_barrier(cmd, &(VkDependencyInfo) {
+ .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+ .imageMemoryBarrierCount = 1,
+ .pImageMemoryBarriers = &barr,
+ });
+ }
+
+ tex_vk->qf = qf;
+ tex_vk->layout = layout;
+ vk_cmd_callback(cmd, (vk_cb) vk_tex_deref, gpu, tex);
+
+ for (int i = 0; i < tex_vk->ext_deps.num; i++)
+ vk_cmd_dep(cmd, stage, tex_vk->ext_deps.elem[i]);
+ tex_vk->ext_deps.num = 0;
+
+ if (tex_vk->ext_sync) {
+ vk_cmd_callback(cmd, (vk_cb) vk_sync_deref, gpu, tex_vk->ext_sync);
+ tex_vk->ext_sync = NULL;
+ }
+}
+
+static void vk_tex_destroy(pl_gpu gpu, struct pl_tex_t *tex)
+{
+ if (!tex)
+ return;
+
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+
+ vk_sync_deref(gpu, tex_vk->ext_sync);
+ vk->DestroyFramebuffer(vk->dev, tex_vk->framebuffer, PL_VK_ALLOC);
+ vk->DestroyImageView(vk->dev, tex_vk->view, PL_VK_ALLOC);
+ for (int i = 0; i < tex_vk->num_planes; i++)
+ vk_tex_deref(gpu, tex->planes[i]);
+ if (!tex_vk->external_img) {
+ vk->DestroyImage(vk->dev, tex_vk->img, PL_VK_ALLOC);
+ vk_malloc_free(vk->ma, &tex_vk->mem);
+ }
+
+ pl_free(tex);
+}
+
+void vk_tex_deref(pl_gpu gpu, pl_tex tex)
+{
+ if (!tex)
+ return;
+
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+ if (pl_rc_deref(&tex_vk->rc))
+ vk_tex_destroy(gpu, (struct pl_tex_t *) tex);
+}
+
+
+// Initializes non-VkImage values like the image view, framebuffers, etc.
+static bool vk_init_image(pl_gpu gpu, pl_tex tex, pl_debug_tag debug_tag)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ const struct pl_tex_params *params = &tex->params;
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+ pl_assert(tex_vk->img);
+ PL_VK_NAME(IMAGE, tex_vk->img, debug_tag);
+ pl_rc_init(&tex_vk->rc);
+ if (tex_vk->num_planes)
+ return true;
+ tex_vk->layout = VK_IMAGE_LAYOUT_UNDEFINED;
+ tex_vk->transfer_queue = GRAPHICS;
+ tex_vk->qf = VK_QUEUE_FAMILY_IGNORED; // will be set on first use, if needed
+
+ // Always use the transfer pool if available, for efficiency
+ if ((params->host_writable || params->host_readable) && vk->pool_transfer)
+ tex_vk->transfer_queue = TRANSFER;
+
+ // For emulated formats: force usage of the compute queue, because we
+ // can't properly track cross-queue dependencies for buffers (yet?)
+ if (params->format->emulated)
+ tex_vk->transfer_queue = COMPUTE;
+
+ bool ret = false;
+ VkRenderPass dummyPass = VK_NULL_HANDLE;
+
+ if (params->sampleable || params->renderable || params->storable) {
+ static const VkImageViewType viewType[] = {
+ [VK_IMAGE_TYPE_1D] = VK_IMAGE_VIEW_TYPE_1D,
+ [VK_IMAGE_TYPE_2D] = VK_IMAGE_VIEW_TYPE_2D,
+ [VK_IMAGE_TYPE_3D] = VK_IMAGE_VIEW_TYPE_3D,
+ };
+
+ const VkImageViewCreateInfo vinfo = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+ .image = tex_vk->img,
+ .viewType = viewType[tex_vk->type],
+ .format = tex_vk->img_fmt,
+ .subresourceRange = {
+ .aspectMask = tex_vk->aspect,
+ .levelCount = 1,
+ .layerCount = 1,
+ },
+ };
+
+ VK(vk->CreateImageView(vk->dev, &vinfo, PL_VK_ALLOC, &tex_vk->view));
+ PL_VK_NAME(IMAGE_VIEW, tex_vk->view, debug_tag);
+ }
+
+ if (params->renderable) {
+ // Framebuffers need to be created against a specific render pass
+ // layout, so we need to temporarily create a skeleton/dummy render
+ // pass for vulkan to figure out the compatibility
+ VkRenderPassCreateInfo rinfo = {
+ .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
+ .attachmentCount = 1,
+ .pAttachments = &(VkAttachmentDescription) {
+ .format = tex_vk->img_fmt,
+ .samples = VK_SAMPLE_COUNT_1_BIT,
+ .loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE,
+ .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
+ .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
+ .finalLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+ },
+ .subpassCount = 1,
+ .pSubpasses = &(VkSubpassDescription) {
+ .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
+ .colorAttachmentCount = 1,
+ .pColorAttachments = &(VkAttachmentReference) {
+ .attachment = 0,
+ .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+ },
+ },
+ };
+
+ VK(vk->CreateRenderPass(vk->dev, &rinfo, PL_VK_ALLOC, &dummyPass));
+
+ VkFramebufferCreateInfo finfo = {
+ .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
+ .renderPass = dummyPass,
+ .attachmentCount = 1,
+ .pAttachments = &tex_vk->view,
+ .width = tex->params.w,
+ .height = tex->params.h,
+ .layers = 1,
+ };
+
+ if (finfo.width > vk->props.limits.maxFramebufferWidth ||
+ finfo.height > vk->props.limits.maxFramebufferHeight)
+ {
+ PL_ERR(gpu, "Framebuffer of size %dx%d exceeds the maximum allowed "
+ "dimensions: %dx%d", finfo.width, finfo.height,
+ vk->props.limits.maxFramebufferWidth,
+ vk->props.limits.maxFramebufferHeight);
+ goto error;
+ }
+
+ VK(vk->CreateFramebuffer(vk->dev, &finfo, PL_VK_ALLOC,
+ &tex_vk->framebuffer));
+ PL_VK_NAME(FRAMEBUFFER, tex_vk->framebuffer, debug_tag);
+ }
+
+ ret = true;
+
+error:
+ vk->DestroyRenderPass(vk->dev, dummyPass, PL_VK_ALLOC);
+ return ret;
+}
+
+pl_tex vk_tex_create(pl_gpu gpu, const struct pl_tex_params *params)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ enum pl_handle_type handle_type = params->export_handle |
+ params->import_handle;
+ VkExternalMemoryHandleTypeFlagBitsKHR vk_handle_type = vk_mem_handle_type(handle_type);
+
+ struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, struct pl_tex_vk);
+ pl_fmt fmt = params->format;
+ tex->params = *params;
+ tex->params.initial_data = NULL;
+ tex->sampler_type = PL_SAMPLER_NORMAL;
+
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+ struct pl_fmt_vk *fmtp = PL_PRIV(fmt);
+ tex_vk->img_fmt = fmtp->vk_fmt->tfmt;
+ tex_vk->num_planes = fmt->num_planes;
+ for (int i = 0; i < tex_vk->num_planes; i++)
+ tex_vk->aspect |= VK_IMAGE_ASPECT_PLANE_0_BIT << i;
+ tex_vk->aspect = PL_DEF(tex_vk->aspect, VK_IMAGE_ASPECT_COLOR_BIT);
+
+ switch (pl_tex_params_dimension(*params)) {
+ case 1: tex_vk->type = VK_IMAGE_TYPE_1D; break;
+ case 2: tex_vk->type = VK_IMAGE_TYPE_2D; break;
+ case 3: tex_vk->type = VK_IMAGE_TYPE_3D; break;
+ }
+
+ if (fmt->emulated) {
+ tex_vk->texel_fmt = pl_find_fmt(gpu, fmt->type, 1, 0,
+ fmt->host_bits[0],
+ PL_FMT_CAP_TEXEL_UNIFORM);
+ if (!tex_vk->texel_fmt) {
+ PL_ERR(gpu, "Failed picking texel format for emulated texture!");
+ goto error;
+ }
+
+ // Our format emulation requires storage image support. In order to
+ // make a bunch of checks happy, just mark it off as storable (and also
+ // enable VK_IMAGE_USAGE_STORAGE_BIT, which we do below)
+ tex->params.storable = true;
+ }
+
+ if (fmtp->blit_emulated) {
+ // Enable what's required for sampling
+ tex->params.sampleable = fmt->caps & PL_FMT_CAP_SAMPLEABLE;
+ tex->params.storable = true;
+ }
+
+ // Blit emulation on planar textures requires storage
+ if ((params->blit_src || params->blit_dst) && tex_vk->num_planes)
+ tex->params.storable = true;
+
+ VkImageUsageFlags usage = 0;
+ VkImageCreateFlags flags = 0;
+ if (tex->params.sampleable)
+ usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
+ if (tex->params.renderable)
+ usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
+ if (tex->params.storable)
+ usage |= VK_IMAGE_USAGE_STORAGE_BIT;
+ if (tex->params.host_readable || tex->params.blit_src)
+ usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
+ if (tex->params.host_writable || tex->params.blit_dst || params->initial_data)
+ usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT;
+
+ if (!usage) {
+ // Vulkan requires images have at least *some* image usage set, but our
+ // API is perfectly happy with a (useless) image. So just put
+ // VK_IMAGE_USAGE_TRANSFER_DST_BIT since this harmless.
+ usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT;
+ }
+
+ if (tex_vk->num_planes) {
+ flags |= VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT |
+ VK_IMAGE_CREATE_EXTENDED_USAGE_BIT;
+ }
+
+ // FIXME: Since we can't keep track of queue family ownership properly,
+ // and we don't know in advance what types of queue families this image
+ // will belong to, we're forced to share all of our images between all
+ // command pools.
+ uint32_t qfs[3] = {0};
+ pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs));
+ for (int i = 0; i < vk->pools.num; i++)
+ qfs[i] = vk->pools.elem[i]->qf;
+
+ VkImageDrmFormatModifierExplicitCreateInfoEXT drm_explicit = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT,
+ .drmFormatModifier = params->shared_mem.drm_format_mod,
+ .drmFormatModifierPlaneCount = 1,
+ .pPlaneLayouts = &(VkSubresourceLayout) {
+ .rowPitch = PL_DEF(params->shared_mem.stride_w, params->w),
+ .depthPitch = params->d ? PL_DEF(params->shared_mem.stride_h, params->h) : 0,
+ .offset = params->shared_mem.offset,
+ },
+ };
+
+#ifdef VK_EXT_metal_objects
+ VkImportMetalTextureInfoEXT import_metal_tex = {
+ .sType = VK_STRUCTURE_TYPE_IMPORT_METAL_TEXTURE_INFO_EXT,
+ .plane = VK_IMAGE_ASPECT_PLANE_0_BIT << params->shared_mem.plane,
+ };
+
+ VkImportMetalIOSurfaceInfoEXT import_iosurface = {
+ .sType = VK_STRUCTURE_TYPE_IMPORT_METAL_IO_SURFACE_INFO_EXT,
+ };
+#endif
+
+ VkImageDrmFormatModifierListCreateInfoEXT drm_list = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT,
+ .drmFormatModifierCount = fmt->num_modifiers,
+ .pDrmFormatModifiers = fmt->modifiers,
+ };
+
+ VkExternalMemoryImageCreateInfoKHR ext_info = {
+ .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO_KHR,
+ .handleTypes = vk_handle_type,
+ };
+
+ VkImageCreateInfo iinfo = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+ .pNext = vk_handle_type ? &ext_info : NULL,
+ .imageType = tex_vk->type,
+ .format = tex_vk->img_fmt,
+ .extent = (VkExtent3D) {
+ .width = params->w,
+ .height = PL_MAX(1, params->h),
+ .depth = PL_MAX(1, params->d)
+ },
+ .mipLevels = 1,
+ .arrayLayers = 1,
+ .samples = VK_SAMPLE_COUNT_1_BIT,
+ .tiling = VK_IMAGE_TILING_OPTIMAL,
+ .usage = usage,
+ .flags = flags,
+ .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
+ .sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT
+ : VK_SHARING_MODE_EXCLUSIVE,
+ .queueFamilyIndexCount = vk->pools.num,
+ .pQueueFamilyIndices = qfs,
+ };
+
+ struct vk_malloc_params mparams = {
+ .optimal = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+ .export_handle = params->export_handle,
+ .import_handle = params->import_handle,
+ .shared_mem = params->shared_mem,
+ .debug_tag = params->debug_tag,
+ };
+
+ if (params->import_handle == PL_HANDLE_DMA_BUF) {
+ vk_link_struct(&iinfo, &drm_explicit);
+ iinfo.tiling = VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;
+ mparams.shared_mem.offset = 0x0; // handled via plane offsets
+ }
+
+#ifdef VK_EXT_metal_objects
+ if (params->import_handle == PL_HANDLE_MTL_TEX) {
+ vk_link_struct(&iinfo, &import_metal_tex);
+ import_metal_tex.mtlTexture = params->shared_mem.handle.handle;
+ }
+
+ if (params->import_handle == PL_HANDLE_IOSURFACE) {
+ vk_link_struct(&iinfo, &import_iosurface);
+ import_iosurface.ioSurface = params->shared_mem.handle.handle;
+ }
+#endif
+
+ if (params->export_handle == PL_HANDLE_DMA_BUF) {
+ pl_assert(drm_list.drmFormatModifierCount > 0);
+ vk_link_struct(&iinfo, &drm_list);
+ iinfo.tiling = VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;
+ }
+
+ // Double-check physical image format limits and fail if invalid
+ VkPhysicalDeviceImageDrmFormatModifierInfoEXT drm_pinfo = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT,
+ .sharingMode = iinfo.sharingMode,
+ .queueFamilyIndexCount = iinfo.queueFamilyIndexCount,
+ .pQueueFamilyIndices = iinfo.pQueueFamilyIndices,
+ };
+
+ VkPhysicalDeviceExternalImageFormatInfoKHR ext_pinfo = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO_KHR,
+ .handleType = ext_info.handleTypes,
+ };
+
+ if (handle_type == PL_HANDLE_DMA_BUF) {
+ if (params->import_handle) {
+ // On import, we know exactly which format modifier to test
+ drm_pinfo.drmFormatModifier = drm_explicit.drmFormatModifier;
+ } else {
+ // On export, the choice of format modifier is ambiguous, because
+ // we offer the implementation a whole list to choose from. In
+ // principle, we must check *all* supported drm format modifiers,
+ // but in practice it should hopefully suffice to just check one
+ drm_pinfo.drmFormatModifier = drm_list.pDrmFormatModifiers[0];
+ }
+ vk_link_struct(&ext_pinfo, &drm_pinfo);
+ }
+
+ VkPhysicalDeviceImageFormatInfo2KHR pinfo = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2_KHR,
+ .pNext = vk_handle_type ? &ext_pinfo : NULL,
+ .format = iinfo.format,
+ .type = iinfo.imageType,
+ .tiling = iinfo.tiling,
+ .usage = iinfo.usage,
+ .flags = iinfo.flags,
+ };
+
+ VkExternalImageFormatPropertiesKHR ext_props = {
+ .sType = VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES_KHR,
+ };
+
+ VkImageFormatProperties2KHR props = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2_KHR,
+ .pNext = vk_handle_type ? &ext_props : NULL,
+ };
+
+ VkResult res;
+ res = vk->GetPhysicalDeviceImageFormatProperties2KHR(vk->physd, &pinfo, &props);
+ if (res == VK_ERROR_FORMAT_NOT_SUPPORTED) {
+ PL_DEBUG(gpu, "Texture creation failed: not supported");
+ goto error;
+ } else {
+ PL_VK_ASSERT(res, "Querying image format properties");
+ }
+
+ VkExtent3D max = props.imageFormatProperties.maxExtent;
+ if (params->w > max.width || params->h > max.height || params->d > max.depth)
+ {
+ PL_ERR(gpu, "Requested image size %dx%dx%d exceeds the maximum allowed "
+ "dimensions %dx%dx%d for vulkan image format %x",
+ params->w, params->h, params->d, max.width, max.height, max.depth,
+ (unsigned) iinfo.format);
+ goto error;
+ }
+
+ // Ensure the handle type is supported
+ if (vk_handle_type) {
+ bool ok = vk_external_mem_check(vk, &ext_props.externalMemoryProperties,
+ handle_type, params->import_handle);
+ if (!ok) {
+ PL_ERR(gpu, "Requested handle type is not compatible with the "
+ "specified combination of image parameters. Possibly the "
+ "handle type is unsupported altogether?");
+ goto error;
+ }
+ }
+
+ VK(vk->CreateImage(vk->dev, &iinfo, PL_VK_ALLOC, &tex_vk->img));
+ tex_vk->usage_flags = iinfo.usage;
+
+ VkMemoryDedicatedRequirements ded_reqs = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR,
+ };
+
+ VkMemoryRequirements2 reqs = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR,
+ .pNext = &ded_reqs,
+ };
+
+ VkImageMemoryRequirementsInfo2 req_info = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2_KHR,
+ .image = tex_vk->img,
+ };
+
+ vk->GetImageMemoryRequirements2(vk->dev, &req_info, &reqs);
+ mparams.reqs = reqs.memoryRequirements;
+ if (ded_reqs.prefersDedicatedAllocation) {
+ mparams.ded_image = tex_vk->img;
+ if (vk_mem_handle_type(params->import_handle))
+ mparams.shared_mem.size = reqs.memoryRequirements.size;
+ }
+
+ const char *debug_tag = params->debug_tag ? params->debug_tag :
+ params->import_handle ? "imported" : "created";
+
+ if (!params->import_handle || vk_mem_handle_type(params->import_handle)) {
+ struct vk_memslice *mem = &tex_vk->mem;
+ if (!vk_malloc_slice(vk->ma, mem, &mparams))
+ goto error;
+
+ VK(vk->BindImageMemory(vk->dev, tex_vk->img, mem->vkmem, mem->offset));
+ }
+
+ static const char * const plane_names[4] = {
+ "plane 0", "plane 1", "plane 2", "plane 3",
+ };
+
+ if (tex_vk->num_planes) {
+ for (int i = 0; i < tex_vk->num_planes; i++) {
+ struct pl_tex_t *plane;
+
+ pl_assert(tex_vk->type == VK_IMAGE_TYPE_2D);
+ plane = (struct pl_tex_t *) pl_vulkan_wrap(gpu, pl_vulkan_wrap_params(
+ .image = tex_vk->img,
+ .aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << i,
+ .width = PL_RSHIFT_UP(tex->params.w, fmt->planes[i].shift_x),
+ .height = PL_RSHIFT_UP(tex->params.h, fmt->planes[i].shift_y),
+ .format = fmtp->vk_fmt->pfmt[i].fmt,
+ .usage = usage,
+ .user_data = params->user_data,
+ .debug_tag = PL_DEF(params->debug_tag, plane_names[i]),
+ ));
+ if (!plane)
+ goto error;
+ plane->parent = tex;
+ tex->planes[i] = plane;
+ tex_vk->planes[i] = PL_PRIV(plane);
+ tex_vk->planes[i]->held = false;
+ tex_vk->planes[i]->layout = tex_vk->layout;
+ }
+
+ // Explicitly mask out all usage flags from planar parent images
+ pl_assert(!fmt->caps);
+ tex->params.sampleable = false;
+ tex->params.renderable = false;
+ tex->params.storable = false;
+ tex->params.blit_src = false;
+ tex->params.blit_dst = false;
+ tex->params.host_writable = false;
+ tex->params.host_readable = false;
+ }
+
+ if (!vk_init_image(gpu, tex, debug_tag))
+ goto error;
+
+ if (params->export_handle)
+ tex->shared_mem = tex_vk->mem.shared_mem;
+
+ if (params->export_handle == PL_HANDLE_DMA_BUF) {
+ if (vk->GetImageDrmFormatModifierPropertiesEXT) {
+
+ // Query the DRM format modifier and plane layout from the driver
+ VkImageDrmFormatModifierPropertiesEXT mod_props = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_PROPERTIES_EXT,
+ };
+
+ VK(vk->GetImageDrmFormatModifierPropertiesEXT(vk->dev, tex_vk->img, &mod_props));
+ tex->shared_mem.drm_format_mod = mod_props.drmFormatModifier;
+
+ VkSubresourceLayout layout = {0};
+ VkImageSubresource plane = {
+ .aspectMask = VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT,
+ };
+
+ vk->GetImageSubresourceLayout(vk->dev, tex_vk->img, &plane, &layout);
+ if (layout.offset != 0) {
+ PL_ERR(gpu, "Exported DRM plane 0 has nonzero offset %zu, "
+ "this should never happen! Erroring for safety...",
+ (size_t) layout.offset);
+ goto error;
+ }
+ tex->shared_mem.stride_w = layout.rowPitch;
+ tex->shared_mem.stride_h = layout.depthPitch;
+
+ } else {
+
+ // Fallback for no modifiers, just do something stupid.
+ tex->shared_mem.drm_format_mod = DRM_FORMAT_MOD_INVALID;
+ tex->shared_mem.stride_w = params->w;
+ tex->shared_mem.stride_h = params->h;
+
+ }
+ }
+
+ if (params->initial_data) {
+ struct pl_tex_transfer_params ul_params = {
+ .tex = tex,
+ .ptr = (void *) params->initial_data,
+ .rc = { 0, 0, 0, params->w, params->h, params->d },
+ };
+
+ // Since we re-use GPU helpers which require writable images, just fake it
+ bool writable = tex->params.host_writable;
+ tex->params.host_writable = true;
+ if (!pl_tex_upload(gpu, &ul_params))
+ goto error;
+ tex->params.host_writable = writable;
+ }
+
+ return tex;
+
+error:
+ vk_tex_destroy(gpu, tex);
+ return NULL;
+}
+
+void vk_tex_invalidate(pl_gpu gpu, pl_tex tex)
+{
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+ tex_vk->may_invalidate = true;
+ for (int i = 0; i < tex_vk->num_planes; i++)
+ tex_vk->planes[i]->may_invalidate = true;
+}
+
+static bool tex_clear_fallback(pl_gpu gpu, pl_tex tex,
+ const union pl_clear_color color)
+{
+ pl_tex pixel = pl_tex_create(gpu, pl_tex_params(
+ .w = 1,
+ .h = 1,
+ .format = tex->params.format,
+ .storable = true,
+ .blit_src = true,
+ .blit_dst = true,
+ ));
+ if (!pixel)
+ return false;
+
+ pl_tex_clear_ex(gpu, pixel, color);
+
+ pl_assert(tex->params.storable);
+ pl_tex_blit(gpu, pl_tex_blit_params(
+ .src = pixel,
+ .dst = tex,
+ .sample_mode = PL_TEX_SAMPLE_NEAREST,
+ ));
+
+ pl_tex_destroy(gpu, &pixel);
+ return true;
+}
+
+void vk_tex_clear_ex(pl_gpu gpu, pl_tex tex, const union pl_clear_color color)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+
+ if (tex_vk->aspect != VK_IMAGE_ASPECT_COLOR_BIT) {
+ if (!tex_clear_fallback(gpu, tex, color)) {
+ PL_ERR(gpu, "Failed clearing imported planar image: color aspect "
+ "clears disallowed by spec and no shader fallback "
+ "available");
+ }
+ return;
+ }
+
+ struct vk_cmd *cmd = CMD_BEGIN(GRAPHICS);
+ if (!cmd)
+ return;
+
+ vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_CLEAR_BIT,
+ VK_ACCESS_2_TRANSFER_WRITE_BIT,
+ VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+ VK_QUEUE_FAMILY_IGNORED);
+
+ pl_static_assert(sizeof(VkClearColorValue) == sizeof(union pl_clear_color));
+ const VkClearColorValue *clearColor = (const VkClearColorValue *) &color;
+
+ pl_assert(tex_vk->aspect == VK_IMAGE_ASPECT_COLOR_BIT);
+ static const VkImageSubresourceRange range = {
+ .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+ .levelCount = 1,
+ .layerCount = 1,
+ };
+
+ vk->CmdClearColorImage(cmd->buf, tex_vk->img, tex_vk->layout,
+ clearColor, 1, &range);
+
+ CMD_FINISH(&cmd);
+}
+
+void vk_tex_blit(pl_gpu gpu, const struct pl_tex_blit_params *params)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_tex_vk *src_vk = PL_PRIV(params->src);
+ struct pl_tex_vk *dst_vk = PL_PRIV(params->dst);
+ struct pl_fmt_vk *src_fmtp = PL_PRIV(params->src->params.format);
+ struct pl_fmt_vk *dst_fmtp = PL_PRIV(params->dst->params.format);
+ bool blit_emulated = src_fmtp->blit_emulated || dst_fmtp->blit_emulated;
+ bool planar_fallback = src_vk->aspect != VK_IMAGE_ASPECT_COLOR_BIT ||
+ dst_vk->aspect != VK_IMAGE_ASPECT_COLOR_BIT;
+
+ pl_rect3d src_rc = params->src_rc, dst_rc = params->dst_rc;
+ bool requires_scaling = !pl_rect3d_eq(src_rc, dst_rc);
+ if ((requires_scaling && blit_emulated) || planar_fallback) {
+ if (!pl_tex_blit_compute(gpu, params))
+ PL_ERR(gpu, "Failed emulating texture blit, incompatible textures?");
+ return;
+ }
+
+ struct vk_cmd *cmd = CMD_BEGIN(GRAPHICS);
+ if (!cmd)
+ return;
+
+ // When the blit operation doesn't require scaling, we can use the more
+ // efficient vkCmdCopyImage instead of vkCmdBlitImage
+ if (!requires_scaling) {
+ vk_tex_barrier(gpu, cmd, params->src, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_READ_BIT,
+ VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+ VK_QUEUE_FAMILY_IGNORED);
+
+ vk_tex_barrier(gpu, cmd, params->dst, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_WRITE_BIT,
+ VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+ VK_QUEUE_FAMILY_IGNORED);
+
+ pl_rect3d_normalize(&src_rc);
+
+ VkImageCopy region = {
+ .srcSubresource = {
+ .aspectMask = src_vk->aspect,
+ .layerCount = 1,
+ },
+ .dstSubresource = {
+ .aspectMask = dst_vk->aspect,
+ .layerCount = 1,
+ },
+ .srcOffset = {src_rc.x0, src_rc.y0, src_rc.z0},
+ .dstOffset = {src_rc.x0, src_rc.y0, src_rc.z0},
+ .extent = {
+ pl_rect_w(src_rc),
+ pl_rect_h(src_rc),
+ pl_rect_d(src_rc),
+ },
+ };
+
+ vk->CmdCopyImage(cmd->buf, src_vk->img, src_vk->layout,
+ dst_vk->img, dst_vk->layout, 1, &region);
+ } else {
+ vk_tex_barrier(gpu, cmd, params->src, VK_PIPELINE_STAGE_2_BLIT_BIT,
+ VK_ACCESS_2_TRANSFER_READ_BIT,
+ VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+ VK_QUEUE_FAMILY_IGNORED);
+
+ vk_tex_barrier(gpu, cmd, params->dst, VK_PIPELINE_STAGE_2_BLIT_BIT,
+ VK_ACCESS_2_TRANSFER_WRITE_BIT,
+ VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+ VK_QUEUE_FAMILY_IGNORED);
+
+ VkImageBlit region = {
+ .srcSubresource = {
+ .aspectMask = src_vk->aspect,
+ .layerCount = 1,
+ },
+ .dstSubresource = {
+ .aspectMask = dst_vk->aspect,
+ .layerCount = 1,
+ },
+ .srcOffsets = {{src_rc.x0, src_rc.y0, src_rc.z0},
+ {src_rc.x1, src_rc.y1, src_rc.z1}},
+ .dstOffsets = {{dst_rc.x0, dst_rc.y0, dst_rc.z0},
+ {dst_rc.x1, dst_rc.y1, dst_rc.z1}},
+ };
+
+ static const VkFilter filters[PL_TEX_SAMPLE_MODE_COUNT] = {
+ [PL_TEX_SAMPLE_NEAREST] = VK_FILTER_NEAREST,
+ [PL_TEX_SAMPLE_LINEAR] = VK_FILTER_LINEAR,
+ };
+
+ vk->CmdBlitImage(cmd->buf, src_vk->img, src_vk->layout,
+ dst_vk->img, dst_vk->layout, 1, &region,
+ filters[params->sample_mode]);
+ }
+
+ CMD_FINISH(&cmd);
+}
+
+// Determine the best queue type to perform a buffer<->image copy on
+static enum queue_type vk_img_copy_queue(pl_gpu gpu, pl_tex tex,
+ const struct VkBufferImageCopy *region)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ const struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+ enum queue_type queue = tex_vk->transfer_queue;
+ if (queue != TRANSFER)
+ return queue;
+
+ VkExtent3D alignment = vk->pool_transfer->props.minImageTransferGranularity;
+
+ enum queue_type fallback = GRAPHICS;
+ if (gpu->limits.compute_queues > gpu->limits.fragment_queues)
+ fallback = COMPUTE; // prefer async compute queue
+
+ int tex_w = PL_DEF(tex->params.w, 1),
+ tex_h = PL_DEF(tex->params.h, 1),
+ tex_d = PL_DEF(tex->params.d, 1);
+
+ bool full_w = region->imageOffset.x + region->imageExtent.width == tex_w,
+ full_h = region->imageOffset.y + region->imageExtent.height == tex_h,
+ full_d = region->imageOffset.z + region->imageExtent.depth == tex_d;
+
+ if (alignment.width) {
+
+ bool unaligned = false;
+ unaligned |= region->imageOffset.x % alignment.width;
+ unaligned |= region->imageOffset.y % alignment.height;
+ unaligned |= region->imageOffset.z % alignment.depth;
+ unaligned |= (region->imageExtent.width % alignment.width) && !full_w;
+ unaligned |= (region->imageExtent.height % alignment.height) && !full_h;
+ unaligned |= (region->imageExtent.depth % alignment.depth) && !full_d;
+
+ return unaligned ? fallback : queue;
+
+ } else {
+
+ // an alignment of {0} means the copy must span the entire image
+ bool unaligned = false;
+ unaligned |= region->imageOffset.x || !full_w;
+ unaligned |= region->imageOffset.y || !full_h;
+ unaligned |= region->imageOffset.z || !full_d;
+
+ return unaligned ? fallback : queue;
+
+ }
+}
+
+static void tex_xfer_cb(void *ctx, void *arg)
+{
+ void (*fun)(void *priv) = ctx;
+ fun(arg);
+}
+
+bool vk_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ pl_tex tex = params->tex;
+ pl_fmt fmt = tex->params.format;
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+ struct pl_tex_transfer_params *slices = NULL;
+ int num_slices = 0;
+
+ if (!params->buf)
+ return pl_tex_upload_pbo(gpu, params);
+
+ pl_buf buf = params->buf;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+ pl_rect3d rc = params->rc;
+ const size_t size = pl_tex_transfer_size(params);
+ const size_t buf_offset = buf_vk->mem.offset + params->buf_offset;
+ bool unaligned = buf_offset % fmt->texel_size;
+ if (unaligned)
+ PL_TRACE(gpu, "vk_tex_upload: unaligned transfer (slow path)");
+
+ if (fmt->emulated || unaligned) {
+
+ // Create all slice buffers first, to early-fail if OOM, and to avoid
+ // blocking unnecessarily on waiting for these buffers to get read from
+ num_slices = pl_tex_transfer_slices(gpu, tex_vk->texel_fmt, params, &slices);
+ for (int i = 0; i < num_slices; i++) {
+ slices[i].buf = pl_buf_create(gpu, pl_buf_params(
+ .memory_type = PL_BUF_MEM_DEVICE,
+ .format = tex_vk->texel_fmt,
+ .size = pl_tex_transfer_size(&slices[i]),
+ .storable = fmt->emulated,
+ ));
+
+ if (!slices[i].buf) {
+ PL_ERR(gpu, "Failed creating buffer for tex upload fallback!");
+ num_slices = i; // only clean up buffers up to here
+ goto error;
+ }
+ }
+
+ // All temporary buffers successfully created, begin copying source data
+ struct vk_cmd *cmd = CMD_BEGIN_TIMED(tex_vk->transfer_queue,
+ params->timer);
+ if (!cmd)
+ goto error;
+
+ vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_READ_BIT, params->buf_offset, size,
+ false);
+
+ for (int i = 0; i < num_slices; i++) {
+ pl_buf slice = slices[i].buf;
+ struct pl_buf_vk *slice_vk = PL_PRIV(slice);
+ vk_buf_barrier(gpu, cmd, slice, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_WRITE_BIT, 0, slice->params.size,
+ false);
+
+ vk->CmdCopyBuffer(cmd->buf, buf_vk->mem.buf, slice_vk->mem.buf, 1, &(VkBufferCopy) {
+ .srcOffset = buf_vk->mem.offset + slices[i].buf_offset,
+ .dstOffset = slice_vk->mem.offset,
+ .size = slice->params.size,
+ });
+ }
+
+ if (params->callback)
+ vk_cmd_callback(cmd, tex_xfer_cb, params->callback, params->priv);
+
+ bool ok = CMD_FINISH(&cmd);
+
+ // Finally, dispatch the (texel) upload asynchronously. We can fire
+ // the callback already at the completion of previous command because
+ // these temporary buffers already hold persistent copies of the data
+ for (int i = 0; i < num_slices; i++) {
+ if (ok) {
+ slices[i].buf_offset = 0;
+ ok = fmt->emulated ? pl_tex_upload_texel(gpu, &slices[i])
+ : pl_tex_upload(gpu, &slices[i]);
+ }
+ pl_buf_destroy(gpu, &slices[i].buf);
+ }
+
+ pl_free(slices);
+ return ok;
+
+ } else {
+
+ pl_assert(fmt->texel_align == fmt->texel_size);
+ const VkBufferImageCopy region = {
+ .bufferOffset = buf_offset,
+ .bufferRowLength = params->row_pitch / fmt->texel_size,
+ .bufferImageHeight = params->depth_pitch / params->row_pitch,
+ .imageOffset = { rc.x0, rc.y0, rc.z0 },
+ .imageExtent = { rc.x1, rc.y1, rc.z1 },
+ .imageSubresource = {
+ .aspectMask = tex_vk->aspect,
+ .layerCount = 1,
+ },
+ };
+
+ enum queue_type queue = vk_img_copy_queue(gpu, tex, &region);
+ struct vk_cmd *cmd = CMD_BEGIN_TIMED(queue, params->timer);
+ if (!cmd)
+ goto error;
+
+ vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_READ_BIT, params->buf_offset, size,
+ false);
+ vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_WRITE_BIT,
+ VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+ VK_QUEUE_FAMILY_IGNORED);
+ vk->CmdCopyBufferToImage(cmd->buf, buf_vk->mem.buf, tex_vk->img,
+ tex_vk->layout, 1, &region);
+
+ if (params->callback)
+ vk_cmd_callback(cmd, tex_xfer_cb, params->callback, params->priv);
+
+ return CMD_FINISH(&cmd);
+ }
+
+ pl_unreachable();
+
+error:
+ for (int i = 0; i < num_slices; i++)
+ pl_buf_destroy(gpu, &slices[i].buf);
+ pl_free(slices);
+ return false;
+}
+
+bool vk_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ pl_tex tex = params->tex;
+ pl_fmt fmt = tex->params.format;
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+ struct pl_tex_transfer_params *slices = NULL;
+ int num_slices = 0;
+
+ if (!params->buf)
+ return pl_tex_download_pbo(gpu, params);
+
+ pl_buf buf = params->buf;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+ pl_rect3d rc = params->rc;
+ const size_t size = pl_tex_transfer_size(params);
+ const size_t buf_offset = buf_vk->mem.offset + params->buf_offset;
+ bool unaligned = buf_offset % fmt->texel_size;
+ if (unaligned)
+ PL_TRACE(gpu, "vk_tex_download: unaligned transfer (slow path)");
+
+ if (fmt->emulated || unaligned) {
+
+ num_slices = pl_tex_transfer_slices(gpu, tex_vk->texel_fmt, params, &slices);
+ for (int i = 0; i < num_slices; i++) {
+ slices[i].buf = pl_buf_create(gpu, pl_buf_params(
+ .memory_type = PL_BUF_MEM_DEVICE,
+ .format = tex_vk->texel_fmt,
+ .size = pl_tex_transfer_size(&slices[i]),
+ .storable = fmt->emulated,
+ ));
+
+ if (!slices[i].buf) {
+ PL_ERR(gpu, "Failed creating buffer for tex download fallback!");
+ num_slices = i;
+ goto error;
+ }
+ }
+
+ for (int i = 0; i < num_slices; i++) {
+ // Restore buffer offset after downloading into temporary buffer,
+ // because we still need to copy the data from the temporary buffer
+ // into this offset in the original buffer
+ const size_t tmp_offset = slices[i].buf_offset;
+ slices[i].buf_offset = 0;
+ bool ok = fmt->emulated ? pl_tex_download_texel(gpu, &slices[i])
+ : pl_tex_download(gpu, &slices[i]);
+ slices[i].buf_offset = tmp_offset;
+ if (!ok)
+ goto error;
+ }
+
+ // Finally, download into the user buffer
+ struct vk_cmd *cmd = CMD_BEGIN_TIMED(tex_vk->transfer_queue, params->timer);
+ if (!cmd)
+ goto error;
+
+ vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_WRITE_BIT, params->buf_offset, size,
+ false);
+
+ for (int i = 0; i < num_slices; i++) {
+ pl_buf slice = slices[i].buf;
+ struct pl_buf_vk *slice_vk = PL_PRIV(slice);
+ vk_buf_barrier(gpu, cmd, slice, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_READ_BIT, 0, slice->params.size,
+ false);
+
+ vk->CmdCopyBuffer(cmd->buf, slice_vk->mem.buf, buf_vk->mem.buf, 1, &(VkBufferCopy) {
+ .srcOffset = slice_vk->mem.offset,
+ .dstOffset = buf_vk->mem.offset + slices[i].buf_offset,
+ .size = slice->params.size,
+ });
+
+ pl_buf_destroy(gpu, &slices[i].buf);
+ }
+
+ vk_buf_flush(gpu, cmd, buf, params->buf_offset, size);
+
+ if (params->callback)
+ vk_cmd_callback(cmd, tex_xfer_cb, params->callback, params->priv);
+
+ pl_free(slices);
+ return CMD_FINISH(&cmd);
+
+ } else {
+
+ pl_assert(params->row_pitch % fmt->texel_size == 0);
+ pl_assert(params->depth_pitch % params->row_pitch == 0);
+ const VkBufferImageCopy region = {
+ .bufferOffset = buf_offset,
+ .bufferRowLength = params->row_pitch / fmt->texel_size,
+ .bufferImageHeight = params->depth_pitch / params->row_pitch,
+ .imageOffset = { rc.x0, rc.y0, rc.z0 },
+ .imageExtent = { rc.x1, rc.y1, rc.z1 },
+ .imageSubresource = {
+ .aspectMask = tex_vk->aspect,
+ .layerCount = 1,
+ },
+ };
+
+ enum queue_type queue = vk_img_copy_queue(gpu, tex, &region);
+
+ struct vk_cmd *cmd = CMD_BEGIN_TIMED(queue, params->timer);
+ if (!cmd)
+ goto error;
+
+ vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_WRITE_BIT, params->buf_offset, size,
+ false);
+ vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_READ_BIT,
+ VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+ VK_QUEUE_FAMILY_IGNORED);
+ vk->CmdCopyImageToBuffer(cmd->buf, tex_vk->img, tex_vk->layout,
+ buf_vk->mem.buf, 1, &region);
+ vk_buf_flush(gpu, cmd, buf, params->buf_offset, size);
+
+ if (params->callback)
+ vk_cmd_callback(cmd, tex_xfer_cb, params->callback, params->priv);
+
+ return CMD_FINISH(&cmd);
+ }
+
+ pl_unreachable();
+
+error:
+ for (int i = 0; i < num_slices; i++)
+ pl_buf_destroy(gpu, &slices[i].buf);
+ pl_free(slices);
+ return false;
+}
+
+bool vk_tex_poll(pl_gpu gpu, pl_tex tex, uint64_t timeout)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+
+ // Opportunistically check if we can re-use this texture without flush
+ vk_poll_commands(vk, 0);
+ if (pl_rc_count(&tex_vk->rc) == 1)
+ goto skip_blocking;
+
+ // Otherwise, we're force to submit any queued command so that the user is
+ // guaranteed to see progress eventually, even if they call this in a loop
+ CMD_SUBMIT(NULL);
+ vk_poll_commands(vk, timeout);
+ if (pl_rc_count(&tex_vk->rc) > 1)
+ return true;
+
+ // fall through
+skip_blocking:
+ for (int i = 0; i < tex_vk->num_planes; i++) {
+ if (vk_tex_poll(gpu, tex->planes[i], timeout))
+ return true;
+ }
+
+ return false;
+}
+
+bool vk_tex_export(pl_gpu gpu, pl_tex tex, pl_sync sync)
+{
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+ struct pl_sync_vk *sync_vk = PL_PRIV(sync);
+
+ if (tex_vk->num_planes) {
+ PL_ERR(gpu, "`pl_tex_export` cannot be called on planar textures."
+ "Please see `pl_vulkan_hold_ex` for a replacement.");
+ return false;
+ }
+
+ struct vk_cmd *cmd = CMD_BEGIN(ANY);
+ if (!cmd)
+ goto error;
+
+ vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_NONE,
+ 0, VK_IMAGE_LAYOUT_GENERAL, VK_QUEUE_FAMILY_EXTERNAL);
+
+ // Make the next barrier appear as though coming from a different queue
+ tex_vk->sem.write.queue = tex_vk->sem.read.queue = NULL;
+
+ vk_cmd_sig(cmd, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, (pl_vulkan_sem){ sync_vk->wait });
+ if (!CMD_SUBMIT(&cmd))
+ goto error;
+
+ // Remember the other dependency and hold on to the sync object
+ PL_ARRAY_APPEND(tex, tex_vk->ext_deps, (pl_vulkan_sem){ sync_vk->signal });
+ pl_rc_ref(&sync_vk->rc);
+ tex_vk->ext_sync = sync;
+ tex_vk->qf = VK_QUEUE_FAMILY_EXTERNAL;
+ return true;
+
+error:
+ PL_ERR(gpu, "Failed exporting shared texture!");
+ return false;
+}
+
+pl_tex pl_vulkan_wrap(pl_gpu gpu, const struct pl_vulkan_wrap_params *params)
+{
+ pl_fmt fmt = NULL;
+ for (int i = 0; i < gpu->num_formats; i++) {
+ const struct vk_format **vkfmt = PL_PRIV(gpu->formats[i]);
+ if ((*vkfmt)->tfmt == params->format) {
+ fmt = gpu->formats[i];
+ break;
+ }
+ }
+
+ if (!fmt) {
+ PL_ERR(gpu, "Could not find pl_fmt suitable for wrapped image "
+ "with format %s", vk_fmt_name(params->format));
+ return NULL;
+ }
+
+ VkImageUsageFlags usage = params->usage;
+ if (fmt->num_planes)
+ usage = 0; // mask capabilities from the base texture
+
+ struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, struct pl_tex_vk);
+ tex->params = (struct pl_tex_params) {
+ .format = fmt,
+ .w = params->width,
+ .h = params->height,
+ .d = params->depth,
+ .sampleable = !!(usage & VK_IMAGE_USAGE_SAMPLED_BIT),
+ .renderable = !!(usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT),
+ .storable = !!(usage & VK_IMAGE_USAGE_STORAGE_BIT),
+ .blit_src = !!(usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT),
+ .blit_dst = !!(usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT),
+ .host_writable = !!(usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT),
+ .host_readable = !!(usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT),
+ .user_data = params->user_data,
+ .debug_tag = params->debug_tag,
+ };
+
+ // Mask out capabilities not permitted by the `pl_fmt`
+#define MASK(field, cap) \
+ do { \
+ if (tex->params.field && !(fmt->caps & cap)) { \
+ PL_WARN(gpu, "Masking `" #field "` from wrapped texture because " \
+ "the corresponding format '%s' does not support " #cap, \
+ fmt->name); \
+ tex->params.field = false; \
+ } \
+ } while (0)
+
+ MASK(sampleable, PL_FMT_CAP_SAMPLEABLE);
+ MASK(renderable, PL_FMT_CAP_RENDERABLE);
+ MASK(storable, PL_FMT_CAP_STORABLE);
+ MASK(blit_src, PL_FMT_CAP_BLITTABLE);
+ MASK(blit_dst, PL_FMT_CAP_BLITTABLE);
+ MASK(host_readable, PL_FMT_CAP_HOST_READABLE);
+#undef MASK
+
+ // For simplicity, explicitly mask out blit emulation for wrapped textures
+ struct pl_fmt_vk *fmtp = PL_PRIV(fmt);
+ if (fmtp->blit_emulated) {
+ tex->params.blit_src = false;
+ tex->params.blit_dst = false;
+ }
+
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+ switch (pl_tex_params_dimension(tex->params)) {
+ case 1: tex_vk->type = VK_IMAGE_TYPE_1D; break;
+ case 2: tex_vk->type = VK_IMAGE_TYPE_2D; break;
+ case 3: tex_vk->type = VK_IMAGE_TYPE_3D; break;
+ }
+ tex_vk->external_img = true;
+ tex_vk->held = !fmt->num_planes;
+ tex_vk->img = params->image;
+ tex_vk->img_fmt = params->format;
+ tex_vk->num_planes = fmt->num_planes;
+ tex_vk->usage_flags = usage;
+ tex_vk->aspect = params->aspect;
+
+ if (!tex_vk->aspect) {
+ for (int i = 0; i < tex_vk->num_planes; i++)
+ tex_vk->aspect |= VK_IMAGE_ASPECT_PLANE_0_BIT << i;
+ tex_vk->aspect = PL_DEF(tex_vk->aspect, VK_IMAGE_ASPECT_COLOR_BIT);
+ }
+
+ // Blitting to planar images requires fallback via compute shaders
+ if (tex_vk->aspect != VK_IMAGE_ASPECT_COLOR_BIT) {
+ tex->params.blit_src &= tex->params.storable;
+ tex->params.blit_dst &= tex->params.storable;
+ }
+
+ static const char * const wrapped_plane_names[4] = {
+ "wrapped plane 0", "wrapped plane 1", "wrapped plane 2", "wrapped plane 3",
+ };
+
+ for (int i = 0; i < tex_vk->num_planes; i++) {
+ struct pl_tex_t *plane;
+ VkImageAspectFlags aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << i;
+ if (!(aspect & tex_vk->aspect)) {
+ PL_INFO(gpu, "Not wrapping plane %d due to aspect bit 0x%x not "
+ "being contained in supplied params->aspect 0x%x!",
+ i, (unsigned) aspect, (unsigned) tex_vk->aspect);
+ continue;
+ }
+
+ pl_assert(tex_vk->type == VK_IMAGE_TYPE_2D);
+ plane = (struct pl_tex_t *) pl_vulkan_wrap(gpu, pl_vulkan_wrap_params(
+ .image = tex_vk->img,
+ .aspect = aspect,
+ .width = PL_RSHIFT_UP(tex->params.w, fmt->planes[i].shift_x),
+ .height = PL_RSHIFT_UP(tex->params.h, fmt->planes[i].shift_y),
+ .format = fmtp->vk_fmt->pfmt[i].fmt,
+ .usage = params->usage,
+ .user_data = params->user_data,
+ .debug_tag = PL_DEF(params->debug_tag, wrapped_plane_names[i]),
+ ));
+ if (!plane)
+ goto error;
+ plane->parent = tex;
+ tex->planes[i] = plane;
+ tex_vk->planes[i] = PL_PRIV(plane);
+ }
+
+ if (!vk_init_image(gpu, tex, PL_DEF(params->debug_tag, "wrapped")))
+ goto error;
+
+ return tex;
+
+error:
+ vk_tex_destroy(gpu, tex);
+ return NULL;
+}
+
+VkImage pl_vulkan_unwrap(pl_gpu gpu, pl_tex tex, VkFormat *out_format,
+ VkImageUsageFlags *out_flags)
+{
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+
+ if (out_format)
+ *out_format = tex_vk->img_fmt;
+ if (out_flags)
+ *out_flags = tex_vk->usage_flags;
+
+ return tex_vk->img;
+}
+
+bool pl_vulkan_hold_ex(pl_gpu gpu, const struct pl_vulkan_hold_params *params)
+{
+ struct pl_tex_vk *tex_vk = PL_PRIV(params->tex);
+ pl_assert(params->semaphore.sem);
+
+ bool held = tex_vk->held;
+ for (int i = 0; i < tex_vk->num_planes; i++)
+ held |= tex_vk->planes[i]->held;
+
+ if (held) {
+ PL_ERR(gpu, "Attempting to hold an already held image!");
+ return false;
+ }
+
+ struct vk_cmd *cmd = CMD_BEGIN(GRAPHICS);
+ if (!cmd) {
+ PL_ERR(gpu, "Failed holding external image!");
+ return false;
+ }
+
+ VkImageLayout layout = params->layout;
+ if (params->out_layout) {
+ // For planar images, arbitrarily pick the current image layout of the
+ // first plane. This should be fine in practice, since all planes will
+ // share the same usage capabilities.
+ if (tex_vk->num_planes) {
+ layout = tex_vk->planes[0]->layout;
+ } else {
+ layout = tex_vk->layout;
+ }
+ }
+
+ bool may_invalidate = true;
+ if (!tex_vk->num_planes) {
+ may_invalidate &= tex_vk->may_invalidate;
+ vk_tex_barrier(gpu, cmd, params->tex, VK_PIPELINE_STAGE_2_NONE,
+ 0, layout, params->qf);
+ }
+
+ for (int i = 0; i < tex_vk->num_planes; i++) {
+ may_invalidate &= tex_vk->planes[i]->may_invalidate;
+ vk_tex_barrier(gpu, cmd, params->tex->planes[i],
+ VK_PIPELINE_STAGE_2_NONE, 0, layout, params->qf);
+ }
+
+ vk_cmd_sig(cmd, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, params->semaphore);
+ bool ok = CMD_SUBMIT(&cmd);
+
+ if (!tex_vk->num_planes) {
+ tex_vk->sem.write.queue = tex_vk->sem.read.queue = NULL;
+ tex_vk->held = ok;
+ }
+
+ for (int i = 0; i < tex_vk->num_planes; i++) {
+ struct pl_tex_vk *plane_vk = tex_vk->planes[i];
+ plane_vk->sem.write.queue = plane_vk->sem.read.queue = NULL;
+ plane_vk->held = ok;
+ }
+
+ if (ok && params->out_layout)
+ *params->out_layout = may_invalidate ? VK_IMAGE_LAYOUT_UNDEFINED : layout;
+
+ return ok;
+}
+
+void pl_vulkan_release_ex(pl_gpu gpu, const struct pl_vulkan_release_params *params)
+{
+ struct pl_tex_vk *tex_vk = PL_PRIV(params->tex);
+ if (tex_vk->num_planes) {
+ struct pl_vulkan_release_params plane_pars = *params;
+ for (int i = 0; i < tex_vk->num_planes; i++) {
+ plane_pars.tex = params->tex->planes[i];
+ pl_vulkan_release_ex(gpu, &plane_pars);
+ }
+ return;
+ }
+
+ if (!tex_vk->held) {
+ PL_ERR(gpu, "Attempting to release an unheld image?");
+ return;
+ }
+
+ if (params->semaphore.sem)
+ PL_ARRAY_APPEND(params->tex, tex_vk->ext_deps, params->semaphore);
+
+ tex_vk->qf = params->qf;
+ tex_vk->layout = params->layout;
+ tex_vk->held = false;
+}
+
+bool pl_vulkan_hold(pl_gpu gpu, pl_tex tex, VkImageLayout layout,
+ pl_vulkan_sem sem_out)
+{
+ return pl_vulkan_hold_ex(gpu, pl_vulkan_hold_params(
+ .tex = tex,
+ .layout = layout,
+ .semaphore = sem_out,
+ .qf = VK_QUEUE_FAMILY_IGNORED,
+ ));
+}
+
+bool pl_vulkan_hold_raw(pl_gpu gpu, pl_tex tex,
+ VkImageLayout *out_layout,
+ pl_vulkan_sem sem_out)
+{
+ return pl_vulkan_hold_ex(gpu, pl_vulkan_hold_params(
+ .tex = tex,
+ .out_layout = out_layout,
+ .semaphore = sem_out,
+ .qf = VK_QUEUE_FAMILY_IGNORED,
+ ));
+}
+
+void pl_vulkan_release(pl_gpu gpu, pl_tex tex, VkImageLayout layout,
+ pl_vulkan_sem sem_in)
+{
+ pl_vulkan_release_ex(gpu, pl_vulkan_release_params(
+ .tex = tex,
+ .layout = layout,
+ .semaphore = sem_in,
+ .qf = VK_QUEUE_FAMILY_IGNORED,
+ ));
+}
diff --git a/src/vulkan/malloc.c b/src/vulkan/malloc.c
new file mode 100644
index 0000000..c35183b
--- /dev/null
+++ b/src/vulkan/malloc.c
@@ -0,0 +1,1058 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "malloc.h"
+#include "command.h"
+#include "utils.h"
+#include "pl_thread.h"
+
+#ifdef PL_HAVE_UNIX
+#include <errno.h>
+#include <unistd.h>
+#endif
+
+// Controls the page size alignment, to help coalesce allocations into the same
+// slab. Pages are rounded up to multiples of this value. (Default: 4 KB)
+#define PAGE_SIZE_ALIGN (1LLU << 12)
+
+// Controls the minimum/maximum number of pages for new slabs. As slabs are
+// exhausted of memory, the number of pages per new slab grows exponentially,
+// starting with the minimum until the maximum is reached.
+//
+// Note: The maximum must never exceed the size of `vk_slab.spacemap`.
+#define MINIMUM_PAGE_COUNT 4
+#define MAXIMUM_PAGE_COUNT (sizeof(uint64_t) * 8)
+
+// Controls the maximum page size. Any allocations above this threshold
+// (absolute size or fraction of VRAM, whichever is higher) will be served by
+// dedicated allocations. (Default: 64 MB or 1/16 of VRAM)
+#define MAXIMUM_PAGE_SIZE_ABSOLUTE (1LLU << 26)
+#define MAXIMUM_PAGE_SIZE_RELATIVE 16
+
+// Controls the minimum slab size, to avoid excessive re-allocation of very
+// small slabs. (Default: 256 KB)
+#define MINIMUM_SLAB_SIZE (1LLU << 18)
+
+// How long to wait before garbage collecting empty slabs. Slabs older than
+// this many invocations of `vk_malloc_garbage_collect` will be released.
+#define MAXIMUM_SLAB_AGE 32
+
+// A single slab represents a contiguous region of allocated memory. Actual
+// allocations are served as pages of this. Slabs are organized into pools,
+// each of which contains a list of slabs of differing page sizes.
+struct vk_slab {
+ pl_mutex lock;
+ pl_debug_tag debug_tag; // debug tag of the triggering allocation
+ VkDeviceMemory mem; // underlying device allocation
+ VkDeviceSize size; // total allocated size of `mem`
+ VkMemoryType mtype; // underlying memory type
+ bool dedicated; // slab is allocated specifically for one object
+ bool imported; // slab represents an imported memory allocation
+
+ // free space accounting (only for non-dedicated slabs)
+ uint64_t spacemap; // bitset of available pages
+ size_t pagesize; // size in bytes per page
+ size_t used; // number of bytes actually in use
+ uint64_t age; // timestamp of last use
+
+ // optional, depends on the memory type:
+ VkBuffer buffer; // buffer spanning the entire slab
+ void *data; // mapped memory corresponding to `mem`
+ bool coherent; // mapped memory is coherent
+ union pl_handle handle; // handle associated with this device memory
+ enum pl_handle_type handle_type;
+};
+
+// Represents a single memory pool. We keep track of a vk_pool for each
+// combination of malloc parameters. This shouldn't actually be that many in
+// practice, because some combinations simply never occur, and others will
+// generally be the same for the same objects.
+//
+// Note: `vk_pool` addresses are not immutable, so we mustn't expose any
+// dangling references to a `vk_pool` from e.g. `vk_memslice.priv = vk_slab`.
+struct vk_pool {
+ struct vk_malloc_params params; // allocation params (with some fields nulled)
+ PL_ARRAY(struct vk_slab *) slabs; // array of slabs, unsorted
+ int index; // running index in `vk_malloc.pools`
+};
+
+// The overall state of the allocator, which keeps track of a vk_pool for each
+// memory type.
+struct vk_malloc {
+ struct vk_ctx *vk;
+ pl_mutex lock;
+ VkPhysicalDeviceMemoryProperties props;
+ size_t maximum_page_size;
+ PL_ARRAY(struct vk_pool) pools;
+ uint64_t age;
+};
+
+static inline float efficiency(size_t used, size_t total)
+{
+ if (!total)
+ return 100.0;
+
+ return 100.0f * used / total;
+}
+
+static const char *print_size(char buf[8], size_t size)
+{
+ const char *suffixes = "\0KMG";
+ while (suffixes[1] && size > 9999) {
+ size >>= 10;
+ suffixes++;
+ }
+
+ int ret = *suffixes ? snprintf(buf, 8, "%4zu%c", size, *suffixes)
+ : snprintf(buf, 8, "%5zu", size);
+
+ return ret >= 0 ? buf : "(error)";
+}
+
+#define PRINT_SIZE(x) (print_size((char[8]){0}, (size_t) (x)))
+
+void vk_malloc_print_stats(struct vk_malloc *ma, enum pl_log_level lev)
+{
+ struct vk_ctx *vk = ma->vk;
+ size_t total_size = 0;
+ size_t total_used = 0;
+ size_t total_res = 0;
+
+ PL_MSG(vk, lev, "Memory heaps supported by device:");
+ for (int i = 0; i < ma->props.memoryHeapCount; i++) {
+ VkMemoryHeap heap = ma->props.memoryHeaps[i];
+ PL_MSG(vk, lev, " %d: flags 0x%x size %s",
+ i, (unsigned) heap.flags, PRINT_SIZE(heap.size));
+ }
+
+ PL_DEBUG(vk, "Memory types supported by device:");
+ for (int i = 0; i < ma->props.memoryTypeCount; i++) {
+ VkMemoryType type = ma->props.memoryTypes[i];
+ PL_DEBUG(vk, " %d: flags 0x%x heap %d",
+ i, (unsigned) type.propertyFlags, (int) type.heapIndex);
+ }
+
+ pl_mutex_lock(&ma->lock);
+ for (int i = 0; i < ma->pools.num; i++) {
+ struct vk_pool *pool = &ma->pools.elem[i];
+ const struct vk_malloc_params *par = &pool->params;
+
+ PL_MSG(vk, lev, "Memory pool %d:", i);
+ PL_MSG(vk, lev, " Compatible types: 0x%"PRIx32, par->reqs.memoryTypeBits);
+ if (par->required)
+ PL_MSG(vk, lev, " Required flags: 0x%"PRIx32, par->required);
+ if (par->optimal)
+ PL_MSG(vk, lev, " Optimal flags: 0x%"PRIx32, par->optimal);
+ if (par->buf_usage)
+ PL_MSG(vk, lev, " Buffer flags: 0x%"PRIx32, par->buf_usage);
+ if (par->export_handle)
+ PL_MSG(vk, lev, " Export handle: 0x%x", par->export_handle);
+
+ size_t pool_size = 0;
+ size_t pool_used = 0;
+ size_t pool_res = 0;
+
+ for (int j = 0; j < pool->slabs.num; j++) {
+ struct vk_slab *slab = pool->slabs.elem[j];
+ pl_mutex_lock(&slab->lock);
+
+ size_t avail = __builtin_popcountll(slab->spacemap) * slab->pagesize;
+ size_t slab_res = slab->size - avail;
+
+ PL_MSG(vk, lev, " Slab %2d: %8"PRIx64" x %s: "
+ "%s used %s res %s alloc from heap %d, efficiency %.2f%% [%s]",
+ j, slab->spacemap, PRINT_SIZE(slab->pagesize),
+ PRINT_SIZE(slab->used), PRINT_SIZE(slab_res),
+ PRINT_SIZE(slab->size), (int) slab->mtype.heapIndex,
+ efficiency(slab->used, slab_res),
+ PL_DEF(slab->debug_tag, "unknown"));
+
+ pool_size += slab->size;
+ pool_used += slab->used;
+ pool_res += slab_res;
+ pl_mutex_unlock(&slab->lock);
+ }
+
+ PL_MSG(vk, lev, " Pool summary: %s used %s res %s alloc, "
+ "efficiency %.2f%%, utilization %.2f%%",
+ PRINT_SIZE(pool_used), PRINT_SIZE(pool_res),
+ PRINT_SIZE(pool_size), efficiency(pool_used, pool_res),
+ efficiency(pool_res, pool_size));
+
+ total_size += pool_size;
+ total_used += pool_used;
+ total_res += pool_res;
+ }
+ pl_mutex_unlock(&ma->lock);
+
+ PL_MSG(vk, lev, "Memory summary: %s used %s res %s alloc, "
+ "efficiency %.2f%%, utilization %.2f%%, max page: %s",
+ PRINT_SIZE(total_used), PRINT_SIZE(total_res),
+ PRINT_SIZE(total_size), efficiency(total_used, total_res),
+ efficiency(total_res, total_size),
+ PRINT_SIZE(ma->maximum_page_size));
+}
+
+static void slab_free(struct vk_ctx *vk, struct vk_slab *slab)
+{
+ if (!slab)
+ return;
+
+#ifndef NDEBUG
+ if (!slab->dedicated && slab->used > 0) {
+ PL_WARN(vk, "Leaked %zu bytes of vulkan memory!", slab->used);
+ PL_WARN(vk, "slab total size: %zu bytes, heap: %d, flags: 0x%"PRIX64,
+ (size_t) slab->size, (int) slab->mtype.heapIndex,
+ (uint64_t) slab->mtype.propertyFlags);
+ if (slab->debug_tag)
+ PL_WARN(vk, "last used for: %s", slab->debug_tag);
+ pl_log_stack_trace(vk->log, PL_LOG_WARN);
+ pl_debug_abort();
+ }
+#endif
+
+ if (slab->imported) {
+ switch (slab->handle_type) {
+ case PL_HANDLE_FD:
+ case PL_HANDLE_DMA_BUF:
+ PL_TRACE(vk, "Unimporting slab of size %s from fd: %d",
+ PRINT_SIZE(slab->size), slab->handle.fd);
+ break;
+ case PL_HANDLE_WIN32:
+ case PL_HANDLE_WIN32_KMT:
+#ifdef PL_HAVE_WIN32
+ PL_TRACE(vk, "Unimporting slab of size %s from handle: %p",
+ PRINT_SIZE(slab->size), (void *) slab->handle.handle);
+#endif
+ break;
+ case PL_HANDLE_HOST_PTR:
+ PL_TRACE(vk, "Unimporting slab of size %s from ptr: %p",
+ PRINT_SIZE(slab->size), (void *) slab->handle.ptr);
+ break;
+ case PL_HANDLE_IOSURFACE:
+ case PL_HANDLE_MTL_TEX:
+ pl_unreachable();
+ }
+ } else {
+ switch (slab->handle_type) {
+ case PL_HANDLE_FD:
+ case PL_HANDLE_DMA_BUF:
+#ifdef PL_HAVE_UNIX
+ if (slab->handle.fd > -1)
+ close(slab->handle.fd);
+#endif
+ break;
+ case PL_HANDLE_WIN32:
+#ifdef PL_HAVE_WIN32
+ if (slab->handle.handle != NULL)
+ CloseHandle(slab->handle.handle);
+#endif
+ break;
+ case PL_HANDLE_WIN32_KMT:
+ // PL_HANDLE_WIN32_KMT is just an identifier. It doesn't get closed.
+ break;
+ case PL_HANDLE_HOST_PTR:
+ // Implicitly unmapped
+ break;
+ case PL_HANDLE_IOSURFACE:
+ case PL_HANDLE_MTL_TEX:
+ pl_unreachable();
+ }
+
+ PL_DEBUG(vk, "Freeing slab of size %s", PRINT_SIZE(slab->size));
+ }
+
+ vk->DestroyBuffer(vk->dev, slab->buffer, PL_VK_ALLOC);
+ // also implicitly unmaps the memory if needed
+ vk->FreeMemory(vk->dev, slab->mem, PL_VK_ALLOC);
+
+ pl_mutex_destroy(&slab->lock);
+ pl_free(slab);
+}
+
+// type_mask: optional
+// thread-safety: safe
+static bool find_best_memtype(const struct vk_malloc *ma, uint32_t type_mask,
+ const struct vk_malloc_params *params,
+ uint32_t *out_index)
+{
+ struct vk_ctx *vk = ma->vk;
+ int best = -1;
+
+ // The vulkan spec requires memory types to be sorted in the "optimal"
+ // order, so the first matching type we find will be the best/fastest one.
+ // That being said, we still want to prioritize memory types that have
+ // better optional flags.
+
+ type_mask &= params->reqs.memoryTypeBits;
+ for (int i = 0; i < ma->props.memoryTypeCount; i++) {
+ const VkMemoryType *mtype = &ma->props.memoryTypes[i];
+
+ // The memory type flags must include our properties
+ if ((mtype->propertyFlags & params->required) != params->required)
+ continue;
+
+ // The memory heap must be large enough for the allocation
+ VkDeviceSize heapSize = ma->props.memoryHeaps[mtype->heapIndex].size;
+ if (params->reqs.size > heapSize)
+ continue;
+
+ // The memory type must be supported by the type mask (bitfield)
+ if (!(type_mask & (1LU << i)))
+ continue;
+
+ // Calculate the score as the number of optimal property flags matched
+ int score = __builtin_popcountl(mtype->propertyFlags & params->optimal);
+ if (score > best) {
+ *out_index = i;
+ best = score;
+ }
+ }
+
+ if (best < 0) {
+ PL_ERR(vk, "Found no memory type matching property flags 0x%x and type "
+ "bits 0x%x!",
+ (unsigned) params->required, (unsigned) type_mask);
+ return false;
+ }
+
+ return true;
+}
+
+static bool buf_external_check(struct vk_ctx *vk, VkBufferUsageFlags usage,
+ enum pl_handle_type handle_type, bool import)
+{
+ if (!handle_type)
+ return true;
+
+ VkPhysicalDeviceExternalBufferInfo info = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_BUFFER_INFO_KHR,
+ .usage = usage,
+ .handleType = vk_mem_handle_type(handle_type),
+ };
+
+ VkExternalBufferProperties props = {
+ .sType = VK_STRUCTURE_TYPE_EXTERNAL_BUFFER_PROPERTIES_KHR,
+ };
+
+ if (!info.handleType)
+ return false;
+
+ vk->GetPhysicalDeviceExternalBufferProperties(vk->physd, &info, &props);
+ return vk_external_mem_check(vk, &props.externalMemoryProperties,
+ handle_type, import);
+}
+
+// thread-safety: safe
+static struct vk_slab *slab_alloc(struct vk_malloc *ma,
+ const struct vk_malloc_params *params)
+{
+ struct vk_ctx *vk = ma->vk;
+ struct vk_slab *slab = pl_alloc_ptr(NULL, slab);
+ *slab = (struct vk_slab) {
+ .age = ma->age,
+ .size = params->reqs.size,
+ .handle_type = params->export_handle,
+ .debug_tag = params->debug_tag,
+ };
+ pl_mutex_init(&slab->lock);
+
+ switch (slab->handle_type) {
+ case PL_HANDLE_FD:
+ case PL_HANDLE_DMA_BUF:
+ slab->handle.fd = -1;
+ break;
+ case PL_HANDLE_WIN32:
+ case PL_HANDLE_WIN32_KMT:
+ case PL_HANDLE_MTL_TEX:
+ case PL_HANDLE_IOSURFACE:
+ slab->handle.handle = NULL;
+ break;
+ case PL_HANDLE_HOST_PTR:
+ slab->handle.ptr = NULL;
+ break;
+ }
+
+ VkExportMemoryAllocateInfoKHR ext_info = {
+ .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR,
+ .handleTypes = vk_mem_handle_type(slab->handle_type),
+ };
+
+ uint32_t type_mask = UINT32_MAX;
+ if (params->buf_usage) {
+ // Queue family sharing modes don't matter for buffers, so we just
+ // set them as concurrent and stop worrying about it.
+ uint32_t qfs[3] = {0};
+ pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs));
+ for (int i = 0; i < vk->pools.num; i++)
+ qfs[i] = vk->pools.elem[i]->qf;
+
+ VkExternalMemoryBufferCreateInfoKHR ext_buf_info = {
+ .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO_KHR,
+ .handleTypes = ext_info.handleTypes,
+ };
+
+ VkBufferCreateInfo binfo = {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+ .pNext = slab->handle_type ? &ext_buf_info : NULL,
+ .size = slab->size,
+ .usage = params->buf_usage,
+ .sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT
+ : VK_SHARING_MODE_EXCLUSIVE,
+ .queueFamilyIndexCount = vk->pools.num,
+ .pQueueFamilyIndices = qfs,
+ };
+
+ if (!buf_external_check(vk, binfo.usage, slab->handle_type, false)) {
+ PL_ERR(vk, "Failed allocating shared memory buffer: possibly "
+ "the handle type is unsupported?");
+ goto error;
+ }
+
+ VK(vk->CreateBuffer(vk->dev, &binfo, PL_VK_ALLOC, &slab->buffer));
+ PL_VK_NAME(BUFFER, slab->buffer, "slab");
+
+ VkMemoryRequirements reqs = {0};
+ vk->GetBufferMemoryRequirements(vk->dev, slab->buffer, &reqs);
+ slab->size = reqs.size; // this can be larger than `slab->size`
+ type_mask = reqs.memoryTypeBits;
+
+ // Note: we can ignore `reqs.align` because we always bind the buffer
+ // memory to offset 0
+ }
+
+ VkMemoryAllocateInfo minfo = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+ .allocationSize = slab->size,
+ };
+
+ if (params->export_handle)
+ vk_link_struct(&minfo, &ext_info);
+
+ VkMemoryDedicatedAllocateInfoKHR dinfo = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR,
+ .image = params->ded_image,
+ };
+
+ if (params->ded_image)
+ vk_link_struct(&minfo, &dinfo);
+
+ if (!find_best_memtype(ma, type_mask, params, &minfo.memoryTypeIndex))
+ goto error;
+
+ const VkMemoryType *mtype = &ma->props.memoryTypes[minfo.memoryTypeIndex];
+ PL_DEBUG(vk, "Allocating %zu memory of type 0x%x (id %d) in heap %d: %s",
+ (size_t) slab->size, (unsigned) mtype->propertyFlags,
+ (int) minfo.memoryTypeIndex, (int) mtype->heapIndex,
+ PL_DEF(params->debug_tag, "unknown"));
+
+ pl_clock_t start = pl_clock_now();
+
+ VkResult res = vk->AllocateMemory(vk->dev, &minfo, PL_VK_ALLOC, &slab->mem);
+ switch (res) {
+ case VK_ERROR_OUT_OF_DEVICE_MEMORY:
+ case VK_ERROR_OUT_OF_HOST_MEMORY:
+ PL_ERR(vk, "Allocation of size %s failed: %s!",
+ PRINT_SIZE(slab->size), vk_res_str(res));
+ vk_malloc_print_stats(ma, PL_LOG_ERR);
+ pl_log_stack_trace(vk->log, PL_LOG_ERR);
+ pl_debug_abort();
+ goto error;
+
+ default:
+ PL_VK_ASSERT(res, "vkAllocateMemory");
+ }
+
+ slab->mtype = *mtype;
+ if (mtype->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
+ VK(vk->MapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data));
+ slab->coherent = mtype->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+ }
+
+ if (slab->buffer)
+ VK(vk->BindBufferMemory(vk->dev, slab->buffer, slab->mem, 0));
+
+#ifdef PL_HAVE_UNIX
+ if (slab->handle_type == PL_HANDLE_FD ||
+ slab->handle_type == PL_HANDLE_DMA_BUF)
+ {
+ VkMemoryGetFdInfoKHR fd_info = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR,
+ .memory = slab->mem,
+ .handleType = ext_info.handleTypes,
+ };
+
+ VK(vk->GetMemoryFdKHR(vk->dev, &fd_info, &slab->handle.fd));
+ }
+#endif
+
+#ifdef PL_HAVE_WIN32
+ if (slab->handle_type == PL_HANDLE_WIN32 ||
+ slab->handle_type == PL_HANDLE_WIN32_KMT)
+ {
+ VkMemoryGetWin32HandleInfoKHR handle_info = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR,
+ .memory = slab->mem,
+ .handleType = ext_info.handleTypes,
+ };
+
+ VK(vk->GetMemoryWin32HandleKHR(vk->dev, &handle_info,
+ &slab->handle.handle));
+ }
+#endif
+
+ pl_log_cpu_time(vk->log, start, pl_clock_now(), "allocating slab");
+
+ // free space accounting is done by the caller
+ return slab;
+
+error:
+ if (params->debug_tag)
+ PL_ERR(vk, " for malloc: %s", params->debug_tag);
+ slab_free(vk, slab);
+ return NULL;
+}
+
+static void pool_uninit(struct vk_ctx *vk, struct vk_pool *pool)
+{
+ for (int i = 0; i < pool->slabs.num; i++)
+ slab_free(vk, pool->slabs.elem[i]);
+
+ pl_free(pool->slabs.elem);
+ *pool = (struct vk_pool) {0};
+}
+
+struct vk_malloc *vk_malloc_create(struct vk_ctx *vk)
+{
+ struct vk_malloc *ma = pl_zalloc_ptr(NULL, ma);
+ pl_mutex_init(&ma->lock);
+ vk->GetPhysicalDeviceMemoryProperties(vk->physd, &ma->props);
+ ma->vk = vk;
+
+ // Determine maximum page size
+ ma->maximum_page_size = MAXIMUM_PAGE_SIZE_ABSOLUTE;
+ for (int i = 0; i < ma->props.memoryHeapCount; i++) {
+ VkMemoryHeap heap = ma->props.memoryHeaps[i];
+ if (heap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) {
+ size_t size_max = heap.size / MAXIMUM_PAGE_SIZE_RELATIVE;
+ ma->maximum_page_size = PL_MAX(ma->maximum_page_size, size_max);
+ }
+ }
+
+ vk_malloc_print_stats(ma, PL_LOG_INFO);
+ return ma;
+}
+
+void vk_malloc_destroy(struct vk_malloc **ma_ptr)
+{
+ struct vk_malloc *ma = *ma_ptr;
+ if (!ma)
+ return;
+
+ vk_malloc_print_stats(ma, PL_LOG_DEBUG);
+ for (int i = 0; i < ma->pools.num; i++)
+ pool_uninit(ma->vk, &ma->pools.elem[i]);
+
+ pl_mutex_destroy(&ma->lock);
+ pl_free_ptr(ma_ptr);
+}
+
+void vk_malloc_garbage_collect(struct vk_malloc *ma)
+{
+ struct vk_ctx *vk = ma->vk;
+
+ pl_mutex_lock(&ma->lock);
+ ma->age++;
+
+ for (int i = 0; i < ma->pools.num; i++) {
+ struct vk_pool *pool = &ma->pools.elem[i];
+ for (int n = 0; n < pool->slabs.num; n++) {
+ struct vk_slab *slab = pool->slabs.elem[n];
+ pl_mutex_lock(&slab->lock);
+ if (slab->used || (ma->age - slab->age) <= MAXIMUM_SLAB_AGE) {
+ pl_mutex_unlock(&slab->lock);
+ continue;
+ }
+
+ PL_DEBUG(vk, "Garbage collected slab of size %s from pool %d",
+ PRINT_SIZE(slab->size), pool->index);
+
+ pl_mutex_unlock(&slab->lock);
+ slab_free(ma->vk, slab);
+ PL_ARRAY_REMOVE_AT(pool->slabs, n--);
+ }
+ }
+
+ pl_mutex_unlock(&ma->lock);
+}
+
+pl_handle_caps vk_malloc_handle_caps(const struct vk_malloc *ma, bool import)
+{
+ struct vk_ctx *vk = ma->vk;
+ pl_handle_caps caps = 0;
+
+ for (int i = 0; vk_mem_handle_list[i]; i++) {
+ // Try seeing if we could allocate a "basic" buffer using these
+ // capabilities, with no fancy buffer usage. More specific checks will
+ // happen down the line at VkBuffer creation time, but this should give
+ // us a rough idea of what the driver supports.
+ enum pl_handle_type type = vk_mem_handle_list[i];
+ if (buf_external_check(vk, VK_BUFFER_USAGE_TRANSFER_DST_BIT, type, import))
+ caps |= type;
+ }
+
+ return caps;
+}
+
+void vk_malloc_free(struct vk_malloc *ma, struct vk_memslice *slice)
+{
+ struct vk_ctx *vk = ma->vk;
+ struct vk_slab *slab = slice->priv;
+ if (!slab || slab->dedicated) {
+ slab_free(vk, slab);
+ goto done;
+ }
+
+ pl_mutex_lock(&slab->lock);
+
+ int page_idx = slice->offset / slab->pagesize;
+ slab->spacemap |= 0x1LLU << page_idx;
+ slab->used -= slice->size;
+ slab->age = ma->age;
+ pl_assert(slab->used >= 0);
+
+ pl_mutex_unlock(&slab->lock);
+
+done:
+ *slice = (struct vk_memslice) {0};
+}
+
+static inline bool pool_params_eq(const struct vk_malloc_params *a,
+ const struct vk_malloc_params *b)
+{
+ return a->reqs.size == b->reqs.size &&
+ a->reqs.alignment == b->reqs.alignment &&
+ a->reqs.memoryTypeBits == b->reqs.memoryTypeBits &&
+ a->required == b->required &&
+ a->optimal == b->optimal &&
+ a->buf_usage == b->buf_usage &&
+ a->export_handle == b->export_handle;
+}
+
+static struct vk_pool *find_pool(struct vk_malloc *ma,
+ const struct vk_malloc_params *params)
+{
+ pl_assert(!params->import_handle);
+ pl_assert(!params->ded_image);
+
+ struct vk_malloc_params fixed = *params;
+ fixed.reqs.alignment = 0;
+ fixed.reqs.size = 0;
+ fixed.shared_mem = (struct pl_shared_mem) {0};
+
+ for (int i = 0; i < ma->pools.num; i++) {
+ if (pool_params_eq(&ma->pools.elem[i].params, &fixed))
+ return &ma->pools.elem[i];
+ }
+
+ // Not found => add it
+ PL_ARRAY_GROW(ma, ma->pools);
+ size_t idx = ma->pools.num++;
+ ma->pools.elem[idx] = (struct vk_pool) {
+ .params = fixed,
+ .index = idx,
+ };
+ return &ma->pools.elem[idx];
+}
+
+// Returns a suitable memory page from the pool. A new slab will be allocated
+// under the hood, if necessary.
+//
+// Note: This locks the slab it returns
+static struct vk_slab *pool_get_page(struct vk_malloc *ma, struct vk_pool *pool,
+ size_t size, size_t align,
+ VkDeviceSize *offset)
+{
+ struct vk_slab *slab = NULL;
+ int slab_pages = MINIMUM_PAGE_COUNT;
+ size = PL_ALIGN2(size, PAGE_SIZE_ALIGN);
+ const size_t pagesize = PL_ALIGN(size, align);
+
+ for (int i = 0; i < pool->slabs.num; i++) {
+ slab = pool->slabs.elem[i];
+ if (slab->pagesize < size)
+ continue;
+ if (slab->pagesize > pagesize * MINIMUM_PAGE_COUNT) // rough heuristic
+ continue;
+ if (slab->pagesize % align)
+ continue;
+
+ pl_mutex_lock(&slab->lock);
+ int page_idx = __builtin_ffsll(slab->spacemap);
+ if (!page_idx--) {
+ pl_mutex_unlock(&slab->lock);
+ // Increase the number of slabs to allocate for new slabs the
+ // more existing full slabs exist for this size range
+ slab_pages = PL_MIN(slab_pages << 1, MAXIMUM_PAGE_COUNT);
+ continue;
+ }
+
+ slab->spacemap ^= 0x1LLU << page_idx;
+ *offset = page_idx * slab->pagesize;
+ return slab;
+ }
+
+ // Otherwise, allocate a new vk_slab and append it to the list.
+ VkDeviceSize slab_size = slab_pages * pagesize;
+ pl_static_assert(MINIMUM_SLAB_SIZE <= PAGE_SIZE_ALIGN * MAXIMUM_PAGE_COUNT);
+ const VkDeviceSize max_slab_size = ma->maximum_page_size * MINIMUM_PAGE_COUNT;
+ pl_assert(pagesize <= ma->maximum_page_size);
+ slab_size = PL_CLAMP(slab_size, MINIMUM_SLAB_SIZE, max_slab_size);
+ slab_pages = slab_size / pagesize;
+ slab_size = slab_pages * pagesize; // max_slab_size may be npot2, trim excess
+
+ struct vk_malloc_params params = pool->params;
+ params.reqs.size = slab_size;
+
+ // Don't hold the lock while allocating the slab, because it can be a
+ // potentially very costly operation.
+ pl_mutex_unlock(&ma->lock);
+ slab = slab_alloc(ma, &params);
+ pl_mutex_lock(&ma->lock);
+ if (!slab)
+ return NULL;
+ pl_mutex_lock(&slab->lock);
+
+ slab->spacemap = (slab_pages == sizeof(uint64_t) * 8) ? ~0LLU : ~(~0LLU << slab_pages);
+ slab->pagesize = pagesize;
+ PL_ARRAY_APPEND(NULL, pool->slabs, slab);
+
+ // Return the first page in this newly allocated slab
+ slab->spacemap ^= 0x1;
+ *offset = 0;
+ return slab;
+}
+
+static bool vk_malloc_import(struct vk_malloc *ma, struct vk_memslice *out,
+ const struct vk_malloc_params *params)
+{
+ struct vk_ctx *vk = ma->vk;
+ VkExternalMemoryHandleTypeFlagBitsKHR vk_handle_type;
+ vk_handle_type = vk_mem_handle_type(params->import_handle);
+
+ struct vk_slab *slab = NULL;
+ const struct pl_shared_mem *shmem = &params->shared_mem;
+
+ VkMemoryDedicatedAllocateInfoKHR dinfo = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR,
+ .image = params->ded_image,
+ };
+
+ VkImportMemoryFdInfoKHR fdinfo = {
+ .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR,
+ .handleType = vk_handle_type,
+ .fd = -1,
+ };
+
+ VkImportMemoryHostPointerInfoEXT ptrinfo = {
+ .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT,
+ .handleType = vk_handle_type,
+ };
+
+ VkMemoryAllocateInfo ainfo = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+ .allocationSize = shmem->size,
+ };
+
+ if (params->ded_image)
+ vk_link_struct(&ainfo, &dinfo);
+
+ VkBuffer buffer = VK_NULL_HANDLE;
+ VkMemoryRequirements reqs = params->reqs;
+
+ if (params->buf_usage) {
+ uint32_t qfs[3] = {0};
+ pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs));
+ for (int i = 0; i < vk->pools.num; i++)
+ qfs[i] = vk->pools.elem[i]->qf;
+
+ VkExternalMemoryBufferCreateInfoKHR ext_buf_info = {
+ .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO_KHR,
+ .handleTypes = vk_handle_type,
+ };
+
+ VkBufferCreateInfo binfo = {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+ .pNext = &ext_buf_info,
+ .size = shmem->size,
+ .usage = params->buf_usage,
+ .sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT
+ : VK_SHARING_MODE_EXCLUSIVE,
+ .queueFamilyIndexCount = vk->pools.num,
+ .pQueueFamilyIndices = qfs,
+ };
+
+ VK(vk->CreateBuffer(vk->dev, &binfo, PL_VK_ALLOC, &buffer));
+ PL_VK_NAME(BUFFER, buffer, "imported");
+
+ vk->GetBufferMemoryRequirements(vk->dev, buffer, &reqs);
+ }
+
+ if (reqs.size > shmem->size) {
+ PL_ERR(vk, "Imported object requires %zu bytes, larger than the "
+ "provided size %zu!",
+ (size_t) reqs.size, shmem->size);
+ goto error;
+ }
+
+ if (shmem->offset % reqs.alignment || shmem->offset % params->reqs.alignment) {
+ PL_ERR(vk, "Imported object offset %zu conflicts with alignment %zu!",
+ shmem->offset, pl_lcm(reqs.alignment, params->reqs.alignment));
+ goto error;
+ }
+
+ switch (params->import_handle) {
+#ifdef PL_HAVE_UNIX
+ case PL_HANDLE_DMA_BUF: {
+ if (!vk->GetMemoryFdPropertiesKHR) {
+ PL_ERR(vk, "Importing PL_HANDLE_DMA_BUF requires %s.",
+ VK_EXT_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME);
+ goto error;
+ }
+
+ VkMemoryFdPropertiesKHR fdprops = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_FD_PROPERTIES_KHR,
+ };
+
+ VK(vk->GetMemoryFdPropertiesKHR(vk->dev,
+ vk_handle_type,
+ shmem->handle.fd,
+ &fdprops));
+
+ // We dup() the fd to make it safe to import the same original fd
+ // multiple times.
+ fdinfo.fd = dup(shmem->handle.fd);
+ if (fdinfo.fd == -1) {
+ PL_ERR(vk, "Failed to dup() fd (%d) when importing memory: %s",
+ fdinfo.fd, strerror(errno));
+ goto error;
+ }
+
+ reqs.memoryTypeBits &= fdprops.memoryTypeBits;
+ vk_link_struct(&ainfo, &fdinfo);
+ break;
+ }
+#else // !PL_HAVE_UNIX
+ case PL_HANDLE_DMA_BUF:
+ PL_ERR(vk, "PL_HANDLE_DMA_BUF requires building with UNIX support!");
+ goto error;
+#endif
+
+ case PL_HANDLE_HOST_PTR: {
+ VkMemoryHostPointerPropertiesEXT ptrprops = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT,
+ };
+
+ VK(vk->GetMemoryHostPointerPropertiesEXT(vk->dev, vk_handle_type,
+ shmem->handle.ptr,
+ &ptrprops));
+
+ ptrinfo.pHostPointer = (void *) shmem->handle.ptr;
+ reqs.memoryTypeBits &= ptrprops.memoryTypeBits;
+ vk_link_struct(&ainfo, &ptrinfo);
+ break;
+ }
+
+ case PL_HANDLE_FD:
+ case PL_HANDLE_WIN32:
+ case PL_HANDLE_WIN32_KMT:
+ case PL_HANDLE_IOSURFACE:
+ case PL_HANDLE_MTL_TEX:
+ PL_ERR(vk, "vk_malloc_import: unsupported handle type %d",
+ params->import_handle);
+ goto error;
+ }
+
+ if (!find_best_memtype(ma, reqs.memoryTypeBits, params, &ainfo.memoryTypeIndex)) {
+ PL_ERR(vk, "No compatible memory types offered for imported memory!");
+ goto error;
+ }
+
+ VkDeviceMemory vkmem = VK_NULL_HANDLE;
+ VK(vk->AllocateMemory(vk->dev, &ainfo, PL_VK_ALLOC, &vkmem));
+
+ slab = pl_alloc_ptr(NULL, slab);
+ *slab = (struct vk_slab) {
+ .mem = vkmem,
+ .dedicated = true,
+ .imported = true,
+ .buffer = buffer,
+ .size = shmem->size,
+ .handle_type = params->import_handle,
+ };
+ pl_mutex_init(&slab->lock);
+
+ *out = (struct vk_memslice) {
+ .vkmem = vkmem,
+ .buf = buffer,
+ .size = shmem->size - shmem->offset,
+ .offset = shmem->offset,
+ .shared_mem = *shmem,
+ .priv = slab,
+ };
+
+ switch (params->import_handle) {
+ case PL_HANDLE_DMA_BUF:
+ case PL_HANDLE_FD:
+ PL_TRACE(vk, "Imported %s bytes from fd: %d%s",
+ PRINT_SIZE(slab->size), shmem->handle.fd,
+ params->ded_image ? " (dedicated)" : "");
+ // fd ownership is transferred at this point.
+ slab->handle.fd = fdinfo.fd;
+ fdinfo.fd = -1;
+ break;
+ case PL_HANDLE_HOST_PTR:
+ PL_TRACE(vk, "Imported %s bytes from ptr: %p%s",
+ PRINT_SIZE(slab->size), shmem->handle.ptr,
+ params->ded_image ? " (dedicated" : "");
+ slab->handle.ptr = ptrinfo.pHostPointer;
+ break;
+ case PL_HANDLE_WIN32:
+ case PL_HANDLE_WIN32_KMT:
+ case PL_HANDLE_IOSURFACE:
+ case PL_HANDLE_MTL_TEX:
+ break;
+ }
+
+ VkMemoryPropertyFlags flags = ma->props.memoryTypes[ainfo.memoryTypeIndex].propertyFlags;
+ if (flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
+ VK(vk->MapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data));
+ slab->coherent = flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+ out->data = (uint8_t *) slab->data + out->offset;
+ out->coherent = slab->coherent;
+ if (!slab->coherent) {
+ // Use entire buffer range, since this is a dedicated memory
+ // allocation. This avoids issues with noncoherent atomicity
+ out->map_offset = 0;
+ out->map_size = VK_WHOLE_SIZE;
+
+ // Mapping does not implicitly invalidate mapped memory
+ VK(vk->InvalidateMappedMemoryRanges(vk->dev, 1, &(VkMappedMemoryRange) {
+ .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+ .memory = slab->mem,
+ .offset = out->map_offset,
+ .size = out->map_size,
+ }));
+ }
+ }
+
+ if (buffer)
+ VK(vk->BindBufferMemory(vk->dev, buffer, vkmem, 0));
+
+ return true;
+
+error:
+ if (params->debug_tag)
+ PL_ERR(vk, " for malloc: %s", params->debug_tag);
+ vk->DestroyBuffer(vk->dev, buffer, PL_VK_ALLOC);
+#ifdef PL_HAVE_UNIX
+ if (fdinfo.fd > -1)
+ close(fdinfo.fd);
+#endif
+ pl_free(slab);
+ *out = (struct vk_memslice) {0};
+ return false;
+}
+
+size_t vk_malloc_avail(struct vk_malloc *ma, VkMemoryPropertyFlags flags)
+{
+ size_t avail = 0;
+ for (int i = 0; i < ma->props.memoryTypeCount; i++) {
+ const VkMemoryType *mtype = &ma->props.memoryTypes[i];
+ if ((mtype->propertyFlags & flags) != flags)
+ continue;
+ avail = PL_MAX(avail, ma->props.memoryHeaps[mtype->heapIndex].size);
+ }
+
+ return avail;
+}
+
+bool vk_malloc_slice(struct vk_malloc *ma, struct vk_memslice *out,
+ const struct vk_malloc_params *params)
+{
+ struct vk_ctx *vk = ma->vk;
+ pl_assert(!params->import_handle || !params->export_handle);
+ if (params->import_handle)
+ return vk_malloc_import(ma, out, params);
+
+ pl_assert(params->reqs.size);
+ size_t size = params->reqs.size;
+ size_t align = params->reqs.alignment;
+ align = pl_lcm(align, vk->props.limits.bufferImageGranularity);
+ align = pl_lcm(align, vk->props.limits.nonCoherentAtomSize);
+
+ struct vk_slab *slab;
+ VkDeviceSize offset;
+
+ if (params->ded_image || size > ma->maximum_page_size) {
+ slab = slab_alloc(ma, params);
+ if (!slab)
+ return false;
+ slab->dedicated = true;
+ offset = 0;
+ } else {
+ pl_mutex_lock(&ma->lock);
+ struct vk_pool *pool = find_pool(ma, params);
+ slab = pool_get_page(ma, pool, size, align, &offset);
+ pl_mutex_unlock(&ma->lock);
+ if (!slab) {
+ PL_ERR(ma->vk, "No slab to serve request for %s bytes (with "
+ "alignment 0x%zx) in pool %d!",
+ PRINT_SIZE(size), align, pool->index);
+ return false;
+ }
+
+ // For accounting, just treat the alignment as part of the used size.
+ // Doing it this way makes sure that the sizes reported to vk_memslice
+ // consumers are always aligned properly.
+ size = PL_ALIGN(size, align);
+ slab->used += size;
+ slab->age = ma->age;
+ if (params->debug_tag)
+ slab->debug_tag = params->debug_tag;
+ pl_mutex_unlock(&slab->lock);
+ }
+
+ pl_assert(offset % align == 0);
+ *out = (struct vk_memslice) {
+ .vkmem = slab->mem,
+ .offset = offset,
+ .size = size,
+ .buf = slab->buffer,
+ .data = slab->data ? (uint8_t *) slab->data + offset : 0x0,
+ .coherent = slab->coherent,
+ .map_offset = slab->data ? offset : 0,
+ .map_size = slab->data ? size : 0,
+ .priv = slab,
+ .shared_mem = {
+ .handle = slab->handle,
+ .offset = offset,
+ .size = slab->size,
+ },
+ };
+ return true;
+}
diff --git a/src/vulkan/malloc.h b/src/vulkan/malloc.h
new file mode 100644
index 0000000..115352e
--- /dev/null
+++ b/src/vulkan/malloc.h
@@ -0,0 +1,72 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+// All memory allocated from a vk_malloc MUST be explicitly released by
+// the caller before vk_malloc_destroy is called.
+struct vk_malloc *vk_malloc_create(struct vk_ctx *vk);
+void vk_malloc_destroy(struct vk_malloc **ma);
+
+// Get the supported handle types for this malloc instance
+pl_handle_caps vk_malloc_handle_caps(const struct vk_malloc *ma, bool import);
+
+// Represents a single "slice" of generic (non-buffer) memory, plus some
+// metadata for accounting. This struct is essentially read-only.
+struct vk_memslice {
+ VkDeviceMemory vkmem;
+ VkDeviceSize offset;
+ VkDeviceSize size;
+ void *priv;
+ // depending on the type/flags:
+ struct pl_shared_mem shared_mem;
+ VkBuffer buf; // associated buffer (when `buf_usage` is nonzero)
+ void *data; // pointer to slice (for persistently mapped slices)
+ bool coherent; // whether `data` is coherent
+ VkDeviceSize map_offset; // can be larger than offset/size
+ VkDeviceSize map_size;
+};
+
+struct vk_malloc_params {
+ VkMemoryRequirements reqs;
+ VkMemoryPropertyFlags required;
+ VkMemoryPropertyFlags optimal;
+ VkBufferUsageFlags buf_usage;
+ VkImage ded_image; // for dedicated image allocations
+ enum pl_handle_type export_handle;
+ enum pl_handle_type import_handle;
+ struct pl_shared_mem shared_mem; // for `import_handle`
+ pl_debug_tag debug_tag;
+};
+
+// Returns the amount of available memory matching a given set of property
+// flags. Always returns the highest single allocation, not the combined total.
+size_t vk_malloc_avail(struct vk_malloc *ma, VkMemoryPropertyFlags flags);
+
+bool vk_malloc_slice(struct vk_malloc *ma, struct vk_memslice *out,
+ const struct vk_malloc_params *params);
+
+void vk_malloc_free(struct vk_malloc *ma, struct vk_memslice *slice);
+
+// Clean up unused slabs. Call this roughly once per frame to reduce
+// memory pressure / memory leaks.
+void vk_malloc_garbage_collect(struct vk_malloc *ma);
+
+// For debugging purposes. Doesn't include dedicated slab allocations!
+void vk_malloc_print_stats(struct vk_malloc *ma, enum pl_log_level);
diff --git a/src/vulkan/meson.build b/src/vulkan/meson.build
new file mode 100644
index 0000000..64c5572
--- /dev/null
+++ b/src/vulkan/meson.build
@@ -0,0 +1,59 @@
+vulkan_build = get_option('vulkan')
+vulkan_link = get_option('vk-proc-addr')
+vulkan_loader = dependency('vulkan', required: false)
+vulkan_headers = vulkan_loader.partial_dependency(includes: true, compile_args: true)
+registry_xml = get_option('vulkan-registry')
+
+# Prefer our Vulkan headers for portability
+vulkan_headers_dir = thirdparty/'Vulkan-Headers'
+vulkan_headers_inc = include_directories()
+if fs.is_dir(vulkan_headers_dir/'include')
+ vulkan_headers = declare_dependency()
+ vulkan_headers_inc = include_directories('../../3rdparty/Vulkan-Headers/include')
+ # Force the use of this vk.xml because it has to be in sync with the headers
+ registry_xml = vulkan_headers_dir/'registry/vk.xml'
+endif
+
+vulkan_build = vulkan_build.require(
+ cc.has_header_symbol('vulkan/vulkan_core.h', 'VK_VERSION_1_3',
+ include_directories: vulkan_headers_inc,
+ dependencies: vulkan_headers),
+ error_message: 'vulkan.h was not found on the system, nor inside ' +
+ '`3rdparty/Vulkan-Headers`. Please run `git submodule update --init` ' +
+ 'followed by `meson --wipe`.')
+components.set('vulkan', vulkan_build.allowed())
+
+vulkan_link = vulkan_link.require(vulkan_loader.found() and vulkan_build.allowed())
+components.set('vk-proc-addr', vulkan_link.allowed())
+
+build_deps += vulkan_headers
+
+if vulkan_build.allowed()
+ sources += [
+ 'vulkan/command.c',
+ 'vulkan/context.c',
+ 'vulkan/formats.c',
+ 'vulkan/gpu.c',
+ 'vulkan/gpu_buf.c',
+ 'vulkan/gpu_tex.c',
+ 'vulkan/gpu_pass.c',
+ 'vulkan/malloc.c',
+ 'vulkan/swapchain.c',
+ 'vulkan/utils.c',
+ ]
+
+ datadir = get_option('prefix') / get_option('datadir')
+ sources += custom_target('utils_gen.c',
+ input: 'utils_gen.py',
+ output: 'utils_gen.c',
+ command: [python, '@INPUT@', datadir, registry_xml, '@OUTPUT@'],
+ env: python_env,
+ )
+
+ if vulkan_link.allowed()
+ build_deps += vulkan_loader
+ tests += 'vulkan.c'
+ endif
+else
+ sources += 'vulkan/stubs.c'
+endif
diff --git a/src/vulkan/stubs.c b/src/vulkan/stubs.c
new file mode 100644
index 0000000..0c0738e
--- /dev/null
+++ b/src/vulkan/stubs.c
@@ -0,0 +1,108 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../common.h"
+#include "log.h"
+
+#include <libplacebo/vulkan.h>
+
+const struct pl_vk_inst_params pl_vk_inst_default_params = {0};
+const struct pl_vulkan_params pl_vulkan_default_params = { PL_VULKAN_DEFAULTS };
+
+pl_vk_inst pl_vk_inst_create(pl_log log, const struct pl_vk_inst_params *params)
+{
+ pl_fatal(log, "libplacebo compiled without Vulkan support!");
+ return NULL;
+}
+
+void pl_vk_inst_destroy(pl_vk_inst *pinst)
+{
+ pl_vk_inst inst = *pinst;
+ pl_assert(!inst);
+}
+
+pl_vulkan pl_vulkan_create(pl_log log, const struct pl_vulkan_params *params)
+{
+ pl_fatal(log, "libplacebo compiled without Vulkan support!");
+ return NULL;
+}
+
+void pl_vulkan_destroy(pl_vulkan *pvk)
+{
+ pl_vulkan vk = *pvk;
+ pl_assert(!vk);
+}
+
+pl_vulkan pl_vulkan_get(pl_gpu gpu)
+{
+ return NULL;
+}
+
+VkPhysicalDevice pl_vulkan_choose_device(pl_log log,
+ const struct pl_vulkan_device_params *params)
+{
+ pl_err(log, "libplacebo compiled without Vulkan support!");
+ return NULL;
+}
+
+pl_swapchain pl_vulkan_create_swapchain(pl_vulkan vk,
+ const struct pl_vulkan_swapchain_params *params)
+{
+ pl_unreachable();
+}
+
+bool pl_vulkan_swapchain_suboptimal(pl_swapchain sw)
+{
+ pl_unreachable();
+}
+
+pl_vulkan pl_vulkan_import(pl_log log, const struct pl_vulkan_import_params *params)
+{
+ pl_fatal(log, "libplacebo compiled without Vulkan support!");
+ return NULL;
+}
+
+pl_tex pl_vulkan_wrap(pl_gpu gpu, const struct pl_vulkan_wrap_params *params)
+{
+ pl_unreachable();
+}
+
+VkImage pl_vulkan_unwrap(pl_gpu gpu, pl_tex tex,
+ VkFormat *out_format, VkImageUsageFlags *out_flags)
+{
+ pl_unreachable();
+}
+
+bool pl_vulkan_hold_ex(pl_gpu gpu, const struct pl_vulkan_hold_params *params)
+{
+ pl_unreachable();
+}
+
+void pl_vulkan_release_ex(pl_gpu gpu, const struct pl_vulkan_release_params *params)
+{
+ pl_unreachable();
+}
+
+VkSemaphore pl_vulkan_sem_create(pl_gpu gpu, const struct pl_vulkan_sem_params *params)
+{
+ pl_unreachable();
+}
+
+void pl_vulkan_sem_destroy(pl_gpu gpu, VkSemaphore *semaphore)
+{
+ pl_unreachable();
+}
diff --git a/src/vulkan/swapchain.c b/src/vulkan/swapchain.c
new file mode 100644
index 0000000..0741fbf
--- /dev/null
+++ b/src/vulkan/swapchain.c
@@ -0,0 +1,911 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+#include "command.h"
+#include "formats.h"
+#include "utils.h"
+#include "gpu.h"
+#include "swapchain.h"
+#include "pl_thread.h"
+
+struct sem_pair {
+ VkSemaphore in;
+ VkSemaphore out;
+};
+
+struct priv {
+ struct pl_sw_fns impl;
+
+ pl_mutex lock;
+ struct vk_ctx *vk;
+ VkSurfaceKHR surf;
+ PL_ARRAY(VkSurfaceFormatKHR) formats;
+
+ // current swapchain and metadata:
+ struct pl_vulkan_swapchain_params params;
+ VkSwapchainCreateInfoKHR protoInfo; // partially filled-in prototype
+ VkSwapchainKHR swapchain;
+ int cur_width, cur_height;
+ int swapchain_depth;
+ pl_rc_t frames_in_flight; // number of frames currently queued
+ bool suboptimal; // true once VK_SUBOPTIMAL_KHR is returned
+ bool needs_recreate; // swapchain needs to be recreated
+ struct pl_color_repr color_repr;
+ struct pl_color_space color_space;
+ struct pl_hdr_metadata hdr_metadata;
+
+ // state of the images:
+ PL_ARRAY(pl_tex) images; // pl_tex wrappers for the VkImages
+ PL_ARRAY(struct sem_pair) sems; // pool of semaphores used to synchronize images
+ int idx_sems; // index of next free semaphore pair
+ int last_imgidx; // the image index last acquired (for submit)
+};
+
+static const struct pl_sw_fns vulkan_swapchain;
+
+static bool map_color_space(VkColorSpaceKHR space, struct pl_color_space *out)
+{
+ switch (space) {
+ // Note: This is technically against the spec, but more often than not
+ // it's the correct result since `SRGB_NONLINEAR` is just a catch-all
+ // for any sort of typical SDR curve, which is better approximated by
+ // `pl_color_space_monitor`.
+ case VK_COLOR_SPACE_SRGB_NONLINEAR_KHR:
+ *out = pl_color_space_monitor;
+ return true;
+
+ case VK_COLOR_SPACE_BT709_NONLINEAR_EXT:
+ *out = pl_color_space_monitor;
+ return true;
+ case VK_COLOR_SPACE_DISPLAY_P3_NONLINEAR_EXT:
+ *out = (struct pl_color_space) {
+ .primaries = PL_COLOR_PRIM_DISPLAY_P3,
+ .transfer = PL_COLOR_TRC_BT_1886,
+ };
+ return true;
+ case VK_COLOR_SPACE_DCI_P3_LINEAR_EXT:
+ *out = (struct pl_color_space) {
+ .primaries = PL_COLOR_PRIM_DCI_P3,
+ .transfer = PL_COLOR_TRC_LINEAR,
+ };
+ return true;
+ case VK_COLOR_SPACE_DCI_P3_NONLINEAR_EXT:
+ *out = (struct pl_color_space) {
+ .primaries = PL_COLOR_PRIM_DCI_P3,
+ .transfer = PL_COLOR_TRC_BT_1886,
+ };
+ return true;
+ case VK_COLOR_SPACE_EXTENDED_SRGB_LINEAR_EXT:
+ case VK_COLOR_SPACE_EXTENDED_SRGB_NONLINEAR_EXT:
+ // TODO
+ return false;
+ case VK_COLOR_SPACE_BT709_LINEAR_EXT:
+ *out = (struct pl_color_space) {
+ .primaries = PL_COLOR_PRIM_DCI_P3,
+ .transfer = PL_COLOR_TRC_LINEAR,
+ };
+ return true;
+ case VK_COLOR_SPACE_BT2020_LINEAR_EXT:
+ *out = (struct pl_color_space) {
+ .primaries = PL_COLOR_PRIM_BT_2020,
+ .transfer = PL_COLOR_TRC_LINEAR,
+ };
+ return true;
+ case VK_COLOR_SPACE_HDR10_ST2084_EXT:
+ *out = (struct pl_color_space) {
+ .primaries = PL_COLOR_PRIM_BT_2020,
+ .transfer = PL_COLOR_TRC_PQ,
+ };
+ return true;
+ case VK_COLOR_SPACE_DOLBYVISION_EXT:
+ // Unlikely to ever be implemented
+ return false;
+ case VK_COLOR_SPACE_HDR10_HLG_EXT:
+ *out = (struct pl_color_space) {
+ .primaries = PL_COLOR_PRIM_BT_2020,
+ .transfer = PL_COLOR_TRC_HLG,
+ };
+ return true;
+ case VK_COLOR_SPACE_ADOBERGB_LINEAR_EXT:
+ *out = (struct pl_color_space) {
+ .primaries = PL_COLOR_PRIM_ADOBE,
+ .transfer = PL_COLOR_TRC_LINEAR,
+ };
+ return true;
+ case VK_COLOR_SPACE_ADOBERGB_NONLINEAR_EXT:
+ *out = (struct pl_color_space) {
+ .primaries = PL_COLOR_PRIM_ADOBE,
+ .transfer = PL_COLOR_TRC_GAMMA22,
+ };
+ return true;
+ case VK_COLOR_SPACE_PASS_THROUGH_EXT:
+ *out = pl_color_space_unknown;
+ return true;
+
+#ifdef VK_AMD_display_native_hdr
+ case VK_COLOR_SPACE_DISPLAY_NATIVE_AMD:
+ // TODO
+ return false;
+#endif
+
+ default: return false;
+ }
+}
+
+static bool pick_surf_format(pl_swapchain sw, const struct pl_color_space *hint)
+{
+ struct priv *p = PL_PRIV(sw);
+ struct vk_ctx *vk = p->vk;
+ pl_gpu gpu = sw->gpu;
+
+ int best_score = 0, best_id;
+ bool wide_gamut = pl_color_primaries_is_wide_gamut(hint->primaries);
+ bool prefer_hdr = pl_color_transfer_is_hdr(hint->transfer);
+
+ for (int i = 0; i < p->formats.num; i++) {
+ // Color space / format whitelist
+ struct pl_color_space space;
+ if (!map_color_space(p->formats.elem[i].colorSpace, &space))
+ continue;
+
+ bool disable10 = !pl_color_transfer_is_hdr(space.transfer) &&
+ p->params.disable_10bit_sdr;
+
+ switch (p->formats.elem[i].format) {
+ // Only accept floating point formats for linear curves
+ case VK_FORMAT_R16G16B16_SFLOAT:
+ case VK_FORMAT_R16G16B16A16_SFLOAT:
+ case VK_FORMAT_R32G32B32_SFLOAT:
+ case VK_FORMAT_R32G32B32A32_SFLOAT:
+ case VK_FORMAT_R64G64B64_SFLOAT:
+ case VK_FORMAT_R64G64B64A64_SFLOAT:
+ if (space.transfer == PL_COLOR_TRC_LINEAR)
+ break; // accept
+ continue;
+
+ // Only accept 8 bit for non-HDR curves
+ case VK_FORMAT_R8G8B8_UNORM:
+ case VK_FORMAT_B8G8R8_UNORM:
+ case VK_FORMAT_R8G8B8A8_UNORM:
+ case VK_FORMAT_B8G8R8A8_UNORM:
+ case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
+ if (!pl_color_transfer_is_hdr(space.transfer))
+ break; // accept
+ continue;
+
+ // Only accept 10 bit formats for non-linear curves
+ case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
+ case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+ if (space.transfer != PL_COLOR_TRC_LINEAR && !disable10)
+ break; // accept
+ continue;
+
+ // Accept 16-bit formats for everything
+ case VK_FORMAT_R16G16B16_UNORM:
+ case VK_FORMAT_R16G16B16A16_UNORM:
+ if (!disable10)
+ break; // accept
+ continue;
+
+ default: continue;
+ }
+
+ // Make sure we can wrap this format to a meaningful, valid pl_fmt
+ for (int n = 0; n < gpu->num_formats; n++) {
+ pl_fmt plfmt = gpu->formats[n];
+ const struct vk_format **pvkfmt = PL_PRIV(plfmt);
+ if ((*pvkfmt)->tfmt != p->formats.elem[i].format)
+ continue;
+
+ enum pl_fmt_caps render_caps = 0;
+ render_caps |= PL_FMT_CAP_RENDERABLE;
+ render_caps |= PL_FMT_CAP_BLITTABLE;
+ if ((plfmt->caps & render_caps) != render_caps)
+ continue;
+
+ // format valid, use it if it has a higher score
+ int score = 0;
+ for (int c = 0; c < 3; c++)
+ score += plfmt->component_depth[c];
+ if (pl_color_primaries_is_wide_gamut(space.primaries) == wide_gamut)
+ score += 1000;
+ if (space.primaries == hint->primaries)
+ score += 2000;
+ if (pl_color_transfer_is_hdr(space.transfer) == prefer_hdr)
+ score += 10000;
+ if (space.transfer == hint->transfer)
+ score += 20000;
+
+ switch (plfmt->type) {
+ case PL_FMT_UNKNOWN: break;
+ case PL_FMT_UINT: break;
+ case PL_FMT_SINT: break;
+ case PL_FMT_UNORM: score += 500; break;
+ case PL_FMT_SNORM: score += 400; break;
+ case PL_FMT_FLOAT: score += 300; break;
+ case PL_FMT_TYPE_COUNT: pl_unreachable();
+ };
+
+ if (score > best_score) {
+ best_score = score;
+ best_id = i;
+ break;
+ }
+ }
+ }
+
+ if (!best_score) {
+ PL_ERR(vk, "Failed picking any valid, renderable surface format!");
+ return false;
+ }
+
+ VkSurfaceFormatKHR new_sfmt = p->formats.elem[best_id];
+ if (p->protoInfo.imageFormat != new_sfmt.format ||
+ p->protoInfo.imageColorSpace != new_sfmt.colorSpace)
+ {
+ PL_INFO(vk, "Picked surface configuration %d: %s + %s", best_id,
+ vk_fmt_name(new_sfmt.format),
+ vk_csp_name(new_sfmt.colorSpace));
+
+ p->protoInfo.imageFormat = new_sfmt.format;
+ p->protoInfo.imageColorSpace = new_sfmt.colorSpace;
+ p->needs_recreate = true;
+ }
+
+ return true;
+}
+
+static void set_hdr_metadata(struct priv *p, const struct pl_hdr_metadata *metadata)
+{
+ struct vk_ctx *vk = p->vk;
+ if (!vk->SetHdrMetadataEXT)
+ return;
+
+ // Whitelist only values that we support signalling metadata for
+ struct pl_hdr_metadata fix = {
+ .prim = metadata->prim,
+ .min_luma = metadata->min_luma,
+ .max_luma = metadata->max_luma,
+ .max_cll = metadata->max_cll,
+ .max_fall = metadata->max_fall,
+ };
+
+ // Ignore no-op changes
+ if (pl_hdr_metadata_equal(&fix, &p->hdr_metadata))
+ return;
+
+ // Remember the metadata so we can re-apply it after swapchain recreation
+ p->hdr_metadata = fix;
+
+ // Ignore HDR metadata requests for SDR swapchains
+ if (!pl_color_transfer_is_hdr(p->color_space.transfer))
+ return;
+
+ if (!p->swapchain)
+ return;
+
+ vk->SetHdrMetadataEXT(vk->dev, 1, &p->swapchain, &(VkHdrMetadataEXT) {
+ .sType = VK_STRUCTURE_TYPE_HDR_METADATA_EXT,
+ .displayPrimaryRed = { fix.prim.red.x, fix.prim.red.y },
+ .displayPrimaryGreen = { fix.prim.green.x, fix.prim.green.y },
+ .displayPrimaryBlue = { fix.prim.blue.x, fix.prim.blue.y },
+ .whitePoint = { fix.prim.white.x, fix.prim.white.y },
+ .maxLuminance = fix.max_luma,
+ .minLuminance = fix.min_luma,
+ .maxContentLightLevel = fix.max_cll,
+ .maxFrameAverageLightLevel = fix.max_fall,
+ });
+
+ // Keep track of applied HDR colorimetry metadata
+ p->color_space.hdr = p->hdr_metadata;
+}
+
+pl_swapchain pl_vulkan_create_swapchain(pl_vulkan plvk,
+ const struct pl_vulkan_swapchain_params *params)
+{
+ struct vk_ctx *vk = PL_PRIV(plvk);
+ pl_gpu gpu = plvk->gpu;
+
+ if (!vk->CreateSwapchainKHR) {
+ PL_ERR(gpu, VK_KHR_SWAPCHAIN_EXTENSION_NAME " not enabled!");
+ return NULL;
+ }
+
+ struct pl_swapchain_t *sw = pl_zalloc_obj(NULL, sw, struct priv);
+ sw->log = vk->log;
+ sw->gpu = gpu;
+
+ struct priv *p = PL_PRIV(sw);
+ pl_mutex_init(&p->lock);
+ p->impl = vulkan_swapchain;
+ p->params = *params;
+ p->vk = vk;
+ p->surf = params->surface;
+ p->swapchain_depth = PL_DEF(params->swapchain_depth, 3);
+ pl_assert(p->swapchain_depth > 0);
+ atomic_init(&p->frames_in_flight, 0);
+ p->last_imgidx = -1;
+ p->protoInfo = (VkSwapchainCreateInfoKHR) {
+ .sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR,
+ .surface = p->surf,
+ .imageArrayLayers = 1, // non-stereoscopic
+ .imageSharingMode = VK_SHARING_MODE_EXCLUSIVE,
+ .minImageCount = p->swapchain_depth + 1, // +1 for the FB
+ .presentMode = params->present_mode,
+ .clipped = true,
+ };
+
+ // These fields will be updated by `vk_sw_recreate`
+ p->color_space = pl_color_space_unknown;
+ p->color_repr = (struct pl_color_repr) {
+ .sys = PL_COLOR_SYSTEM_RGB,
+ .levels = PL_COLOR_LEVELS_FULL,
+ .alpha = PL_ALPHA_UNKNOWN,
+ };
+
+ // Make sure the swapchain present mode is supported
+ VkPresentModeKHR *modes = NULL;
+ uint32_t num_modes = 0;
+ VK(vk->GetPhysicalDeviceSurfacePresentModesKHR(vk->physd, p->surf, &num_modes, NULL));
+ modes = pl_calloc_ptr(NULL, num_modes, modes);
+ VK(vk->GetPhysicalDeviceSurfacePresentModesKHR(vk->physd, p->surf, &num_modes, modes));
+
+ bool supported = false;
+ for (int i = 0; i < num_modes; i++)
+ supported |= (modes[i] == p->protoInfo.presentMode);
+ pl_free_ptr(&modes);
+
+ if (!supported) {
+ PL_WARN(vk, "Requested swap mode unsupported by this device, falling "
+ "back to VK_PRESENT_MODE_FIFO_KHR");
+ p->protoInfo.presentMode = VK_PRESENT_MODE_FIFO_KHR;
+ }
+
+ // Enumerate the supported surface color spaces
+ uint32_t num_formats = 0;
+ VK(vk->GetPhysicalDeviceSurfaceFormatsKHR(vk->physd, p->surf, &num_formats, NULL));
+ PL_ARRAY_RESIZE(sw, p->formats, num_formats);
+ VK(vk->GetPhysicalDeviceSurfaceFormatsKHR(vk->physd, p->surf, &num_formats, p->formats.elem));
+ p->formats.num = num_formats;
+
+ PL_INFO(gpu, "Available surface configurations:");
+ for (int i = 0; i < p->formats.num; i++) {
+ PL_INFO(gpu, " %d: %-40s %s", i,
+ vk_fmt_name(p->formats.elem[i].format),
+ vk_csp_name(p->formats.elem[i].colorSpace));
+ }
+
+ // Ensure there exists at least some valid renderable surface format
+ struct pl_color_space hint = {0};
+ if (!pick_surf_format(sw, &hint))
+ goto error;
+
+ return sw;
+
+error:
+ pl_free(modes);
+ pl_free(sw);
+ return NULL;
+}
+
+static void vk_sw_destroy(pl_swapchain sw)
+{
+ pl_gpu gpu = sw->gpu;
+ struct priv *p = PL_PRIV(sw);
+ struct vk_ctx *vk = p->vk;
+
+ pl_gpu_flush(gpu);
+ vk_wait_idle(vk);
+
+ // Vulkan offers no way to know when a queue presentation command is done,
+ // leading to spec-mandated undefined behavior when destroying resources
+ // tied to the swapchain. Use an extra `vkQueueWaitIdle` on all of the
+ // queues we may have oustanding presentation calls on, to hopefully inform
+ // the driver that we want to wait until the device is truly idle.
+ for (int i = 0; i < vk->pool_graphics->num_queues; i++)
+ vk->QueueWaitIdle(vk->pool_graphics->queues[i]);
+
+ for (int i = 0; i < p->images.num; i++)
+ pl_tex_destroy(gpu, &p->images.elem[i]);
+ for (int i = 0; i < p->sems.num; i++) {
+ vk->DestroySemaphore(vk->dev, p->sems.elem[i].in, PL_VK_ALLOC);
+ vk->DestroySemaphore(vk->dev, p->sems.elem[i].out, PL_VK_ALLOC);
+ }
+
+ vk->DestroySwapchainKHR(vk->dev, p->swapchain, PL_VK_ALLOC);
+ pl_mutex_destroy(&p->lock);
+ pl_free((void *) sw);
+}
+
+static int vk_sw_latency(pl_swapchain sw)
+{
+ struct priv *p = PL_PRIV(sw);
+ return p->swapchain_depth;
+}
+
+static bool update_swapchain_info(struct priv *p, VkSwapchainCreateInfoKHR *info,
+ int w, int h)
+{
+ struct vk_ctx *vk = p->vk;
+
+ // Query the supported capabilities and update this struct as needed
+ VkSurfaceCapabilitiesKHR caps = {0};
+ VK(vk->GetPhysicalDeviceSurfaceCapabilitiesKHR(vk->physd, p->surf, &caps));
+
+ // Check for hidden/invisible window
+ if (!caps.currentExtent.width || !caps.currentExtent.height) {
+ PL_DEBUG(vk, "maxImageExtent reported as 0x0, hidden window? skipping");
+ return false;
+ }
+
+ // Sorted by preference
+ static const struct { VkCompositeAlphaFlagsKHR vk_mode;
+ enum pl_alpha_mode pl_mode;
+ } alphaModes[] = {
+ {VK_COMPOSITE_ALPHA_POST_MULTIPLIED_BIT_KHR, PL_ALPHA_INDEPENDENT},
+ {VK_COMPOSITE_ALPHA_PRE_MULTIPLIED_BIT_KHR, PL_ALPHA_PREMULTIPLIED},
+ {VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR, PL_ALPHA_UNKNOWN},
+ {VK_COMPOSITE_ALPHA_INHERIT_BIT_KHR, PL_ALPHA_UNKNOWN},
+ };
+
+ for (int i = 0; i < PL_ARRAY_SIZE(alphaModes); i++) {
+ if (caps.supportedCompositeAlpha & alphaModes[i].vk_mode) {
+ info->compositeAlpha = alphaModes[i].vk_mode;
+ p->color_repr.alpha = alphaModes[i].pl_mode;
+ PL_DEBUG(vk, "Requested alpha compositing mode: %s",
+ vk_alpha_mode(info->compositeAlpha));
+ break;
+ }
+ }
+
+ if (!info->compositeAlpha) {
+ PL_ERR(vk, "Failed picking alpha compositing mode (caps: 0x%x)",
+ caps.supportedCompositeAlpha);
+ goto error;
+ }
+
+ // Note: We could probably also allow picking a surface transform that
+ // flips the framebuffer and set `pl_swapchain_frame.flipped`, but this
+ // doesn't appear to be necessary for any vulkan implementations.
+ static const VkSurfaceTransformFlagsKHR rotModes[] = {
+ VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR,
+ VK_SURFACE_TRANSFORM_INHERIT_BIT_KHR,
+ };
+
+ for (int i = 0; i < PL_ARRAY_SIZE(rotModes); i++) {
+ if (caps.supportedTransforms & rotModes[i]) {
+ info->preTransform = rotModes[i];
+ PL_DEBUG(vk, "Requested surface transform: %s",
+ vk_surface_transform(info->preTransform));
+ break;
+ }
+ }
+
+ if (!info->preTransform) {
+ PL_ERR(vk, "Failed picking surface transform mode (caps: 0x%x)",
+ caps.supportedTransforms);
+ goto error;
+ }
+
+ // Image count as required
+ PL_DEBUG(vk, "Requested image count: %d (min %d max %d)",
+ (int) info->minImageCount, (int) caps.minImageCount,
+ (int) caps.maxImageCount);
+
+ info->minImageCount = PL_MAX(info->minImageCount, caps.minImageCount);
+ if (caps.maxImageCount)
+ info->minImageCount = PL_MIN(info->minImageCount, caps.maxImageCount);
+
+ PL_DEBUG(vk, "Requested image size: %dx%d (min %dx%d < cur %dx%d < max %dx%d)",
+ w, h, caps.minImageExtent.width, caps.minImageExtent.height,
+ caps.currentExtent.width, caps.currentExtent.height,
+ caps.maxImageExtent.width, caps.maxImageExtent.height);
+
+ // Default the requested size based on the reported extent
+ if (caps.currentExtent.width != 0xFFFFFFFF)
+ w = PL_DEF(w, caps.currentExtent.width);
+ if (caps.currentExtent.height != 0xFFFFFFFF)
+ h = PL_DEF(h, caps.currentExtent.height);
+
+ // Otherwise, re-use the existing size if available
+ w = PL_DEF(w, info->imageExtent.width);
+ h = PL_DEF(h, info->imageExtent.height);
+
+ if (!w || !h) {
+ PL_ERR(vk, "Failed resizing swapchain: unknown size?");
+ goto error;
+ }
+
+ // Clamp the extent based on the supported limits
+ w = PL_CLAMP(w, caps.minImageExtent.width, caps.maxImageExtent.width);
+ h = PL_CLAMP(h, caps.minImageExtent.height, caps.maxImageExtent.height);
+ info->imageExtent = (VkExtent2D) { w, h };
+
+ // We just request whatever makes sense, and let the pl_vk decide what
+ // pl_tex_params that translates to. That said, we still need to intersect
+ // the swapchain usage flags with the format usage flags
+ VkImageUsageFlags req_flags = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
+ VK_IMAGE_USAGE_TRANSFER_DST_BIT;
+ VkImageUsageFlags opt_flags = VK_IMAGE_USAGE_STORAGE_BIT;
+
+ info->imageUsage = caps.supportedUsageFlags & (req_flags | opt_flags);
+ VkFormatProperties fmtprop = {0};
+ vk->GetPhysicalDeviceFormatProperties(vk->physd, info->imageFormat, &fmtprop);
+
+#define CHECK(usage, feature) \
+ if (!((fmtprop.optimalTilingFeatures & VK_FORMAT_FEATURE_##feature##_BIT))) \
+ info->imageUsage &= ~VK_IMAGE_USAGE_##usage##_BIT
+
+ CHECK(COLOR_ATTACHMENT, COLOR_ATTACHMENT);
+ CHECK(TRANSFER_DST, TRANSFER_DST);
+ CHECK(STORAGE, STORAGE_IMAGE);
+
+ if ((info->imageUsage & req_flags) != req_flags) {
+ PL_ERR(vk, "The swapchain doesn't support rendering and blitting!");
+ goto error;
+ }
+
+ return true;
+
+error:
+ return false;
+}
+
+static void destroy_swapchain(struct vk_ctx *vk, void *swapchain)
+{
+ vk->DestroySwapchainKHR(vk->dev, vk_unwrap_handle(swapchain), PL_VK_ALLOC);
+}
+
+static bool vk_sw_recreate(pl_swapchain sw, int w, int h)
+{
+ pl_gpu gpu = sw->gpu;
+ struct priv *p = PL_PRIV(sw);
+ struct vk_ctx *vk = p->vk;
+
+ VkImage *vkimages = NULL;
+ uint32_t num_images = 0;
+
+ if (!update_swapchain_info(p, &p->protoInfo, w, h))
+ return false;
+
+ VkSwapchainCreateInfoKHR sinfo = p->protoInfo;
+#ifdef VK_EXT_full_screen_exclusive
+ // Explicitly disallow full screen exclusive mode if possible
+ static const VkSurfaceFullScreenExclusiveInfoEXT fsinfo = {
+ .sType = VK_STRUCTURE_TYPE_SURFACE_FULL_SCREEN_EXCLUSIVE_INFO_EXT,
+ .fullScreenExclusive = VK_FULL_SCREEN_EXCLUSIVE_DISALLOWED_EXT,
+ };
+ if (vk->AcquireFullScreenExclusiveModeEXT)
+ vk_link_struct(&sinfo, &fsinfo);
+#endif
+
+ p->suboptimal = false;
+ p->needs_recreate = false;
+ p->cur_width = sinfo.imageExtent.width;
+ p->cur_height = sinfo.imageExtent.height;
+
+ PL_DEBUG(sw, "(Re)creating swapchain of size %dx%d",
+ sinfo.imageExtent.width,
+ sinfo.imageExtent.height);
+
+#ifdef PL_HAVE_UNIX
+ if (vk->props.vendorID == VK_VENDOR_ID_NVIDIA) {
+ vk->DeviceWaitIdle(vk->dev);
+ vk_wait_idle(vk);
+ }
+#endif
+
+ // Calling `vkCreateSwapchainKHR` puts sinfo.oldSwapchain into a retired
+ // state whether the call succeeds or not, so we always need to garbage
+ // collect it afterwards - asynchronously as it may still be in use
+ sinfo.oldSwapchain = p->swapchain;
+ p->swapchain = VK_NULL_HANDLE;
+ VkResult res = vk->CreateSwapchainKHR(vk->dev, &sinfo, PL_VK_ALLOC, &p->swapchain);
+ vk_dev_callback(vk, (vk_cb) destroy_swapchain, vk, vk_wrap_handle(sinfo.oldSwapchain));
+ PL_VK_ASSERT(res, "vk->CreateSwapchainKHR(...)");
+
+ // Get the new swapchain images
+ VK(vk->GetSwapchainImagesKHR(vk->dev, p->swapchain, &num_images, NULL));
+ vkimages = pl_calloc_ptr(NULL, num_images, vkimages);
+ VK(vk->GetSwapchainImagesKHR(vk->dev, p->swapchain, &num_images, vkimages));
+
+ for (int i = 0; i < num_images; i++)
+ PL_VK_NAME(IMAGE, vkimages[i], "swapchain");
+
+ // If needed, allocate some more semaphores
+ while (num_images > p->sems.num) {
+ VkSemaphore sem_in = VK_NULL_HANDLE, sem_out = VK_NULL_HANDLE;
+ static const VkSemaphoreCreateInfo seminfo = {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+ };
+ VK(vk->CreateSemaphore(vk->dev, &seminfo, PL_VK_ALLOC, &sem_in));
+ VK(vk->CreateSemaphore(vk->dev, &seminfo, PL_VK_ALLOC, &sem_out));
+ PL_VK_NAME(SEMAPHORE, sem_in, "swapchain in");
+ PL_VK_NAME(SEMAPHORE, sem_out, "swapchain out");
+
+ PL_ARRAY_APPEND(sw, p->sems, (struct sem_pair) {
+ .in = sem_in,
+ .out = sem_out,
+ });
+ }
+
+ // Recreate the pl_tex wrappers
+ for (int i = 0; i < p->images.num; i++)
+ pl_tex_destroy(gpu, &p->images.elem[i]);
+ p->images.num = 0;
+
+ for (int i = 0; i < num_images; i++) {
+ const VkExtent2D *ext = &sinfo.imageExtent;
+ pl_tex tex = pl_vulkan_wrap(gpu, pl_vulkan_wrap_params(
+ .image = vkimages[i],
+ .width = ext->width,
+ .height = ext->height,
+ .format = sinfo.imageFormat,
+ .usage = sinfo.imageUsage,
+ ));
+ if (!tex)
+ goto error;
+ PL_ARRAY_APPEND(sw, p->images, tex);
+ }
+
+ pl_assert(num_images > 0);
+ int bits = 0;
+
+ // The channel with the most bits is probably the most authoritative about
+ // the actual color information (consider e.g. a2bgr10). Slight downside
+ // in that it results in rounding r/b for e.g. rgb565, but we don't pick
+ // surfaces with fewer than 8 bits anyway, so let's not care for now.
+ pl_fmt fmt = p->images.elem[0]->params.format;
+ for (int i = 0; i < fmt->num_components; i++)
+ bits = PL_MAX(bits, fmt->component_depth[i]);
+
+ p->color_repr.bits.sample_depth = bits;
+ p->color_repr.bits.color_depth = bits;
+
+ // Note: `p->color_space.hdr` is (re-)applied by `set_hdr_metadata`
+ map_color_space(sinfo.imageColorSpace, &p->color_space);
+
+ // Forcibly re-apply HDR metadata, bypassing the no-op check
+ struct pl_hdr_metadata metadata = p->hdr_metadata;
+ p->hdr_metadata = pl_hdr_metadata_empty;
+ set_hdr_metadata(p, &metadata);
+
+ pl_free(vkimages);
+ return true;
+
+error:
+ PL_ERR(vk, "Failed (re)creating swapchain!");
+ pl_free(vkimages);
+ vk->DestroySwapchainKHR(vk->dev, p->swapchain, PL_VK_ALLOC);
+ p->swapchain = VK_NULL_HANDLE;
+ p->cur_width = p->cur_height = 0;
+ return false;
+}
+
+static bool vk_sw_start_frame(pl_swapchain sw,
+ struct pl_swapchain_frame *out_frame)
+{
+ struct priv *p = PL_PRIV(sw);
+ struct vk_ctx *vk = p->vk;
+ pl_mutex_lock(&p->lock);
+
+ bool recreate = !p->swapchain || p->needs_recreate;
+ if (p->suboptimal && !p->params.allow_suboptimal)
+ recreate = true;
+
+ if (recreate && !vk_sw_recreate(sw, 0, 0)) {
+ pl_mutex_unlock(&p->lock);
+ return false;
+ }
+
+ VkSemaphore sem_in = p->sems.elem[p->idx_sems].in;
+ PL_TRACE(vk, "vkAcquireNextImageKHR signals 0x%"PRIx64, (uint64_t) sem_in);
+
+ for (int attempts = 0; attempts < 2; attempts++) {
+ uint32_t imgidx = 0;
+ VkResult res = vk->AcquireNextImageKHR(vk->dev, p->swapchain, UINT64_MAX,
+ sem_in, VK_NULL_HANDLE, &imgidx);
+
+ switch (res) {
+ case VK_SUBOPTIMAL_KHR:
+ p->suboptimal = true;
+ // fall through
+ case VK_SUCCESS:
+ p->last_imgidx = imgidx;
+ pl_vulkan_release_ex(sw->gpu, pl_vulkan_release_params(
+ .tex = p->images.elem[imgidx],
+ .layout = VK_IMAGE_LAYOUT_UNDEFINED,
+ .qf = VK_QUEUE_FAMILY_IGNORED,
+ .semaphore = { sem_in },
+ ));
+ *out_frame = (struct pl_swapchain_frame) {
+ .fbo = p->images.elem[imgidx],
+ .flipped = false,
+ .color_repr = p->color_repr,
+ .color_space = p->color_space,
+ };
+ // keep lock held
+ return true;
+
+ case VK_ERROR_OUT_OF_DATE_KHR: {
+ // In these cases try recreating the swapchain
+ if (!vk_sw_recreate(sw, 0, 0)) {
+ pl_mutex_unlock(&p->lock);
+ return false;
+ }
+ continue;
+ }
+
+ default:
+ PL_ERR(vk, "Failed acquiring swapchain image: %s", vk_res_str(res));
+ pl_mutex_unlock(&p->lock);
+ return false;
+ }
+ }
+
+ // If we've exhausted the number of attempts to recreate the swapchain,
+ // just give up silently and let the user retry some time later.
+ pl_mutex_unlock(&p->lock);
+ return false;
+}
+
+static void present_cb(struct priv *p, void *arg)
+{
+ (void) pl_rc_deref(&p->frames_in_flight);
+}
+
+static bool vk_sw_submit_frame(pl_swapchain sw)
+{
+ pl_gpu gpu = sw->gpu;
+ struct priv *p = PL_PRIV(sw);
+ struct vk_ctx *vk = p->vk;
+ pl_assert(p->last_imgidx >= 0);
+ pl_assert(p->swapchain);
+ uint32_t idx = p->last_imgidx;
+ VkSemaphore sem_out = p->sems.elem[p->idx_sems++].out;
+ p->idx_sems %= p->sems.num;
+ p->last_imgidx = -1;
+
+ bool held = pl_vulkan_hold_ex(gpu, pl_vulkan_hold_params(
+ .tex = p->images.elem[idx],
+ .layout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR,
+ .qf = VK_QUEUE_FAMILY_IGNORED,
+ .semaphore = { sem_out },
+ ));
+
+ if (!held) {
+ PL_ERR(gpu, "Failed holding swapchain image for presentation");
+ pl_mutex_unlock(&p->lock);
+ return false;
+ }
+
+ struct vk_cmd *cmd = pl_vk_steal_cmd(gpu);
+ if (!cmd) {
+ pl_mutex_unlock(&p->lock);
+ return false;
+ }
+
+ pl_rc_ref(&p->frames_in_flight);
+ vk_cmd_callback(cmd, (vk_cb) present_cb, p, NULL);
+ if (!vk_cmd_submit(&cmd)) {
+ pl_mutex_unlock(&p->lock);
+ return false;
+ }
+
+ struct vk_cmdpool *pool = vk->pool_graphics;
+ int qidx = pool->idx_queues;
+ VkQueue queue = pool->queues[qidx];
+
+ vk_rotate_queues(p->vk);
+ vk_malloc_garbage_collect(vk->ma);
+
+ VkPresentInfoKHR pinfo = {
+ .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR,
+ .waitSemaphoreCount = 1,
+ .pWaitSemaphores = &sem_out,
+ .swapchainCount = 1,
+ .pSwapchains = &p->swapchain,
+ .pImageIndices = &idx,
+ };
+
+ PL_TRACE(vk, "vkQueuePresentKHR waits on 0x%"PRIx64, (uint64_t) sem_out);
+ vk->lock_queue(vk->queue_ctx, pool->qf, qidx);
+ VkResult res = vk->QueuePresentKHR(queue, &pinfo);
+ vk->unlock_queue(vk->queue_ctx, pool->qf, qidx);
+ pl_mutex_unlock(&p->lock);
+
+ switch (res) {
+ case VK_SUBOPTIMAL_KHR:
+ p->suboptimal = true;
+ // fall through
+ case VK_SUCCESS:
+ return true;
+
+ case VK_ERROR_OUT_OF_DATE_KHR:
+ // We can silently ignore this error, since the next start_frame will
+ // recreate the swapchain automatically.
+ return true;
+
+ default:
+ PL_ERR(vk, "Failed presenting to queue %p: %s", (void *) queue,
+ vk_res_str(res));
+ return false;
+ }
+}
+
+static void vk_sw_swap_buffers(pl_swapchain sw)
+{
+ struct priv *p = PL_PRIV(sw);
+
+ pl_mutex_lock(&p->lock);
+ while (pl_rc_count(&p->frames_in_flight) >= p->swapchain_depth) {
+ pl_mutex_unlock(&p->lock); // don't hold mutex while blocking
+ vk_poll_commands(p->vk, UINT64_MAX);
+ pl_mutex_lock(&p->lock);
+ }
+ pl_mutex_unlock(&p->lock);
+}
+
+static bool vk_sw_resize(pl_swapchain sw, int *width, int *height)
+{
+ struct priv *p = PL_PRIV(sw);
+ bool ok = true;
+
+ pl_mutex_lock(&p->lock);
+
+ bool width_changed = *width && *width != p->cur_width,
+ height_changed = *height && *height != p->cur_height;
+
+ if (p->suboptimal || p->needs_recreate || width_changed || height_changed)
+ ok = vk_sw_recreate(sw, *width, *height);
+
+ *width = p->cur_width;
+ *height = p->cur_height;
+
+ pl_mutex_unlock(&p->lock);
+ return ok;
+}
+
+static void vk_sw_colorspace_hint(pl_swapchain sw, const struct pl_color_space *csp)
+{
+ struct priv *p = PL_PRIV(sw);
+ pl_mutex_lock(&p->lock);
+
+ // This should never fail if the swapchain already exists
+ bool ok = pick_surf_format(sw, csp);
+ set_hdr_metadata(p, &csp->hdr);
+ pl_assert(ok);
+
+ pl_mutex_unlock(&p->lock);
+}
+
+bool pl_vulkan_swapchain_suboptimal(pl_swapchain sw)
+{
+ struct priv *p = PL_PRIV(sw);
+ return p->suboptimal;
+}
+
+static const struct pl_sw_fns vulkan_swapchain = {
+ .destroy = vk_sw_destroy,
+ .latency = vk_sw_latency,
+ .resize = vk_sw_resize,
+ .colorspace_hint = vk_sw_colorspace_hint,
+ .start_frame = vk_sw_start_frame,
+ .submit_frame = vk_sw_submit_frame,
+ .swap_buffers = vk_sw_swap_buffers,
+};
diff --git a/src/vulkan/utils.c b/src/vulkan/utils.c
new file mode 100644
index 0000000..914f9e4
--- /dev/null
+++ b/src/vulkan/utils.c
@@ -0,0 +1,181 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "utils.h"
+
+VkExternalMemoryHandleTypeFlagBitsKHR
+vk_mem_handle_type(enum pl_handle_type handle_type)
+{
+ if (!handle_type)
+ return 0;
+
+ switch (handle_type) {
+ case PL_HANDLE_FD:
+ return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
+ case PL_HANDLE_WIN32:
+ return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR;
+ case PL_HANDLE_WIN32_KMT:
+ return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR;
+ case PL_HANDLE_DMA_BUF:
+ return VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT;
+ case PL_HANDLE_HOST_PTR:
+ return VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT;
+ case PL_HANDLE_MTL_TEX:
+ case PL_HANDLE_IOSURFACE:
+ return 0;
+ }
+
+ pl_unreachable();
+}
+
+VkExternalSemaphoreHandleTypeFlagBitsKHR
+vk_sync_handle_type(enum pl_handle_type handle_type)
+{
+ if (!handle_type)
+ return 0;
+
+ switch (handle_type) {
+ case PL_HANDLE_FD:
+ return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
+ case PL_HANDLE_WIN32:
+ return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR;
+ case PL_HANDLE_WIN32_KMT:
+ return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR;
+ case PL_HANDLE_DMA_BUF:
+ case PL_HANDLE_HOST_PTR:
+ case PL_HANDLE_MTL_TEX:
+ case PL_HANDLE_IOSURFACE:
+ return 0;
+ }
+
+ pl_unreachable();
+}
+
+bool vk_external_mem_check(struct vk_ctx *vk,
+ const VkExternalMemoryPropertiesKHR *props,
+ enum pl_handle_type handle_type,
+ bool import)
+{
+ VkExternalMemoryFeatureFlagsKHR flags = props->externalMemoryFeatures;
+ VkExternalMemoryHandleTypeFlagBitsKHR vk_handle = vk_mem_handle_type(handle_type);
+
+ if (import) {
+ if (!(flags & VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT_KHR)) {
+ PL_DEBUG(vk, "Handle type %s (0x%x) is not importable",
+ vk_handle_name(vk_handle), (unsigned int) handle_type);
+ return false;
+ }
+ } else {
+ if (!(flags & VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT_KHR)) {
+ PL_DEBUG(vk, "Handle type %s (0x%x) is not exportable",
+ vk_handle_name(vk_handle), (unsigned int) handle_type);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+const enum pl_handle_type vk_mem_handle_list[] = {
+ PL_HANDLE_HOST_PTR,
+#ifdef PL_HAVE_UNIX
+ PL_HANDLE_FD,
+ PL_HANDLE_DMA_BUF,
+#endif
+#ifdef PL_HAVE_WIN32
+ PL_HANDLE_WIN32,
+ PL_HANDLE_WIN32_KMT,
+#endif
+ 0
+};
+
+const enum pl_handle_type vk_sync_handle_list[] = {
+#ifdef PL_HAVE_UNIX
+ PL_HANDLE_FD,
+#endif
+#ifdef PL_HAVE_WIN32
+ PL_HANDLE_WIN32,
+ PL_HANDLE_WIN32_KMT,
+#endif
+ 0
+};
+
+const void *vk_find_struct(const void *chain, VkStructureType stype)
+{
+ const VkBaseInStructure *in = chain;
+ while (in) {
+ if (in->sType == stype)
+ return in;
+
+ in = in->pNext;
+ }
+
+ return NULL;
+}
+
+void vk_link_struct(void *chain, const void *in)
+{
+ if (!in)
+ return;
+
+ VkBaseOutStructure *out = chain;
+ while (out->pNext)
+ out = out->pNext;
+
+ out->pNext = (void *) in;
+}
+
+void *vk_struct_memdup(void *alloc, const void *pin)
+{
+ if (!pin)
+ return NULL;
+
+ const VkBaseInStructure *in = pin;
+ size_t size = vk_struct_size(in->sType);
+ pl_assert(size);
+
+ VkBaseOutStructure *out = pl_memdup(alloc, in, size);
+ out->pNext = NULL;
+ return out;
+}
+
+void *vk_chain_memdup(void *alloc, const void *pin)
+{
+ if (!pin)
+ return NULL;
+
+ const VkBaseInStructure *in = pin;
+ VkBaseOutStructure *out = vk_struct_memdup(alloc, in);
+ pl_assert(out);
+
+ out->pNext = vk_chain_memdup(alloc, in->pNext);
+ return out;
+}
+
+void *vk_chain_alloc(void *alloc, void *chain, VkStructureType stype)
+{
+ for (VkBaseOutStructure *out = chain;; out = out->pNext) {
+ if (out->sType == stype)
+ return out;
+ if (!out->pNext) {
+ VkBaseOutStructure *s = pl_zalloc(alloc, vk_struct_size(stype));
+ s->sType = stype;
+ out->pNext = s;
+ return s;
+ }
+ }
+}
diff --git a/src/vulkan/utils.h b/src/vulkan/utils.h
new file mode 100644
index 0000000..cb1c5f5
--- /dev/null
+++ b/src/vulkan/utils.h
@@ -0,0 +1,136 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+// Return a human-readable name for various vulkan enums
+const char *vk_res_str(VkResult res);
+const char *vk_fmt_name(VkFormat fmt);
+const char *vk_csp_name(VkColorSpaceKHR csp);
+const char *vk_handle_name(VkExternalMemoryHandleTypeFlagBitsKHR handle);
+const char *vk_obj_type(VkObjectType obj);
+const char *vk_alpha_mode(VkCompositeAlphaFlagsKHR alpha);
+const char *vk_surface_transform(VkSurfaceTransformFlagsKHR transform);
+
+// Return the size of an arbitrary vulkan struct. Returns 0 for unknown structs
+size_t vk_struct_size(VkStructureType stype);
+
+// Returns the vulkan API version which a given extension was promoted to, or 0
+// if the extension is not promoted.
+uint32_t vk_ext_promoted_ver(const char *extension);
+
+// Enum translation boilerplate
+VkExternalMemoryHandleTypeFlagBitsKHR vk_mem_handle_type(enum pl_handle_type);
+VkExternalSemaphoreHandleTypeFlagBitsKHR vk_sync_handle_type(enum pl_handle_type);
+
+// Bitmask of all access flags that imply a read/write operation, respectively
+extern const VkAccessFlags2 vk_access_read;
+extern const VkAccessFlags2 vk_access_write;
+
+// Check for compatibility of a VkExternalMemoryProperties
+bool vk_external_mem_check(struct vk_ctx *vk,
+ const VkExternalMemoryPropertiesKHR *props,
+ enum pl_handle_type handle_type,
+ bool check_import);
+
+// Static lists of external handle types we should try probing for
+extern const enum pl_handle_type vk_mem_handle_list[];
+extern const enum pl_handle_type vk_sync_handle_list[];
+
+// Find a structure in a pNext chain, or NULL
+const void *vk_find_struct(const void *chain, VkStructureType stype);
+
+// Link a structure into a pNext chain
+void vk_link_struct(void *chain, const void *in);
+
+// Make a copy of a structure, not including the pNext chain
+void *vk_struct_memdup(void *alloc, const void *in);
+
+// Make a deep copy of an entire pNext chain
+void *vk_chain_memdup(void *alloc, const void *in);
+
+// Find a structure in a pNext chain, or allocate + link it if absent.
+void *vk_chain_alloc(void *alloc, void *chain, VkStructureType stype);
+
+// Renormalize input features into a state consistent for a given API version.
+// If `api_ver` is specified as 0, *both* meta-structs and extension structs
+// will be emitted. Note: `out` should be initialized by the user. In
+// particular, if it already contains a valid features chain, then this
+// function will effectively act as a union.
+void vk_features_normalize(void *alloc, const VkPhysicalDeviceFeatures2 *in,
+ uint32_t api_ver, VkPhysicalDeviceFeatures2 *out);
+
+// Convenience macros to simplify a lot of common boilerplate
+#define PL_VK_ASSERT(res, str) \
+ do { \
+ if (res != VK_SUCCESS) { \
+ PL_ERR(vk, str ": %s (%s:%d)", \
+ vk_res_str(res), __FILE__, __LINE__); \
+ goto error; \
+ } \
+ } while (0)
+
+#define VK(cmd) \
+ do { \
+ PL_TRACE(vk, #cmd); \
+ VkResult _res = (cmd); \
+ PL_VK_ASSERT(_res, #cmd); \
+ } while (0)
+
+#define PL_VK_NAME(type, obj, name) \
+ do { \
+ if (vk->SetDebugUtilsObjectNameEXT) { \
+ vk->SetDebugUtilsObjectNameEXT(vk->dev, &(VkDebugUtilsObjectNameInfoEXT) { \
+ .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT, \
+ .objectType = VK_OBJECT_TYPE_##type, \
+ .objectHandle = (uint64_t) (obj), \
+ .pObjectName = (name), \
+ }); \
+ } \
+ } while (0)
+
+// Variant of PL_VK_NAME for dispatchable handles
+#define PL_VK_NAME_HANDLE(type, obj, name) \
+ PL_VK_NAME(type, (uintptr_t) (obj), name)
+
+// Helper functions to wrap and unwrap non-dispatchable handles into pointers.
+// Note that wrap/unwrap must always be used linearly.
+#if VK_USE_64_BIT_PTR_DEFINES == 1
+#define vk_wrap_handle(h) (h)
+#define vk_unwrap_handle(h) (h)
+#elif UINTPTR_MAX >= UINT64_MAX
+#define vk_wrap_handle(h) ((void *) (uintptr_t) (h))
+#define vk_unwrap_handle(h) ((uint64_t) (uintptr_t) (h))
+#else
+static inline void *vk_wrap_handle(uint64_t h)
+{
+ uint64_t *wrapper = malloc(sizeof(h));
+ assert(wrapper);
+ *wrapper = h;
+ return wrapper;
+}
+
+static inline uint64_t vk_unwrap_handle(void *h)
+{
+ uint64_t *wrapper = h;
+ uint64_t ret = *wrapper;
+ free(wrapper);
+ return ret;
+}
+#endif
diff --git a/src/vulkan/utils_gen.c.j2 b/src/vulkan/utils_gen.c.j2
new file mode 100644
index 0000000..6db0454
--- /dev/null
+++ b/src/vulkan/utils_gen.c.j2
@@ -0,0 +1,137 @@
+#define VK_ENABLE_BETA_EXTENSIONS
+#include "vulkan/utils.h"
+
+const char *vk_res_str(VkResult res)
+{
+ switch (res) {
+{% for res in vkresults %}
+ case {{ res }}: return "{{ res }}";
+{% endfor %}
+
+ default: return "unknown error";
+ }
+}
+
+const char *vk_fmt_name(VkFormat fmt)
+{
+ switch (fmt) {
+{% for fmt in vkformats %}
+ case {{ fmt }}: return "{{ fmt }}";
+{% endfor %}
+
+ default: return "unknown format";
+ }
+}
+
+const char *vk_csp_name(VkColorSpaceKHR csp)
+{
+ switch (csp) {
+{% for csp in vkspaces %}
+ case {{ csp }}: return "{{ csp }}";
+{% endfor %}
+
+ default: return "unknown color space";
+ }
+}
+
+const char *vk_handle_name(VkExternalMemoryHandleTypeFlagBitsKHR handle)
+{
+ switch (handle) {
+{% for handle in vkhandles %}
+ case {{ handle }}: return "{{ handle }}";
+{% endfor %}
+
+ default: return "unknown handle type";
+ }
+}
+
+const char *vk_alpha_mode(VkCompositeAlphaFlagsKHR alpha)
+{
+ switch (alpha) {
+{% for mode in vkalphas %}
+ case {{ mode }}: return "{{ mode }}";
+{% endfor %}
+
+ default: return "unknown alpha mode";
+ }
+}
+
+const char *vk_surface_transform(VkSurfaceTransformFlagsKHR tf)
+{
+ switch (tf) {
+{% for tf in vktransforms %}
+ case {{ tf }}: return "{{ tf }}";
+{% endfor %}
+
+ default: return "unknown surface transform";
+ }
+}
+
+
+const char *vk_obj_type(VkObjectType obj)
+{
+ switch (obj) {
+{% for obj in vkobjects %}
+ case {{ obj.enum }}: return "{{ obj.name }}";
+{% endfor %}
+
+ default: return "unknown object";
+ }
+}
+
+size_t vk_struct_size(VkStructureType stype)
+{
+ switch (stype) {
+{% for struct in vkstructs %}
+ case {{ struct.stype }}: return sizeof({{ struct.name }});
+{% endfor %}
+
+ default: return 0;
+ }
+}
+
+uint32_t vk_ext_promoted_ver(const char *extension)
+{
+{% for ext in vkexts %}
+{% if ext.promoted_ver %}
+ if (!strcmp(extension, "{{ ext.name }}"))
+ return {{ ext.promoted_ver }};
+{% endif %}
+{% endfor %}
+ return 0;
+}
+
+void vk_features_normalize(void *alloc, const VkPhysicalDeviceFeatures2 *fin,
+ uint32_t api_ver, VkPhysicalDeviceFeatures2 *out)
+{
+ for (const VkBaseInStructure *in = (void *) fin; in; in = in->pNext) {
+ switch (in->sType) {
+ default: break;
+{% for fs in vkfeatures %}
+ case {{ fs.stype }}: {
+ const {{ fs.name }} *i = (const void *) in;
+{% for f in fs.features %}
+ if (i->{{ f.name }}) {
+{% for r in f.replacements %}
+{% if r.core_ver %}
+ if (!api_ver || api_ver >= {{ r.core_ver }})
+{% elif r.max_ver %}
+ if (!api_ver || api_ver < {{ r.max_ver }})
+{% endif %}
+{% if fs.is_base %}
+ out->{{ f.name }} = true;
+{% else %}
+ (({{ r.name }} *) vk_chain_alloc(alloc, out, {{ r.stype }}))->{{ f.name }} = true;
+{% endif %}
+{% endfor %}
+ }
+{% endfor %}
+ break;
+ }
+{% endfor %}
+ }
+ }
+}
+
+const VkAccessFlags2 vk_access_read = {{ '0x%x' % vkaccess.read }}LLU;
+const VkAccessFlags2 vk_access_write = {{ '0x%x' % vkaccess.write }}LLU;
diff --git a/src/vulkan/utils_gen.py b/src/vulkan/utils_gen.py
new file mode 100644
index 0000000..a8652fd
--- /dev/null
+++ b/src/vulkan/utils_gen.py
@@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+#
+# This file is part of libplacebo.
+#
+# libplacebo is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# libplacebo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+
+import os.path
+import re
+import sys
+import xml.etree.ElementTree as ET
+
+try:
+ import jinja2
+except ModuleNotFoundError:
+ print('Module \'jinja2\' not found, please install \'python3-Jinja2\' or '
+ 'an equivalent package on your system! Alternatively, run '
+ '`git submodule update --init` followed by `meson --wipe`.',
+ file=sys.stderr)
+ sys.exit(1)
+
+TEMPLATE = jinja2.Environment(
+ loader = jinja2.FileSystemLoader(searchpath=os.path.dirname(__file__)),
+ trim_blocks=True,
+).get_template('utils_gen.c.j2')
+
+class Obj(object):
+ def __init__(self, **kwargs):
+ self.__dict__.update(kwargs)
+
+class VkXML(ET.ElementTree):
+ def blacklist_block(self, req):
+ for t in req.iterfind('type'):
+ self.blacklist_types.add(t.attrib['name'])
+ for e in req.iterfind('enum'):
+ self.blacklist_enums.add(e.attrib['name'])
+
+ def __init__(self, *args, **kwargs):
+
+ super().__init__(*args, **kwargs)
+ self.blacklist_types = set()
+ self.blacklist_enums = set()
+
+ for f in self.iterfind('feature'):
+ # Feature block for non-Vulkan API
+ if not 'vulkan' in f.attrib['api'].split(','):
+ for r in f.iterfind('require'):
+ self.blacklist_block(r)
+
+ for e in self.iterfind('extensions/extension'):
+ # Entire extension is unsupported on vulkan or platform-specifid
+ if not 'vulkan' in e.attrib['supported'].split(',') or 'platform' in e.attrib:
+ for r in e.iterfind('require'):
+ self.blacklist_block(r)
+ continue
+
+ # Only individual <require> blocks are API-specific
+ for r in e.iterfind('require[@api]'):
+ if not 'vulkan' in r.attrib['api'].split(','):
+ self.blacklist_block(r)
+
+ def findall_enum(self, name):
+ for e in self.iterfind('enums[@name="{0}"]/enum'.format(name)):
+ if not 'alias' in e.attrib:
+ if not e.attrib['name'] in self.blacklist_enums:
+ yield e
+ for e in self.iterfind('.//enum[@extends="{0}"]'.format(name)):
+ if not 'alias' in e.attrib:
+ if not e.attrib['name'] in self.blacklist_enums:
+ yield e
+
+ def findall_type(self, category):
+ for t in self.iterfind('types/type[@category="{0}"]'.format(category)):
+ name = t.attrib.get('name') or t.find('name').text
+ if name in self.blacklist_types:
+ continue
+ yield t
+
+
+def get_vkenum(registry, enum):
+ for e in registry.findall_enum(enum):
+ yield e.attrib['name']
+
+def get_vkobjects(registry):
+ for t in registry.findall_type('handle'):
+ if 'objtypeenum' in t.attrib:
+ yield Obj(enum = t.attrib['objtypeenum'],
+ name = t.find('name').text)
+
+def get_vkstructs(registry):
+ for t in registry.findall_type('struct'):
+ stype = None
+ for m in t.iterfind('member'):
+ if m.find('name').text == 'sType':
+ stype = m
+ break
+
+ if stype is not None and 'values' in stype.attrib:
+ yield Obj(stype = stype.attrib['values'],
+ name = t.attrib['name'])
+
+def get_vkaccess(registry):
+ access = Obj(read = 0, write = 0)
+ for e in registry.findall_enum('VkAccessFlagBits2'):
+ if '_READ_' in e.attrib['name']:
+ access.read |= 1 << int(e.attrib['bitpos'])
+ if '_WRITE_' in e.attrib['name']:
+ access.write |= 1 << int(e.attrib['bitpos'])
+ return access
+
+def get_vkexts(registry):
+ for e in registry.iterfind('extensions/extension'):
+ promoted_ver = None
+ if res := re.match(r'VK_VERSION_(\d)_(\d)', e.attrib.get('promotedto', '')):
+ promoted_ver = 'VK_API_VERSION_{0}_{1}'.format(res[1], res[2])
+ yield Obj(name = e.attrib['name'],
+ promoted_ver = promoted_ver)
+
+def get_vkfeatures(registry):
+ structs = [];
+ featuremap = {}; # features -> [struct]
+ for t in registry.findall_type('struct'):
+ sname = t.attrib['name']
+ is_base = sname == 'VkPhysicalDeviceFeatures'
+ extends = t.attrib.get('structextends', [])
+ if is_base:
+ sname = 'VkPhysicalDeviceFeatures2'
+ stype = 'VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2'
+ elif not 'VkPhysicalDeviceFeatures2' in extends:
+ continue
+
+ features = []
+ for f in t.iterfind('member'):
+ if f.find('type').text == 'VkStructureType':
+ stype = f.attrib['values']
+ elif f.find('type').text == 'VkBool32':
+ fname = f.find('name').text
+ if is_base:
+ fname = 'features.' + fname
+ features.append(Obj(name = fname))
+
+ core_ver = None
+ if res := re.match(r'VkPhysicalDeviceVulkan(\d)(\d)Features', sname):
+ core_ver = 'VK_API_VERSION_{0}_{1}'.format(res[1], res[2])
+
+ struct = Obj(name = sname,
+ stype = stype,
+ core_ver = core_ver,
+ is_base = is_base,
+ features = features)
+
+ structs.append(struct)
+ for f in features:
+ featuremap.setdefault(f.name, []).append(struct)
+
+ for s in structs:
+ for f in s.features:
+ f.replacements = featuremap[f.name]
+ core_ver = next(( r.core_ver for r in f.replacements if r.core_ver ), None)
+ for r in f.replacements:
+ if not r.core_ver:
+ r.max_ver = core_ver
+
+ yield from structs
+
+def find_registry_xml(datadir):
+ registry_paths = [
+ '{0}/vulkan/registry/vk.xml'.format(datadir),
+ '$MINGW_PREFIX/share/vulkan/registry/vk.xml',
+ '%VULKAN_SDK%/share/vulkan/registry/vk.xml',
+ '$VULKAN_SDK/share/vulkan/registry/vk.xml',
+ '/usr/share/vulkan/registry/vk.xml',
+ ]
+
+ for p in registry_paths:
+ path = os.path.expandvars(p)
+ if os.path.isfile(path):
+ print('Found vk.xml: {0}'.format(path))
+ return path
+
+ print('Could not find the vulkan registry (vk.xml), please specify its '
+ 'location manually using the -Dvulkan-registry=/path/to/vk.xml '
+ 'option!', file=sys.stderr)
+ sys.exit(1)
+
+if __name__ == '__main__':
+ assert len(sys.argv) == 4
+ datadir = sys.argv[1]
+ xmlfile = sys.argv[2]
+ outfile = sys.argv[3]
+
+ if not xmlfile or xmlfile == '':
+ xmlfile = find_registry_xml(datadir)
+
+ registry = VkXML(ET.parse(xmlfile))
+ with open(outfile, 'w') as f:
+ f.write(TEMPLATE.render(
+ vkresults = get_vkenum(registry, 'VkResult'),
+ vkformats = get_vkenum(registry, 'VkFormat'),
+ vkspaces = get_vkenum(registry, 'VkColorSpaceKHR'),
+ vkhandles = get_vkenum(registry, 'VkExternalMemoryHandleTypeFlagBits'),
+ vkalphas = get_vkenum(registry, 'VkCompositeAlphaFlagBitsKHR'),
+ vktransforms = get_vkenum(registry, 'VkSurfaceTransformFlagBitsKHR'),
+ vkobjects = get_vkobjects(registry),
+ vkstructs = get_vkstructs(registry),
+ vkaccess = get_vkaccess(registry),
+ vkexts = get_vkexts(registry),
+ vkfeatures = get_vkfeatures(registry),
+ ))